From 009e972f79264ec4330f1da38644f07003e6014f Mon Sep 17 00:00:00 2001
From: sangchengmeng <sangchengmeng@mail.ustc.edu.cn>
Date: Wed, 7 May 2025 17:34:28 +0800
Subject: [PATCH 01/14] add_lightllm_kernel

---
 .../multimodal_model_quickstart.rst           |  11 +
 lightllm-kernel/CMakeLists.txt                |  65 ++++
 lightllm-kernel/Makefile                      |   9 +
 lightllm-kernel/README-CH.md                  |  42 +++
 lightllm-kernel/README.md                     |  39 +++
 lightllm-kernel/csrc/cuda_compat.h            |  49 +++
 lightllm-kernel/csrc/moe/grouped_topk.cu      | 319 ++++++++++++++++++
 .../csrc/moe/grouped_topk_interface.cpp       |  48 +++
 lightllm-kernel/csrc/ops_bindings.cpp         |  56 +++
 lightllm-kernel/include/ops_common.h          |   4 +
 lightllm-kernel/lightllm_kernel/__init__.py   |   3 +
 .../lightllm_kernel/ops/__init__.py           |  44 +++
 .../lightllm_kernel/ops/attention.py          |   0
 lightllm-kernel/lightllm_kernel/ops/moe.py    |   0
 lightllm-kernel/lightllm_kernel/ops/quant.py  |   0
 lightllm-kernel/setup.py                      |  40 +++
 16 files changed, 729 insertions(+)
 create mode 100644 docs/CN/source/getting_started/multimodal_model_quickstart.rst
 create mode 100644 lightllm-kernel/CMakeLists.txt
 create mode 100644 lightllm-kernel/Makefile
 create mode 100644 lightllm-kernel/README-CH.md
 create mode 100644 lightllm-kernel/README.md
 create mode 100644 lightllm-kernel/csrc/cuda_compat.h
 create mode 100644 lightllm-kernel/csrc/moe/grouped_topk.cu
 create mode 100644 lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
 create mode 100644 lightllm-kernel/csrc/ops_bindings.cpp
 create mode 100644 lightllm-kernel/include/ops_common.h
 create mode 100644 lightllm-kernel/lightllm_kernel/__init__.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/__init__.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/attention.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/moe.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/quant.py
 create mode 100644 lightllm-kernel/setup.py

diff --git a/docs/CN/source/getting_started/multimodal_model_quickstart.rst b/docs/CN/source/getting_started/multimodal_model_quickstart.rst
new file mode 100644
index 000000000..cc3eaf724
--- /dev/null
+++ b/docs/CN/source/getting_started/multimodal_model_quickstart.rst
@@ -0,0 +1,11 @@
+..multimodal_model_quickstart.rst
+-------------------------
+
+下载多模态模型（如llava系列、internvl系列、qwen_vl系列等）的模型以后，在终端使用下面的代码部署API服务：
+
+.. code-block:: console
+
+    $ python -m lightllm.server.api_server --model_dir ~/models/llava-7b-chat --use_dynamic_prompt_cache --enable_multimodal
+
+.. note::
+    上面代码中的 ``--model_dir`` 参数需要修改为你本机实际的模型路径。
diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt
new file mode 100644
index 000000000..c61ed9dd8
--- /dev/null
+++ b/lightllm-kernel/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.22)
+project(lightllm_kernel LANGUAGES CXX CUDA)
+
+# GPU 架构：缺省支持 A100(80)、Ampere(86)、Ada/L40s/4090(89)、Hopper(90)，
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 80;86;89;90-virtual)
+endif()
+
+# 找 PyTorch & Python
+find_package(Torch REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development)
+
+# 收集 csrc 下的 .cpp/.cu
+file(GLOB_RECURSE SRC_CPP   CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cpp")
+file(GLOB_RECURSE SRC_CUDA  CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cu")
+
+# 编译生成 Python 扩展， _C.so
+add_library(_C SHARED ${SRC_CPP} ${SRC_CUDA})
+
+# C++17 更方便调度宏
+target_compile_features(_C PRIVATE cxx_std_17)
+target_include_directories(_C PRIVATE ${TORCH_INCLUDE_DIRS})
+target_link_libraries(_C
+    PRIVATE
+      ${TORCH_LIBRARIES}
+      Python::Python)
+
+      
+# 输出文件名 _C.so，无前缀
+set_target_properties(_C PROPERTIES
+    PREFIX ""
+    OUTPUT_NAME "_C"
+    BUILD_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+    INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+)
+
+# 安装：把 _C.so、Python 包和 csrc 一起拷到 site-packages
+include(GNUInstallDirs)
+
+# 1) 计算 Python site-packages 路径
+execute_process(
+  COMMAND ${Python_EXECUTABLE} - <<EOF
+import sysconfig, json
+print(json.dumps({
+  "arch": sysconfig.get_path("platlib"),
+  "pure": sysconfig.get_path("purelib")
+}))
+EOF
+  OUTPUT_VARIABLE _py_paths
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+string(JSON Python_SITEARCH GET "${_py_paths}" arch)
+string(JSON Python_SITELIB  GET "${_py_paths}" pure)
+
+# 2) 安装编译好的 _C.so 到 lightllm_kernel 目录
+install(TARGETS _C
+        LIBRARY DESTINATION ${Python_SITEARCH}/lightllm_kernel)
+
+# 3) 安装 Python 源码包
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lightllm_kernel
+        DESTINATION ${Python_SITELIB})
+
+# 4) 安装 csrc 源码以供 JIT fallback
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/csrc
+        DESTINATION ${Python_SITELIB}/lightllm_kernel)
diff --git a/lightllm-kernel/Makefile b/lightllm-kernel/Makefile
new file mode 100644
index 000000000..c3fc05d52
--- /dev/null
+++ b/lightllm-kernel/Makefile
@@ -0,0 +1,9 @@
+.PHONY: build clean
+
+build:
+	# 8.0-> A100, 8.6-> A10, 8.9-> L40s/4090, 9.0+PTX-> Hopper
+	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" \
+	python -m pip install -v .
+
+clean:
+	rm -rf build dist *.egg-info
\ No newline at end of file
diff --git a/lightllm-kernel/README-CH.md b/lightllm-kernel/README-CH.md
new file mode 100644
index 000000000..647a594b8
--- /dev/null
+++ b/lightllm-kernel/README-CH.md
@@ -0,0 +1,42 @@
+# LightLLM-Kernel
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+lightllm-kernel 是大模型推理系统 LightLLM 的 CUDA 算子库。它提供了在大型模型推理过程中所需的一系列自定义 GPU 运算算子，以加速关键步骤的计算。
+
+## 功能列表
+
+| Module       | Description                                                                                     |
+|--------------|-------------------------------------------------------------------------------------------------|
+| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax         |
+| **MoE**       | Expert routing and computation kernels for Mixture-of-Experts architectures                    |
+| **Quant**     | Low-precision quantization support (INT8/INT4) for weights and activations                      |
+| **Extensions**| Continuous expansion of optimized operations for emerging model architectures                   |
+
+## 安装方法
+
+lightllm_kernel 提供了静态编译以及JIT（Just-In-Time）动态编译的安装方式。推荐使用静态编译安装以获得最佳性能，同时也支持开发者使用可编辑安装进行开发调试。
+
+### System Requirements
+- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+)
+- CUDA 11.8 or higher
+- Python 3.8+
+
+### Installation Methods
+
+#### Static Compilation (Recommended)
+```bash
+git clone https://github.com/YourUsername/lightllm_kernel.git
+cd lightllm_kernel
+make build
+# Alternative using pip
+pip install .
+```
+
+## 贡献指南
+欢迎社区开发者为 lightllm_kernel 做出贡献！如果您计划新增自定义算子或改进现有功能，请参考以下指南：
+- 新增算子实现：在 csrc/ 目录下添加您的 CUDA/C++ 源码文件，添加时建议参考现有算子的代码风格和结构。
+- 注册Python接口：在 csrc/ops_bindings.cpp中，将新增的算子通过 PyBind11 或 TORCH_LIBRARY 等机制注册到 Python 接口。
+- 导出算子到Python模块：在lightllm_kernel/ops/__init__.py只添加相应的导出代码，使新算子包含在 lightllm_kernel.ops 模块中。
+- 本地测试：开发完成后，请在本地对您的更改进行测试。您可以编译安装新的版本并编写简单的脚本调用新算子，检查其功能和性能是否符合预期。如果项目附带了测试用例，也请运行所有测试确保不引入回归。
+- 
\ No newline at end of file
diff --git a/lightllm-kernel/README.md b/lightllm-kernel/README.md
new file mode 100644
index 000000000..9ce4bce41
--- /dev/null
+++ b/lightllm-kernel/README.md
@@ -0,0 +1,39 @@
+# LightLLM-Kernel
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+LightLLM-Kernel is a high-performance CUDA kernel library powering the LightLLM inference system. It provides optimized GPU implementations for critical operations in large language model (LLM) inference, delivering significant performance improvements through carefully crafted CUDA kernels.
+
+## Project Overview
+
+LightLLM-Kernel serves as the computational backbone for LightLLM framework, offering:
+- **Custom CUDA Kernels**: Highly optimized implementations for transformer-based model operations
+- **Memory Efficiency**: Reduced memory footprint through advanced quantization techniques
+- **Scalability**: Support for large model architectures including MoE (Mixture-of-Experts) models
+
+## Key Features
+
+### Core Modules
+| Module       | Description                                                                                     |
+|--------------|-------------------------------------------------------------------------------------------------|
+| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax         |
+| **MoE**       | Expert routing and computation kernels for Mixture-of-Experts architectures                    |
+| **Quant**     | Low-precision quantization support (INT8/INT4) for weights and activations                      |
+| **Extensions**| Continuous expansion of optimized operations for emerging model architectures                   |
+
+## Installation
+
+### System Requirements
+- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+)
+- CUDA 11.8 or higher
+- Python 3.8+
+
+### Installation Methods
+
+#### Static Compilation (Recommended)
+```bash
+git clone https://github.com/YourUsername/lightllm_kernel.git
+cd lightllm_kernel
+make build
+# Alternative using pip
+pip install .
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/cuda_compat.h b/lightllm-kernel/csrc/cuda_compat.h
new file mode 100644
index 000000000..82e55613d
--- /dev/null
+++ b/lightllm-kernel/csrc/cuda_compat.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif
diff --git a/lightllm-kernel/csrc/moe/grouped_topk.cu b/lightllm-kernel/csrc/moe/grouped_topk.cu
new file mode 100644
index 000000000..635ca5193
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/grouped_topk.cu
@@ -0,0 +1,319 @@
+#include <cub/cub.cuh>
+#include <torch/extension.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "../cuda_compat.h"
+
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+namespace moe {
+
+template <int TPB>
+__launch_bounds__(TPB) 
+__global__ void moeGroupedTopK(
+    const float* input, 
+    const bool* finished, 
+    float* inputs_after_softmax, 
+    const int num_cols, 
+    const float* correction_bias, 
+    float* group_scores, 
+    float* output, // topk_weights
+    int* indices, // topk_indices
+    int* group_indices, // token_expert_indices
+    const int num_experts, 
+    const int num_expert_group, 
+    const int topk_group,
+    const int k,
+    const bool renormalize,
+    const bool softmax_or_sigmoid, 
+    const int start_expert, 
+    const int end_expert)
+{
+
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    if(softmax_or_sigmoid)
+    {
+        //softmax
+        using BlockReduce_topk = cub::BlockReduce<float, TPB>;
+        __shared__ typename BlockReduce_topk::TempStorage tmpStorage;
+
+        __shared__ float normalizing_factor;
+        __shared__ float float_max;
+
+        cub::Sum sum;
+        float threadData(-FLT_MAX);
+
+        // Don't touch finished rows.
+        if ((finished != nullptr) && finished[blockIdx.x])
+        {
+            return;
+        }
+
+        for (int i = threadIdx.x; i < num_cols; i += TPB)
+        {
+            const int idx = thread_row_offset + i;
+            threadData = max(static_cast<float>(input[idx]), threadData);
+        }
+
+        const float maxElem = BlockReduce_topk(tmpStorage).Reduce(threadData, cub::Max());
+        if (threadIdx.x == 0)
+        {
+            float_max = maxElem;
+        }
+        __syncthreads();
+
+        threadData = 0;
+
+        for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+        {
+            const int idx = thread_row_offset + ii;
+            threadData += exp((static_cast<float>(input[idx]) - float_max));
+        }
+
+        const auto Z = BlockReduce_topk(tmpStorage).Reduce(threadData, sum);
+
+        if (threadIdx.x == 0)
+        {
+            normalizing_factor = 1.f / Z;
+        }
+        __syncthreads();
+
+        for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+        {
+            const int idx = thread_row_offset + ii;
+            const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+            inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f);
+        }
+    } else {
+        // sigmoid
+        for (int i = threadIdx.x; i < num_cols; i += TPB)
+        {
+            const int idx = thread_row_offset + i;
+            float val = 1.f / (1.f + expf(-input[idx])); 
+            inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f);
+        }
+    }
+    __syncthreads();
+
+    using cub_kvp = cub::KeyValuePair<int, float>;
+    using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage_kvp;
+
+    int block_row = blockIdx.x;  // (0 - tokens-1) 即0-199
+    int thread_read_offset = block_row * num_experts;
+
+    int group_size = num_experts / num_expert_group;
+
+    for(int group_id = threadIdx.x; group_id < num_expert_group; group_id += TPB)
+    {
+        float local_max = -FLT_MAX;
+        const int start = group_id * group_size;
+        const int end   = (group_id + 1) * group_size;
+
+        // find max in this group
+        for(int e = start; e < end; e++)
+        {
+            float val = inputs_after_softmax[thread_read_offset + e];
+            local_max = fmaxf(local_max, val);
+        }
+
+        // store max in group_scores
+        group_scores[block_row * num_expert_group + group_id] = local_max;
+    }
+    __syncthreads();
+
+    cub_kvp thread_kvp;
+    cub::ArgMax arg_max;
+
+    const bool row_is_active = finished ? !finished[block_row] : true;
+    thread_read_offset = blockIdx.x * num_expert_group;
+
+    for (int k_idx = 0; k_idx < topk_group; ++k_idx)
+    {
+        thread_kvp.key = 0;
+        thread_kvp.value = -1.f; // This is OK because inputs are probabilities
+
+        // every thread finds the max expert in a different expert group
+        cub_kvp inp_kvp;
+        for (int expert = threadIdx.x; expert < num_expert_group; expert += TPB)
+        {
+            const int idx = thread_read_offset + expert;
+            inp_kvp.key = expert;
+            inp_kvp.value = group_scores[idx];
+
+            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
+            {
+                const int prior_winning_expert = group_indices[topk_group * block_row + prior_k]; 
+
+                if (prior_winning_expert == expert)
+                {
+                    inp_kvp = thread_kvp;
+                }
+            }
+
+            thread_kvp = arg_max(inp_kvp, thread_kvp);
+        }
+
+        const cub_kvp result_kvp = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0)
+        {
+            // Ignore experts the node isn't responsible for with expert parallelism
+            const int expert = result_kvp.key;
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            const int idx = topk_group * block_row + k_idx;
+            group_indices[idx] = should_process_row ? (expert - start_expert) : num_expert_group;
+            assert(group_indices[idx] >= 0);
+        }
+        __syncthreads();
+    }
+
+    int score_offset = block_row * num_experts; 
+    for (int e = threadIdx.x; e < num_experts; e += TPB)
+    {
+        int grp = e / group_size;
+        bool selected = false;
+        // selected = True if e in group_indices[block_row, :]
+        for (int i = 0; i < topk_group; i++) {
+            int sel_grp = group_indices[block_row * topk_group + i];
+            if (sel_grp == grp) {
+                selected = true;
+                break;
+            }
+        }
+        if (!selected) {
+            inputs_after_softmax[score_offset + e] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    for (int tk = 0; tk < k; tk++) {
+        thread_kvp.key = -1;
+        thread_kvp.value = -FLT_MAX;
+        for (int e = threadIdx.x; e < num_experts; e += TPB) {
+            bool already_selected = false;
+            for (int prev = 0; prev < tk; prev++) {
+                if (indices[block_row * k + prev] == e) {
+                    already_selected = true;
+                    break;
+                }
+            }
+            float val = already_selected ? -FLT_MAX : inputs_after_softmax[score_offset + e];
+            cub_kvp inp;
+            inp.key = e;
+            inp.value = val;
+            thread_kvp = arg_max(inp, thread_kvp);
+        }
+        cub_kvp result = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0) {
+            output[block_row * k + tk] = result.value;
+            indices[block_row * k + tk] = result.key;
+        }
+        __syncthreads();
+    }
+
+    // renormalize
+    if (threadIdx.x == 0 && renormalize) {
+        float sum = 0.0f;
+        int out_offset = block_row * k;
+        for (int j = 0; j < k; j++) {
+            sum += output[out_offset + j];
+        }
+        // avoid division by zero
+        if (sum > 0.0f) {
+            for (int j = 0; j < k; j++) {
+                output[out_offset + j] /= sum;
+            }
+        }
+    }
+    __syncthreads();
+
+}
+
+void GroupedTopKKernelLauncher(
+    const float* gating_output,
+    const float* correction_bias,
+    float* topk_weights,
+    int* topk_indicies,
+    int* group_indices,
+    float* softmax_workspace,
+    float* group_scores,
+    const int num_tokens,
+    const int num_experts,
+    const int num_expert_group,
+    const int topk_group,
+    const int topk,
+    const bool renormalize,
+    const bool softmax_or_sigmoid,
+    cudaStream_t stream) {
+
+    static constexpr int TPB = 256;
+    moeGroupedTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+        gating_output, nullptr, softmax_workspace, num_experts, correction_bias,
+        group_scores, topk_weights, topk_indicies, group_indices,
+        num_experts, num_expert_group, topk_group, topk, renormalize, softmax_or_sigmoid, 0, num_experts);
+}
+
+} // namespace moe
+
+void grouped_topk_cuda(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& correction_bias,             // [num_tokens, num_experts]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& group_indices,               // [num_tokens, topk_group]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    const int num_expert_group,
+    const int topk_group,
+    const int topk,
+    const bool renormalize,
+    std::string scoring_func,
+    torch::Tensor group_scores = torch::Tensor() // [num_tokens, num_expert_group]
+    )
+{
+    const int num_experts = gating_output.size(-1);
+    const int num_tokens = gating_output.numel() / num_experts;
+
+    const int64_t workspace_size = num_tokens * num_experts;
+
+    const bool softmax_or_sigmoid = (scoring_func == "softmax") ? true : false;
+
+    float* d_group_scores = nullptr;
+    if (group_scores.defined() && group_scores.numel() > 0) {
+        d_group_scores = group_scores.data_ptr<float>();
+    } else {
+        cudaMalloc(&d_group_scores, num_tokens * num_expert_group * sizeof(float));
+        cudaMemset(d_group_scores, 0, num_tokens * num_expert_group * sizeof(float));
+    }
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    moe::GroupedTopKKernelLauncher(
+        gating_output.data_ptr<float>(),
+        correction_bias.defined() ? correction_bias.data_ptr<float>() : nullptr,
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        group_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        d_group_scores,
+        num_tokens,
+        num_experts,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        softmax_or_sigmoid,
+        stream);
+}
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp b/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
new file mode 100644
index 000000000..f35c92caa
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
@@ -0,0 +1,48 @@
+#include <torch/extension.h>
+#include "../../include/ops_common.h"
+
+
+void grouped_topk_cuda(
+    torch::Tensor& topk_weights,
+    torch::Tensor& correction_bias,
+    torch::Tensor& topk_indices,
+    torch::Tensor& group_indices,
+    torch::Tensor& gating_output,
+    int  num_expert_group,
+    int  topk_group,
+    int  topk,
+    bool renormalize,
+    std::string scoring_func,
+    torch::Tensor group_scores = torch::Tensor());
+
+torch::Tensor grouped_topk(
+    torch::Tensor topk_weights,
+    torch::Tensor correction_bias,
+    torch::Tensor topk_indices,
+    torch::Tensor group_indices,
+    torch::Tensor gating_output,
+    int  num_expert_group,
+    int  topk_group,
+    int  topk,
+    bool renormalize,
+    std::string scoring_func,
+    torch::Tensor group_scores /* = {} */) {
+
+    TORCH_CHECK(topk_weights.is_cuda(),   "topk_weights must be CUDA tensor");
+    TORCH_CHECK(gating_output.is_cuda(),  "gating_output must be CUDA tensor");
+
+    grouped_topk(topk_weights,
+                 correction_bias,
+                 topk_indices,
+                 group_indices,
+                 gating_output,
+                 num_expert_group,
+                 topk_group,
+                 topk,
+                 renormalize,
+                 scoring_func,
+                 group_scores);
+
+    // 就地写结果，所以这里直接返回topk_weights
+    return topk_weights;
+}
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
new file mode 100644
index 000000000..e21a9d376
--- /dev/null
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -0,0 +1,56 @@
+#include <torch/extension.h>
+#include "../include/ops_common.h"
+#include <pybind11/pybind11.h>
+
+void grouped_topk_cuda(
+        torch::Tensor& topk_weights,
+        torch::Tensor& correction_bias,
+        torch::Tensor& topk_indices,
+        torch::Tensor& group_indices,
+        torch::Tensor& gating_output,
+        int  num_expert_group,
+        int  topk_group,
+        int  topk,
+        bool renormalize,
+        std::string scoring_func,
+        torch::Tensor group_scores);
+
+
+torch::Tensor grouped_topk(
+        torch::Tensor topk_weights,
+        torch::Tensor correction_bias,
+        torch::Tensor topk_indices,
+        torch::Tensor group_indices,
+        torch::Tensor gating_output,
+        int64_t  num_expert_group,
+        int64_t  topk_group,
+        int64_t  topk,
+        bool     renormalize,
+        std::string scoring_func,
+        torch::Tensor group_scores) {
+
+    grouped_topk_cuda(topk_weights, correction_bias, topk_indices, group_indices,
+                      gating_output,
+                      static_cast<int>(num_expert_group),
+                      static_cast<int>(topk_group),
+                      static_cast<int>(topk),
+                      renormalize, scoring_func, group_scores);
+
+    return topk_weights;
+}
+
+PYBIND11_MODULE(_C, m) {
+    m.def("grouped_topk", &grouped_topk,
+          "Grouped Top-K routing (CUDA)",
+          py::arg("topk_weights"),
+          py::arg("correction_bias"),
+          py::arg("topk_indices"),
+          py::arg("group_indices"),
+          py::arg("gating_output"),
+          py::arg("num_expert_group"),
+          py::arg("topk_group"),
+          py::arg("topk"),
+          py::arg("renormalize"),
+          py::arg("scoring_func"),
+          py::arg("group_scores") = torch::Tensor());
+}
\ No newline at end of file
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
new file mode 100644
index 000000000..3c80fef44
--- /dev/null
+++ b/lightllm-kernel/include/ops_common.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
\ No newline at end of file
diff --git a/lightllm-kernel/lightllm_kernel/__init__.py b/lightllm-kernel/lightllm_kernel/__init__.py
new file mode 100644
index 000000000..23c3bd2b0
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/__init__.py
@@ -0,0 +1,3 @@
+from . import ops  # noqa: F401
+
+__all__ = ["ops"]
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
new file mode 100644
index 000000000..c3f54642b
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -0,0 +1,44 @@
+import importlib
+import os
+from pathlib import Path
+from torch.utils.cpp_extension import load
+
+PKG = "lightllm_kernel"
+try:
+    _C = importlib.import_module(f"{PKG}._C")
+except ImportError:
+    repo_root = Path(__file__).resolve().parents[2]
+    csrc_dir = repo_root / "csrc"
+    if not csrc_dir.exists():
+        raise ImportError(
+            "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
+            "directory (csrc/) found; please ensure you have run "
+            "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
+        )
+
+    sources = (
+        [str(p) for p in (csrc_dir / "moe").glob("*.cpp")]
+        + [str(p) for p in (csrc_dir / "moe").glob("*.cu")]
+        + [str(csrc_dir / "ops_bindings.cpp")]
+    )
+
+    _C = load(
+        name="lightllm_kernel._C",
+        sources=sources,
+        verbose=True,
+        extra_cuda_cflags=[
+            # A100
+            "-gencode=arch=compute_80,code=sm_80",
+            "-gencode=arch=compute_80,code=compute_80",
+            # Ada / L40s / 4090
+            "-gencode=arch=compute_89,code=sm_89",
+            "-gencode=arch=compute_89,code=compute_89",
+            # Hopper / H100 / H200
+            "-gencode=arch=compute_90,code=sm_90",
+            "-gencode=arch=compute_90,code=compute_90",
+        ],
+    )
+
+# 向外暴露 Python 端接口
+grouped_topk = _C.grouped_topk
+__all__ = ["grouped_topk"]
diff --git a/lightllm-kernel/lightllm_kernel/ops/attention.py b/lightllm-kernel/lightllm_kernel/ops/attention.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm-kernel/lightllm_kernel/ops/moe.py b/lightllm-kernel/lightllm_kernel/ops/moe.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
new file mode 100644
index 000000000..34f992b73
--- /dev/null
+++ b/lightllm-kernel/setup.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+this_dir = Path(__file__).parent
+
+sources = [
+    str(this_dir / "csrc" / "moe" / "grouped_topk_interface.cpp"),
+    str(this_dir / "csrc" / "moe" / "grouped_topk.cu"),
+    str(this_dir / "csrc" / "ops_bindings.cpp"),
+]
+print("---- sources for CUDAExtension ----")
+for s in sources:
+    print(s)
+print("-----------------------------------")
+ext_modules = [
+    CUDAExtension(
+        name="lightllm_kernel._C",
+        sources=sources,
+        extra_compile_args={
+            "cxx": ["-O3"],
+            "nvcc": [
+                "-O3",
+                "--use_fast_math",
+                "-gencode=arch=compute_90,code=sm_90",
+                "-gencode=arch=compute_90,code=compute_90",
+            ],
+        },
+        include_dirs=[str(this_dir / "include")],
+    )
+]
+
+setup(
+    name="lightllm_kernel",
+    packages=["lightllm_kernel", "lightllm_kernel.ops"],
+    version="0.1",
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    package_dir={"ops": "ops"},
+)

From 8b5f18b0f0b26c734d85e3e6cca9bcd1ea387a9f Mon Sep 17 00:00:00 2001
From: Xtra <571889291@qq.com>
Date: Fri, 9 May 2025 12:49:06 +0800
Subject: [PATCH 02/14] feat(vit_cuda_kernels):add norm quant and some fused
 ops (#886)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# vit fp8w8a8量化推理相关算子优化
## 新增算子
1. rmsnorm_bf16，性能较pytorch较大提升
2. pre_tp_norm，融合了tp_norm的通信前操作
3. post_tp_norm，融合了tp_norm的通信后操作
4. pre_token_quant，逐token FP8量化，性能较vllm的quant极大提升，较sgl的quant性能更好
5. gelu_per_token_quant，融合了GELU激活 + 逐token FP8量化
6. add_norm_quant，融合了attention与mlp模块间的，add norm quant操作
7. cutlass_scaled_mm_bias_ls，融合了量化矩阵乘、反量化和可选的bias和ls weight
---
 lightllm-kernel/csrc/fusion/add_norm_quant.cu |   551 +
 .../csrc/fusion/gelu_per_token_quant.cu       |   367 +
 lightllm-kernel/csrc/fusion/post_tp_norm.cu   |   364 +
 lightllm-kernel/csrc/fusion/pre_tp_norm.cu    |   257 +
 lightllm-kernel/csrc/gemm/Epilogues.md        |   147 +
 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu    |    73 +
 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh   |   161 +
 .../gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh  |    97 +
 lightllm-kernel/csrc/gemm/scaled_mm_entry.cu  |    83 +
 lightllm-kernel/csrc/moe/grouped_topk.cu      |    35 +-
 .../csrc/moe/grouped_topk_interface.cpp       |    48 -
 lightllm-kernel/csrc/norm/rmsnorm_bf16.cu     |   350 +
 lightllm-kernel/csrc/ops_bindings.cpp         |    52 +-
 .../csrc/quant/per_token_quantize_bf16.cu     |   342 +
 .../cutlass/include/cute/algorithm/axpby.hpp  |    95 +
 .../cutlass/include/cute/algorithm/clear.hpp  |    64 +
 .../cute/algorithm/cooperative_copy.hpp       |   333 +
 .../cute/algorithm/cooperative_gemm.hpp       |   512 +
 .../cutlass/include/cute/algorithm/copy.hpp   |   382 +
 .../cutlass/include/cute/algorithm/fill.hpp   |    87 +
 .../include/cute/algorithm/functional.hpp     |   290 +
 .../cutlass/include/cute/algorithm/gemm.hpp   |   500 +
 .../cutlass/include/cute/algorithm/prefer.hpp |    46 +
 .../include/cute/algorithm/prefetch.hpp       |   145 +
 .../cute/algorithm/tensor_algorithms.hpp      |   166 +
 .../cute/algorithm/tuple_algorithms.hpp       |  1073 +
 .../include/cute/arch/cluster_sm90.hpp        |   245 +
 .../cutlass/include/cute/arch/config.hpp      |    50 +
 .../cutlass/include/cute/arch/copy.hpp        |   107 +
 .../cutlass/include/cute/arch/copy_sm50.hpp   |    98 +
 .../cutlass/include/cute/arch/copy_sm75.hpp   |   236 +
 .../cutlass/include/cute/arch/copy_sm80.hpp   |   198 +
 .../cutlass/include/cute/arch/copy_sm90.hpp   |   219 +
 .../include/cute/arch/copy_sm90_desc.hpp      |   440 +
 .../include/cute/arch/copy_sm90_tma.hpp       |  1395 +
 .../cutlass/include/cute/arch/mma.hpp         |    64 +
 .../cutlass/include/cute/arch/mma_sm61.hpp    |    87 +
 .../cutlass/include/cute/arch/mma_sm70.hpp    |   329 +
 .../cutlass/include/cute/arch/mma_sm75.hpp    |   120 +
 .../cutlass/include/cute/arch/mma_sm80.hpp    |  2243 +
 .../cutlass/include/cute/arch/mma_sm90.hpp    |  9331 +++
 .../include/cute/arch/mma_sm90_desc.hpp       |   156 +
 .../include/cute/arch/mma_sm90_gmma.hpp       | 20974 ++++++
 .../include/cute/arch/mma_sm90_gmma_ext.hpp   | 56445 +++++++++++++++
 .../cute/arch/mma_sm90_gmma_sparse.hpp        | 22743 ++++++
 .../cute/arch/mma_sm90_gmma_sparse_ext.hpp    | 60445 ++++++++++++++++
 .../cutlass/include/cute/arch/util.hpp        |   320 +
 .../cutlass/include/cute/atom/copy_atom.hpp   |   764 +
 .../cutlass/include/cute/atom/copy_traits.hpp |   159 +
 .../include/cute/atom/copy_traits_sm50.hpp    |    75 +
 .../include/cute/atom/copy_traits_sm75.hpp    |   143 +
 .../include/cute/atom/copy_traits_sm80.hpp    |   194 +
 .../include/cute/atom/copy_traits_sm90.hpp    |   132 +
 .../cute/atom/copy_traits_sm90_im2col.hpp     |   940 +
 .../cute/atom/copy_traits_sm90_tma.hpp        |  1525 +
 .../atom/copy_traits_sm90_tma_swizzle.hpp     |    93 +
 .../cutlass/include/cute/atom/mma_atom.hpp    |  1117 +
 .../cutlass/include/cute/atom/mma_traits.hpp  |   189 +
 .../include/cute/atom/mma_traits_sm61.hpp     |    73 +
 .../include/cute/atom/mma_traits_sm70.hpp     |   198 +
 .../include/cute/atom/mma_traits_sm75.hpp     |    81 +
 .../include/cute/atom/mma_traits_sm80.hpp     |   489 +
 .../include/cute/atom/mma_traits_sm90.hpp     |   144 +
 .../cute/atom/mma_traits_sm90_gmma.hpp        |  8999 +++
 .../cute/atom/mma_traits_sm90_gmma_ext.hpp    | 20116 +++++
 .../cute/atom/mma_traits_sm90_gmma_sparse.hpp |  7738 ++
 .../atom/mma_traits_sm90_gmma_sparse_ext.hpp  | 17335 +++++
 .../cutlass/include/cute/config.hpp           |   149 +
 .../include/cute/container/alignment.hpp      |    70 +
 .../cutlass/include/cute/container/array.hpp  |   492 +
 .../include/cute/container/array_aligned.hpp  |    42 +
 .../include/cute/container/array_subbyte.hpp  |   643 +
 .../include/cute/container/bit_field.hpp      |   133 +
 .../include/cute/container/cuda_types.hpp     |   183 +
 .../include/cute/container/packed_tuple.hpp   |   254 +
 .../cutlass/include/cute/container/tuple.hpp  |   744 +
 .../include/cute/container/type_list.hpp      |   124 +
 .../cutlass/include/cute/int_tuple.hpp        |   864 +
 .../cutlass/include/cute/layout.hpp           |  2058 +
 .../cutlass/include/cute/layout_composed.hpp  |   652 +
 .../include/cute/numeric/arithmetic_tuple.hpp |   556 +
 .../cutlass/include/cute/numeric/complex.hpp  |    76 +
 .../cutlass/include/cute/numeric/int.hpp      |   106 +
 .../include/cute/numeric/integer_sequence.hpp |   151 +
 .../cute/numeric/integral_constant.hpp        |   517 +
 .../include/cute/numeric/integral_ratio.hpp   |   264 +
 .../cutlass/include/cute/numeric/math.hpp     |   356 +
 .../include/cute/numeric/numeric_types.hpp    |   135 +
 .../cutlass/include/cute/numeric/real.hpp     |    74 +
 .../cutlass/include/cute/pointer.hpp          |   322 +
 .../cutlass/include/cute/pointer_base.hpp     |   246 +
 .../cutlass/include/cute/pointer_flagged.hpp  |   199 +
 .../cutlass/include/cute/pointer_sparse.hpp   |   172 +
 .../cutlass/include/cute/pointer_swizzle.hpp  |   168 +
 .../cutlass/include/cute/stride.hpp           |   598 +
 .../cutlass/include/cute/swizzle.hpp          |   498 +
 .../cutlass/include/cute/swizzle_layout.hpp   |   584 +
 .../cutlass/include/cute/tensor.hpp           |    58 +
 .../cutlass/include/cute/tensor_impl.hpp      |  1193 +
 .../cutlass/include/cute/tensor_predicate.hpp |    78 +
 .../cutlass/include/cute/tensor_zip.hpp       |   243 +
 .../cutlass/include/cute/underscore.hpp       |   194 +
 .../cutlass/include/cute/util/debug.hpp       |   164 +
 .../cutlass/include/cute/util/print.hpp       |   261 +
 .../cutlass/include/cute/util/type_traits.hpp |   292 +
 .../cutlass/include/cutlass/aligned_buffer.h  |   129 +
 .../cutlass/include/cutlass/arch/arch.h       |   109 +
 .../cutlass/include/cutlass/arch/barrier.h    |   630 +
 .../include/cutlass/arch/cache_operation.h    |    66 +
 .../cutlass/include/cutlass/arch/config.h     |    81 +
 .../cutlass/arch/grid_dependency_control.h    |    84 +
 .../cutlass/include/cutlass/arch/memory.h     |   602 +
 .../include/cutlass/arch/memory_sm75.h        |   269 +
 .../include/cutlass/arch/memory_sm80.h        |   472 +
 .../cutlass/include/cutlass/arch/mma.h        |   269 +
 .../cutlass/include/cutlass/arch/mma_sm50.h   |   432 +
 .../cutlass/include/cutlass/arch/mma_sm60.h   |   252 +
 .../cutlass/include/cutlass/arch/mma_sm61.h   |   142 +
 .../cutlass/include/cutlass/arch/mma_sm70.h   |   665 +
 .../cutlass/include/cutlass/arch/mma_sm75.h   |   793 +
 .../cutlass/include/cutlass/arch/mma_sm80.h   |  1505 +
 .../cutlass/include/cutlass/arch/mma_sm89.h   |   367 +
 .../cutlass/include/cutlass/arch/mma_sm90.h   |   245 +
 .../include/cutlass/arch/mma_sparse_sm80.h    |  1238 +
 .../include/cutlass/arch/mma_sparse_sm89.h    |   409 +
 .../include/cutlass/arch/reg_reconfig.h       |    67 +
 .../cutlass/include/cutlass/arch/simd.h       |   125 +
 .../cutlass/include/cutlass/arch/simd_sm60.h  |   104 +
 .../cutlass/include/cutlass/arch/simd_sm61.h  |   147 +
 .../cutlass/include/cutlass/arch/synclog.hpp  |  1324 +
 .../cutlass/include/cutlass/arch/wmma.h       |   223 +
 .../cutlass/include/cutlass/arch/wmma_sm70.h  |   136 +
 .../cutlass/include/cutlass/arch/wmma_sm72.h  |   210 +
 .../cutlass/include/cutlass/arch/wmma_sm75.h  |   207 +
 .../cutlass/include/cutlass/array.h           |  2614 +
 .../include/cutlass/array_planar_complex.h    |    89 +
 .../cutlass/include/cutlass/array_subbyte.h   |   559 +
 .../cutlass/include/cutlass/barrier.h         |   377 +
 .../cutlass/include/cutlass/bfloat16.h        |   679 +
 .../cutlass/include/cutlass/blas3.h           |   143 +
 .../cutlass/include/cutlass/blas3_types.h     |    78 +
 .../cutlass/include/cutlass/block_striped.h   |   267 +
 .../include/cutlass/cluster_launch.hpp        |   275 +
 .../cutlass/include/cutlass/complex.h         |   823 +
 .../cutlass/include/cutlass/constants.h       |  1239 +
 .../conv/collective/builders/sm90_common.inl  |    96 +
 .../collective/builders/sm90_gmma_builder.inl |   257 +
 .../conv/collective/collective_builder.hpp    |    93 +
 .../conv/collective/collective_conv.hpp       |    62 +
 .../cutlass/conv/collective/detail.hpp        |   254 +
 ..._implicit_gemm_gmma_ss_warpspecialized.hpp |   663 +
 .../cutlass/conv/conv2d_problem_size.h        |   654 +
 .../cutlass/conv/conv3d_problem_size.h        |   513 +
 .../cutlass/conv/convnd_problem_shape.hpp     |   561 +
 .../include/cutlass/conv/convolution.h        |   194 +
 .../cutlass/include/cutlass/conv/detail.hpp   |   137 +
 .../conv/device/conv_universal_adapter.hpp    |   421 +
 .../cutlass/conv/device/direct_convolution.h  |   270 +
 .../conv/device/implicit_gemm_convolution.h   |   361 +
 .../device/implicit_gemm_convolution_fusion.h |   269 +
 .../include/cutlass/conv/dispatch_policy.hpp  |    90 +
 .../cutlass/conv/kernel/conv_universal.hpp    |    65 +
 .../cutlass/conv/kernel/default_conv2d.h      |   322 +
 .../conv/kernel/default_conv2d_dgrad.h        |  1927 +
 .../conv/kernel/default_conv2d_fprop.h        |  2007 +
 .../conv/kernel/default_conv2d_fprop_fusion.h |   357 +
 .../kernel/default_conv2d_fprop_with_absmax.h |   127 +
 .../default_conv2d_fprop_with_broadcast.h     |   221 +
 .../default_conv2d_fprop_with_reduction.h     |   130 +
 .../conv/kernel/default_conv2d_group_fprop.h  |   622 +
 .../conv/kernel/default_conv2d_wgrad.h        |  1011 +
 .../conv/kernel/default_conv2d_wgrad_fusion.h |   325 +
 .../conv/kernel/default_conv3d_dgrad.h        |   736 +
 .../conv/kernel/default_conv3d_fprop.h        |   981 +
 .../conv/kernel/default_conv3d_fprop_fusion.h |   360 +
 .../default_conv3d_fprop_with_broadcast.h     |   222 +
 .../conv/kernel/default_conv3d_wgrad.h        |   936 +
 .../cutlass/conv/kernel/default_deconv2d.h    |   999 +
 .../kernel/default_deconv2d_with_broadcast.h  |   305 +
 .../cutlass/conv/kernel/default_deconv3d.h    |   541 +
 .../kernel/default_deconv3d_with_broadcast.h  |   309 +
 .../conv/kernel/default_depthwise_fprop.h     |   588 +
 .../cutlass/conv/kernel/direct_convolution.h  |   505 +
 .../conv/kernel/implicit_gemm_convolution.h   |   455 +
 .../kernel/implicit_gemm_convolution_fusion.h |   461 +
 .../implicit_gemm_convolution_strided_dgrad.h |   492 +
 .../implicit_gemm_convolution_with_absmax.h   |   494 +
 ...cit_gemm_convolution_with_fused_epilogue.h |   499 +
 ...sm90_implicit_gemm_tma_warpspecialized.hpp |    76 +
 .../cutlass/conv/thread/depthwise_mma.h       |   325 +
 ...rad_filter_tile_access_iterator_analytic.h |   485 +
 ...ad_filter_tile_access_iterator_optimized.h |   619 +
 ...t_gradient_tile_access_iterator_analytic.h |   606 +
 ..._gradient_tile_access_iterator_optimized.h |   821 +
 ...activation_tile_access_iterator_analytic.h |   332 +
 ...vation_tile_access_iterator_few_channels.h |   360 +
 ...tion_tile_access_iterator_fixed_channels.h |   353 +
 ...ctivation_tile_access_iterator_optimized.h |   422 +
 ...rop_filter_tile_access_iterator_analytic.h |   330 +
 ...filter_tile_access_iterator_few_channels.h |   289 +
 ...lter_tile_access_iterator_fixed_channels.h |   275 +
 ...op_filter_tile_access_iterator_optimized.h |   322 +
 .../cutlass/conv/threadblock/conv2d_params.h  |   893 +
 .../conv/threadblock/conv2d_tile_iterator.h   |   337 +
 ...activation_tile_access_iterator_analytic.h |   285 +
 ...ctivation_tile_access_iterator_optimized.h |   321 +
 ...t_gradient_tile_access_iterator_analytic.h |   260 +
 ..._gradient_tile_access_iterator_optimized.h |   310 +
 ...rad_filter_tile_access_iterator_analytic.h |   268 +
 ...ad_filter_tile_access_iterator_optimized.h |   289 +
 ...t_gradient_tile_access_iterator_analytic.h |   343 +
 ..._gradient_tile_access_iterator_optimized.h |   489 +
 ...activation_tile_access_iterator_analytic.h |   291 +
 ...ctivation_tile_access_iterator_optimized.h |   478 +
 ...rop_filter_tile_access_iterator_analytic.h |   259 +
 ...op_filter_tile_access_iterator_optimized.h |   279 +
 .../cutlass/conv/threadblock/conv3d_params.h  |   508 +
 ...activation_tile_access_iterator_analytic.h |   289 +
 ...ctivation_tile_access_iterator_optimized.h |   319 +
 ...t_gradient_tile_access_iterator_analytic.h |   267 +
 ..._gradient_tile_access_iterator_optimized.h |   310 +
 .../depthwise_direct_conv_params.h            |   230 +
 ...erator_direct_conv_fixed_stride_dilation.h |   314 +
 ...le_access_iterator_direct_conv_optimized.h |   291 +
 .../depthwise_fprop_direct_conv_multistage.h  |   551 +
 ...le_access_iterator_direct_conv_optimized.h |   261 +
 .../threadblock/depthwise_fprop_pipelined.h   |   336 +
 .../conv/threadblock/depthwise_mma_base.h     |   229 +
 ...depthwise_mma_core_with_lane_access_size.h |   952 +
 .../implicit_gemm_fprop_fusion_multistage.h   |   802 +
 .../threadblock/implicit_gemm_multistage.h    |   539 +
 .../threadblock/implicit_gemm_pipelined.h     |   320 +
 .../implicit_gemm_wgrad_fusion_multistage.h   |   729 +
 ...icated_scale_bias_vector_access_iterator.h |   470 +
 .../predicated_scale_bias_vector_iterator.h   |   371 +
 .../conv/threadblock/threadblock_swizzle.h    |   193 +
 .../cutlass/conv/warp/mma_depthwise_simt.h    |   380 +
 .../warp/mma_depthwise_simt_tile_iterator.h   |   862 +
 .../conv/warp/scale_bias_relu_transform.h     |   221 +
 .../cutlass/include/cutlass/coord.h           |   480 +
 .../cutlass/include/cutlass/core_io.h         |   286 +
 .../include/cutlass/cuda_host_adapter.hpp     |   407 +
 .../cutlass/include/cutlass/cutlass.h         |   160 +
 .../include/cutlass/detail/collective.hpp     |    63 +
 .../cutlass/detail/dependent_false.hpp        |    86 +
 .../include/cutlass/detail/helper_macros.hpp  |   205 +
 .../cutlass/include/cutlass/detail/layout.hpp |   406 +
 .../cutlass/include/cutlass/detail/mma.hpp    |    71 +
 .../cutlass/include/cutlass/device_kernel.h   |   125 +
 .../collective/builders/sm90_builder.inl      |   812 +
 .../collective/builders/sm90_common.inl       |    80 +
 .../collective/collective_builder.hpp         |   120 +
 .../collective/collective_epilogue.hpp        |    71 +
 .../epilogue/collective/default_epilogue.hpp  |   242 +
 .../collective/default_epilogue_array.hpp     |   273 +
 .../cutlass/epilogue/collective/detail.hpp    |   491 +
 .../collective/epilogue_tensor_broadcast.hpp  |   271 +
 .../collective/sm70_epilogue_vectorized.hpp   |   549 +
 .../sm70_epilogue_vectorized_array.hpp        |   412 +
 ...m90_epilogue_array_tma_warpspecialized.hpp |  1191 +
 .../sm90_epilogue_tma_warpspecialized.hpp     |   904 +
 ...e_tma_warpspecialized_bias_elementwise.hpp |   164 +
 .../cutlass/epilogue/dispatch_policy.hpp      |   195 +
 .../cutlass/epilogue/fusion/callbacks.hpp     |    89 +
 .../cutlass/epilogue/fusion/operations.hpp    |   351 +
 .../sm90_callbacks_tma_warpspecialized.hpp    |  1787 +
 ...90_visitor_compute_tma_warpspecialized.hpp |   839 +
 .../sm90_visitor_load_tma_warpspecialized.hpp |  1415 +
 ...sm90_visitor_store_tma_warpspecialized.hpp |  1736 +
 .../sm90_visitor_tma_warpspecialized.hpp      |  1139 +
 .../fusion/sm90_visitor_topk_softmax.hpp      |   759 +
 .../cutlass/epilogue/thread/activation.h      |   758 +
 .../cutlass/epilogue/thread/conversion_op.h   |   132 +
 .../cutlass/epilogue/thread/detail.hpp        |    52 +
 .../epilogue/thread/linear_combination.h      |   523 +
 .../linear_combination_bias_elementwise.h     |   524 +
 .../thread/linear_combination_bias_relu.h     |   610 +
 .../thread/linear_combination_clamp.h         |   685 +
 .../thread/linear_combination_dgelu.h         |   250 +
 .../thread/linear_combination_drelu.h         |   452 +
 .../epilogue/thread/linear_combination_gelu.h |    70 +
 .../thread/linear_combination_generic.h       |   265 +
 .../linear_combination_generic_with_scaling.h |   325 +
 .../thread/linear_combination_hardswish.h     |    69 +
 .../thread/linear_combination_leaky_relu.h    |   231 +
 .../thread/linear_combination_params.h        |    75 +
 .../linear_combination_planar_complex.h       |   236 +
 .../epilogue/thread/linear_combination_relu.h |   572 +
 .../thread/linear_combination_relu0.h         |   543 +
 .../linear_combination_residual_block.h       |   301 +
 .../thread/linear_combination_sigmoid.h       |    70 +
 .../epilogue/thread/linear_combination_silu.h |    69 +
 .../linear_combination_tensor_broadcast.hpp   |   253 +
 .../linear_combination_with_elementwise.h     |   234 +
 .../cutlass/epilogue/thread/reduction_op.h    |    97 +
 .../cutlass/epilogue/thread/scale_type.h      |    66 +
 .../default_epilogue_complex_tensor_op.h      |   255 +
 ...default_epilogue_complex_tensor_op_blas3.h |   264 +
 .../default_epilogue_direct_store.h           |    74 +
 .../default_epilogue_planar_complex.h         |   241 +
 .../threadblock/default_epilogue_simt.h       |   443 +
 .../threadblock/default_epilogue_tensor_op.h  |   904 +
 .../default_epilogue_tensor_op_blas3.h        |   175 +
 .../default_epilogue_volta_tensor_op.h        |   337 +
 .../default_epilogue_with_absmax.h            |   126 +
 .../default_epilogue_with_broadcast.h         |   376 +
 .../default_epilogue_with_reduction.h         |   177 +
 .../default_epilogue_wmma_tensor_op.h         |   165 +
 .../threadblock/default_thread_map_simt.h     |   127 +
 .../default_thread_map_tensor_op.h            |   208 +
 .../default_thread_map_volta_tensor_op.h      |   228 +
 .../default_thread_map_wmma_tensor_op.h       |   113 +
 .../direct_store_epilogue_iterator.h          |   142 +
 .../cutlass/epilogue/threadblock/epilogue.h   |   543 +
 .../epilogue/threadblock/epilogue_base.h      |   240 +
 .../threadblock/epilogue_base_streamk.h       |   197 +
 .../epilogue/threadblock/epilogue_depthwise.h |   335 +
 .../threadblock/epilogue_direct_store.h       |   347 +
 .../threadblock/epilogue_gemm_k_reduction.h   |   212 +
 .../threadblock/epilogue_planar_complex.h     |   401 +
 .../threadblock/epilogue_smem_accumulator.h   |   230 +
 .../epilogue_streamk_with_broadcast.h         |   443 +
 .../epilogue_visitor_with_softmax.h           |   513 +
 .../threadblock/epilogue_with_absmax.h        |   923 +
 .../threadblock/epilogue_with_broadcast.h     |  1718 +
 .../threadblock/epilogue_with_reduction.h     |   823 +
 .../threadblock/epilogue_with_visitor.h       |   409 +
 .../epilogue_with_visitor_callbacks.h         |   504 +
 .../epilogue/threadblock/epilogue_workspace.h |   197 +
 .../threadblock/fusion/visitor_2x.hpp         |   433 +
 .../threadblock/fusion/visitor_compute.hpp    |   109 +
 .../threadblock/fusion/visitor_load.hpp       |   583 +
 .../threadblock/fusion/visitor_store.hpp      |   805 +
 .../epilogue/threadblock/fusion/visitors.hpp  |    38 +
 .../threadblock/interleaved_epilogue.h        |   407 +
 .../threadblock/output_iterator_parameter.h   |   223 +
 .../threadblock/output_tile_thread_map.h      |   628 +
 .../threadblock/predicated_tile_iterator.h    |  1387 +
 .../predicated_tile_iterator_affine.h         |   615 +
 ...cated_tile_iterator_affine_layout_params.h |   156 +
 .../predicated_tile_iterator_blas3.h          |   633 +
 .../predicated_tile_iterator_conv.h           |   562 +
 .../predicated_tile_iterator_direct_conv.h    |   445 +
 .../predicated_tile_iterator_params.h         |   483 +
 .../predicated_tile_iterator_predicates.h     |   309 +
 .../predicated_tile_iterator_strided_dgrad.h  |   479 +
 .../threadblock/shared_load_iterator.h        |   223 +
 .../threadblock/shared_load_iterator_mixed.h  |   594 +
 .../shared_load_iterator_pitch_linear.h       |   194 +
 .../fragment_iterator_complex_tensor_op.h     |   187 +
 ...ment_iterator_gaussian_complex_tensor_op.h |   194 +
 .../epilogue/warp/fragment_iterator_simt.h    |   164 +
 .../warp/fragment_iterator_tensor_op.h        |   378 +
 .../warp/fragment_iterator_volta_tensor_op.h  |   269 +
 .../warp/fragment_iterator_wmma_tensor_op.h   |   164 +
 .../cutlass/epilogue/warp/simt_policy.h       |   107 +
 .../cutlass/epilogue/warp/tensor_op_policy.h  |   189 +
 .../epilogue/warp/tile_iterator_simt.h        |   785 +
 .../epilogue/warp/tile_iterator_tensor_op.h   |   671 +
 .../warp/tile_iterator_tensor_op_mixed.h      |  1081 +
 .../warp/tile_iterator_volta_tensor_op.h      |   440 +
 .../warp/tile_iterator_wmma_tensor_op.h       |   227 +
 .../epilogue/warp/volta_tensor_op_policy.h    |   195 +
 .../epilogue/warp/wmma_tensor_op_policy.h     |   101 +
 .../cutlass/include/cutlass/fast_math.h       |  1067 +
 .../cutlass/include/cutlass/float8.h          |  1284 +
 .../include/cutlass/floating_point_nvrtc.h    |    98 +
 .../cutlass/include/cutlass/functional.h      |   930 +
 .../gemm/collective/builders/sm90_common.inl  |   419 +
 .../collective/builders/sm90_gmma_builder.inl |  1048 +
 .../builders/sm90_sparse_config.inl           |   268 +
 .../builders/sm90_sparse_gmma_builder.inl     |   388 +
 .../gemm/collective/collective_builder.hpp    |    42 +
 .../collective/collective_builder_decl.hpp    |    88 +
 .../gemm/collective/collective_mma.hpp        |    49 +
 .../gemm/collective/collective_mma_decl.hpp   |    64 +
 .../gemm/collective/fp8_accumulation.hpp      |   121 +
 .../gemm/collective/sm70_mma_twostage.hpp     |   597 +
 .../gemm/collective/sm80_mma_multistage.hpp   |   707 +
 ..._mma_array_tma_gmma_ss_warpspecialized.hpp |   759 +
 ...mma_multistage_gmma_rs_warpspecialized.hpp |   677 +
 ...mma_multistage_gmma_ss_warpspecialized.hpp |   509 +
 .../sm90_mma_tma_gmma_rs_warpspecialized.hpp  |   752 +
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp |  1560 +
 .../gemm/collective/sm90_mma_tma_gmma_ss.hpp  |   539 +
 .../sm90_mma_tma_gmma_ss_warpspecialized.hpp  |   582 +
 ...90_mma_tma_gmma_ss_warpspecialized_fp8.hpp |   584 +
 ...sparse_mma_tma_gmma_ss_warpspecialized.hpp |   724 +
 .../cutlass/gemm/device/base_grouped.h        |   478 +
 .../gemm/device/default_gemm_configuration.h  |   955 +
 .../include/cutlass/gemm/device/ell_gemm.h    |   849 +
 .../include/cutlass/gemm/device/gemm.h        |   772 +
 .../include/cutlass/gemm/device/gemm_array.h  |   738 +
 .../cutlass/gemm/device/gemm_batched.h        |   704 +
 .../cutlass/gemm/device/gemm_complex.h        |   718 +
 .../cutlass/gemm/device/gemm_grouped.h        |    61 +
 .../device/gemm_layernorm_mainloop_fusion.h   |   385 +
 .../include/cutlass/gemm/device/gemm_sparse.h |   515 +
 .../gemm/device/gemm_sparse_universal.h       |   211 +
 .../gemm_sparse_universal_with_absmax.h       |   202 +
 .../gemm/device/gemm_sparse_with_absmax.h     |   360 +
 .../gemm/device/gemm_sparse_with_visitor.h    |   342 +
 .../gemm/device/gemm_splitk_parallel.h        |   636 +
 .../cutlass/gemm/device/gemm_universal.h      |   442 +
 .../gemm/device/gemm_universal_adapter.h      |   693 +
 .../cutlass/gemm/device/gemm_universal_base.h |   522 +
 .../gemm_universal_streamk_with_broadcast.h   |   386 +
 .../gemm/device/gemm_universal_with_absmax.h  |   404 +
 .../device/gemm_universal_with_broadcast.h    |   386 +
 .../gemm/device/gemm_with_k_reduction.h       |   415 +
 .../include/cutlass/gemm/device/gemv.h        |   182 +
 .../include/cutlass/gemm/device/rank_2k.h     |   548 +
 .../cutlass/gemm/device/rank_2k_grouped.h     |    63 +
 .../include/cutlass/gemm/device/rank_k.h      |   510 +
 .../include/cutlass/gemm/device/symm.h        |   603 +
 .../include/cutlass/gemm/device/trmm.h        |   759 +
 .../include/cutlass/gemm/dispatch_policy.hpp  |   324 +
 .../cutlass/include/cutlass/gemm/gemm.h       |   133 +
 .../cutlass/gemm/gemm_enumerated_types.h      |    80 +
 .../gemm/group_array_problem_shape.hpp        |   123 +
 .../cutlass/gemm/kernel/default_ell_gemm.h    |   837 +
 .../cutlass/gemm/kernel/default_gemm.h        |  1189 +
 .../gemm/kernel/default_gemm_complex.h        |   404 +
 .../gemm/kernel/default_gemm_grouped.h        |   384 +
 ...ult_gemm_grouped_softmax_mainloop_fusion.h |   164 +
 .../default_gemm_layernorm_mainloop_fusion.h  |   137 +
 .../default_gemm_planar_complex_universal.h   |   352 +
 .../cutlass/gemm/kernel/default_gemm_sparse.h |   252 +
 .../kernel/default_gemm_sparse_universal.h    |   141 +
 ...efault_gemm_sparse_universal_with_absmax.h |   144 +
 .../kernel/default_gemm_sparse_with_absmax.h  |   157 +
 .../kernel/default_gemm_sparse_with_visitor.h |   197 +
 .../kernel/default_gemm_splitk_parallel.h     |   136 +
 .../default_gemm_streamk_with_broadcast.h     |   146 +
 .../gemm/kernel/default_gemm_universal.h      |   396 +
 .../default_gemm_universal_with_visitor.h     |   157 +
 .../gemm/kernel/default_gemm_with_absmax.h    |   143 +
 .../gemm/kernel/default_gemm_with_broadcast.h |   243 +
 .../kernel/default_gemm_with_k_reduction.h    |   150 +
 .../gemm/kernel/default_gemm_with_reduction.h |   246 +
 .../cutlass/gemm/kernel/default_gemv.h        |   132 +
 .../cutlass/gemm/kernel/default_rank_2k.h     |   285 +
 .../gemm/kernel/default_rank_2k_complex.h     |   498 +
 .../gemm/kernel/default_rank_2k_grouped.h     |   355 +
 .../gemm/kernel/default_rank_2k_universal.h   |   346 +
 .../cutlass/gemm/kernel/default_rank_k.h      |   247 +
 .../gemm/kernel/default_rank_k_complex.h      |   429 +
 .../gemm/kernel/default_rank_k_universal.h    |   305 +
 .../cutlass/gemm/kernel/default_symm.h        |   321 +
 .../gemm/kernel/default_symm_complex.h        |   508 +
 .../gemm/kernel/default_symm_universal.h      |   342 +
 .../cutlass/gemm/kernel/default_trmm.h        |   269 +
 .../gemm/kernel/default_trmm_complex.h        |   265 +
 .../gemm/kernel/default_trmm_universal.h      |   359 +
 .../include/cutlass/gemm/kernel/ell_gemm.h    |   824 +
 .../include/cutlass/gemm/kernel/gemm.h        |   380 +
 .../include/cutlass/gemm/kernel/gemm_array.h  |   264 +
 .../cutlass/gemm/kernel/gemm_batched.h        |   273 +
 .../cutlass/gemm/kernel/gemm_grouped.h        |   457 +
 .../kernel/gemm_grouped_problem_visitor.h     |   121 +
 .../gemm_grouped_softmax_mainloop_fusion.h    |   481 +
 .../kernel/gemm_layernorm_mainloop_fusion.h   |   782 +
 .../include/cutlass/gemm/kernel/gemm_params.h |   189 +
 .../cutlass/gemm/kernel/gemm_pipelined.h      |   158 +
 .../cutlass/gemm/kernel/gemm_planar_complex.h |   715 +
 .../gemm/kernel/gemm_planar_complex_array.h   |   609 +
 .../gemm/kernel/gemm_sparse_universal.h       |   804 +
 .../gemm_sparse_universal_with_absmax.h       |   609 +
 .../gemm/kernel/gemm_splitk_parallel.h        |   253 +
 .../kernel/gemm_streamk_with_fused_epilogue.h |  2396 +
 .../gemm/kernel/gemm_transpose_operands.h     |   124 +
 .../cutlass/gemm/kernel/gemm_universal.h      |   702 +
 .../cutlass/gemm/kernel/gemm_universal.hpp    |    66 +
 .../cutlass/gemm/kernel/gemm_universal_decl.h |    61 +
 .../gemm/kernel/gemm_universal_streamk.h      |  1168 +
 .../gemm/kernel/gemm_universal_with_visitor.h |   321 +
 .../gemm_universal_with_visitor_streamk.h     |   895 +
 .../cutlass/gemm/kernel/gemm_with_absmax.h    |   759 +
 .../gemm/kernel/gemm_with_fused_epilogue.h    |  1512 +
 .../gemm/kernel/gemm_with_k_reduction.h       |   704 +
 .../include/cutlass/gemm/kernel/gemv.h        |   638 +
 .../gemm/kernel/gemv_batched_strided.h        |   244 +
 .../gemm/kernel/grouped_problem_visitor.h     |   463 +
 .../cutlass/gemm/kernel/params_sparse_base.h  |   115 +
 .../gemm/kernel/params_universal_base.h       |   264 +
 .../cutlass/gemm/kernel/rank_2k_grouped.h     |   688 +
 .../kernel/rank_2k_grouped_problem_visitor.h  |   376 +
 .../gemm/kernel/rank_2k_transpose_operands.h  |   129 +
 .../cutlass/gemm/kernel/rank_2k_universal.h   |   769 +
 .../cutlass/gemm/kernel/rank_k_universal.h    |   556 +
 .../include/cutlass/gemm/kernel/sm70_gemm.hpp |   270 +
 ..._array_tma_warpspecialized_cooperative.hpp |   881 +
 ...emm_array_tma_warpspecialized_pingpong.hpp |   946 +
 .../cutlass/gemm/kernel/sm90_gemm_tma.hpp     |   306 +
 .../kernel/sm90_gemm_tma_warpspecialized.hpp  |   522 +
 ...0_gemm_tma_warpspecialized_cooperative.hpp |   671 +
 ...sm90_gemm_tma_warpspecialized_pingpong.hpp |   664 +
 .../gemm/kernel/sm90_gemm_warpspecialized.hpp |   417 +
 .../sm90_gemm_warpspecialized_cooperative.hpp |   504 +
 .../sm90_gemm_warpspecialized_pingpong.hpp    |   516 +
 .../gemm/kernel/sm90_tile_scheduler.hpp       |   139 +
 .../gemm/kernel/sm90_tile_scheduler_group.hpp |   510 +
 .../kernel/sm90_tile_scheduler_stream_k.hpp   |   960 +
 .../include/cutlass/gemm/kernel/sparse_gemm.h |   394 +
 .../gemm/kernel/sparse_gemm_with_absmax.h     |   509 +
 .../gemm/kernel/sparse_gemm_with_visitor.h    |   238 +
 .../gemm/kernel/static_tile_scheduler.hpp     |   502 +
 .../cutlass/gemm/kernel/symm_universal.h      |   675 +
 .../cutlass/gemm/kernel/tile_scheduler.hpp    |   149 +
 .../gemm/kernel/tile_scheduler_params.h       |  1535 +
 .../cutlass/gemm/kernel/trmm_universal.h      |   580 +
 .../cutlass/include/cutlass/gemm/thread/mma.h |    90 +
 .../include/cutlass/gemm/thread/mma_sm50.h    |   538 +
 .../include/cutlass/gemm/thread/mma_sm60.h    |  1161 +
 .../include/cutlass/gemm/thread/mma_sm61.h    |   284 +
 .../gemm/threadblock/default_ell_mma.h        |   734 +
 .../gemm/threadblock/default_gemv_core.h      |   151 +
 .../cutlass/gemm/threadblock/default_mma.h    |   823 +
 .../gemm/threadblock/default_mma_core.h       |   116 +
 .../gemm/threadblock/default_mma_core_simt.h  |  1723 +
 .../gemm/threadblock/default_mma_core_sm70.h  |   682 +
 .../gemm/threadblock/default_mma_core_sm75.h  |  1315 +
 .../gemm/threadblock/default_mma_core_sm80.h  |  2951 +
 .../default_mma_core_sparse_sm80.h            |   876 +
 .../default_mma_core_with_access_size.h       |   328 +
 .../default_mma_core_with_reduction.h         |   167 +
 .../gemm/threadblock/default_mma_core_wmma.h  |   712 +
 .../default_mma_layernorm_mainloop_fusion.h   |   178 +
 .../default_mma_planar_complex_multistage.h   |   136 +
 .../default_mma_planar_complex_pipelined.h    |   130 +
 .../default_mma_softmax_mainloop_fusion.h     |   160 +
 .../threadblock/default_mma_with_reduction.h  |   141 +
 .../default_multistage_mma_complex.h          |   159 +
 .../default_multistage_mma_complex_core.h     |   119 +
 ...default_multistage_mma_complex_core_sm80.h |  1808 +
 .../default_multistage_trmm_complex.h         |   556 +
 .../gemm/threadblock/default_sparse_mma.h     |   196 +
 .../cutlass/gemm/threadblock/default_trmm.h   |   445 +
 .../gemm/threadblock/ell_mma_multistage.h     |   648 +
 .../gemm/threadblock/ell_mma_pipelined.h      |   376 +
 .../include/cutlass/gemm/threadblock/gemv.h   |   147 +
 .../cutlass/gemm/threadblock/index_remat.h    |   107 +
 .../cutlass/gemm/threadblock/mma_base.h       |   236 +
 .../gemm/threadblock/mma_blas3_multistage.h   |   707 +
 ...mma_layernorm_mainloop_fusion_multistage.h |   863 +
 .../cutlass/gemm/threadblock/mma_multistage.h |   741 +
 .../cutlass/gemm/threadblock/mma_pipelined.h  |   439 +
 .../threadblock/mma_planar_complex_base.h     |   208 +
 .../mma_planar_complex_multistage.h           |   646 +
 .../mma_planar_complex_pipelined.h            |   424 +
 .../gemm/threadblock/mma_singlestage.h        |   265 +
 .../mma_softmax_mainloop_fusion_multistage.h  |   756 +
 .../gemm/threadblock/mma_sparse_base.h        |   273 +
 .../gemm/threadblock/mma_sparse_multistage.h  |   668 +
 .../mma_with_reduction_multistage.h           |   545 +
 .../gemm/threadblock/threadblock_swizzle.h    |   459 +
 .../threadblock/threadblock_swizzle_streamk.h |   801 +
 .../gemm/warp/default_mma_complex_tensor_op.h |   612 +
 .../gemm/warp/default_mma_sparse_tensor_op.h  |   165 +
 .../cutlass/gemm/warp/default_mma_tensor_op.h |   123 +
 .../gemm/warp/default_mma_tensor_op_sm80.h    |   375 +
 .../default_mma_with_reduction_tensor_op.h    |    92 +
 .../gemm/warp/default_mma_wmma_tensor_op.h    |   130 +
 .../warp/layernorm_scale_bias_transform.h     |   139 +
 .../cutlass/include/cutlass/gemm/warp/mma.h   |    60 +
 .../cutlass/gemm/warp/mma_complex_tensor_op.h |  1168 +
 .../warp/mma_complex_tensor_op_fast_f32.h     |   663 +
 ...mma_complex_tensor_op_tile_iterator_sm80.h |  2485 +
 .../warp/mma_gaussian_complex_tensor_op.h     |   642 +
 ...ian_complex_tensor_op_tile_iterator_sm80.h |   390 +
 .../gemm/warp/mma_mixed_input_tensor_op.h     |   566 +
 .../cutlass/gemm/warp/mma_planar_complex.h    |   182 +
 .../include/cutlass/gemm/warp/mma_simt.h      |   263 +
 .../cutlass/gemm/warp/mma_simt_policy.h       |    69 +
 .../gemm/warp/mma_simt_tile_iterator.h        |  1890 +
 .../cutlass/gemm/warp/mma_sparse_tensor_op.h  |   382 +
 .../include/cutlass/gemm/warp/mma_tensor_op.h |   415 +
 .../gemm/warp/mma_tensor_op_fast_f32.h        |   471 +
 .../warp/mma_tensor_op_fragment_iterator.h    |   559 +
 .../cutlass/gemm/warp/mma_tensor_op_policy.h  |    65 +
 .../cutlass/gemm/warp/mma_tensor_op_sm70.h    |   280 +
 .../warp/mma_tensor_op_tile_access_iterator.h |   362 +
 .../gemm/warp/mma_tensor_op_tile_iterator.h   |  4803 ++
 .../warp/mma_tensor_op_tile_iterator_sm70.h   |  3098 +
 .../warp/mma_tensor_op_tile_iterator_sm80.h   |  2441 +
 .../warp/mma_tensor_op_tile_iterator_sparse.h |   380 +
 .../warp/mma_tensor_op_tile_iterator_wmma.h   |   805 +
 .../cutlass/gemm/warp/mma_tensor_op_wmma.h    |   223 +
 .../gemm/warp/mma_with_reduction_tensor_op.h  |   449 +
 .../gemm/warp/scale_bias_tile_iterator.h      |   572 +
 .../gemm/warp/softmax_scale_bias_transform.h  |   117 +
 .../gemm/warp/tile_iterator_planar_complex.h  |   250 +
 .../cutlass/include/cutlass/gemm_coord.h      |   394 +
 .../cutlass/include/cutlass/gemm_coord.hpp    |    66 +
 .../cutlass/include/cutlass/half.h            |   930 +
 .../cutlass/include/cutlass/integer_subbyte.h |   280 +
 .../include/cutlass/kernel_hardware_info.h    |    76 +
 .../include/cutlass/kernel_hardware_info.hpp  |    35 +
 .../cutlass/include/cutlass/kernel_launch.h   |   141 +
 .../cutlass/include/cutlass/layout/layout.h   |    64 +
 .../cutlass/include/cutlass/layout/matrix.h   |  1349 +
 .../cutlass/include/cutlass/layout/permute.h  |   828 +
 .../include/cutlass/layout/pitch_linear.h     |   149 +
 .../cutlass/include/cutlass/layout/tensor.h   |   648 +
 .../layout/tensor_op_multiplicand_sm70.h      |  1044 +
 .../layout/tensor_op_multiplicand_sm75.h      |  1169 +
 .../layout/tensor_op_multiplicand_sm80.h      |  1139 +
 .../cutlass/include/cutlass/layout/vector.h   |   105 +
 .../cutlass/include/cutlass/matrix.h          | 14129 ++++
 .../cutlass/include/cutlass/matrix_coord.h    |   164 +
 .../cutlass/include/cutlass/matrix_shape.h    |    65 +
 .../include/cutlass/numeric_conversion.h      |  4547 ++
 .../cutlass/include/cutlass/numeric_size.h    |    83 +
 .../cutlass/include/cutlass/numeric_types.h   |    88 +
 .../include/cutlass/pipeline/pipeline.hpp     |    36 +
 .../cutlass/pipeline/sm90_pipeline.hpp        |  1173 +
 .../include/cutlass/pitch_linear_coord.h      |   181 +
 .../include/cutlass/platform/platform.h       |   913 +
 .../include/cutlass/predicate_vector.h        |   547 +
 .../cutlass/include/cutlass/quaternion.h      |   752 +
 .../cutlass/include/cutlass/real.h            |    61 +
 .../cutlass/reduction/device/reduce_split_k.h |   232 +
 .../cutlass/reduction/device/tensor_reduce.h  |   264 +
 .../device/tensor_reduce_affine_contiguous.h  |   374 +
 .../device/tensor_reduce_affine_strided.h     |   362 +
 .../reduction/kernel/reduce_softmax_final.h   |   267 +
 .../cutlass/reduction/kernel/reduce_split_k.h |   248 +
 .../kernel/tensor_reduce_affine_contiguous.h  |   606 +
 .../kernel/tensor_reduce_affine_strided.h     |   641 +
 .../include/cutlass/reduction/thread/reduce.h |   234 +
 .../reduction/thread/reduction_operators.h    |   235 +
 .../cutlass/reduction/threadblock_swizzle.h   |    67 +
 .../include/cutlass/relatively_equal.h        |   275 +
 .../cutlass/include/cutlass/semaphore.h       |   118 +
 .../include/cutlass/subbyte_reference.h       |  1388 +
 .../cutlass/include/cutlass/tensor_coord.h    |   326 +
 .../cutlass/include/cutlass/tensor_ref.h      |   419 +
 .../cutlass/tensor_ref_planar_complex.h       |   374 +
 .../cutlass/include/cutlass/tensor_view.h     |   297 +
 .../cutlass/tensor_view_planar_complex.h      |   301 +
 .../cutlass/include/cutlass/tfloat32.h        |   478 +
 .../cutlass/include/cutlass/thread/matrix.h   |   198 +
 .../cutlass/include/cutlass/trace.h           |    59 +
 .../collective/sm90_wgmma_transpose.hpp       |   754 +
 .../device/transform_universal_adapter.hpp    |   303 +
 .../kernel/filter_format_transformer.hpp      |   223 +
 .../kernel/sm90_sparse_gemm_compressor.hpp    |   578 +
 .../kernel/sparse_gemm_compressor.hpp         |   284 +
 .../transform/pitch_linear_thread_map.h       |   926 +
 .../cutlass/transform/thread/transpose.h      |   107 +
 .../cutlass/transform/thread/unary_op.h       |   105 +
 .../transform/threadblock/ell_iterator.h      |   199 +
 .../ell_predicated_tile_access_iterator.h     |  1350 +
 .../ell_predicated_tile_iterator.h            |  1315 +
 ...icated_scale_bias_vector_access_iterator.h |   375 +
 .../predicated_scale_bias_vector_iterator.h   |   328 +
 .../predicated_tile_access_iterator.h         |  2118 +
 ...icated_tile_access_iterator_2dthreadtile.h |   834 +
 .../predicated_tile_access_iterator_params.h  |   290 +
 ...d_tile_access_iterator_triangular_matrix.h |   892 +
 .../threadblock/predicated_tile_iterator.h    |  1887 +
 .../predicated_tile_iterator_2dthreadtile.h   |   787 +
 ...edicated_tile_iterator_triangular_matrix.h |   818 +
 .../predicated_vector_access_iterator.h       |   417 +
 ...egular_scale_bias_vector_access_iterator.h |   253 +
 .../regular_tile_access_iterator.h            |    58 +
 ...egular_tile_access_iterator_pitch_linear.h |   408 +
 ...access_iterator_pitch_linear_direct_conv.h |   587 +
 .../regular_tile_access_iterator_tensor_op.h  |   821 +
 ...ular_tile_access_iterator_tensor_op_sm80.h |  1532 +
 .../threadblock/regular_tile_iterator.h       |    62 +
 .../regular_tile_iterator_pitch_linear.h      |   552 +
 ..._tile_iterator_pitch_linear_2dthreadtile.h |   509 +
 .../regular_tile_iterator_tensor_op.h         |  1107 +
 .../regular_tile_iterator_tensor_op_sm70.h    |  1460 +
 .../transform/threadblock/vector_iterator.h   |   149 +
 .../transform/warp/vector_fragment_iterator.h |   283 +
 .../cutlass/include/cutlass/uint128.h         |   270 +
 .../cutlass/include/cutlass/version.h         |    80 +
 .../cutlass/include/cutlass/wmma_array.h      |   133 +
 .../cutlass/include/cutlass/workspace.h       |   150 +
 .../include/cutlass_extensions/common.hpp     |    48 +
 .../epilogue/broadcast_load_epilogue_c3x.hpp  |   447 +
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   286 +
 lightllm-kernel/include/ops_common.h          |    66 +-
 lightllm-kernel/include/reduce/sm70.cuh       |   191 +
 lightllm-kernel/include/utils.h               |   267 +
 .../lightllm_kernel/ops/__init__.py           |    51 +-
 lightllm-kernel/lightllm_kernel/ops/fusion.py |    22 +
 lightllm-kernel/lightllm_kernel/ops/gemm.py   |     8 +
 lightllm-kernel/lightllm_kernel/ops/norm.py   |     7 +
 lightllm-kernel/lightllm_kernel/ops/quant.py  |    10 +
 lightllm-kernel/setup.py                      |    45 +-
 lightllm-kernel/test/__init__.py              |     0
 .../test/fusion/add_norm_quant_test.py        |    70 +
 .../test/fusion/gelu_per_token_quant_test.py  |    50 +
 .../test/fusion/post_tp_norm_test.py          |    54 +
 .../test/fusion/pre_tp_norm_test.py           |    46 +
 .../test/gemm/cutlass_scaled_mm_test.py       |    80 +
 lightllm-kernel/test/norm/rmsnorm_test.py     |    45 +
 lightllm-kernel/test/quant/quant_test.py      |    47 +
 lightllm-kernel/test/utils.py                 |   125 +
 702 files changed, 554067 insertions(+), 112 deletions(-)
 create mode 100755 lightllm-kernel/csrc/fusion/add_norm_quant.cu
 create mode 100755 lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu
 create mode 100755 lightllm-kernel/csrc/fusion/post_tp_norm.cu
 create mode 100755 lightllm-kernel/csrc/fusion/pre_tp_norm.cu
 create mode 100755 lightllm-kernel/csrc/gemm/Epilogues.md
 create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu
 create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh
 create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh
 create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_entry.cu
 delete mode 100644 lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
 create mode 100755 lightllm-kernel/csrc/norm/rmsnorm_bf16.cu
 create mode 100755 lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/config.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/arch/util.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/config.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/alignment.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/array.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/tuple.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/container/type_list.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/int_tuple.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/layout.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/layout_composed.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/int.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/math.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/real.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/pointer.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_base.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/stride.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/swizzle.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/tensor.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/underscore.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/util/debug.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/util/print.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/arch.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/config.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/array.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/barrier.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/bfloat16.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3_types.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/block_striped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/constants.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/coord.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/core_io.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/cutlass.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/device_kernel.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/fast_math.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/float8.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/functional.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/half.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/layout.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/permute.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/vector.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_size.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_types.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/platform/platform.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/quaternion.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/real.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/semaphore.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/tfloat32.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/trace.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/uint128.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/version.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/wmma_array.h
 create mode 100755 lightllm-kernel/cutlass/include/cutlass/workspace.h
 create mode 100755 lightllm-kernel/include/cutlass_extensions/common.hpp
 create mode 100755 lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
 create mode 100755 lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
 create mode 100755 lightllm-kernel/include/reduce/sm70.cuh
 create mode 100644 lightllm-kernel/include/utils.h
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/fusion.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/gemm.py
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/norm.py
 create mode 100644 lightllm-kernel/test/__init__.py
 create mode 100755 lightllm-kernel/test/fusion/add_norm_quant_test.py
 create mode 100644 lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
 create mode 100755 lightllm-kernel/test/fusion/post_tp_norm_test.py
 create mode 100755 lightllm-kernel/test/fusion/pre_tp_norm_test.py
 create mode 100644 lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
 create mode 100755 lightllm-kernel/test/norm/rmsnorm_test.py
 create mode 100755 lightllm-kernel/test/quant/quant_test.py
 create mode 100644 lightllm-kernel/test/utils.py

diff --git a/lightllm-kernel/csrc/fusion/add_norm_quant.cu b/lightllm-kernel/csrc/fusion/add_norm_quant.cu
new file mode 100755
index 000000000..3684dffc8
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/add_norm_quant.cu
@@ -0,0 +1,551 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB>
+__global__ void device_add_norm_quant_bf16_general(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const int32_t N,                   // Number of cols in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store data.
+    extern __shared__ bf16_t workspace1[];
+
+    // Local registers to hold data.
+    bf16_t local_input;
+    bf16_t local_residual;
+    bf16_t local_w;
+    bf16_t local_output;
+    fp8_e4m3_t local_f8;
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_input = _input[i];
+        local_residual = _residual[i];
+
+        fp32_t x = cvt_bf16_f32(local_input);
+        fp32_t r = cvt_bf16_f32(local_residual);
+        local_input = cvt_f32_bf16(x + r);
+        fp32_t tmp = cvt_bf16_f32(local_input);
+        local_square_sum += tmp * tmp;
+
+        _input[i] = local_input;
+        workspace1[i] = local_input;
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_input = workspace1[i];
+        local_w = weight[i];
+
+        fp32_t x = cvt_bf16_f32(local_input);
+        fp32_t w = cvt_bf16_f32(local_w);
+
+        fp32_t ret = x * inv_norm * w;
+        local_output = cvt_f32_bf16(ret);
+        fp32_t tmp = cvt_bf16_f32(local_output);
+        local_max = fmaxf(local_max, fabsf(tmp));
+
+        workspace1[i] = local_output;
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_output = workspace1[i];
+
+        fp32_t tmp = cvt_bf16_f32(local_output);
+        fp32_t ret = tmp * inv_scale;
+        local_f8 = fp8_e4m3_t(ret);
+
+        _output[i] = local_f8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+template<int32_t TPB>
+__global__ void device_add_norm_quant_bf16_vpt(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const int32_t N,                   // Number of cols in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    extern __shared__ bf16x2_t workspace2[];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_input[VPT / 2];
+    bf16x2_t local_residual[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_output[VPT / 2];
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_input) into local vector (local_input).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_input);
+        // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual).
+        vec_copy<sizeof(bf16_t) * VPT>(_residual + i, local_residual);
+
+        # pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            // Convert the bf16x2_t to fp32x2_t for computation.
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]);
+            // Add the residual to the input.
+            local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y));
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, _input + i);
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, workspace2 + (i >> 1));
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_input);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(weight + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_output[j] = _float22bf162_rn(ret);
+
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_output, workspace2 + (i >> 1));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_output);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB, int32_t N>
+__global__ void device_add_norm_quant_bf16(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    constexpr fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    __shared__ bf16x2_t workspace[N / 2];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_input[VPT / 2];
+    bf16x2_t local_residual[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_output[VPT / 2];
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_input) into local vector (local_input).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_input);
+        // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual).
+        vec_copy<sizeof(bf16_t) * VPT>(_residual + i, local_residual);
+
+        # pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            // Convert the bf16x2_t to fp32x2_t for computation.
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]);
+            // Add the residual to the input.
+            local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y));
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, _input + i);
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, workspace + (i >> 1));
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    #pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_input);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(weight + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_output[j] = _float22bf162_rn(ret);
+
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_output, workspace + (i >> 1));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    #pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_output);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+/**
+ * @brief Fused add norm quant
+ */
+std::tuple<Tensor, Tensor> add_norm_quant_bf16_fp8(
+    Tensor& X, const Tensor &R, const Tensor &W,
+    const fp32_t eps
+) {
+    TORCH_CHECK(X.ndimension() == 2, "Input tensor X must be 2D");
+    TORCH_CHECK(R.ndimension() == 2, "Input tensor R must be 2D");
+    TORCH_CHECK(W.ndimension() == 1, "Input tensor W must be 1D");
+
+    TORCH_CHECK(X.is_cuda(), "Input tensor X must be a CUDA tensor.");
+    TORCH_CHECK(R.is_cuda(), "Input tensor R must be a CUDA tensor.");
+    TORCH_CHECK(W.is_cuda(), "Input tensor W must be a CUDA tensor.");
+
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor X must be BF16.");
+    TORCH_CHECK(R.scalar_type() == c10::ScalarType::BFloat16, "Input tensor R must be BF16.");
+    TORCH_CHECK(W.scalar_type() == c10::ScalarType::BFloat16, "Input tensor W must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_R = R.is_contiguous() ? R : R.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+
+    const uint32_t M = contiguous_X.size(0);
+    const uint32_t N = contiguous_X.size(1);
+    
+    Tensor output_q = torch::empty(
+        {M, N},
+        torch::TensorOptions()
+            .dtype(torch::kFloat8_e4m3fn)
+            .device(contiguous_X.device())
+    );
+    Tensor scales = torch::empty(
+        {M, 1},
+        torch::TensorOptions()
+            .dtype(torch::kFloat32)
+            .device(contiguous_X.device())
+    );
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_add_norm_quant_bf16<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 32:
+            device_add_norm_quant_bf16<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 64:
+            device_add_norm_quant_bf16<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 512:
+            device_add_norm_quant_bf16<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 1024:
+            device_add_norm_quant_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 3200:
+            device_add_norm_quant_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 4096:
+            device_add_norm_quant_bf16<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 12800:
+            device_add_norm_quant_bf16<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_add_norm_quant_bf16_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_X),
+                    PTR<bf16_t>(contiguous_R),
+                    PTR<bf16_t>(contiguous_W),
+                    PTR<fp8_e4m3_t>(output_q),
+                    PTR<fp32_t>(scales),
+                    M,
+                    N,
+                    eps
+                );
+            } else {
+                device_add_norm_quant_bf16_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_X),
+                    PTR<bf16_t>(contiguous_R),
+                    PTR<bf16_t>(contiguous_W),
+                    PTR<fp8_e4m3_t>(output_q),
+                    PTR<fp32_t>(scales),
+                    M,
+                    N,
+                    eps
+                );
+            }
+        }
+    }
+
+    return {output_q, scales};
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu
new file mode 100755
index 000000000..b204e9737
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu
@@ -0,0 +1,367 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB, int32_t N>
+__global__ void device_gelu_per_token_quant_bf16_to_fp8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M                  // Number of rows in the input tensor
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    const bf16x2_t one =  _float22bf162_rn(make_float2(1.0f, 1.0f));
+    const bf16x2_t one_2 =  _float22bf162_rn(make_float2(0.5f, 0.5f));
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+        //gelu
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+            tmp.x = erf(tmp.x * 0.7071067811f);
+            tmp.y = erf(tmp.y * 0.7071067811f);
+            bf16x2_t tan =  _float22bf162_rn(tmp);
+            tan = __hadd2(tan, one);
+            tan = __hmul2(tan, local_bf16[j]);
+            tan = __hmul2(tan, one_2);
+            local_bf16[j] = tan;
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+        
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+           fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+           fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+           local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB>
+__global__ void gelu_per_token_quant_bf16_to_fp8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f;
+    constexpr fp32_t coeff = 0.044715f;
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+
+           fp32_t tanh_arg1 = sqrt_2_over_pi * (tmp.x + coeff * tmp.x * tmp.x * tmp.x);
+           fp32_t tanh_arg2 = sqrt_2_over_pi * (tmp.y + coeff * tmp.y * tmp.y * tmp.y);
+           tmp.x = 0.5f * tmp.x * (1.0f + tanhf(tanh_arg1));
+           tmp.y = 0.5f * tmp.y * (1.0f + tanhf(tanh_arg2));
+
+           local_bf16[j] = _float22bf162_rn(tmp);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB>
+__global__ void gelu_per_token_quant_bf16_to_fp8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f;
+    constexpr fp32_t coeff = 0.044715f;
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    extern __shared__ bf16_t workspace_[];
+
+    fp32_t local_max = -FLT_MAX;
+  
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t tmp = cvt_bf16_f32(_input[i]);
+        fp32_t tanh_arg = sqrt_2_over_pi * (tmp + coeff * tmp * tmp * tmp);
+        tmp = 0.5f * tmp * (1.0f + tanhf(tanh_arg));
+        local_max = fmaxf(local_max, fabsf(tmp));
+        workspace_[i] = cvt_f32_bf16(tmp);
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        // Load the previously stored vectorized data from shared memory.
+        fp32_t x = cvt_bf16_f32(workspace_[i]);
+        // Apply normalization: multiply by inv_norm and then scale by the weight.
+        fp32_t ret = x * inv_scale;
+        _output[i] = fp8_e4m3_t(ret);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+void gelu_per_token_quant_bf16_fp8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 16>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 32:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 32>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 64:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 64>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 512:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 512>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+
+        case 1024:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 2048:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 3200:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 4096:
+            device_gelu_per_token_quant_bf16_to_fp8<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 12800:
+            device_gelu_per_token_quant_bf16_to_fp8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 128;
+            int32_t sharedmem = N / 2 * sizeof(bf16x2_t);
+            if (N % 8 == 0) {
+                gelu_per_token_quant_bf16_to_fp8_vpt<128>
+                <<<blocks, TPB, sharedmem, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M, N
+                );
+            }
+            else {
+                gelu_per_token_quant_bf16_to_fp8_general<128>
+                <<<blocks, TPB, sharedmem, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M, N
+                );
+            }
+        }
+    }
+    return ;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/post_tp_norm.cu b/lightllm-kernel/csrc/fusion/post_tp_norm.cu
new file mode 100755
index 000000000..89f711405
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/post_tp_norm.cu
@@ -0,0 +1,364 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. 
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row.
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB>
+__global__
+void  device_post_tp_norm_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold data.
+    bf16_t local_x = cvt_f32_bf16(0.0f);
+    bf16_t local_w = cvt_f32_bf16(0.0f);
+    bf16_t local_y = cvt_f32_bf16(0.0f);
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_x = _X[i];
+        local_w = W[i];
+
+        fp32_t x = cvt_bf16_f32(local_x);
+        fp32_t w = cvt_bf16_f32(local_w);
+
+        fp32_t ret = x * inv_norm * w;
+        local_y = cvt_f32_bf16(ret);
+
+        _Y[i] = local_y;
+    }
+}
+
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB>
+__global__
+void  device_post_tp_norm_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void  device_post_tp_norm_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief Launch RMSNorm kernel for FP16 tensors with aligned 16-element rows.
+ *
+ * This function validates the input tensors, ensures they are contiguous,
+ * selects the appropriate kernel configuration based on the row width N,
+ * and launches the CUDA kernel.
+ *
+ * @param X    Input tensor with shape [M, N] (FP16, CUDA).
+ * @param W    Weight tensor with shape [N] (FP16, CUDA).
+ * @param eps  Epsilon for numerical stability.
+ * @return     Output tensor with the same shape as X.
+ */
+Tensor post_tp_norm_bf16(Tensor &X, const Tensor &W, const Tensor &V, const int embed_dim, const fp32_t eps) {
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+    Tensor contiguous_V = V.is_contiguous() ? V : V.contiguous();
+
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor Y;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        Y = torch::empty_like(input_tensor);
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        Y = torch::empty_like(input_tensor);
+    }
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_post_tp_norm_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 1024:
+            device_post_tp_norm_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 1664:
+            device_post_tp_norm_bf16<128, 1664>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 2048:
+            device_post_tp_norm_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 3200:
+            device_post_tp_norm_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+        break;
+        case 4096:
+            device_post_tp_norm_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 8192:
+            device_post_tp_norm_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 10240:
+            device_post_tp_norm_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        default:
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                device_post_tp_norm_bf16_vpt<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                    PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                    M, N, embed_dim, eps
+                );
+            } else {
+                device_post_tp_norm_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                    PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                    M, N, embed_dim, eps
+                );
+            }
+    }
+
+    // need to reshape Y back to 4 dimens
+    if (X.ndimension() == 4) {
+        Y = Y.reshape(X.sizes());
+    }
+
+    return Y;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/pre_tp_norm.cu b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu
new file mode 100755
index 000000000..966cf5ce7
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu
@@ -0,0 +1,257 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row.
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB>
+__global__
+void device_pre_tp_norm_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N
+) {
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    bf16_t local_x = cvt_f32_bf16(0.0f);
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_x = _X[i];
+
+        fp32_t tmp = cvt_bf16_f32(local_x);
+
+        local_square_sum += tmp * tmp;
+    }
+
+    fp32_t block_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+    if (tid == 0) {
+        V[bid] = block_square_sum;
+    }
+
+}
+
+
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB>
+__global__
+void device_pre_tp_norm_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT bf16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+}
+
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void device_pre_tp_norm_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M                  // Number of rows.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT bf16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+}
+
+/**
+ * @param X    Input tensor with shape [M, N] (bf16, CUDA).
+ */
+Tensor pre_tp_norm_bf16(Tensor &X) {
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor V;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        V = torch::empty(
+            {M},
+            torch::TensorOptions()
+                .dtype(c10::ScalarType::Float)
+                .device(contiguous_X.device())
+        );
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        V = torch::empty(
+            {M},
+            torch::TensorOptions()
+                .dtype(c10::ScalarType::Float)
+                .device(contiguous_X.device())
+        );
+    }
+
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_pre_tp_norm_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 1024:
+            device_pre_tp_norm_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 1664:
+            device_pre_tp_norm_bf16<128, 1664>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 2048:
+            device_pre_tp_norm_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 3200:
+            device_pre_tp_norm_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 4096:
+            device_pre_tp_norm_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 8192:
+            device_pre_tp_norm_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 10240:
+            device_pre_tp_norm_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                device_pre_tp_norm_bf16_vpt<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M, N
+                );
+            } else {
+                device_pre_tp_norm_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M, N
+                );
+            }
+        }
+    }
+    return V;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/Epilogues.md b/lightllm-kernel/csrc/gemm/Epilogues.md
new file mode 100755
index 000000000..aae04157b
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/Epilogues.md
@@ -0,0 +1,147 @@
+# CUTLASS Epilogues
+
+## Introduction
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. 
+
+Currently, we only support symmetric quantization for weights,
+and symmetric and asymmetric quantization for activations.
+Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
+
+There are 4 epilogues:
+1. ScaledEpilogue: symmetric quantization for activations, no bias.
+1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
+1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
+1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
+
+We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
+Instead, if no bias is passed, the epilogue will use 0 as the bias.
+That induces a redundant addition operation (and runtime check), but the performance impact is minor.
+
+## Underlying Linear Algebra
+
+More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
+
+If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
+
+```math
+A = s_a (\widehat A - J_a z_a)
+```
+```math
+B = s_b \widehat B
+```
+```math
+D = A B + C
+```
+```math
+D = s_a s_b \widehat D + C
+```
+
+Here, D is the output of the GEMM, and C is the bias.
+A is the activations and supports asymmetric quantization,
+and B is the weights and only supports symmetric quantization.
+$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
+$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
+Additional epilogues would be required to support asymmetric quantization for weights.
+
+Expanding further, we can calculate $` \widehat D `$ as follows:
+
+```math
+A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
+```
+```math
+A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
+```
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
+and $` J_a \widehat B `$ is known ahead of time.
+Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
+
+## Epilogues
+
+### ScaledEpilogue
+This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+```math
+D = s_a s_b \widehat D
+```
+```math
+D = s_a s_b \widehat A \widehat B
+```
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+
+### ScaledEpilogueBias
+This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+```math
+D = s_a s_b \widehat D + C 
+```
+```math
+D = s_a s_b \widehat A \widehat B + C
+```
+
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+### ScaledEpilogueAzp
+This epilogue computes the asymmetric per-tensor quantization for activations with bias.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+```math
+D = s_a s_b \widehat D + C 
+```
+```math
+D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
+```
+
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. 
+That is precomputed and stored in `azp_with_adj` as a row-vector.
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-tensor as the zero-points are per-tensor.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
+
+### ScaledEpilogueAzpPerToken
+This epilogue computes the asymmetric per-token quantization for activations with bias.
+
+The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
+That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-token as the zero-points are per-token.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
+- `azp` is the zero-point (`z_a`), is per-token (column-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
+
+The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
+```
+out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
+```
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu
new file mode 100755
index 000000000..55d623755
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu
@@ -0,0 +1,73 @@
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
+  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  
+}
+
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias,
+                            c10::optional<torch::Tensor> const& ls) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias && ls) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    TORCH_CHECK(ls->dtype() == c.dtype(),
+                "currently ls dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasLs>(
+        c, a, b, a_scales, b_scales, *bias, *ls);
+  } else if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        c, a, b, a_scales, b_scales, *bias);
+  } else if (ls) {
+    TORCH_CHECK(ls->dtype() == c.dtype(),
+                "currently ls dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueLs>(
+        c, a, b, a_scales, b_scales, *ls);
+  } else {
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        c, a, b, a_scales, b_scales);
+  }
+}
+
+} // namespace ops
+} // namespace lightllm
+
+#endif
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh
new file mode 100755
index 000000000..93641a157
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh
@@ -0,0 +1,161 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace lightllm {
+namespace ops {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh
new file mode 100755
index 000000000..bbd709ccb
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh
@@ -0,0 +1,97 @@
+#pragma once
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace lightllm {
+namespace ops {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu b/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu
new file mode 100755
index 000000000..6655c3712
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu
@@ -0,0 +1,83 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "ops_common.h"
+#include "cutlass_extensions/common.hpp"
+
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias,
+                            c10::optional<torch::Tensor> const& ls);
+
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
+void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias,
+                       c10::optional<torch::Tensor> const& ls) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  if (ls) {
+    TORCH_CHECK(ls->numel() == b.size(1) && ls->is_contiguous() &&
+                ls->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  if (version_num >= 90) {
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias, ls);
+    return;
+  }
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false,
+    "No compiled cutlass_scaled_mm for a compute capability less than "
+    "CUDA device capability: ",
+    version_num);
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/grouped_topk.cu b/lightllm-kernel/csrc/moe/grouped_topk.cu
index 635ca5193..83bbee8c7 100644
--- a/lightllm-kernel/csrc/moe/grouped_topk.cu
+++ b/lightllm-kernel/csrc/moe/grouped_topk.cu
@@ -16,7 +16,8 @@
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
-namespace moe {
+namespace lightllm {
+namespace ops{
 
 template <int TPB>
 __launch_bounds__(TPB) 
@@ -266,8 +267,6 @@ void GroupedTopKKernelLauncher(
         num_experts, num_expert_group, topk_group, topk, renormalize, softmax_or_sigmoid, 0, num_experts);
 }
 
-} // namespace moe
-
 void grouped_topk_cuda(
     torch::Tensor& topk_weights,                // [num_tokens, topk]
     torch::Tensor& correction_bias,             // [num_tokens, num_experts]
@@ -300,7 +299,7 @@ void grouped_topk_cuda(
     const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
-    moe::GroupedTopKKernelLauncher(
+    GroupedTopKKernelLauncher(
         gating_output.data_ptr<float>(),
         correction_bias.defined() ? correction_bias.data_ptr<float>() : nullptr,
         topk_weights.data_ptr<float>(),
@@ -316,4 +315,30 @@ void grouped_topk_cuda(
         renormalize,
         softmax_or_sigmoid,
         stream);
-}
\ No newline at end of file
+}
+
+torch::Tensor grouped_topk(
+        torch::Tensor topk_weights,
+        torch::Tensor correction_bias,
+        torch::Tensor topk_indices,
+        torch::Tensor group_indices,
+        torch::Tensor gating_output,
+        int64_t  num_expert_group,
+        int64_t  topk_group,
+        int64_t  topk,
+        bool     renormalize,
+        std::string scoring_func,
+        torch::Tensor group_scores) {
+
+    grouped_topk_cuda(topk_weights, correction_bias, topk_indices, group_indices,
+                      gating_output,
+                      static_cast<int>(num_expert_group),
+                      static_cast<int>(topk_group),
+                      static_cast<int>(topk),
+                      renormalize, scoring_func, group_scores);
+
+    return topk_weights;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp b/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
deleted file mode 100644
index f35c92caa..000000000
--- a/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <torch/extension.h>
-#include "../../include/ops_common.h"
-
-
-void grouped_topk_cuda(
-    torch::Tensor& topk_weights,
-    torch::Tensor& correction_bias,
-    torch::Tensor& topk_indices,
-    torch::Tensor& group_indices,
-    torch::Tensor& gating_output,
-    int  num_expert_group,
-    int  topk_group,
-    int  topk,
-    bool renormalize,
-    std::string scoring_func,
-    torch::Tensor group_scores = torch::Tensor());
-
-torch::Tensor grouped_topk(
-    torch::Tensor topk_weights,
-    torch::Tensor correction_bias,
-    torch::Tensor topk_indices,
-    torch::Tensor group_indices,
-    torch::Tensor gating_output,
-    int  num_expert_group,
-    int  topk_group,
-    int  topk,
-    bool renormalize,
-    std::string scoring_func,
-    torch::Tensor group_scores /* = {} */) {
-
-    TORCH_CHECK(topk_weights.is_cuda(),   "topk_weights must be CUDA tensor");
-    TORCH_CHECK(gating_output.is_cuda(),  "gating_output must be CUDA tensor");
-
-    grouped_topk(topk_weights,
-                 correction_bias,
-                 topk_indices,
-                 group_indices,
-                 gating_output,
-                 num_expert_group,
-                 topk_group,
-                 topk,
-                 renormalize,
-                 scoring_func,
-                 group_scores);
-
-    // 就地写结果，所以这里直接返回topk_weights
-    return topk_weights;
-}
diff --git a/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu b/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu
new file mode 100755
index 000000000..95853db0d
--- /dev/null
+++ b/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu
@@ -0,0 +1,350 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB>
+__global__
+void device_rmsnorm_align16_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t tmp = cvt_bf16_f32(_X[i]);
+        local_square_sum += tmp* tmp;
+    }
+    
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+    
+    // // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t x = cvt_bf16_f32(_X[i]);
+        fp32_t w = cvt_bf16_f32(W[i]);
+        // Apply normalization: multiply by inv_norm and then scale by the weight.
+        fp32_t ret = x* inv_norm * w;
+        _Y[i] = cvt_f32_bf16(ret);
+    }
+}
+
+template<int32_t TPB>
+__global__
+void device_rmsnorm_align16_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    // __shared__ bf16x2_t workspace[N / 2];
+    extern __shared__ bf16x2_t workspace2[];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_x, workspace2 + (i >> 1));
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void device_rmsnorm_align16_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    constexpr fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    __shared__ bf16x2_t workspace[N / 2];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_x, workspace + (i >> 1));
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief Launch RMSNorm kernel for FP16 tensors with aligned 16-element rows.
+ *
+ * This function validates the input tensors, ensures they are contiguous,
+ * selects the appropriate kernel configuration based on the row width N,
+ * and launches the CUDA kernel.
+ *
+ * @param X    Input tensor with shape [M, N] (FP16, CUDA).
+ * @param W    Weight tensor with shape [N] (FP16, CUDA).
+ * @param eps  Epsilon for numerical stability.
+ * @return     Output tensor with the same shape as X.
+ */
+Tensor rmsnorm_align16_bf16(const Tensor &X, const Tensor &W, const fp32_t eps) {
+
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor Y;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        Y = torch::empty_like(input_tensor);
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        Y = torch::empty_like(input_tensor);
+    }
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_rmsnorm_align16_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 1024:
+            device_rmsnorm_align16_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 2048:
+            device_rmsnorm_align16_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 3200:
+            device_rmsnorm_align16_bf16<256, 3200>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 4096:
+            device_rmsnorm_align16_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 8192:
+            device_rmsnorm_align16_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 10240:
+            device_rmsnorm_align16_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                const int64_t shared_mem_size = N * sizeof(bf16_t);
+                device_rmsnorm_align16_bf16_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                    M, N, eps
+                );
+            } else {
+                device_rmsnorm_align16_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                    M, N, eps
+                );
+            }
+        }
+    }
+
+    // need to reshape Y back to 4 dimens
+    if (X.ndimension() == 4) {
+        Y = Y.reshape(X.sizes());
+    }
+
+    return Y;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index e21a9d376..b5ad19e74 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -1,43 +1,9 @@
 #include <torch/extension.h>
-#include "../include/ops_common.h"
+#include "ops_common.h"
 #include <pybind11/pybind11.h>
 
-void grouped_topk_cuda(
-        torch::Tensor& topk_weights,
-        torch::Tensor& correction_bias,
-        torch::Tensor& topk_indices,
-        torch::Tensor& group_indices,
-        torch::Tensor& gating_output,
-        int  num_expert_group,
-        int  topk_group,
-        int  topk,
-        bool renormalize,
-        std::string scoring_func,
-        torch::Tensor group_scores);
-
-
-torch::Tensor grouped_topk(
-        torch::Tensor topk_weights,
-        torch::Tensor correction_bias,
-        torch::Tensor topk_indices,
-        torch::Tensor group_indices,
-        torch::Tensor gating_output,
-        int64_t  num_expert_group,
-        int64_t  topk_group,
-        int64_t  topk,
-        bool     renormalize,
-        std::string scoring_func,
-        torch::Tensor group_scores) {
-
-    grouped_topk_cuda(topk_weights, correction_bias, topk_indices, group_indices,
-                      gating_output,
-                      static_cast<int>(num_expert_group),
-                      static_cast<int>(topk_group),
-                      static_cast<int>(topk),
-                      renormalize, scoring_func, group_scores);
-
-    return topk_weights;
-}
+namespace lightllm {
+namespace ops {
 
 PYBIND11_MODULE(_C, m) {
     m.def("grouped_topk", &grouped_topk,
@@ -53,4 +19,14 @@ PYBIND11_MODULE(_C, m) {
           py::arg("renormalize"),
           py::arg("scoring_func"),
           py::arg("group_scores") = torch::Tensor());
-}
\ No newline at end of file
+    m.def("rmsnorm_align16_bf16", &rmsnorm_align16_bf16, "RMSNORM (CUDA)");
+    m.def("pre_tp_norm_bf16", &pre_tp_norm_bf16, "PRE TP NORM (CUDA)");
+    m.def("post_tp_norm_bf16", &post_tp_norm_bf16, "POST TP NORM (CUDA)");
+    m.def("per_token_quant_bf16_fp8", &per_token_quant_bf16_fp8, "PER TOKEN QUANT (CUDA)");
+    m.def("add_norm_quant_bf16_fp8", &add_norm_quant_bf16_fp8, "ADD NORM QUANT FUSED (CUDA)");
+    m.def("gelu_per_token_quant_bf16_fp8", &gelu_per_token_quant_bf16_fp8, "GELU QUANT FUSED (CUDA)");
+    m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu
new file mode 100755
index 000000000..ba9a5877e
--- /dev/null
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu
@@ -0,0 +1,342 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_fp8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M,                  // Number of rows in the input tensor
+    const int64_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8_e4m3_t local_f8;
+    bf16_t local_bf16;
+
+    extern __shared__ bf16_t workspace1[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = _input[i];
+        workspace1[i] = local_bf16;
+
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        local_max = fmaxf(local_max, tmp);
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = workspace1[i];
+        
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        fp32_t x = tmp * inv_scale;
+        local_f8 = fp8_e4m3_t(x);
+
+        _output[i] = local_f8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+
+}
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_fp8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace2[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace2 + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB, int32_t N>
+__global__ void device_per_token_quant_bf16_to_fp8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M                  // Number of rows in the input tensor
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+void per_token_quant_bf16_fp8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_per_token_quant_bf16_to_fp8<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 32:
+            device_per_token_quant_bf16_to_fp8<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 64:
+            device_per_token_quant_bf16_to_fp8<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 512:
+            device_per_token_quant_bf16_to_fp8<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 1024:
+            device_per_token_quant_bf16_to_fp8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 3200:
+            device_per_token_quant_bf16_to_fp8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 4096:
+            device_per_token_quant_bf16_to_fp8<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 12800:
+            device_per_token_quant_bf16_to_fp8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        default: {
+            static constexpr int TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_per_token_quant_bf16_to_fp8_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M,
+                    N
+                );
+            } else {
+                device_per_token_quant_bf16_to_fp8_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M,
+                    N
+                );
+            }
+        }
+    }
+
+    return;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
new file mode 100755
index 000000000..339743f49
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/tensor_impl.hpp>
+#include <cute/tensor_predicate.hpp>
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Alpha,
+          class XEngine, class XLayout,
+          class Beta,
+          class YEngine, class YLayout,
+          class PrdTensor = TrivialPredTensor>
+CUTE_HOST_DEVICE
+void
+axpby(Alpha                    const& alpha,
+      Tensor<XEngine, XLayout> const& x,
+      Beta                     const& beta,
+      Tensor<YEngine, YLayout>     && y,
+      PrdTensor                const& p = {})
+{
+  return axpby(alpha, x, beta, y, p);
+}
+
+//
+// AXPBY
+//
+template <class Alpha,
+          class XEngine, class XLayout,
+          class Beta,
+          class YEngine, class YLayout,
+          class PrdTensor = TrivialPredTensor>
+CUTE_HOST_DEVICE
+void
+axpby(Alpha                    const& alpha,
+      Tensor<XEngine, XLayout> const& x,
+      Beta                     const& beta,
+      Tensor<YEngine, YLayout>      & y,
+      PrdTensor                const& p = {})
+{
+  auto isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  } ();
+
+  CUTE_UNROLL
+  for (int i = 0; i < size(x); ++i) {
+    if (p(i)) {
+      y(i) = (isBetaZero ? alpha * x(i) : alpha * x(i) + beta * y(i));
+    }
+  }
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
new file mode 100755
index 000000000..0b3a8eaa1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/algorithm/fill.hpp>  // cute::fill
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+clear(Tensor<Engine, Layout>&& tensor)
+{
+  return clear(tensor);
+}
+
+//
+// Set elements to zero
+//
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+clear(Tensor<Engine, Layout>& tensor)
+{
+  using T = typename Tensor<Engine,Layout>::value_type;
+
+  fill(tensor, T{});
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
new file mode 100755
index 000000000..9d080116d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
@@ -0,0 +1,333 @@
+/***************************************************************************************************
+* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* SPDX-License-Identifier: BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+*
+* 3. Neither the name of the copyright holder nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/layout.hpp>
+#include <cute/layout_composed.hpp> // cute::logical_divide
+#include <cute/swizzle.hpp>         // cute::Swizzle
+#include <cute/swizzle_layout.hpp>  // cute::get_nonswizzle_portion
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/tensor_predicate.hpp>
+#include <cute/algorithm/copy.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+namespace cute
+{
+
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE void
+naive_cooperative_copy(uint32_t                     const& tid,
+                       Tensor<SrcEngine, SrcLayout> const& src,
+                       Tensor<DstEngine, DstLayout>      & dst)
+{
+  auto N = size(src);
+  if (tid < N) {
+    uint32_t upper_bound = (N / NumThreads) * NumThreads;
+    CUTE_UNROLL
+    for (uint32_t i = 0; i < upper_bound; i += NumThreads) {   // All in-bounds
+      dst[tid + i] = src[tid + i];
+    }
+    if (N % NumThreads != 0) {                                 // Likely static condition
+      uint32_t final_idx = tid + upper_bound;
+      if (final_idx < N) {                                     // Final in-bounds
+        dst[final_idx] = src[final_idx];
+      }
+    }
+  }
+}
+
+// Accept mutable temporaries
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE void
+naive_cooperative_copy(uint32_t                     const& tid,
+                       Tensor<SrcEngine, SrcLayout> const& src,
+                       Tensor<DstEngine, DstLayout>     && dst)
+{
+  return naive_cooperative_copy(tid, src, dst);
+}
+
+// A heuristic to determine a "good" permutation of two tensors for later vectorization and thr-assignment
+template <class AEngine, class ALayout,
+          class BEngine, class BLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+heuristic_permutation(Tensor<AEngine, ALayout> const& a,
+                      Tensor<BEngine, BLayout> const& b)
+{
+  constexpr bool swizzleA = get_swizzle_t<AEngine>::num_bits != 0 or
+                            get_swizzle_t<ALayout>::num_bits != 0;
+  constexpr bool swizzleB = get_swizzle_t<BEngine>::num_bits != 0 or
+                            get_swizzle_t<BLayout>::num_bits != 0;
+  auto a_inv = right_inverse(get_nonswizzle_portion(a.layout()));
+  auto b_inv = right_inverse(get_nonswizzle_portion(b.layout()));
+
+  constexpr uint8_t scoreA = (uint8_t(swizzleA)                  << 2) |
+                             (uint8_t(is_smem<AEngine>::value)   << 1) |
+                             (uint8_t(size(a_inv) > size(b_inv)) << 0);
+
+  constexpr uint8_t scoreB = (uint8_t(swizzleB)                  << 2) |
+                             (uint8_t(is_smem<BEngine>::value)   << 1) |
+                             (uint8_t(size(b_inv) > size(a_inv)) << 0);
+
+  if constexpr (scoreA >= scoreB) {
+    return a_inv;
+  } else {
+    return b_inv;
+  }
+}
+
+// cooperative_copy<NumThreads, MaxVecBits>(thr_idx, src, dst)
+// Use NumThreads to copy Tensor src to Tensor dst with element-wise vectorization up to MaxVecBits.
+// @pre 0 <= @a tid < NumThreads
+// @pre Tensors @a src and @a dst are aligned up to MaxVecBits.
+//      That is, pointers and dynamic strides are assumed to be aligned up to MaxVecBits.
+//
+template <uint32_t NumThreads, uint32_t MaxVecBits,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>      & dst)
+{
+  // Assumes the shapes are static, can generalize/fallback
+  CUTE_STATIC_ASSERT_V(is_static<decltype(shape(src))>{} && is_static<decltype(shape(dst))>{});
+  CUTE_STATIC_ASSERT_V(size(src) == size(dst));
+  // Assumes the types are the same, can generalize/fallback
+  static_assert(cute::is_same<typename SrcEngine::value_type, typename DstEngine::value_type>::value);
+  static_assert(MaxVecBits == sizeof_bits_v<typename SrcEngine::value_type> ||
+                MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
+                "Expected MaxVecBits to be value size or 8 or 16 or 32 or 64 or 128 for alignment and performance.");
+  // Check that the tensors are likely shared across threads: either gmem or smem
+  static_assert((is_gmem<SrcEngine>::value || is_smem<SrcEngine>::value),
+                "cooperative_copy expects shared gmem or smem source tensor.");
+  static_assert((is_gmem<DstEngine>::value || is_smem<DstEngine>::value),
+                "cooperative_copy expects shared gmem or smem destination tensor.");
+  // Precondition on tid in DEBUG
+  assert(tid < NumThreads);
+  // Precondition on pointer alignment in DEBUG
+  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(src.data())));
+  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(dst.data())));
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy\n");
+        print("   "); print("NumThreads: "); print(NumThreads); print("\n");
+        print("   "); print("MaxVecBits: "); print(MaxVecBits); print("\n");
+        print("   "); print("src: "); print(src); print("\n");
+        print("   "); print("dst: "); print(dst); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+  // The common layout of the two tensors that can be vectorized over elements and threads
+  // vidx -> coord
+  auto common_layout = heuristic_permutation(src, dst);
+
+  // Apply
+  // (V, rest)
+  Tensor src_a = coalesce(logical_divide(src, common_layout), Shape<_1,_1>{});
+  Tensor dst_a = coalesce(logical_divide(dst, common_layout), Shape<_1,_1>{});
+
+  //
+  // Determine vectorization of elems and thrs based on src/dst size and number of threads
+  // NOTE: This heuristic promotes parallelization over vectorization
+  //
+
+  // The number of elements and number of bits
+  constexpr int  elem_bits = sizeof_bits_v<typename SrcEngine::value_type>;
+  constexpr int total_elem = size(SrcLayout{});
+
+  // The number of elements that can be vectorized in values
+  constexpr int common_elem = decltype(max_common_vector(src_a, dst_a))::value;
+
+#if 0
+      if (thread0()) {
+        print("   "); print("common_layout: "); print(common_layout); print("\n");
+        print("   "); print("src_a: "); print(src_a); print("\n");
+        print("   "); print("dst_a: "); print(dst_a); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+  //
+  if constexpr (total_elem % NumThreads != 0) {
+    // Not attempting to find a partitioning pattern, fallback to dynamically indexed slowpath
+
+    if constexpr (common_elem > 1 && MaxVecBits > elem_bits) {
+      // If the vectorization is non-trivial and divides the maximum vectorizations, then vectorize
+      constexpr auto max_align_src = elem_bits * decltype(max_alignment(src_a.layout()))::value;
+      constexpr auto max_align_dst = elem_bits * decltype(max_alignment(dst_a.layout()))::value;
+      constexpr auto vec_bits = gcd(max_align_src, max_align_dst, MaxVecBits);
+      using VecType = uint_bit_t<vec_bits>;
+
+      static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
+      static_assert((vec_bits >= 8), "No support for subbyte copying");
+
+      Tensor src_v = recast<VecType const>(src_a);
+      Tensor dst_v = recast<VecType      >(dst_a);
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy -- naive\n");
+        print("   "); print("src_v: "); print(src_v); print("\n");
+        print("   "); print("dst_v: "); print(dst_v); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+      naive_cooperative_copy<NumThreads>(tid, src_v, dst_v);
+    } else {
+      naive_cooperative_copy<NumThreads>(tid, src_a, dst_a);
+    }
+  } else {
+    // If the tensors can be equally partitioned by the threads,
+    // compute vectorization widths in elements and threads.
+
+    // If there are too many threads to allow a full vectorized copy, trunc the vectorization
+    constexpr int total_bits = total_elem * elem_bits;
+    constexpr int max_bits_per_thr = total_bits / NumThreads;
+    // At least elem_bits, at most common_bits
+    constexpr int common_bits = common_elem * elem_bits;
+    constexpr int vec_bits = cute::max(elem_bits, cute::gcd(common_bits, int(MaxVecBits), max_bits_per_thr));
+
+    // Should account for vec_bits < 8 and/or vec_elem <= 1
+    // And also account for subbyte types, which could cause race conditions
+    // Want to ENFORCE sufficient vectorization in those cases
+    static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
+    static_assert(vec_bits >= 8, "No support for subbyte copying");
+
+    using VecType = uint_bit_t<vec_bits>;
+    constexpr int vec_elem = vec_bits / elem_bits;
+
+    constexpr int vec_thrs = cute::min(int(NumThreads), total_elem / vec_elem);
+
+    //
+    // Determine the partitioning patterns for the vec_elems and vec_thrs
+    //
+
+    // Distribute the rest of the V*T to some consistent portion outside of the common_layout, if needed
+    auto common_domain_src = domain_distribute(shape(src_a), Int<vec_elem*vec_thrs>{});
+    auto common_domain_dst = domain_distribute(shape(dst_a), Int<vec_elem*vec_thrs>{});
+
+    // Make sure for now, could fall back here instead
+    CUTE_STATIC_ASSERT_V(size(common_domain_src) == Int<vec_elem*vec_thrs>{});
+    CUTE_STATIC_ASSERT_V(compatible(common_domain_src, common_domain_dst) ||
+                         compatible(common_domain_dst, common_domain_src));
+    // Use the "more specific" domain for the extra elements of V*T
+    auto common_domain = conditional_return(compatible(common_domain_src, common_domain_dst),
+                                            common_domain_dst, common_domain_src);
+
+    // Construct the tiler
+    auto tiler_vt = common_domain.with_shape(Int<vec_elem>{}, Int<vec_thrs>{});
+
+    // Apply and slice
+    Tensor src_v = logical_divide(src_a, tiler_vt)(make_coord(_,tid),_);
+    Tensor dst_v = logical_divide(dst_a, tiler_vt)(make_coord(_,tid),_);
+
+#if 0
+      if (thread0()) {
+        print("   "); print("cooperative_copy -- vec\n");
+        print("   "); print("Used vector: ");  print(vec_elem); print("\n");
+        print("   "); print("Used threads: ");  print(vec_thrs); print("\n");
+        print("   "); print("tiler_vt: "); print(tiler_vt); print("\n");
+        print("   "); print("src_v: "); print(src_v); print("\n");
+        print("   "); print("dst_v: "); print(dst_v); print("\n");
+        print("   "); print("recast<VecType const>(src_v): "); print(recast<VecType const>(src_v)); print("\n");
+        print("   "); print("recast<VecType      >(dst_v): "); print(recast<VecType      >(dst_v)); print("\n");
+      }
+#ifdef __CUDA_ARCH__
+      __syncthreads();
+#endif
+#endif
+
+    // If we're using all threads (static) or the tid is in-range (dynamic)
+    if (vec_thrs == NumThreads or tid < vec_thrs) {
+      return copy_if(TrivialPredTensor{}, recast<VecType const>(src_v), recast<VecType>(dst_v));
+    }
+  }
+}
+
+// Default max-vectorization size to value_type size
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>      & dst)
+{
+  constexpr uint32_t MaxVecBits = sizeof_bits_v<typename SrcEngine::value_type>;
+  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <uint32_t NumThreads,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>     && dst)
+{
+  return cooperative_copy<NumThreads>(tid, src, dst);
+}
+
+template <uint32_t NumThreads, uint32_t MaxVecBits,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_copy(uint32_t                     const& tid,
+                 Tensor<SrcEngine, SrcLayout> const& src,
+                 Tensor<DstEngine, DstLayout>     && dst)
+{
+  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
new file mode 100755
index 000000000..2c91ce6f4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
@@ -0,0 +1,512 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+
+#include <cute/atom/mma_atom.hpp>
+
+#include <cute/algorithm/axpby.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/algorithm/gemm.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+//
+// Cooperative Shared-Memory GEMMs
+//
+
+namespace detail {
+
+// Predicated Cooperative GEMM
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp, class BLoadTransformOp,
+          class CLoadTransformOp, class CStoreTransformOp,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm_predication(ThrMMA<Args...> const& thr_mma,
+                             Alpha const& alpha,
+                             Tensor<TA, ALayout> sA,
+                             Tensor<TB, BLayout> sB,
+                             Beta  const& beta,
+                             Tensor<TC, CLayout> sC,
+                             ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
+                             BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
+                             CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
+                             CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
+{
+  using TypeA = typename TA::value_type;
+  using TypeB = typename TB::value_type;
+  using TypeC = typename TC::value_type;
+
+  //
+  // MMA Partitioning
+  //
+
+  // Partition the sA, sB, and sC tiles across the threads for the MMA
+  Tensor tCsA = thr_mma.partition_A(sA);                            // (MMA,MMA_M,MMA_K)
+  Tensor tCsB = thr_mma.partition_B(sB);                            // (MMA,MMA_N,MMA_K)
+  Tensor tCsC = thr_mma.partition_C(sC);                            // (MMA,MMA_M,MMA_N)
+
+  // Create register tensors for the MMA to operate on
+  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                      // (MMA,MMA_M,MMA_K)
+  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                      // (MMA,MMA_N,MMA_K)
+  Tensor tCrC = thr_mma.make_fragment_C(tCsC);                      // (MMA,MMA_M,MMA_N)
+
+#if 0
+  if (thread0()) {
+    print("  sA: "); print(  sA); print("\n");
+    print("  sB: "); print(  sB); print("\n");
+    print("  sC: "); print(  sC); print("\n");
+    print(thr_mma);
+    print("tCsA: "); print(tCsA); print("\n");
+    print("tCsB: "); print(tCsB); print("\n");
+    print("tCsC: "); print(tCsC); print("\n");
+    print("tCrA: "); print(tCrA); print("\n");
+    print("tCrB: "); print(tCrB); print("\n");
+    print("tCrC: "); print(tCrC); print("\n");
+  }
+#endif
+
+  //
+  // PREDICATION
+  //
+
+  // Create coordinate tensors for the problem
+  Tensor cA = make_identity_tensor(shape(sA));                      // (M,K) -> (m,k)
+  Tensor cB = make_identity_tensor(shape(sB));                      // (N,K) -> (n,k)
+
+  // Repeat partitioning with thr_mma
+  Tensor tCcA = thr_mma.partition_A(cA);                            // (MMA,MMA_M,MMA_K) -> (m,k)
+  Tensor tCcB = thr_mma.partition_B(cB);                            // (MMA,MMA_N,MMA_K) -> (n,k)
+
+  // Allocate the preds for MMA- and MMA_MN-modes
+  Tensor tCpA = make_tensor<bool>(make_shape(size<0>(tCsA), size<1>(tCsA)));
+  Tensor tCpB = make_tensor<bool>(make_shape(size<0>(tCsB), size<1>(tCsB)));
+
+  // Populate the predicates on M and N
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCpA); ++i) {
+    tCpA(i) = elem_less(get<0>(tCcA(_,_,Int<0>{})(i)), shape<0>(sA));
+  }
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCpB); ++i) {
+    tCpB(i) = elem_less(get<0>(tCcB(_,_,Int<0>{})(i)), shape<0>(sB));
+  }
+
+#if 0
+  if (thread0()) {
+    print("  cA: "); print(  cA); print("\n");
+    print("  cB: "); print(  cB); print("\n");
+    print("tCcA: "); print(tCcA); print("\n");
+    print("tCcB: "); print(tCcB); print("\n");
+    print_tensor(tCpA);
+    print_tensor(tCpB);
+  }
+#endif
+
+  //
+  // PREFETCH k_block = 0
+  //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
+  //   Assumes the MMA-tiling in K is trivial
+  //
+
+  constexpr int K_BLOCK_MAX = size<2>(tCrA);
+
+  CUTE_UNROLL
+  for (int m = 0; m < size<1>(tCrA); ++m) {     // Copy MMA_M
+    CUTE_UNROLL
+    for (int i = 0; i < size<0>(tCrA); ++i) {   // Copy MMA_I
+      tCrA(i,m,0) = (tCpA(i,m) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,0)), shape<1>(sA)))) ? sA_load_op(tCsA(i,m,0)) : TypeA{};
+    }
+  }
+  CUTE_UNROLL
+  for (int n = 0; n < size<1>(tCrB); ++n) {     // Copy MMA_N
+    CUTE_UNROLL
+    for (int i = 0; i < size<0>(tCrB); ++i) {   // Copy MMA_I
+      tCrB(i,n,0) = (tCpB(i,n) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,0)), shape<1>(sB)))) ? sB_load_op(tCsB(i,n,0)) : TypeB{};
+    }
+  }
+  //
+  // MAINLOOP
+  //
+
+  // Clear accumulators
+  clear(tCrC);
+
+  CUTE_UNROLL
+  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+  {
+    if (k_block < K_BLOCK_MAX-1)   // static-if not the last k_block
+    {
+      int k_next = k_block + 1;    // Load k_next block
+
+      //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
+      //   Assumes the MMA-tiling in K is trivial
+
+      CUTE_UNROLL
+      for (int m = 0; m < size<1>(tCrA); ++m) {       // Copy MMA_M
+        CUTE_UNROLL
+        for (int i = 0; i < size<0>(tCrA); ++i) {     // Copy MMA_I
+          tCrA(i,m,k_next) = (tCpA(i,m) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,k_next)), shape<1>(sA)))) ? sA_load_op(tCsA(i,m,k_next)) : TypeA{};
+        }
+      }
+      CUTE_UNROLL
+      for (int n = 0; n < size<1>(tCrB); ++n) {       // Copy MMA_N
+        CUTE_UNROLL
+        for (int i = 0; i < size<0>(tCrB); ++i) {     // Copy MMA_I
+          tCrB(i,n,k_next) = (tCpB(i,n) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,k_next)), shape<1>(sB)))) ? sB_load_op(tCsB(i,n,k_next)) : TypeB{};
+        }
+      }
+    }
+    // GEMM on k_block in registers
+    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+  }
+
+  //
+  // Epilogue
+  //
+
+  // Create coordinate tensors for the problem
+  Tensor cC   = make_identity_tensor(shape(sC));                     // (M,N) -> (m,n)
+  // Repeat partitioning with thr_mma
+  Tensor tCcC = thr_mma.partition_C(cC);                             // (MMA,MMA_M,MMA_N) -> (m,n)
+
+  const bool isBetaZero = (beta == Beta{});
+
+  // Custom axpby_if for now
+  CUTE_UNROLL
+  for (int i = 0; i < size(tCrC); ++i)
+  {
+    if (elem_less(tCcC(i), shape(sC)))
+    {
+      tCsC(i) = sC_store_op(isBetaZero ? alpha * static_cast<TypeC>(tCrC(i))
+                                       : alpha * static_cast<TypeC>(tCrC(i)) +
+                                          beta * static_cast<TypeC>(sC_load_op(tCsC(i))));
+    }
+  }
+}
+
+// Slow fallback path
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp, class BLoadTransformOp,
+          class CLoadTransformOp, class CStoreTransformOp,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm_predication(uint32_t thread_idx,
+                             TiledMMA<Args...> const& tiled_mma,
+                             Alpha const& alpha,
+                             Tensor<TA, ALayout> sA,
+                             Tensor<TB, BLayout> sB,
+                             Beta  const& beta,
+                             Tensor<TC, CLayout> sC,
+                             ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
+                             BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
+                             CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
+                             CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
+{
+  // ThrMMA
+  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+  cooperative_gemm_predication(thr_mma, alpha, sA, sB, beta, sC, sA_load_op, sB_load_op, sC_load_op, sC_store_op);
+}
+
+// Unpredicated Cooperative GEMM
+template <class SmemCopyOpA, class SmemCopyOpB, class SmemCopyOpC,
+          class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp, class BLoadTransformOp,
+          class CLoadTransformOp, class CStoreTransformOp,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm_no_predication(uint32_t thread_idx,
+                                TiledMMA<Args...> const& tiled_mma,
+                                Alpha const& alpha,
+                                Tensor<TA, ALayout> sA,
+                                Tensor<TB, BLayout> sB,
+                                Beta  const& beta,
+                                Tensor<TC, CLayout> sC,
+                                ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
+                                BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
+                                CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
+                                CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
+{
+  using TypeA = typename TA::value_type;
+  using TypeB = typename TB::value_type;
+  using TypeC = typename TC::value_type;
+
+  // ThrMMA
+  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+
+  //
+  // MMA Partitioning
+  //
+
+  Tensor tCsC = thr_mma.partition_C(sC);
+  // Create register tensors for the MMA to operate on
+  Tensor tCrA  = thr_mma.partition_fragment_A(sA);                    // (MMA,MMA_M,MMA_K)
+  Tensor tCrB  = thr_mma.partition_fragment_B(sB);                    // (MMA,MMA_N,MMA_K)
+  Tensor tCrC  = thr_mma.make_fragment_C(tCsC);                       // (MMA,MMA_M,MMA_N)
+
+  using CopyOpAType = SmemCopyOpA;
+  using CopyOpBType = SmemCopyOpB;
+
+  auto smem_tiled_copy_A = make_tiled_copy_A(Copy_Atom<CopyOpAType, TypeA>{}, thr_mma);
+  auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+  Tensor tCsA            = smem_thr_copy_A.partition_S(sA);
+  Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));             // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));             // CPY_K
+
+  auto smem_tiled_copy_B = make_tiled_copy_B(Copy_Atom<CopyOpBType, TypeB>{}, thr_mma);
+  auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+  Tensor tCsB            = smem_thr_copy_B.partition_S(sB);
+  Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+#if 0
+  if (thread0()) {
+    print("  sA: "); print(sA); print("\n");
+    print("  sB: "); print(sB); print("\n");
+    print("  sC: "); print(sC); print("\n");
+    print(thr_mma); print("\n");
+    print("tCsC: "); print(tCsC); print("\n");
+    print("tCrA: "); print(tCrA); print("\n");
+    print("tCrB: "); print(tCrB); print("\n");
+    print("tCrC: "); print(tCrC); print("\n");
+    print(smem_thr_copy_A); print("\n");
+    print("tCsA: "); print(tCsA); print("\n");
+    print("tCrA_copy_view: "); print(tCrA_copy_view); print("\n");
+    print(smem_thr_copy_B); print("\n");
+    print("tCsB: "); print(tCsB); print("\n");
+    print("tCrB_copy_view: "); print(tCrB_copy_view); print("\n");
+  }
+#endif
+
+  //
+  // PREFETCH
+  //
+
+  copy(smem_tiled_copy_A, tCsA(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+  copy(smem_tiled_copy_B, tCsB(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+  //
+  // MAINLOOP
+  //
+
+  // Clear accumulators
+  clear(tCrC);
+
+  constexpr int K_BLOCK_MAX = size<2>(tCrA);
+
+  CUTE_UNROLL
+  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
+  {
+    // static-if load the next k_block. No k-predication required on these loads.
+    if (k_block < K_BLOCK_MAX-1)
+    {
+      // Load the next k_block
+      int k_next = k_block + 1;       // statically unrolled
+      copy(smem_tiled_copy_A, tCsA(_,_,k_next), tCrA_copy_view(_,_,k_next));
+      copy(smem_tiled_copy_B, tCsB(_,_,k_next), tCrB_copy_view(_,_,k_next));
+    }
+
+    // Transform A and B, relying on the compiler to remove in case of identity ops
+    cute::transform(tCrA(_,_,k_block), sA_load_op);
+    cute::transform(tCrB(_,_,k_block), sB_load_op);
+
+    // GEMM on k_block in registers
+    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+  }
+
+  //
+  // Epilogue
+  //
+
+  auto isBetaZero = [&] () {
+    if constexpr (is_complex<Beta>::value) {
+      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
+    }
+    else {
+      return beta == Int<0>{};
+    }
+    CUTE_GCC_UNREACHABLE;
+  } ();
+
+  using CopyOpCType = SmemCopyOpC;
+  Tensor tCrD = thr_mma.make_fragment_C(tCsC);
+  if(!isBetaZero) {
+    copy(CopyOpCType{}, tCsC, tCrD);
+    // Transform C on/after load
+    cute::transform(tCrD, sC_load_op);
+  }
+  // C = alpha * (A * B) + beta * C
+  axpby(alpha, tCrC, beta, tCrD);
+  // Transform C before/on store
+  cute::transform(tCrD, sC_store_op);
+  copy(CopyOpCType{}, tCrD, tCsC);
+}
+
+} // end namespace detail
+
+template <class SmemCopyOpA, class SmemCopyOpB, class SmemCopyOpC,
+          class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm(uint32_t thread_idx,
+                 TiledMMA<Args...> const& tiled_mma,
+                 Alpha const& alpha,
+                 Tensor<TA, ALayout> sA,
+                 Tensor<TB, BLayout> sB,
+                 Beta  const& beta,
+                 Tensor<TC, CLayout> sC,
+                 ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
+{
+  CUTE_STATIC_ASSERT_V(size<0>(sA) == size<0>(sC));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(sB) == size<1>(sC));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));  // AK == BK
+
+  using TypeA = typename TA::value_type;
+  using TypeB = typename TB::value_type;
+  using TypeC = typename TC::value_type;
+
+  static_assert(is_convertible_v<decay_t<invoke_result_t<ALoadTransformOp, TypeA>>, TypeA>,
+    "ALoadTransformOp functor must accept value of type TA::value_type and return value convertible to type TA::value_type");
+  static_assert(is_convertible_v<decay_t<invoke_result_t<BLoadTransformOp, TypeB>>, TypeB>,
+    "BLoadTransformOp functor must accept value of type TB::value_type and return value convertible to type TB::value_type");
+  static_assert(is_convertible_v<decay_t<invoke_result_t<CLoadTransformOp, TypeC>>, TypeC>,
+    "CLoadTransformOp functor must accept value of type TC::value_type and return value convertible to type TC::value_type");
+  static_assert(is_convertible_v<decay_t<invoke_result_t<CStoreTransformOp, TypeC>>, TypeC>,
+    "CStoreTransformOp functor must accept value of type TC::value_type and return value convertible to type TC::value_type");
+
+  static constexpr bool compat = evenly_divides(make_shape(size<0>(sA), size<0>(sB), size<1>(sA)),
+                                                tile_shape(TiledMMA<Args...>{}));
+  if constexpr (compat) {
+    detail::cooperative_gemm_no_predication<SmemCopyOpA, SmemCopyOpB, SmemCopyOpC>(
+        thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
+        sA_load_op, sB_load_op, sC_load_op, sC_store_op
+    );
+  } else {
+    detail::cooperative_gemm_predication(
+      thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
+      sA_load_op, sB_load_op, sC_load_op, sC_store_op
+    );
+  }
+}
+
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+cooperative_gemm(uint32_t thread_idx,
+                 TiledMMA<Args...> const& tiled_mma,
+                 Alpha const& alpha,
+                 Tensor<TA, ALayout> sA,
+                 Tensor<TB, BLayout> sB,
+                 Beta  const& beta,
+                 Tensor<TC, CLayout> sC,
+                 ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
+{
+  using CopyOpA = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TA::value_type>>;
+  using CopyOpB = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TB::value_type>>;
+  using CopyOpC = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TC::value_type>>;
+  cooperative_gemm<CopyOpA, CopyOpB, CopyOpC>(
+      thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
+      sA_load_op, sB_load_op, sC_load_op, sC_store_op
+  );
+}
+
+// Legacy overload of cute::gemm for backwards-compatibility
+template <class... Args,
+          class Alpha, class TA, class ALayout, class TB, class BLayout,
+          class Beta,  class TC, class CLayout,
+          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
+          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
+          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_smem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(ThrMMA<Args...> const& thr_mma,
+     Alpha const& alpha,
+     Tensor<TA, ALayout> sA,
+     Tensor<TB, BLayout> sB,
+     Beta  const& beta,
+     Tensor<TC, CLayout> sC,
+     ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
+     BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
+     CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
+     CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
+{
+  // Goes directly to the slow path to avoid getting thread_idx from thr_mma
+  detail::cooperative_gemm_predication(
+    thr_mma, alpha, sA, sB, beta, sC,
+    sA_load_op, sB_load_op, sC_load_op, sC_store_op
+  );
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
new file mode 100755
index 000000000..c2decd15d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
@@ -0,0 +1,382 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>       // cute::Tensor
+#include <cute/tensor_predicate.hpp>  // cute::TrivialPredTensor
+#include <cute/atom/copy_atom.hpp>    // cute::Copy_Atom
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy(src, dst);
+}
+
+template <class VecType,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
+         Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_vec<VecType>(src, dst);
+}
+
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
+             Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_aligned(src, dst);
+}
+
+template <class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_if(pred, src, dst);
+}
+
+template <class CopyPolicy,
+          class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(CopyPolicy                   const& copy_policy,
+        PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy_if(copy_policy, pred, src, dst);
+}
+
+template <class CopyPolicy,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(CopyPolicy                   const& copy_policy,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>     && dst)
+{
+  return copy(copy_policy, src, dst);
+}
+
+//
+// copy_if -- Predicated Copy
+//
+
+template <class PrdTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(PrdTensor                    const& pred,
+        Tensor<SrcEngine, SrcLayout> const& src,
+        Tensor<DstEngine, DstLayout>      & dst)
+{
+  auto copy_op = select_elementwise_copy(src, dst);
+
+  CUTE_UNROLL
+  for (int i = 0; i < size(src); ++i) {
+    if (pred(i)) {
+      copy_op.copy(src(i), dst(i));
+    }
+  }
+}
+
+//
+// copy_if -- Predicated CopyAtom
+//
+
+namespace detail {
+
+// Trait that detects if atom's traits has a member function with(bool)
+template <class, class Enable = void>
+constexpr bool has_with_bool = false;
+
+template <class T>
+constexpr bool has_with_bool<T, cute::void_t<decltype(declval<typename T::Traits>().with(declval<bool>()))>> = true;
+
+} // end namespace detail
+
+template <class... CopyArgs,
+          class PredTensor,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
+        PredTensor                   const& pred,      // (Rest...)
+        Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
+        Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
+{
+  static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
+  if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
+    copy_atom.call(src, dst);
+  } else {                                // Loop over all but the first mode
+    constexpr int R = SrcLayout::rank;
+    Tensor src_v = group_modes<1,R>(src);
+    Tensor dst_v = group_modes<1,R>(dst);
+    CUTE_UNROLL
+    for (int i = 0; i < size<1>(src_v); ++i) {
+      // If copy traits can be transformed with a predicate value, do it, otherwise branch here
+      if constexpr (detail::has_with_bool<Copy_Atom<CopyArgs...>>) {
+        copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i));
+      } else {
+        if (pred(i)) {
+          copy_atom.call(src_v(_,i), dst_v(_,i));
+        }
+      }
+    }
+  }
+}
+
+//
+// copy_vec -- attempt vectorized copy with VecType
+//
+
+template <class VecType,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
+         Tensor<DstEngine, DstLayout>      & dst)
+{
+  static_assert(sizeof_bits_v<VecType> >= 8 && sizeof_bits_v<VecType> % 8 == 0,
+                "Expected a vectorization type of at least a byte.");
+  using SrcType = typename SrcEngine::value_type;
+  using DstType = typename DstEngine::value_type;
+  if constexpr (cute::is_same<SrcType, DstType>::value &&
+                sizeof_bits_v<VecType>  > sizeof_bits_v<DstType>)
+  {
+    // Preserve volatility of Src/Dst types.
+    using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
+    using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType       volatile, VecType      >;
+    Tensor src_v = recast<SrcVecType>(src);
+    Tensor dst_v = recast<DstVecType>(dst);
+
+#if 0
+    if (thread0()) {
+      print("copy_vec<%db> -- vectorizing copy:\n", int(sizeof_bits_v<VecType>));
+      print("   "); print(src); print(" => "); print(src_v); print("\n");
+      print("   "); print(dst); print(" => "); print(dst_v); print("\n");
+    }
+#endif
+
+    return copy_if(TrivialPredTensor{}, src_v, dst_v);
+  } else {
+#if 0
+  if (thread0()) {
+    print("copy_vec<%db> -- NOT vectorizing copy:\n", int(sizeof_bits_v<VecType>));
+    print("   "); print(src); print("\n");
+    print("   "); print(dst); print("\n");
+  }
+#endif
+
+    return copy_if(TrivialPredTensor{}, src, dst);
+  }
+}
+
+//
+// copy -- CopyAtom
+//
+
+template <class... CopyArgs,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<CopyArgs...>       const& copy_atom,
+     Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst)
+{
+  return copy_if(copy_atom, TrivialPredTensor{}, src, dst);
+}
+
+//////////////////////////////////////////
+// Special Auto-Vectorizing Overloads
+//////////////////////////////////////////
+
+// Specialization for AutoVectorizingCopyAssumedAlignment<MaxVecBits>
+template <int MaxVecBits, class... Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits> const&,
+     Tensor<SrcEngine, SrcLayout>                        const& src,
+     Tensor<DstEngine, DstLayout>                             & dst)
+{
+  constexpr int vec_elem = decltype(max_common_vector(src, dst))::value;
+
+  constexpr int max_align_src = decltype(max_alignment(src.layout()))::value;
+  constexpr int max_align_dst = decltype(max_alignment(dst.layout()))::value;
+  constexpr int max_align     = gcd(vec_elem, max_align_src, max_align_dst);
+
+  constexpr int src_bits = sizeof_bits<typename SrcEngine::value_type>::value;
+  constexpr int vec_bits = gcd(src_bits * max_align, MaxVecBits);
+
+  if constexpr (vec_elem > 1 && vec_bits >= 8) {
+    // If more than one element vectorizes to 8bits or more, then copy_vec
+#if 0
+    if (thread0()) {
+      print("copy -- found max_common_vector of %d elems and vectorization to %d bits\n", vec_elem, vec_bits);
+      print("   "); print(src); print("\n");
+      print("   "); print(dst); print("\n");
+    }
+#endif
+    return copy_vec<uint_bit_t<vec_bits>>(src, dst);
+  } else {
+    return copy_if(TrivialPredTensor{}, src, dst);
+  }
+}
+
+// Auto-vectorizing copy for static layouts
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Tensor<SrcEngine, SrcLayout> const& src,
+     Tensor<DstEngine, DstLayout>      & dst)
+{
+  if constexpr (is_static<SrcLayout>::value && is_static<DstLayout>::value) {
+    // Assume Tensors with static layouts (e.g. registers) have pointers that are 128b aligned
+    return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst);
+  } else {
+    // Do not assume that dynamic layouts are aligned.
+    return copy(AutoVectorizingCopyWithAssumedAlignment<8>{}, src, dst);
+  }
+}
+
+// Auto-vectorizing copy with assumed alignment up to 128bit.
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
+             Tensor<DstEngine, DstLayout>      & dst)
+{
+  return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst);
+}
+
+// Specializaton for Atom AutoVectorizingCopyAssumedAlignment
+template <int MaxVecBits, class... Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>, Args...> const&,
+     Tensor<SrcEngine, SrcLayout>                                            const& src,
+     Tensor<DstEngine, DstLayout>                                                 & dst)
+{
+  return copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>{}, src, dst);
+}
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+template <class... CT_Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,  // Copy_Traits may or may not have the memory barrier in it already
+     Tensor<SrcEngine, SrcLayout>                 const& src,
+     Tensor<DstEngine, DstLayout>                      & dst)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using DstType = typename DstEngine::value_type;
+  static_assert(cute::is_same<SrcType, DstType>::value);
+  static_assert((is_gmem<SrcEngine>::value && is_smem<DstEngine>::value) ||
+                (is_smem<SrcEngine>::value && is_gmem<DstEngine>::value),
+                "Bulk Copy only supports gmem -> smem or smem -> gmem movement.");
+  // G2S or S2G dispatch
+  using BULK_COPY_OP = conditional_t<is_gmem<SrcEngine>::value,
+                                     SM90_BULK_COPY_G2S,
+                                     SM90_BULK_COPY_S2G>;
+
+  // Find the common subtensor of src and dst
+  auto tiler = max_common_layout(src, dst);
+  constexpr int vec_elem = decltype(size(tiler))::value;
+  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
+  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
+
+  // Construct a new concrete Atom of the vector size
+  using BulkAtom = Copy_Atom<Copy_Traits<BULK_COPY_OP, Int<vec_bits>, CT_Args...>, SrcType>;
+  auto bulk_atom = apply(atom.opargs_, [](auto const&... args) { return BulkAtom{args...}; });
+
+#if 0
+  if (thread0()) {
+    print("copy blkcp -- found a max_common_layout of "); print(tiler); print("\n");
+    print("   "); print(src); print("\n");
+    print("   "); print(dst); print("\n");
+  }
+#endif
+
+  return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler));
+}
+
+// Backwards-compat. Throw out any extra Copy_Atom args.
+template <class... CT_Args, class... CA_Args,
+          class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE
+void
+copy(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
+     Tensor<SrcEngine, SrcLayout>                const& src,
+     Tensor<DstEngine, DstLayout>                     & dst)
+{
+  return copy(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src, dst);
+}
+#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
new file mode 100755
index 000000000..3f33a42ad
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/tensor_impl.hpp>
+#include <cute/algorithm/prefer.hpp>
+
+namespace cute
+{
+
+//
+// Accept mutable temporaries
+//
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>&& tensor, T const& value)
+{
+  return fill(tensor, value);
+}
+
+namespace detail
+{
+
+// Prefer fill(tensor.data(), value), if possible
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+auto
+fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<1>)
+    -> decltype(fill(tensor.data(), value))
+{
+  fill(tensor.data(), value);
+}
+
+// Default implementation
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<0>)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = value;
+  }
+}
+
+} // end namespace detail
+
+template <class Engine, class Layout, class T>
+CUTE_HOST_DEVICE
+void
+fill(Tensor<Engine, Layout>& tensor, T const& value)
+{
+  return detail::fill(tensor, value, prefer<1>{});
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
new file mode 100755
index 000000000..ef80d018d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
@@ -0,0 +1,290 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/numeric/math.hpp>    // cute::max, cute::min
+#include <cute/numeric/complex.hpp> // cute::conj
+
+/** C++14 <functional> extensions */
+
+namespace cute {
+
+/**************/
+/** Identity **/
+/**************/
+
+struct identity {
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg);
+  }
+};
+
+template <class R>
+struct constant_fn {
+  template <class... T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&&...) const {
+    return r_;
+  }
+  R r_;
+};
+
+/***********/
+/** Unary **/
+/***********/
+
+#define CUTE_LEFT_UNARY_OP(NAME,OP)                                  \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return OP static_cast<T&&>(arg);                                \
+    }                                                                \
+  }
+#define CUTE_RIGHT_UNARY_OP(NAME,OP)                                 \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return static_cast<T&&>(arg) OP ;                               \
+    }                                                                \
+  }
+#define CUTE_NAMED_UNARY_OP(NAME,OP)                                 \
+  struct NAME {                                                      \
+    template <class T>                                               \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& arg) const {                       \
+      return OP (static_cast<T&&>(arg));                              \
+    }                                                                \
+  }
+
+CUTE_LEFT_UNARY_OP(unary_plus,       +);
+CUTE_LEFT_UNARY_OP(negate,           -);
+CUTE_LEFT_UNARY_OP(bit_not,          ~);
+CUTE_LEFT_UNARY_OP(logical_not,      !);
+CUTE_LEFT_UNARY_OP(dereference,      *);
+CUTE_LEFT_UNARY_OP(address_of,       &);
+CUTE_LEFT_UNARY_OP(pre_increment,   ++);
+CUTE_LEFT_UNARY_OP(pre_decrement,   --);
+
+CUTE_RIGHT_UNARY_OP(post_increment, ++);
+CUTE_RIGHT_UNARY_OP(post_decrement, --);
+
+CUTE_NAMED_UNARY_OP(abs_fn,           abs);
+CUTE_NAMED_UNARY_OP(conjugate, cute::conj);
+
+#undef CUTE_LEFT_UNARY_OP
+#undef CUTE_RIGHT_UNARY_OP
+#undef CUTE_NAMED_UNARY_OP
+
+template <int Shift_>
+struct shift_right_const {
+  static constexpr int Shift = Shift_;
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg) >> Shift;
+  }
+};
+
+template <int Shift_>
+struct shift_left_const {
+  static constexpr int Shift = Shift_;
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) operator()(T&& arg) const {
+    return static_cast<T&&>(arg) << Shift;
+  }
+};
+
+/************/
+/** Binary **/
+/************/
+
+#define CUTE_BINARY_OP(NAME,OP)                                      \
+  struct NAME {                                                      \
+    template <class T, class U>                                      \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
+      return static_cast<T&&>(lhs) OP static_cast<U&&>(rhs);           \
+    }                                                                \
+  }
+#define CUTE_NAMED_BINARY_OP(NAME,OP)                                \
+  struct NAME {                                                      \
+    template <class T, class U>                                      \
+    CUTE_HOST_DEVICE constexpr                                       \
+    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
+      return OP (static_cast<T&&>(lhs), static_cast<U&&>(rhs));        \
+    }                                                                \
+  }
+
+
+CUTE_BINARY_OP(plus,                 +);
+CUTE_BINARY_OP(minus,                -);
+CUTE_BINARY_OP(multiplies,           *);
+CUTE_BINARY_OP(divides,              /);
+CUTE_BINARY_OP(modulus,              %);
+
+CUTE_BINARY_OP(plus_assign,         +=);
+CUTE_BINARY_OP(minus_assign,        -=);
+CUTE_BINARY_OP(multiplies_assign,   *=);
+CUTE_BINARY_OP(divides_assign,      /=);
+CUTE_BINARY_OP(modulus_assign,      %=);
+
+CUTE_BINARY_OP(bit_and,              &);
+CUTE_BINARY_OP(bit_or,               |);
+CUTE_BINARY_OP(bit_xor,              ^);
+CUTE_BINARY_OP(left_shift,          <<);
+CUTE_BINARY_OP(right_shift,         >>);
+
+CUTE_BINARY_OP(bit_and_assign,      &=);
+CUTE_BINARY_OP(bit_or_assign,       |=);
+CUTE_BINARY_OP(bit_xor_assign,      ^=);
+CUTE_BINARY_OP(left_shift_assign,  <<=);
+CUTE_BINARY_OP(right_shift_assign, >>=);
+
+CUTE_BINARY_OP(logical_and,         &&);
+CUTE_BINARY_OP(logical_or,          ||);
+
+CUTE_BINARY_OP(equal_to,            ==);
+CUTE_BINARY_OP(not_equal_to,        !=);
+CUTE_BINARY_OP(greater,              >);
+CUTE_BINARY_OP(less,                 <);
+CUTE_BINARY_OP(greater_equal,       >=);
+CUTE_BINARY_OP(less_equal,          <=);
+
+CUTE_NAMED_BINARY_OP(max_fn, cute::max);
+CUTE_NAMED_BINARY_OP(min_fn, cute::min);
+
+#undef CUTE_BINARY_OP
+#undef CUTE_NAMED_BINARY_OP
+
+/**********/
+/** Fold **/
+/**********/
+
+#define CUTE_FOLD_OP(NAME,OP)                                        \
+  struct NAME##_unary_rfold {                                        \
+    template <class... T>                                            \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(T&&... t) const {                                \
+      return (t OP ...);                                             \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_unary_lfold {                                        \
+    template <class... T>                                            \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(T&&... t) const {                                \
+      return (... OP t);                                             \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_binary_rfold {                                       \
+    template <class U, class... T>                                   \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(U&& u, T&&... t) const {                         \
+      return (t OP ... OP u);                                        \
+    }                                                                \
+  };                                                                 \
+  struct NAME##_binary_lfold {                                       \
+    template <class U, class... T>                                   \
+    CUTE_HOST_DEVICE constexpr                                       \
+    auto operator()(U&& u, T&&... t) const {                         \
+      return (u OP ... OP t);                                        \
+    }                                                                \
+  }
+
+CUTE_FOLD_OP(plus,                 +);
+CUTE_FOLD_OP(minus,                -);
+CUTE_FOLD_OP(multiplies,           *);
+CUTE_FOLD_OP(divides,              /);
+CUTE_FOLD_OP(modulus,              %);
+
+CUTE_FOLD_OP(plus_assign,         +=);
+CUTE_FOLD_OP(minus_assign,        -=);
+CUTE_FOLD_OP(multiplies_assign,   *=);
+CUTE_FOLD_OP(divides_assign,      /=);
+CUTE_FOLD_OP(modulus_assign,      %=);
+
+CUTE_FOLD_OP(bit_and,              &);
+CUTE_FOLD_OP(bit_or,               |);
+CUTE_FOLD_OP(bit_xor,              ^);
+CUTE_FOLD_OP(left_shift,          <<);
+CUTE_FOLD_OP(right_shift,         >>);
+
+CUTE_FOLD_OP(bit_and_assign,      &=);
+CUTE_FOLD_OP(bit_or_assign,       |=);
+CUTE_FOLD_OP(bit_xor_assign,      ^=);
+CUTE_FOLD_OP(left_shift_assign,  <<=);
+CUTE_FOLD_OP(right_shift_assign, >>=);
+
+CUTE_FOLD_OP(logical_and,         &&);
+CUTE_FOLD_OP(logical_or,          ||);
+
+CUTE_FOLD_OP(equal_to,            ==);
+CUTE_FOLD_OP(not_equal_to,        !=);
+CUTE_FOLD_OP(greater,              >);
+CUTE_FOLD_OP(less,                 <);
+CUTE_FOLD_OP(greater_equal,       >=);
+CUTE_FOLD_OP(less_equal,          <=);
+
+#undef CUTE_FOLD_OP
+
+/**********/
+/** Meta **/
+/**********/
+
+template <class Fn, class Arg>
+struct bound_fn {
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(T&& arg) {
+    return fn_(arg_, static_cast<T&&>(arg));
+  }
+
+  Fn fn_;
+  Arg arg_;
+};
+
+template <class Fn, class Arg>
+CUTE_HOST_DEVICE constexpr
+auto
+bind(Fn const& fn, Arg const& arg) {
+  return bound_fn<Fn,Arg>{fn, arg};
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
new file mode 100755
index 000000000..c4713838b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/algorithm/functional.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+#include <cute/atom/mma_atom.hpp>
+
+/** The gemm algorithm takes four (or three) tensors and computes
+ *   D = A * B + C
+ * It dispatches based on the number of modes each tensor has:
+ *
+ * 1. `(V) x (V) => (V)`.
+ *      The element-wise product of vectors. Dispatches to FMA or MMA.
+ * 2. `(M) x (N) => (M,N)`.
+ *      The outer product of vectors. Dispatches to [3] with new mode K=(1).
+ * 3. `(M,K) x (N,K) => (M,N)`.
+ *      The product of matrices. Dispatches to [5] with MMA vector-mode V.
+ * 4. `(V,M) x (V,N) => (V,M,N)`.
+ *      The batched outer product of vectors. Accounts for register reuse and dispatches to [1] for each (m,n).
+ * 5. `(V,M,K) x (V,N,K) => (V,M,N)`.
+ *      The batched product of matrices. Dispatches to [4] for each (k).
+ */
+
+namespace cute
+{
+
+//
+// Three arguments to four
+//
+
+template <class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>      & C)
+{
+  return gemm(C, A, B, C);
+}
+
+template <class MMA,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>      & C)
+{
+  return gemm(mma, C, A, B, C);
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>     && C)
+{
+  return gemm(C, A, B, C);
+}
+
+template <class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TD, DLayout>     && D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  return gemm(D, A, B, C);
+}
+
+template <class MMA,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout>     && C)
+{
+  return gemm(mma, C, A, B, C);
+}
+
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>     && D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  return gemm(mma, D, A, B, C);
+}
+
+//
+// Default MMA is UniversalFMA
+//
+
+template <class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE
+void
+gemm(Tensor<TD, DLayout>      & D,
+     Tensor<TA, ALayout> const& A,
+     Tensor<TB, BLayout> const& B,
+     Tensor<TC, CLayout> const& C)
+{
+  using MMA = MMA_Atom<UniversalFMA<typename Tensor<TD,DLayout>::value_type,
+                                    typename Tensor<TA,ALayout>::value_type,
+                                    typename Tensor<TB,BLayout>::value_type,
+                                    typename Tensor<TC,CLayout>::value_type>>;
+
+  return gemm(MMA{}, D, A, B, C);
+}
+
+//
+// Thread-Local Register-Memory GEMMs
+//
+
+// Dispatch [1]: (V) x (V) => (V)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 1 && is_rmem<TD>::value &&
+                          ALayout::rank == 1 && is_rmem<TA>::value &&
+                          BLayout::rank == 1 && is_rmem<TB>::value &&
+                          CLayout::rank == 1 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V) Logical data
+     Tensor<TA, ALayout> const& A,  // (V) Logical data
+     Tensor<TB, BLayout> const& B,  // (V) Logical data
+     Tensor<TC, CLayout> const& C)  // (V) Logical data
+{
+  // No static assertions on (V), MMA checks compatibility
+  mma.call(D, A, B, C);
+}
+
+// Dispatch [2]: (M) x (N) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 1 && is_rmem<TA>::value &&
+                          BLayout::rank == 1 && is_rmem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M)   Logical data
+     Tensor<TB, BLayout> const& B,  // (N)   Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+  gemm(mma,
+       D,                                                       // (M,N)
+       make_tensor(A.data(), append<2>(A.layout())),            // (M,1)
+       make_tensor(B.data(), append<2>(B.layout())),            // (N,1)
+       C);                                                      // (M,N)
+}
+
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_rmem<TA>::value &&
+                          BLayout::rank == 2 && is_rmem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+
+  // Assert this is a 1-value MMA
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
+
+  gemm(mma,
+       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
+       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
+       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
+       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
+}
+
+// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_rmem<TA>::value &&
+                          BLayout::rank == 2 && is_rmem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M)   Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N)   Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+  auto M = size<1>(A);
+  auto N = size<1>(B);
+  // REGISTER .reuse OPTIMIZATIONS
+  // 64-bit traversal specialization -- serpentine path
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8)
+  {
+#if 1 // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
+    // Row-major serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; ++m) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
+        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
+      }
+    }
+#else
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+#endif
+  } else
+  // 32-bit traversal specialization -- kinked serpentine path
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4)
+  {
+#if 1  // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
+    // Row-major kinked serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; m += 2) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 2) ? N-1-n : n;
+        gemm(mma, D(_,m+0,ns), A(_,m+0), B(_,ns), C(_,m+0,ns));
+
+        if (m+1 < M) {
+          gemm(mma, D(_,m+1,ns), A(_,m+1), B(_,ns), C(_,m+1,ns));
+        }
+      }
+    }
+#else
+    // Col-major kinked serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; n += 2) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        // Kinked serpentine traversal for maximum register reuse
+        int ms = (n & 2) ? M-1-m : m;
+        gemm(mma, D(_,ms,n+0), A(_,ms), B(_,n+0), C(_,ms,n+0));
+
+        if (n+1 < N) {
+          gemm(mma, D(_,ms,n+1), A(_,ms), B(_,n+1), C(_,ms,n+1));
+        }
+      }
+    }
+#endif
+  } else
+  // 64-bit + 32-bit traversal order -- keep A (64-bit) in the outer loop and serpentine B
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4) {
+    // Row-major serpentine iteration
+    CUTE_UNROLL
+    for (int m = 0; m < M; ++m) {
+      CUTE_UNROLL
+      for (int n = 0; n < N; ++n) {
+        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
+        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
+      }
+    }
+  } else
+  // 32-bit + 64-bit traversal order -- keep B (64-bit) in the outer loop and serpentine A
+  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
+                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8) {
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+  } else
+  // Fallback to serpentine loop
+  {
+    // Col-major serpentine iteration
+    CUTE_UNROLL
+    for (int n = 0; n < N; ++n) {
+      CUTE_UNROLL
+      for (int m = 0; m < M; ++m) {
+        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
+        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
+      }
+    }
+  }
+}
+
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 3 && is_rmem<TA>::value &&
+                          BLayout::rank == 3 && is_rmem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+  auto K = size<2>(A);
+
+  CUTE_UNROLL
+  for (int k = 0; k < K; ++k) {
+    gemm(mma, D, A(_,_,k), B(_,_,k), C);
+  }
+}
+
+//
+// Thread-Local Shared-Memory GEMMs
+//
+
+// Dispatch [1]: (V) x (V) => (V)
+// Dispatch [2]: (M) x (N) => (M,N)
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+// Dispatch [3]: (M,K) x (N,K) => (M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
+                          ALayout::rank == 2 && is_smem<TA>::value &&
+                          BLayout::rank == 2 && is_smem<TB>::value &&
+                          CLayout::rank == 2 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
+
+  // Assert this is a 1-value MMA
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
+  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
+
+  gemm(mma,
+       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
+       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
+       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
+       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
+}
+
+// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+template <class MMA,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout,
+          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
+                          ALayout::rank == 3 && is_smem<TA>::value &&
+                          BLayout::rank == 3 && is_smem<TB>::value &&
+                          CLayout::rank == 3 && is_rmem<TC>::value)>
+CUTE_HOST_DEVICE
+void
+gemm(MMA_Atom<MMA>       const& mma,
+     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
+     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
+     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
+     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
+{
+  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
+  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
+  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
+  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
+
+  auto rA = MMA_Atom<MMA>::make_fragment_A(A);
+  auto rB = MMA_Atom<MMA>::make_fragment_B(B);
+
+  auto K = size<2>(A);
+
+  CUTE_UNROLL
+  for (int k = 0; k < K; ++k)
+  {
+    copy(A(_,_,k), rA(_,_,k));
+    copy(B(_,_,k), rB(_,_,k));
+    // Thread-level register gemm for k
+    gemm(mma, D, rA(_,_,k), rB(_,_,k), C);
+  }
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
new file mode 100755
index 000000000..a69e50429
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+namespace cute
+{
+
+// Infinite types that inherit from each other
+template <size_t N>
+struct prefer : prefer<N-1> {};
+
+template <>
+struct prefer<0> {};
+
+// Can be used to preferencially overload implementations
+// Higher N in prefer<N> have higher priority.
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
new file mode 100755
index 000000000..c39f63acd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
@@ -0,0 +1,145 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>     // cute::Tensor
+#include <cute/atom/copy_atom.hpp>  // cute::Copy_Atom
+
+namespace cute
+{
+
+//
+// Prefetch global tensors into L2
+//
+
+template <uint32_t NumThreads, uint32_t FetchBytes = 64,
+          class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+cooperative_prefetch(uint32_t                 const& tid,
+                     Tensor<GEngine, GLayout> const& src)
+{
+  static_assert(is_gmem<GEngine>::value, "Expected global tensor for prefetch");
+
+  constexpr int V = decltype(max_common_vector(src, src))::value;
+
+  if constexpr (V > 1) {
+    // L2 sector is 32B, default fetch granularity is 64B
+    using VecType = conditional_t<(V * sizeof_bits_v<typename GEngine::value_type>) < (FetchBytes * 8),
+                                  ArrayEngine<typename GEngine::value_type, V>,
+                                  uint8_t[FetchBytes]                         >;
+
+    Tensor src_v = recast<VecType const>(src);
+    CUTE_UNROLL
+    for (int i = tid; i < size(src_v); i += NumThreads) {
+      prefetch(raw_pointer_cast(&src_v(i)));
+    }
+  } else {
+    CUTE_UNROLL
+    for (int i = tid; i < size(src); i += NumThreads) {
+      prefetch(raw_pointer_cast(&src(i)));
+    }
+  }
+}
+
+template <class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Tensor<GEngine, GLayout> const& src)
+{
+  return cooperative_prefetch<1>(0, src);
+}
+
+// Prefetch with copy atom
+namespace detail {
+
+template <class CopyOp, class = void>
+constexpr bool has_prefetch = false;
+
+template <class CopyOp>
+constexpr bool has_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = true;
+
+} // end namespace detail
+
+template <class CopyOp, class... CT_Args, class... CA_Args,
+          class GEngine, class GLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CA_Args...> const& atom,
+         Tensor<GEngine, GLayout>                               const& src)
+{
+  if constexpr (detail::has_prefetch<CopyOp>) {
+    using Prefetch_Traits = Copy_Traits<typename CopyOp::PREFETCH, CT_Args...>;
+    using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CA_Args...>;
+    Prefetch_Atom prefetch_atom{atom};
+    auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
+    return copy(prefetch_atom, src, dst);
+  } else {
+    return prefetch(src);
+  }
+}
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+template <class... CT_Args,
+          class SrcEngine, class SrcLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,
+         Tensor<SrcEngine, SrcLayout>                 const& src)
+{
+  using SrcType = typename SrcEngine::value_type;
+  static_assert(is_gmem<SrcEngine>::value, "Expected global tensor for L2 prefetch");
+
+  auto tiler = max_common_layout(src, src);
+  constexpr int vec_elem = decltype(size(tiler))::value;
+  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
+  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
+
+  // Construct a new concrete Atom of the vector size
+  auto bulk_atom = Copy_Atom<Copy_Traits<SM90_BULK_COPY_G2S, Int<vec_bits>>, SrcType>{};
+
+  return prefetch(bulk_atom, logical_divide(src, tiler));
+}
+
+// Backwards-compat. Throw out any extra Copy_Atom args.
+template <class... CT_Args, class... CA_Args,
+          class SrcEngine, class SrcLayout>
+CUTE_HOST_DEVICE
+void
+prefetch(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
+         Tensor<SrcEngine, SrcLayout>                                        const& src)
+{
+  return prefetch(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src);
+}
+#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
new file mode 100755
index 000000000..dbffc6133
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
@@ -0,0 +1,166 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/** Common algorithms on (hierarchical) tensors */
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+//
+// for_each
+//
+
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout> const& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    op(tensor(i));
+  }
+}
+
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    op(tensor(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
+{
+  return for_each(tensor, op);
+}
+
+//
+// transform
+//
+
+// Similar to std::transform but does not return number of elements affected
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = op(tensor(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class Engine, class Layout, class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
+{
+  return transform(tensor, op);
+}
+
+// Similar to std::transform transforms one tensors and assigns it to another
+template <class EngineIn, class LayoutIn,
+          class EngineOut, class LayoutOut,
+          class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
+          Tensor<EngineOut,LayoutOut>      & tensor_out,
+          UnaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor_in); ++i) {
+    tensor_out(i) = op(tensor_in(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class EngineIn, class LayoutIn,
+          class EngineOut, class LayoutOut,
+          class UnaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
+          Tensor<EngineOut,LayoutOut>     && tensor_out,
+          UnaryOp&& op)
+{
+  return transform(tensor_in, tensor_out, op);
+}
+
+// Similar to std::transform with a binary operation
+// Takes two tensors as input and one tensor as output.
+// Applies the binary_op to tensor_in1 and tensor_in2 and
+// assigns it to tensor_out
+template <class EngineIn1, class LayoutIn1,
+          class EngineIn2, class LayoutIn2,
+          class EngineOut, class LayoutOut,
+          class BinaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
+          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
+          Tensor<EngineOut,LayoutOut>      & tensor_out,
+          BinaryOp&& op)
+{
+  CUTE_UNROLL
+  for (int i = 0; i < size(tensor_in1); ++i) {
+    tensor_out(i) = op(tensor_in1(i), tensor_in2(i));
+  }
+}
+
+// Accept mutable temporaries
+template <class EngineIn1, class LayoutIn1,
+          class EngineIn2, class LayoutIn2,
+          class EngineOut, class LayoutOut,
+          class BinaryOp>
+CUTE_HOST_DEVICE constexpr
+void
+transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
+          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
+          Tensor<EngineOut,LayoutOut>     && tensor_out,
+          BinaryOp&& op)
+{
+  return transform(tensor_in1, tensor_in2, tensor_out, op);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
new file mode 100755
index 000000000..5a70f590b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
@@ -0,0 +1,1073 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/util/type_traits.hpp>
+#include <cute/container/tuple.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/numeric/integer_sequence.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+/// @file tuple_algorithms.hpp
+/// @brief Common algorithms on (hierarchical) tuples
+///
+/// Code guidelines and style preferences:
+///
+/// For perfect forwarding, don't use std::forward, because it may not
+/// be defined in device code when compiling with NVRTC. Instead, use
+/// `static_cast<ParameterType&&>(parameter_name)`.
+///
+/// CuTe generally does not bother forwarding functions, as
+/// reference-qualified member functions are rare in this code base.
+///
+/// Throughout CUTLASS, cute::make_tuple always needs to be called
+/// namespace-qualified, EVEN If inside the cute namespace and/or in
+/// scope of a "using namespace cute" declaration. Otherwise, the
+/// compiler may select std::make_tuple instead of cute::make_tuple,
+/// due to argument-dependent lookup.
+
+namespace cute
+{
+
+//
+// Apply (Unpack)
+// (t, f) => f(t_0,t_1,...,t_n)
+//
+
+namespace detail {
+
+template <class T, class F, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+apply(T&& t, F&& f, seq<I...>)
+{
+  return f(get<I>(static_cast<T&&>(t))...);
+}
+
+} // end namespace detail
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+apply(T&& t, F&& f)
+{
+  return detail::apply(static_cast<T&&>(t), f, tuple_seq<T>{});
+}
+
+//
+// Transform Apply
+// (t, f, g) => g(f(t_0),f(t_1),...)
+//
+
+namespace detail {
+
+template <class T, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T&& t, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T&&>(t)))...);
+}
+
+template <class T0, class T1, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T0&& t0, T1&& t1, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T0&&>(t0)),
+             get<I>(static_cast<T1&&>(t1)))...);
+}
+
+template <class T0, class T1, class T2, class F, class G, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tapply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g, seq<I...>)
+{
+  return g(f(get<I>(static_cast<T0&&>(t0)),
+             get<I>(static_cast<T1&&>(t1)),
+             get<I>(static_cast<T2&&>(t2)))...);
+}
+
+} // end namespace detail
+
+template <class T, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T&& t, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::tapply(static_cast<T&&>(t), f, g, tuple_seq<T>{});
+  } else {
+    return g(f(static_cast<T&&>(t)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T0&& t0, T1&& t1, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2, class F, class G>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_apply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g)
+{
+  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
+    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2), f, g, tuple_seq<T0>{});
+  } else {
+    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// For Each
+// (t, f) => f(t_0),f(t_1),...,f(t_n)
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+void
+for_each(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::apply(t, [&](auto&&... a) { (f(static_cast<decltype(a)&&>(a)), ...); }, tuple_seq<T>{});
+  } else {
+    return f(static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+for_each_leaf(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::apply(static_cast<T&&>(t), [&](auto&&... a){ return (for_each_leaf(static_cast<decltype(a)&&>(a), f), ...); }, tuple_seq<T>{});
+  } else {
+    return f(static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Transform
+// (t, f) => (f(t_0),f(t_1),...,f(t_n))
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::tapply(t, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T0 const& t0, T1 const& t1, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
+    return detail::tapply(t0, t1, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
+  } else {
+    return f(t0, t1);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
+    static_assert(tuple_size<T0>::value == tuple_size<T2>::value, "Mismatched tuple_size");
+    return detail::tapply(t0, t1, t2, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
+  } else {
+    return f(t0, t1, t2);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_leaf(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return transform(t, [&](auto const& a) { return transform_leaf(a, f); });
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_leaf(T0 const& t0, T1 const& t1, F&& f)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return transform(t0, t1, [&](auto const& a, auto const& b) { return transform_leaf(a, b, f); });
+  } else {
+    return f(t0, t1);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// find and find_if
+//
+
+namespace detail {
+
+template <class T, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+find_if(T const& t, F&& f, seq<I,Is...>)
+{
+  if constexpr (decltype(f(get<I>(t)))::value) {
+    return cute::C<I>{};
+  } else
+  if constexpr (sizeof...(Is) == 0) {
+    return cute::C<I+1>{};
+  } else {
+    return find_if(t, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+find_if(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::find_if(t, f, tuple_seq<T>{});
+  } else {
+    return cute::C<decltype(f(t))::value ? 0 : 1>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+find(T const& t, X const& x)
+{
+  return find_if(t, [&](auto const& v) { return v == x; });  // This should always return a static true/false
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+any_of(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+all_of(T const& t, F&& f)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
+  } else {
+    return f(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+none_of(T const& t, F&& f)
+{
+  return not any_of(t, f);
+}
+
+//
+// Filter
+// (t, f) => <f(t_0),f(t_1),...,f(t_n)>
+//
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T const& t, F&& f)
+{
+  return transform_apply(t, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+template <class T0, class T1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T0 const& t0, T1 const& t1, F&& f)
+{
+  return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+template <class T0, class T1, class T2, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_tuple(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
+{
+  return transform_apply(t0, t1, t2, f, [](auto const&... a) { return cute::tuple_cat(a...); });
+}
+
+//
+// Fold (Reduce, Accumulate)
+// (t, v, f) => f(...f(f(v,t_0),t_1),...,t_n)
+//
+
+namespace detail {
+
+template <class Fn, class Val>
+struct FoldAdaptor {
+  template <class X>
+  CUTE_HOST_DEVICE constexpr auto operator|(X&& x) {
+    auto r = fn_(val_, static_cast<X&&>(x));
+    return FoldAdaptor<Fn, decltype(r)>{fn_, r};
+  }
+  Fn fn_;
+  Val val_;
+};
+
+template <class T, class V, class F, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+fold(T&& t, V const& v, F&& f, seq<Is...>)
+{
+  return (FoldAdaptor<F,V>{f,v} | ... | get<Is>(static_cast<T&&>(t))).val_;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+fold(T&& t, V const& v, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::fold(static_cast<T&&>(t), v, f, tuple_seq<T>{});
+  } else {
+    return f(v, static_cast<T&&>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+fold_first(T&& t, F&& f)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return detail::fold(static_cast<T&&>(t), get<0>(t), f, make_range<1,tuple_size<remove_cvref_t<T>>::value>{});
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// front, back, take, select, unwrap
+//
+
+// Get the first non-tuple element in a hierarchical tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+front(T&& t)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    return front(get<0>(static_cast<T&&>(t)));
+  } else {
+    return static_cast<T&&>(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Get the last non-tuple element in a hierarchical tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+back(T&& t)
+{
+  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
+    constexpr int N = tuple_size<remove_cvref_t<T>>::value;
+
+    // MSVC needs a bit of extra help here deducing return types.
+    // We help it by peeling off the nonrecursive case a level "early."
+    if constexpr (! is_tuple<remove_cvref_t<decltype(get<N - 1>(static_cast<T&&>(t)))>>::value) {
+      return get<N - 1>(static_cast<T&&>(t));
+    } else {
+      return back(get<N - 1>(static_cast<T&&>(t)));
+    }
+  } else {
+    return static_cast<T&&>(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Takes the elements in the range [B,E)
+template <int B, int E, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+take(T const& t)
+{
+  if constexpr (E == -1) {
+    if constexpr (is_tuple<T>::value) {
+      return take<B,tuple_size<T>::value>(t);
+    } else {
+      return take<B,1>(t);
+    }
+  } else
+  if constexpr (B <= E) {
+    return detail::apply(t, [](auto const&... a) { return cute::make_tuple(a...); }, make_range<B,E>{});
+  } else {
+    static_assert(B <= E);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Select tuple elements with given indices.
+template <int... I, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+select(T const& t)
+{
+  return cute::make_tuple(get<I>(t)...);
+}
+
+// Wrap non-tuples into rank-1 tuples or forward
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+wrap(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    return t;
+  } else {
+    return cute::make_tuple(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Unwrap rank-1 tuples until we're left with a rank>1 tuple or a non-tuple
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+unwrap(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (tuple_size<T>::value == 1) {
+      return unwrap(get<0>(t));
+    } else {
+      return t;
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Flatten and Unflatten
+//
+
+template <class T>
+struct is_flat : true_type {};
+
+template <class... Ts>
+struct is_flat<tuple<Ts...>> : bool_constant<(true && ... && (not is_tuple<Ts>::value))> {};
+
+// Flatten a hierarchical tuple to a tuple of depth one
+//   and wrap non-tuples into a rank-1 tuple.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten_to_tuple(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_flat<T>::value) {      // Shortcut for perf
+      return t;
+    } else {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
+  } else {
+    return cute::make_tuple(t);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Flatten a hierarchical tuple to a tuple of depth one
+//   and leave non-tuple untouched.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_flat<T>::value) {      // Shortcut for perf
+      return t;
+    } else {
+      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class FlatTuple, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten_impl(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
+{
+  if constexpr (is_tuple<TargetProfile>::value) {
+    return fold(target_profile, cute::make_tuple(cute::make_tuple(), flat_tuple), [](auto const& v, auto const& t) {
+      auto [result, remaining_tuple] = v;
+      auto [sub_result, sub_tuple] = unflatten_impl(remaining_tuple, t);
+      return cute::make_tuple(append(result, sub_result), sub_tuple);
+    });
+  } else {
+    return cute::make_tuple(get<0>(flat_tuple), take<1, decltype(rank(flat_tuple))::value>(flat_tuple));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+}  // end namespace detail
+
+// Unflatten a flat tuple into a hierarchical tuple
+// @pre flatten(@a flat_tuple) == @a flat_tuple
+// @pre rank(flatten(@a target_profile)) == rank(@a flat_tuple)
+// @post congruent(@a result, @a target_profile)
+// @post flatten(@a result) == @a flat_tuple
+template <class FlatTuple, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
+{
+  auto [unflatten_tuple, flat_remainder] = detail::unflatten_impl(flat_tuple, target_profile);
+  CUTE_STATIC_ASSERT_V(rank(flat_remainder) == Int<0>{});
+  return unflatten_tuple;
+}
+
+//
+// insert and remove and replace
+//
+
+namespace detail {
+
+// Shortcut around cute::tuple_cat for common insert/remove/repeat cases
+template <class T, class X, int... I, int... J, int... K>
+CUTE_HOST_DEVICE constexpr
+auto
+construct(T const& t, X const& x, seq<I...>, seq<J...>, seq<K...>)
+{
+  return cute::make_tuple(get<I>(t)..., (void(J),x)..., get<K>(t)...);
+}
+
+} // end namespace detail
+
+// Insert x into the Nth position of the tuple
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+insert(T const& t, X const& x)
+{
+  return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N,tuple_size<T>::value>{});
+}
+
+// Remove the Nth element of the tuple
+template <int N, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+remove(T const& t)
+{
+  return detail::construct(t, 0, make_seq<N>{}, seq<>{}, make_range<N+1,tuple_size<T>::value>{});
+}
+
+// Replace the Nth element of the tuple with x
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N+1,tuple_size<T>::value>{});
+  } else {
+    static_assert(N == 0);
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Replace the first element of the tuple with x
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace_front(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, seq<>{}, seq<0>{}, make_range<1,tuple_size<T>::value>{});
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Replace the last element of the tuple with x
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+replace_back(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(t, x, make_seq<tuple_size<T>::value-1>{}, seq<0>{}, seq<>{});
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Make a tuple of Xs of tuple_size N
+//
+
+template <int N, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_repeat(X const& x)
+{
+  return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
+}
+
+//
+// Make repeated Xs of rank N
+//
+
+template <int N, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+repeat(X const& x)
+{
+  if constexpr (N == 1) {
+    return x;
+  } else {
+    return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Make a tuple of Xs the same profile as tuple T
+//
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+repeat_like(T const& t, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return transform(t, [&](auto const& a) { return repeat_like(a,x); });
+  } else {
+    return x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Group the elements [B,E) of a T into a single element
+// e.g. group<2,4>(T<_1,_2,_3,_4,_5,_6>{})
+//              => T<_1,_2,T<_3,_4>,_5,_6>{}
+template <int B, int E, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+group(T const& t)
+{
+  if constexpr (not is_tuple<T>::value) {
+    if constexpr (E == -1) {
+      return group<B,1>(t);
+    } else {
+      return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,1>{});
+    }
+  } else
+  if constexpr (E == -1) {
+    return group<B,tuple_size<T>::value>(t);
+  } else
+  if constexpr (B <= E) {
+    return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,tuple_size<T>::value>{});
+  } else {
+    static_assert(B <= E);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Extend a T to rank N by appending/prepending an element
+//
+
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (N == tuple_size<T>::value) {
+      return a;
+    } else {
+      static_assert(N > tuple_size<T>::value);
+      return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, make_seq<N-tuple_size<T>::value>{}, seq<>{});
+    }
+  } else {
+    if constexpr (N == 1) {
+      return a;
+    } else {
+      return detail::construct(cute::make_tuple(a), x, seq<0>{}, make_seq<N-1>{}, seq<>{});
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, seq<0>{}, seq<>{});
+  } else {
+    return cute::make_tuple(a, x);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (N == tuple_size<T>::value) {
+      return a;
+    } else {
+      static_assert(N > tuple_size<T>::value);
+      return detail::construct(a, x, seq<>{}, make_seq<N-tuple_size<T>::value>{}, make_seq<tuple_size<T>::value>{});
+    }
+  } else {
+    if constexpr (N == 1) {
+      return a;
+    } else {
+      static_assert(N > 1);
+      return detail::construct(cute::make_tuple(a), x, seq<>{}, make_seq<N-1>{}, seq<0>{});
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(T const& a, X const& x)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::construct(a, x, seq<>{}, seq<0>{}, make_seq<tuple_size<T>::value>{});
+  } else {
+    return cute::make_tuple(x, a);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Inclusive scan (prefix sum)
+//
+
+namespace detail {
+
+template <class T, class V, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+iscan(T const& t, V const& v, F&& f, seq<I,Is...>)
+{
+  // Apply the function to v and the element at I
+  auto v_next = f(v, get<I>(t));
+  // Replace I with v_next
+  auto t_next = replace<I>(t, v_next);
+
+#if 0
+  std::cout << "ISCAN i" << I << std::endl;
+  std::cout << "  t      " << t << std::endl;
+  std::cout << "  i      " << v << std::endl;
+  std::cout << "  f(i,t) " << v_next << std::endl;
+  std::cout << "  t_n    " << t_next << std::endl;
+#endif
+
+  if constexpr (sizeof...(Is) == 0) {
+    return t_next;
+  } else {
+    return iscan(t_next, v_next, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+iscan(T const& t, V const& v, F&& f)
+{
+  return detail::iscan(t, v, f, tuple_seq<T>{});
+}
+
+//
+// Exclusive scan (prefix sum)
+//
+
+namespace detail {
+
+template <class T, class V, class F, int I, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+escan(T const& t, V const& v, F&& f, seq<I,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    // Replace I with v
+    return replace<I>(t, v);
+  } else {
+    // Apply the function to v and the element at I
+    auto v_next = f(v, get<I>(t));
+    // Replace I with v
+    auto t_next = replace<I>(t, v);
+
+#if 0
+    std::cout << "ESCAN i" << I << std::endl;
+    std::cout << "  t      " << t << std::endl;
+    std::cout << "  i      " << v << std::endl;
+    std::cout << "  f(i,t) " << v_next << std::endl;
+    std::cout << "  t_n    " << t_next << std::endl;
+#endif
+
+    // Recurse
+    return escan(t_next, v_next, f, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class T, class V, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+escan(T const& t, V const& v, F&& f)
+{
+  return detail::escan(t, v, f, tuple_seq<T>{});
+}
+
+//
+// Zip (Transpose)
+//
+
+// Take       ((a,b,c,...),(x,y,z,...),...)        rank-R0 x rank-R1 input
+// to produce ((a,x,...),(b,y,...),(c,z,...),...)  rank-R1 x rank-R0 output
+
+namespace detail {
+
+template <int J, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+zip_(Ts const&... ts)
+{
+  return cute::make_tuple(get<J>(ts)...);
+}
+
+template <class T, int... Is, int... Js>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T const& t, seq<Is...>, seq<Js...>)
+{
+  static_assert(conjunction<bool_constant<tuple_size<tuple_element_t<0,T>>::value == tuple_size<tuple_element_t<Is,T>>::value>...>::value, "Mismatched Ranks");
+  return cute::make_tuple(zip_<Js>(get<Is>(t)...)...);
+}
+
+} // end namespace detail
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    if constexpr (is_tuple<tuple_element_t<0,T>>::value) {
+      return detail::zip(t, tuple_seq<T>{}, tuple_seq<tuple_element_t<0,T>>{});
+    } else {
+      return cute::make_tuple(t);
+    }
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Convenient to pass them in separately
+template <class T0, class T1, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(T0 const& t0, T1 const& t1, Ts const&... ts)
+{
+  return zip(cute::make_tuple(t0, t1, ts...));
+}
+
+//
+// zip2_by -- A guided zip for rank-2 tuples
+//   Take a tuple like ((A,a),((B,b),(C,c)),d)
+//   and produce a tuple ((A,(B,C)),(a,(b,c),d))
+//   where the rank-2 modes are selected by the terminals of the guide (X,(X,X))
+//
+
+namespace detail {
+
+template <class T, class TG, int... Is, int... Js>
+CUTE_HOST_DEVICE constexpr
+auto
+zip2_by(T const& t, TG const& guide, seq<Is...>, seq<Js...>)
+{
+  // zip2_by produces the modes like ((A,a),(B,b),...)
+  auto split = cute::make_tuple(zip2_by(get<Is>(t), get<Is>(guide))...);
+
+  // Rearrange and append missing modes from t to make ((A,B,...),(a,b,...,x,y))
+  return cute::make_tuple(cute::make_tuple(get<0>(get<Is>(split))...),
+                          cute::make_tuple(get<1>(get<Is>(split))..., get<Js>(t)...));
+}
+
+} // end namespace detail
+
+template <class T, class TG>
+CUTE_HOST_DEVICE constexpr
+auto
+zip2_by(T const& t, TG const& guide)
+{
+  if constexpr (is_tuple<TG>::value) {
+    constexpr int TR = tuple_size<T>::value;
+    constexpr int GR = tuple_size<TG>::value;
+    static_assert(TR >= GR, "Mismatched ranks");
+    return detail::zip2_by(t, guide,
+                           make_range< 0, GR>{},
+                           make_range<GR, TR>{});
+  } else {
+    static_assert(tuple_size<T>::value == 2, "Mismatched ranks");
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/// @return A tuple of the elements of @c t in reverse order.
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+reverse(T const& t)
+{
+  if constexpr (is_tuple<T>::value) {
+    return detail::apply(t, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_rseq<T>{});
+  } else {
+    return t;
+  }
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
new file mode 100755
index 000000000..8fff51be8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
@@ -0,0 +1,245 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
+  ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))))
+#  define CUTE_ARCH_CLUSTER_SM90_ENABLED
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12))
+#  define CUTE_ARCH_ELECT_ONE_SM90_ENABLED
+#endif
+
+namespace cute {
+
+CUTE_DEVICE void cluster_arrive_relaxed()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.arrive.relaxed.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_arrive()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.arrive.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_wait()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  asm volatile("barrier.cluster.wait.aligned;\n" : : );
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+CUTE_DEVICE void cluster_sync()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  cluster_arrive();
+  cluster_wait();
+#else
+  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
+#endif
+}
+
+// Returns the dim3 grid size in terms of number of clusters.
+CUTE_DEVICE dim3 cluster_grid_dims()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%nclusterid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%nclusterid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%nclusterid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#elif defined(__CUDA_ARCH__)
+  // MSVC requires protecting use of gridDim with __CUDA_ARCH__.
+  return gridDim;
+#elif defined(_MSC_VER)
+  CUTE_INVALID_CONTROL_PATH("cluster_grid_dims() can only be called on device");
+  return {0, 0, 0};
+#else
+  return {0, 0, 0};
+#endif
+}
+
+// Returns the dim3 cluster rank in the grid.
+CUTE_DEVICE dim3 cluster_id_in_grid()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%clusterid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%clusterid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%clusterid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#elif defined(__CUDA_ARCH__)
+  // MSVC requires protecting use of blockIdx with __CUDA_ARCH__.
+  return blockIdx;
+#elif defined(_MSC_VER)
+  CUTE_INVALID_CONTROL_PATH("cluster_id_in_grid() can only be called on device");
+  return {0, 0, 0};
+#else
+  return {0, 0, 0};
+#endif
+}
+
+// Returns the relative dim3 block rank local to the cluster.
+CUTE_DEVICE dim3 block_id_in_cluster()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_ctaid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%cluster_ctaid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%cluster_ctaid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#else
+  return {0,0,0};
+#endif
+}
+
+// Returns the dim3 cluster shape.
+CUTE_DEVICE dim3 cluster_shape()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(x) : );
+  asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(y) : );
+  asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(z) : );
+  return {x, y, z};
+#else
+  return {1,1,1};
+#endif
+}
+
+// Get 1D ctaid in a cluster.
+CUTE_DEVICE uint32_t block_rank_in_cluster()
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t rank;
+  asm volatile("mov.u32 %0, %%cluster_ctarank;\n" : "=r"(rank) :);
+  return rank;
+#else
+  return 0;
+#endif
+}
+
+// Set the destination block-ID in cluster for a given SMEM Address
+CUTE_DEVICE uint32_t set_block_rank(uint32_t smemAddr, uint32_t rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t result;
+  asm volatile("mapa.shared::cluster.u32  %0, %1, %2;\n"
+              : "=r"(result)
+              : "r"(smemAddr), "r"(rank));
+  return result;
+#else
+  return smemAddr;
+#endif
+}
+
+// Elect one thread in the warp. The elected thread gets its predicate set to true, all others obtain false.
+CUTE_HOST_DEVICE uint32_t elect_one_sync()
+{
+#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
+  uint32_t pred = 0;
+  uint32_t laneid = 0;
+  asm volatile(
+    "{\n"
+    ".reg .b32 %%rx;\n"
+    ".reg .pred %%px;\n"
+    "     elect.sync %%rx|%%px, %2;\n"
+    "@%%px mov.s32 %1, 1;\n"
+    "     mov.s32 %0, %%rx;\n"
+    "}\n"
+    : "+r"(laneid), "+r"(pred)
+    : "r"(0xFFFFFFFF));
+  return pred;
+#elif defined(__CUDA_ARCH__)
+  return (threadIdx.x % 32) == 0;
+#else
+  return true;
+#endif
+}
+
+struct ElectOneLaneIdReturnType {
+  uint32_t is_leader;
+  uint32_t leader_lane_id;
+};
+
+CUTE_HOST_DEVICE
+ElectOneLaneIdReturnType
+elect_one_leader_sync()
+{
+#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
+  uint32_t pred = 0;
+  uint32_t laneid = 0;
+  asm volatile(
+    "{\n"
+    ".reg .b32 %%rx;\n"
+    ".reg .pred %%px;\n"
+    "     elect.sync %%rx|%%px, %2;\n"
+    "@%%px mov.s32 %1, 1;\n"
+    "     mov.s32 %0, %%rx;\n"
+    "}\n"
+    : "+r"(laneid), "+r"(pred)
+    : "r"(0xFFFFFFFF));
+  return {pred, laneid};
+#elif defined(__CUDA_ARCH__)
+  return {(threadIdx.x % 32) == 0, 0};
+#else
+  return {true, 0};
+#endif
+}
+
+// Store value to remote shared memory in the cluster
+CUTE_DEVICE
+void
+store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
+  uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
+  asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
+               : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
+#endif
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/config.hpp b/lightllm-kernel/cutlass/include/cute/arch/config.hpp
new file mode 100755
index 000000000..84d7779a3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/config.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cutlass/arch/config.h> // CUTLASS_ARCH_MMA_SMxx_ENABLED
+
+// TMA instructions
+#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
+#  define CUTE_ARCH_TMA_SM90_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED)
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#endif
+
+// STSM
+#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy.hpp
new file mode 100755
index 000000000..513928999
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy.hpp
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/util.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+//
+// Direct Copy for any type
+//
+
+template <class S, class D = S>
+struct UniversalCopy
+{
+  using SRegisters = S[1];
+  using DRegisters = D[1];
+
+  template <class S_, class D_>
+  CUTE_HOST_DEVICE static constexpr void
+  copy(S_ const& src,
+       D_      & dst)
+  {
+    dst = static_cast<D>(static_cast<S>(src));
+  }
+
+  // Accept mutable temporaries
+  template <class S_, class D_>
+  CUTE_HOST_DEVICE static constexpr void
+  copy(S_ const& src,
+       D_     && dst)
+  {
+    UniversalCopy<S,D>::copy(src, dst);
+  }
+};
+
+//
+// Placeholder for the copy algorithm's stronger auto-vectorizing behavior
+//   that assumes alignment of pointers and dynamic layouts up to MaxVecBits
+//
+
+template <int MaxVecBits = 128>
+struct AutoVectorizingCopyWithAssumedAlignment
+     : UniversalCopy<uint_bit_t<MaxVecBits>>
+{
+  static_assert(MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
+                "Expected MaxVecBits to be 8 or 16 or 32 or 64 or 128 for alignment and performance.");
+};
+
+//
+// AutoVectorizingCopy alias assumes maximal alignment of pointers and dynamic strides.
+//   If this is not the case then AutoVectorizingCopyWithAssumedAlignment should be used instead
+//
+
+using AutoVectorizingCopy = AutoVectorizingCopyWithAssumedAlignment<128>;
+
+//
+// DefaultCopy alias does not assume alignment of pointers or dynamic strides.
+//
+
+using DefaultCopy = AutoVectorizingCopyWithAssumedAlignment<8>;
+
+//
+// Global memory prefetch into L2
+//
+
+CUTE_HOST_DEVICE static void
+prefetch(void const* gmem_ptr)
+{
+#if defined(__CUDA_ARCH__)
+  asm volatile("prefetch.global.L2 [%0];\n" : : "l"(gmem_ptr) : "memory");
+#endif
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
new file mode 100755
index 000000000..925d9ebe3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+  #define CUTE_ARCH_WARP_SHUFFLE_ENABLED 1
+#endif
+
+namespace cute
+{
+// Shuffle data between thread pair (0, 1), (2, 3), etc.
+struct SM50_Shuffle_U32_2x2Trans_XOR1
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
+    uint32_t x0 = src0;
+    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 1);
+
+    uint32_t x1 = src1;
+    uint32_t y1 = __shfl_xor_sync(0xffffffff, x1, 1);
+
+    if (threadIdx.x % 2 == 0) {
+      dst1 = y0;
+    } 
+    else {
+      dst0 = y1;
+    }
+#else 
+    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
+#endif
+  }
+};
+
+// Shuffle data between thread pair (0, 4), (1, 5), etc.
+struct SM50_Shuffle_U32_2x2Trans_XOR4
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
+    uint32_t x0 = threadIdx.x & 4  ? src0 : src1;
+    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 4);
+
+    // Replace detination register with shuffle result.
+    if (threadIdx.x & 0x4) {
+      dst0 = y0;
+    } 
+    else {
+      dst1 = y0;
+    }
+#else 
+    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
+#endif
+  }
+};
+
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
new file mode 100755
index 000000000..3d3d37acb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+// Config
+#if defined(__clang__) && defined(__CUDA__)
+  // ldmatrix PTX instructions added in Clang 14: https://reviews.llvm.org/D107046
+  // ... but will not work until Clang 15:
+  //   * https://reviews.llvm.org/D121666
+  //   * https://reviews.llvm.org/D126846
+  #define CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75 (__clang_major__ >= 15)
+#endif
+
+#if defined(__NVCC__) || defined(__CUDACC_RTC__)
+  // ldmatrix PTX instruction added in CUDA 10.2+
+  #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 ((__CUDACC_VER_MAJOR__  == 10 && __CUDACC_VER_MINOR__ >= 2) || __CUDACC_VER_MAJOR__ >= 11)
+#endif
+
+#if ! defined(CUTE_ARCH_LDSM_SM75_SUPPORTED)
+  #define CUTE_ARCH_LDSM_SM75_SUPPORTED (CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 || CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75)
+#endif
+
+#if ! defined(CUTE_ARCH_LDSM_SM75_ENABLED)
+  #define CUTE_ARCH_LDSM_SM75_ENABLED (CUTE_ARCH_LDSM_SM75_SUPPORTED)
+#endif
+
+#if (CUTE_ARCH_LDSM_SM75_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+  #define CUTE_ARCH_LDSM_SM75_ACTIVATED 1
+#endif
+
+namespace cute
+{
+
+struct SM75_U32x1_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U32x2_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst0), "=r"(dst1)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U32x4_LDSM_N
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x2_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x4_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst0), "=r"(dst1)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+struct SM75_U16x8_LDSM_T
+{
+  using SRegisters = uint128_t[1];
+  using DRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint128_t const& smem_src,
+       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
+  {
+#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
+    asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
+        :  "r"(smem_int_ptr));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
+#endif
+  }
+};
+
+//
+// Legacy LDSM interfaces that aren't very useful
+//
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_ldsm(uint128_t const* const smem_ptr,
+          T* rmem_ptr)
+{
+  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM75_U32x1_LDSM_N::copy(smem_ptr[0], reg_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM75_U32x2_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
+  }
+  else if (sizeof(T) == 16) {
+    SM75_U32x4_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_ldsm_trans(uint128_t const* const smem_ptr,
+                T* rmem_ptr)
+{
+  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM75_U16x2_LDSM_T::copy(smem_ptr[0], reg_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM75_U16x4_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
+  }
+  else if (sizeof(T) == 16) {
+    SM75_U16x8_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
new file mode 100755
index 000000000..e04181bfe
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/copy.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#  define CUTE_ARCH_CP_ASYNC_SM80_ENABLED
+#endif
+
+namespace cute
+{
+
+/// Copy via cp.async with caching at all levels
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEALWAYS
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at global level
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEGLOBAL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at all levels
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEALWAYS_ZFILL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst,
+       bool      pred)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    int src_size = pred ? sizeof(TS) : 0;
+    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)),
+           "r"(src_size));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+/// Copy via cp.async with caching at global level
+template <class TS, class TD = TS>
+struct SM80_CP_ASYNC_CACHEGLOBAL_ZFILL
+{
+  using SRegisters = TS[1];
+  using DRegisters = TD[1];
+
+  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
+  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
+
+  CUTE_HOST_DEVICE static void
+  copy(TS const& gmem_src,
+       TD      & smem_dst,
+       bool      pred)
+  {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+    TS const* gmem_ptr    = &gmem_src;
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    int src_size = pred ? sizeof(TS) : 0;
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n"
+        :: "r"(smem_int_ptr),
+           "l"(gmem_ptr),
+           "n"(sizeof(TS)),
+           "r"(src_size));
+#else
+    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+CUTE_HOST_DEVICE
+void
+cp_async_fence()
+{
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  asm volatile("cp.async.commit_group;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Blocks until all but N previous cp.async.commit_group operations have committed.
+template <int N>
+CUTE_HOST_DEVICE
+void
+cp_async_wait()
+{
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  if constexpr (N == 0) {
+    asm volatile("cp.async.wait_all;\n" ::);
+  } else {
+    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
+  }
+#endif
+}
+
+template <int N>
+CUTE_HOST_DEVICE
+void
+cp_async_wait(Int<N>)
+{
+  return cp_async_wait<N>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
new file mode 100755
index 000000000..bcb3b7d19
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
@@ -0,0 +1,219 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>      // CUTE_HOST_DEVICE
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+
+namespace cute
+{
+
+struct SM90_U32x1_STSM_N
+{
+  using SRegisters = uint32_t[1];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src,
+       uint128_t     & smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x1.m8n8.shared.b16 [%0], {%1};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U32x2_STSM_N
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src0), "r"(src1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U32x4_STSM_N
+{
+  using SRegisters = uint32_t[4];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x4.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
+        :: "r"(smem_int_ptr),
+          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x2_STSM_T
+{
+  using SRegisters = uint32_t[1];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [%0], {%1};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x4_STSM_T
+{
+  using SRegisters = uint32_t[2];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [%0], {%1, %2};\n"
+        :: "r"(smem_int_ptr),
+           "r"(src0), "r"(src1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_U16x8_STSM_T
+{
+  using SRegisters = uint32_t[4];
+  using DRegisters = uint128_t[1];
+
+  CUTE_HOST_DEVICE static void
+  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
+       uint128_t& smem_dst)
+  {
+#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
+    asm volatile ("stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
+        :: "r"(smem_int_ptr),
+          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
+#endif
+  }
+};
+
+//
+// Legacy STSM interfaces that aren't very useful
+//
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_stsm(T const* const rmem_ptr,
+          uint128_t* const smem_ptr)
+{
+  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM90_U32x1_STSM_N::copy(reg_ptr[0], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM90_U32x2_STSM_N::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 16) {
+    SM90_U32x4_STSM_N::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+copy_stsm_trans(T const* const rmem_ptr,
+                uint128_t* const smem_ptr)
+{
+  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
+
+  // if constexpr
+  if (sizeof(T) == 4) {
+    SM90_U16x2_STSM_T::copy(reg_ptr[0], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 8) {
+    SM90_U16x4_STSM_T::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
+  }
+  else if (sizeof(T) == 16) {
+    SM90_U16x8_STSM_T::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
+  }
+  else {
+    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/arch/copy_sm90_desc.hpp>
+#include <cute/arch/copy_sm90_tma.hpp>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
new file mode 100755
index 000000000..cc0bf4a39
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
@@ -0,0 +1,440 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/numeric_types.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#include <cinttypes>
+#endif
+
+#include <cute/config.hpp>
+
+#include <cute/arch/util.hpp>   // cute::cast_smem_ptr_to_uint
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+#include <cute/arch/copy_sm90.hpp>
+
+#include <cute/container/alignment.hpp>
+#include <cute/container/bit_field.hpp>
+#include <cute/container/array.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Barriers are 64-bit of user-managed information used in broadly two types syncronization patterns
+/// 1) arrive/wait on threads (usage: cp.async and warp-specialized kernels)
+/// 2) transaction-based (usage: TMA transaction where a CTA issues one transaction)
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Initialize barrier present in shared memory
+CUTE_HOST_DEVICE
+void
+initialize_barrier(uint64_t& smem_barrier,                 // 64 bits user-manged barrier in smem
+                   int thread_count = 1)                   // Thread count expected to arrive/wait on this barrier
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile ("mbarrier.init.shared::cta.b64 [%0], %1;\n"
+    :: "r"(smem_int_ptr),
+       "r"(thread_count));
+#endif
+}
+
+// Set the number of bytes transfered per transaction and perform an arrive operation as well
+CUTE_HOST_DEVICE
+void
+set_barrier_transaction_bytes(uint64_t& smem_barrier,      // 64 bits user-manged barrier in smem
+                              uint32_t bytes)              // Number of bytes transfered by per TMA transaction
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile ("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%0], %1;\n"
+    :: "r"(smem_int_ptr),
+       "r"(bytes));
+#endif
+}
+
+// Barrier wait
+CUTE_HOST_DEVICE
+void
+wait_barrier(uint64_t& smem_barrier,                       // 64 bits user-manged barrier in smem
+             int phase_bit)                                // Current phase bit the barrier waiting to flip
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile(
+    "{\n"
+    ".reg .pred                P1;\n"
+    "LAB_WAIT:\n"
+    "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
+    "@P1                       bra DONE;\n"
+    "bra                   LAB_WAIT;\n"
+    "DONE:\n"
+    "}\n"
+    :: "r"(smem_int_ptr),
+       "r"(phase_bit));
+
+#endif
+}
+
+// Barrier arrive
+CUTE_HOST_DEVICE
+void
+arrive_barrier(uint64_t& smem_barrier)                      // 64 bits user-manged barrier in smem
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
+  asm volatile(
+    "{\n"
+    ".reg .b64 state; \n"
+    "mbarrier.arrive.shared::cta.b64   state, [%0];\n"
+    "}\n"
+    :: "r"(smem_int_ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// TMA Descriptor and utilities
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace TMA {
+
+enum class SmemSwizzleBits : uint8_t {
+  DISABLE = 0,
+  B32 = 1,
+  B64 = 2,
+  B128 = 3,
+};
+
+enum class SmemSwizzleBase : uint8_t {
+  SWIZZLE_BASE_16B         = 0,
+};
+
+enum class OOBFill : uint8_t {
+  ZERO = 0,
+  CONSTANT = 1,
+};
+
+CUTE_HOST_DEVICE char const* to_string(OOBFill const& t) {
+  switch (t) {
+    case OOBFill::ZERO:     return "ZERO";
+    case OOBFill::CONSTANT: return "CONSTANT";
+  }
+  return nullptr;
+}
+
+enum class L2Promotion : uint8_t {
+  DISABLE = 0,
+  B64 = 1,
+  B128 = 2,
+  B256 = 3,
+};
+
+CUTE_HOST_DEVICE char const* to_string(L2Promotion const& t) {
+  switch (t) {
+    case L2Promotion::DISABLE: return "DISABLE";
+    case L2Promotion::B64:     return "B64";
+    case L2Promotion::B128:    return "B128";
+    case L2Promotion::B256:    return "B256";
+  }
+  return nullptr;
+}
+
+// Aux parameters which are independent with the problem size
+struct DescriptorAuxParams {
+  OOBFill     oobfill_     = OOBFill::ZERO;
+  L2Promotion l2promo_     = L2Promotion::DISABLE;
+};
+
+enum class CacheHintSm90 : uint64_t {
+  EVICT_NORMAL = 0x1000000000000000,
+  EVICT_FIRST = 0x12F0000000000000,
+  EVICT_LAST = 0x14F0000000000000,
+};
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+
+#if !defined(__CUDACC_RTC__)
+/// @return The TMA descriptor datatype enum corresponding to T.
+template <class T>
+inline CUtensorMapDataType
+to_CUtensorMapDataType() {
+  if constexpr (is_same_v<T,       int8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T,      uint8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_e4m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T, float_e5m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
+  if constexpr (is_same_v<T,     uint16_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT16;   } else
+  if constexpr (is_same_v<T,     uint32_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT32;   } else
+  if constexpr (is_same_v<T,     uint64_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT64;   } else
+  if constexpr (is_same_v<T,      int32_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT32;    } else
+  if constexpr (is_same_v<T,      int64_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT64;    } else
+  if constexpr (is_same_v<T,       half_t>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;  } else
+  if constexpr (is_same_v<T,        float>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;  } else
+  if constexpr (is_same_v<T,       double>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT64;  } else
+  if constexpr (is_same_v<T,   bfloat16_t>) { return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; } else
+  if constexpr (is_same_v<T,   tfloat32_t>) { return CU_TENSOR_MAP_DATA_TYPE_TFLOAT32; } else
+  { static_assert(sizeof(T) < 0, "Unknown TMA Format!"); }
+}
+
+inline CUtensorMapSwizzle
+to_CUtensorMapSwizzle(SmemSwizzleBits const& t, SmemSwizzleBase const& b) {
+  switch (t) {
+    default: assert(false && "Unsupported pair of SmemSwizzleBits and SmemSwizzleBase!");
+    case SmemSwizzleBits::DISABLE: 
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 0B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_NONE;
+    case SmemSwizzleBits::B32:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 32B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_32B;
+    case SmemSwizzleBits::B64:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 64B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_64B;
+    case SmemSwizzleBits::B128:
+      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 128B swizzle bits.");
+      return CU_TENSOR_MAP_SWIZZLE_128B;
+  }
+}
+
+inline CUtensorMapFloatOOBfill
+to_CUtensorMapFloatOOBfill(OOBFill const& t) {
+  switch(t) {
+    default:                assert(false && "Unknown OOBFill!");
+    case OOBFill::ZERO:     return CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+    case OOBFill::CONSTANT: return CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
+  }
+}
+
+inline CUtensorMapL2promotion
+to_CUtensorMapL2promotion(L2Promotion const& t) {
+  switch(t) {
+    default: assert(false && "Unknown L2Promotion!");
+    case L2Promotion::DISABLE: return CU_TENSOR_MAP_L2_PROMOTION_NONE;
+    case L2Promotion::B64:     return CU_TENSOR_MAP_L2_PROMOTION_L2_64B;
+    case L2Promotion::B128:    return CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
+    case L2Promotion::B256:    return CU_TENSOR_MAP_L2_PROMOTION_L2_256B;
+  }
+}
+
+#endif // !defined(__CUDACC_RTC__)
+
+#endif // (__CUDACC_VER_MAJOR__ >= 12)
+
+} // end namespace TMA
+
+#if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  using TmaDescriptor = CUtensorMap;
+  using Im2ColTmaDescriptor = CUtensorMap;
+#else
+  using TmaDescriptor = struct alignas(64) { char bytes[128]; };
+  using Im2ColTmaDescriptor = struct alignas(64) { char bytes[128]; };
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Initiates a TensorMap Prefetch
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+prefetch_tma_descriptor(TmaDescriptor const* desc_ptr)
+{
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  // Prefetch TMA Descriptor using generic addressing (i.e. no specific state space: const or param)
+  asm volatile (
+    "prefetch.tensormap [%0];"
+    :
+    : "l"(gmem_int_desc)
+    : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Trying to use TMA Descriptor Prefetch without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a TensorMap modification (by each field)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Replace tensor pointer directly in GMEM
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_addr_in_global_mem(TmaDescriptor const* desc_ptr,
+                                          void const* const new_tensor_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
+  asm volatile (
+    "tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;"
+    :: "l"(gmem_int_desc), "l"(new_desc_addr));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+// Replace tensor pointer by bringing the tensormap from GMEM into the shared memory
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_addr_in_shared_mem(TmaDescriptor& smem_desc,
+                                          void const* const new_tensor_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
+  asm volatile (
+    "tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;"
+    :: "r"(smem_int_desc), "l"(new_desc_addr));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+// Replace tensor dims and strides for GEMMs by bringing the tensormap from GMEM into the shared memory
+CUTE_HOST_DEVICE
+void
+tma_descriptor_replace_dims_strides_in_shared_mem(TmaDescriptor                 & smem_desc,
+                                                  cute::array<uint32_t, 5> const& prob_shape,
+                                                  cute::array<uint64_t, 5> const& prob_stride)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  uint64_t const smem_int64_desc = 0;
+  asm volatile (
+    "cvt.u64.u32 %0, %1;"
+    :: "l"(smem_int64_desc), "r"(smem_int_desc));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[0]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[1]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[2]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[3]));
+  asm volatile (
+    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 4, %1;"
+    :: "l"(smem_int64_desc), "r"(prob_shape[4]));
+  // Strides must be a multiple of 16. Also, stride for the intermost dimension is implicitly 1
+  #if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 5)))
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[1]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[2]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[3]));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[4]));
+  #else
+  // 4 LSBs are not included
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[1] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[2] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[3] >> 4));
+  asm volatile (
+    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
+    :: "l"(smem_int64_desc), "l"(prob_stride[4] >> 4));
+  #endif
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a fused copy and fence operation (needed when modifying tensormap in shared memory)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_cp_fence_release(TmaDescriptor const* gmem_desc_ptr, TmaDescriptor& smem_desc)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(gmem_desc_ptr);
+  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
+  asm volatile (
+    "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], 128;"
+    :: "l"(gmem_int_desc), "r"(smem_int_desc));
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a release fence operation (needed when modifying tensormap directly in GMEM)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_fence_release()
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  asm volatile ("fence.proxy.tensormap::generic.release.gpu;");
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Perform a acquire fence operation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTE_HOST_DEVICE
+void
+tma_descriptor_fence_acquire(TmaDescriptor const* desc_ptr)
+{
+#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+  asm volatile (
+    "fence.proxy.tensormap::generic.acquire.gpu [%0], 128;"
+    :
+    : "l"(gmem_int_desc)
+    : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
new file mode 100755
index 000000000..fb33d63ca
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
@@ -0,0 +1,1395 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
+#include <cute/arch/copy.hpp>
+#include <cute/arch/copy_sm90.hpp>
+#include "cutlass/arch/synclog.hpp"
+
+namespace cute
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD : Initiates a TMA copy from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3}], [%2], %4;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.1d.L2.global"
+        " [%0, {%1}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4}], [%2], %5;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.2d.L2.global"
+        " [%0, {%1, %2}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5}], [%2], %6;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.3d.L2.global"
+        " [%0, {%1, %2, %3}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.4d.L2.global"
+        " [%0, {%1, %2, %3, %4}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.5d.L2.global"
+        " [%0, {%1, %2, %3, %4, %5}];"
+        :
+        : "l"(gmem_int_desc),
+          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_LOAD_1D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_LOAD_2D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_LOAD_3D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_LOAD_4D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_LOAD_5D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0)
+    {
+      return SM90_TMA_LOAD_1D::PREFETCH::copy(desc_ptr, crd0);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1)
+    {
+      return SM90_TMA_LOAD_2D::PREFETCH::copy(desc_ptr, crd0, crd1);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+    {
+      return SM90_TMA_LOAD_3D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+    {
+      return SM90_TMA_LOAD_4D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+    {
+      return SM90_TMA_LOAD_5D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3, crd4);
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5}], [%2], {%6};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n),
+        "h"(offset_w)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+         uint16_t const& offset_w)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.3d.L2.global.im2col"
+        " [%0, {%1, %2, %3}], {%4};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_n),
+          "h"(offset_w)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.4d.L2.global.im2col"
+        " [%0, {%1, %2, %3, %4}], {%5, %6};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+          "h"(offset_w), "h"(offset_h)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10};"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h), "h"(offset_d)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+      asm volatile (
+        "cp.async.bulk.prefetch.tensor.5d.L2.global.im2col"
+        " [%0, {%1, %2, %3, %4, %5}], {%6, %7, %8};"
+        :
+        : "l"(gmem_int_desc),
+          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+          "h"(offset_w), "h"(offset_h), "h"(offset_d)
+        : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_TMA_LOAD_IM2COL
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+    return SM90_TMA_LOAD_IM2COL_3D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_n,
+                                         offset_w);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+    return SM90_TMA_LOAD_IM2COL_4D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_h, coord_n,
+                                         offset_w, offset_h);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+    return SM90_TMA_LOAD_IM2COL_5D::copy(desc_ptr, mbar_ptr, smem_ptr,
+                                         coord_c, coord_w, coord_h, coord_d, coord_n,
+                                         offset_w, offset_h, offset_d);
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+         uint16_t const& offset_w)
+    {
+      return SM90_TMA_LOAD_IM2COL_3D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_n,
+                                                     offset_w);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h)
+    {
+      return SM90_TMA_LOAD_IM2COL_4D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_h, coord_n,
+                                                     offset_w, offset_h);
+    }
+    CUTE_HOST_DEVICE static void
+    copy(void const* desc_ptr,
+         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+    {
+      return SM90_TMA_LOAD_IM2COL_5D::PREFETCH::copy(desc_ptr,
+                                                     coord_c, coord_w, coord_h, coord_d, coord_n,
+                                                     offset_w, offset_h, offset_d);
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD_MULTICAST: Initiates a TMA copy from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_MULTICAST_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4}], [%2], %3, %5;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5}], [%2], %3, %6;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6}], [%2], %3, %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6, %7}], [%2], %3, %8;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2),  "r"(crd3), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
+      " [%0], [%1, {%4, %5, %6, %7, %8}], [%2], %3, %9;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "h"(multicast_mask),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_MULTICAST
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_LOAD_MULTICAST_1D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_LOAD_MULTICAST_2D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_LOAD_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_LOAD_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
+       void      * smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_LOAD_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+
+  using PREFETCH = typename SM90_TMA_LOAD::PREFETCH;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_LOAD_MULTICAST im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5}], [%2], {%6}, %7;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n),
+        "h"(offset_w),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
+    // Copy from global to shared::cluster.
+    asm volatile (
+      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
+      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10}, %11;"
+      :
+      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
+        "h"(offset_w), "h"(offset_h), "h"(offset_d),
+        "h"(multicast_mask)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
+       uint16_t const& offset_w)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_n,
+                                                   offset_w);
+  }
+
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_h, coord_n,
+                                                   offset_w, offset_h);
+  }
+
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
+       void      * smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
+       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
+  {
+    return SM90_TMA_LOAD_IM2COL_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask,
+                                                   smem_ptr,
+                                                   coord_c, coord_w, coord_h, coord_d, coord_n,
+                                                   offset_w, offset_h, offset_d);
+  }
+
+  using PREFETCH = typename SM90_TMA_LOAD_IM2COL::PREFETCH;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_STORE : Initiates a TMA copy from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_STORE_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_STORE_1D::copy(desc_ptr, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_STORE_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_STORE_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_STORE_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_STORE_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_STORE im2col: Initiates a TMA copy, in im2col mode, from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_STORE_IM2COL_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group"
+      " [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_STORE_IM2COL
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_3D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_n);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_4D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_n);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* desc_ptr,
+       void const* smem_ptr,
+       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
+  {
+    return SM90_TMA_STORE_IM2COL_5D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_d, coord_n);
+  }
+};
+
+// Fence for smem stores for subsequent TMA_STORE
+CUTE_HOST_DEVICE static void
+tma_store_fence() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
+    asm volatile ("fence.proxy.async.shared::cta;");
+#elif defined(__CUDA_ARCH__)
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+// Indicate arrival of warp issuing TMA_STORE
+CUTE_HOST_DEVICE static void
+tma_store_arrive() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    cutlass::arch::synclog_emit_tma_store_arrive(__LINE__);
+    asm volatile("cp.async.bulk.commit_group;");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+// Wait until at most Count committed TMA_STOREs are pending and all prior commits are complete
+template <int Count>
+CUTE_HOST_DEVICE static void
+tma_store_wait() {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    asm volatile(
+      "cp.async.bulk.wait_group.read %0;"
+      :
+      : "n"(Count)
+      : "memory");
+    cutlass::arch::synclog_emit_tma_store_wait(__LINE__, Count);
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TMA_REDUCE_ADD : Initiates a TMA reduce-add from shared memory to global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_REDUCE_ADD_1D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.bulk_group [%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_2D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group [%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_3D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_4D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD_5D
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
+    asm volatile (
+      "cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr),
+        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+      : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_TMA_REDUCE_ADD
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0)
+  {
+    return SM90_TMA_REDUCE_ADD_1D::copy(desc_ptr, smem_ptr, crd0);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1)
+  {
+    return SM90_TMA_REDUCE_ADD_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
+  {
+    return SM90_TMA_REDUCE_ADD_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
+  {
+    return SM90_TMA_REDUCE_ADD_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
+  }
+  CUTE_HOST_DEVICE static void
+  copy(void const* const desc_ptr,
+       void const* const smem_ptr,
+       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
+  {
+    return SM90_TMA_REDUCE_ADD_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// BULK_COPY : Copy a bulk of memory between shared memory and global memory
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_BULK_COPY_G2S
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* gmem_ptr, uint64_t* mbar_ptr,
+       void      * smem_ptr, int32_t load_bytes)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
+                     :
+                     : "r"(smem_int_ptr), "l"(gmem_ptr), "r"(load_bytes), "r"(smem_int_mbar)
+                     : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  struct PREFETCH
+  {
+    CUTE_HOST_DEVICE static void
+    copy(void const* gmem_ptr, int32_t load_bytes)
+    {
+  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+      asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n"
+                      :
+                      : "l"(gmem_ptr), "r"(load_bytes)
+                      : "memory");
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+  #endif
+    }
+  };
+};
+
+struct SM90_BULK_COPY_S2G
+{
+  CUTE_HOST_DEVICE static void
+  copy(void const* smem_ptr,
+       void      * gmem_ptr, int32_t store_bytes)
+  {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
+                     :
+                     : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
+                     : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+struct SM90_BULK_COPY_AUTO {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma.hpp
new file mode 100755
index 000000000..6e06114a6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/numeric/complex.hpp>  // cute::fma
+#include <cute/numeric/real.hpp>     // cute::fma
+
+namespace cute
+{
+
+//
+// Direct FMA for any type
+//
+
+template <class D, class A = D, class B = A, class C = D>
+struct UniversalFMA
+{
+  using DRegisters = D[1];
+  using ARegisters = A[1];
+  using BRegisters = B[1];
+  using CRegisters = C[1];
+
+  CUTE_HOST_DEVICE static constexpr void
+  fma(D      & d,
+      A const& a,
+      B const& b,
+      C const& c)
+  {
+    // Forward to an ADL/cute free function for these types
+    using cute::fma;
+    fma(d, a, b, c);
+  }
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
new file mode 100755
index 000000000..f7bcb7d19
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+#  define CUTE_ARCH_MMA_SM61_ENABLED
+#endif
+
+namespace cute
+{
+
+struct SM61_DP4A
+{
+  using DRegisters = int32_t[1];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = int32_t[1];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
+  {
+#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d)
+                 : "r"(a), "r"(b), "r"(c));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP4A without CUTE_ARCH_MMA_SM61_ENABLED");
+#endif
+  }
+};
+
+struct SM61_DP2A
+{
+  using DRegisters = int32_t[1];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = int32_t[1];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
+  {
+#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
+    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d)
+                 : "r"(a), "r"(b), "r"(c));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP2A without CUTE_ARCH_MMA_SM61_ENABLED");
+#endif
+  }
+};
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
new file mode 100755
index 000000000..63d96cf5d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
@@ -0,0 +1,329 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
+#  define CUTE_ARCH_MMA_SM70_SUPPORTED
+#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#    define CUTE_ARCH_MMA_SM70_ENABLED
+#  endif
+#endif
+
+namespace cute
+{
+
+//
+// SM70 MMA 884 F16F16F16
+//
+
+struct SM70_8x8x4_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_NT
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_NN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F16F16F16F16_TT
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16"
+                 "{%0, %1,  %2,  %3},"
+                 "{%4, %5},"
+                 "{%6, %7},"
+                 "{%8, %9, %10, %11};\n"
+        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// SM70 MMA 884 F16F16F32
+//
+
+struct SM70_8x8x4_F32F16F16F32_TN
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};\n"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_NT
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_NN
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NN without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM70_8x8x4_F32F16F16F32_TT
+{
+  using DRegisters = float[8];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[8];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      float         & d4, float         & d5, float      & d6, float      & d7,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0, uint32_t const& b1,
+      float    const& c0, float    const& c1, float const& c2, float const& c3,
+      float    const& c4, float    const& c5, float const& c6, float const& c7)
+  {
+#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32"
+                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+                 "{%8,  %9},"
+                 "{%10, %11},"
+                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
+          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),  "r"(b1),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
+           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TT without CUTE_ARCH_MMA_SM70_ENABLED");
+#endif
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
new file mode 100755
index 000000000..c33f7b391
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+#  define CUTE_ARCH_MMA_SM75_SUPPORTED
+#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#    define CUTE_ARCH_MMA_SM75_ENABLED
+#  endif
+#endif
+
+namespace cute
+{
+
+//
+// SM75 MMA 1688 F16F16F32
+//
+
+struct SM75_16x8x8_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float      & d2, float      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float    const& c0, float    const& c1, float const& c2, float const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                 "{%0, %1, %2, %3},"
+                 "{%4, %5},"
+                 "{%6},"
+                 "{%7, %8, %9, %10};\n"
+        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+        :  "r"(a0),  "r"(a1),
+           "r"(b0),
+           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// SM75 MMA 8816 S8S8S32
+//
+
+struct SM75_8x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  // Register asm fma
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
+    asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32"
+                 "{%0, %1},"
+                 "{%2},"
+                 "{%3},"
+                 "{%4, %5};\n"
+        : "=r"(d0), "=r"(d1)
+        :  "r"(a0),
+           "r"(b0),
+           "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
new file mode 100755
index 000000000..60777f220
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
@@ -0,0 +1,2243 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/numeric/complex.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#  define CUTE_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTE_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+
+#if (__CUDA_ARCH__ <= 890)
+#define CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
+#endif
+
+
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+      "{%0, %1},"
+      "{%2, %3},"
+      "{%4},"
+      "{%5, %6};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F16F16F16F16_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      "{%6,  %7},"
+      "{%8,  %9};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F32F16F16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32BF16BF16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_F32BF16BF16F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct SM80_16x8x4_F32TF32TF32F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x4_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct SM80_16x8x8_F32TF32TF32F32_TN
+{
+  using DRegisters = float[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x4 TN
+struct SM80_8x8x4_F64F64F64F64_TN
+{
+  using DRegisters = double[2];
+  using ARegisters = double[1];
+  using BRegisters = double[1];
+  using CRegisters = double[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1,
+      double const& a0,
+      double const& b0,
+      double const& c0, double const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=d"(d0), "=d"(d1)
+      :  "d"(a0),
+         "d"(b0),
+         "d"(c0),  "d"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+// MMA 8x8x4 TN with Planar Complex multiplication
+struct SM80_8x8x4_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[2];
+  using ARegisters = complex<double>[1];
+  using BRegisters = complex<double>[1];
+  using CRegisters = complex<double>[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double> const& a0,
+      complex<double> const& b0,
+      complex<double> const& c0, complex<double> const& c1)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1,
+      a0.real(),
+      b0.real(),
+      c0.real(), c1.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      id0, id1,
+      a0.imag(),
+      b0.real(),
+      c0.imag(), c1.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1,
+      -a0.imag(),
+      b0.imag(),
+      d0.real(), d1.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    SM80_8x8x4_F64F64F64F64_TN::fma(
+      id0, id1,
+      a0.real(),
+      b0.imag(),
+      d0.imag(), d1.imag());
+  }
+};
+
+// MMA 8x8x4 TN with Gaussian Complex multiplication:
+//    (a + bi)*(c + di)
+//  yields
+//    t0 += a*c
+//    t1 += b*d
+//    t2 += (a+b)*(c+d)
+//  then
+//    re = t0 - t1
+//    im = t2 - t0 - t1
+struct SM80_8x8x4_GC64C64C64GC64_TN
+{
+  struct GaussComplex {
+    double t0, t1, t2;
+
+    CUTE_HOST_DEVICE //constexpr
+    operator complex<double>() const { return complex<double>(t0 - t1, t2 - t0 - t1); }
+
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator*(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) * b; }
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator*(complex<double> const& a, GaussComplex const& b) { return b * a; }
+
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator+(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) + b; }
+    CUTE_HOST_DEVICE friend //constexpr
+    complex<double> operator+(complex<double> const& a, GaussComplex const& b) { return b + a; }
+  };
+
+  using DRegisters = GaussComplex[2];
+  using ARegisters = complex<double>[1];
+  using BRegisters = complex<double>[1];
+  using CRegisters = GaussComplex[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(GaussComplex         & d0, GaussComplex         & d1,
+      complex<double> const& a0,
+      complex<double> const& b0,
+      GaussComplex    const& c0, GaussComplex    const& c1)
+  {
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t0, d1.t0,
+                                    a0.real(),
+                                    b0.real(),
+                                    c0.t0, c1.t0);
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t1, d1.t1,
+                                    a0.imag(),
+                                    b0.imag(),
+                                    c0.t1, c1.t1);
+    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t2, d1.t2,
+                                    a0.real() + a0.imag(),
+                                    b0.real() + b0.imag(),
+                                    c0.t2, c1.t2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8S8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8S8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x16 TN
+struct SM80_8x8x16_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct SM80_16x8x16_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8U8S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U8U8S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32S4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4S4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4S4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x32 TN
+struct SM80_8x8x32_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32.satfinite "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x32 TN
+struct SM80_16x8x32_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4U4S32_TN
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x64 TN
+struct SM80_16x8x64_S32U4U4S32_TN_SATURATE
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x128 TN
+struct SM80_8x8x128_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x128 TN
+struct SM80_16x8x128_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x256 TN
+struct SM80_16x8x256_S32U1U1S32_TN_XORPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 8x8x128 TN
+struct SM80_8x8x128_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[2];
+  using ARegisters = uint32_t[1];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& a0,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1)
+  {
+#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.and.popc "
+      "{%0, %1},"
+      "{%2},"
+      "{%3},"
+      "{%4, %5};\n"
+      : "=r"(d0), "=r"(d1)
+      :  "r"(a0),
+         "r"(b0),
+         "r"(c0),  "r"(c1));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x128 TN
+struct SM80_16x8x128_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[2];
+  using BRegisters = uint32_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1,
+      uint32_t const& b0,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.and.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),
+         "r"(b0),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x256 TN
+struct SM80_16x8x256_S32U1U1S32_TN_ANDPOPC
+{
+  using DRegisters = uint32_t[4];
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint32_t[2];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint32_t const& b0, uint32_t const& b1,
+      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "r"(b0),  "r"(b1),
+         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
new file mode 100755
index 000000000..51d34563c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
@@ -0,0 +1,9331 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+
+// Config
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#    define CUTE_ARCH_MMA_SM90_ENABLED
+#    define CUTE_ARCH_MMA_F64_SM90_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+
+namespace SM90 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct MMA_16x8x4_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[2];
+  using BRegisters = double[1];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1,
+      double const& b0,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5},"
+      "{%6},"
+      "{%7,  %8,  %9,  %10};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),
+         "d"(b0),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct MMA_16x8x8_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[4];
+  using BRegisters = double[2];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1, double const& a2, double const& a3,
+      double const& b0, double const& b1,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
+         "d"(b0),  "d"(b1),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x8_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct MMA_16x8x16_F64F64F64F64_TN
+{
+  using DRegisters = double[4];
+  using ARegisters = double[8];
+  using BRegisters = double[4];
+  using CRegisters = double[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(double      & d0, double      & d1, double      & d2, double      & d3,
+      double const& a0, double const& a1, double const& a2, double const& a3,
+      double const& a4, double const& a5, double const& a6, double const& a7,
+      double const& b0, double const& b1, double const& b2, double const& b3,
+      double const& c0, double const& c1, double const& c2, double const& c3)
+  {
+#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
+    asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64"
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7,  %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      "{%16, %17, %18, %19};\n"
+      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
+      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
+         "d"(a4),  "d"(a5),  "d"(a6),  "d"(a7),
+         "d"(b0),  "d"(b1),  "d"(b2),  "d"(b3),
+         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x16_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x4 TN
+struct MMA_16x8x4_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[2];
+  using BRegisters = complex<double>[1];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& b0,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(),
+      b0.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(),
+      b0.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(),
+      b0.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x4_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(),
+      b0.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x8 TN
+struct MMA_16x8x8_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[4];
+  using BRegisters = complex<double>[2];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& a2, complex<double> const& a3,
+      complex<double> const& b0, complex<double> const& b1,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      b0.real(), b1.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
+      b0.real(), b1.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
+      b0.imag(), b1.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x8_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      b0.imag(), b1.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// MMA 16x8x16 TN
+struct MMA_16x8x16_C64C64C64C64_TN
+{
+  using DRegisters = complex<double>[4];
+  using ARegisters = complex<double>[8];
+  using BRegisters = complex<double>[4];
+  using CRegisters = complex<double>[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(complex<double>      & d0, complex<double>      & d1,
+      complex<double>      & d2, complex<double>      & d3,
+      complex<double> const& a0, complex<double> const& a1,
+      complex<double> const& a2, complex<double> const& a3,
+      complex<double> const& a4, complex<double> const& a5,
+      complex<double> const& a6, complex<double> const& a7,
+      complex<double> const& b0, complex<double> const& b1,
+      complex<double> const& b2, complex<double> const& b3,
+      complex<double> const& c0, complex<double> const& c1,
+      complex<double> const& c2, complex<double> const& c3)
+  {
+    // Because thrust::complex does not provide a mutable ref
+    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
+    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
+    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
+    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
+    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
+    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
+    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
+    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
+
+    // d.real() =  a.real() * b.real() + c.real();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      a4.real(), a5.real(), a6.real(), a7.real(),
+      b0.real(), b1.real(), b2.real(), b3.real(),
+      c0.real(), c1.real(), c2.real(), c3.real());
+
+    // d.imag() =  a.imag() * b.real() + c.imag();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
+      a4.imag(), a5.imag(), a6.imag(), a7.imag(),
+      b0.real(), b1.real(), b2.real(), b3.real(),
+      c0.imag(), c1.imag(), c2.imag(), c3.imag());
+
+    // d.real() = -a.imag() * b.imag() + d.real();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      rd0, rd1, rd2, rd3,
+      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
+      -a4.imag(), -a5.imag(), -a6.imag(), -a7.imag(),
+      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
+      d0.real(), d1.real(), d2.real(), d3.real());
+
+    // d.imag() =  a.real() * b.imag() + d.imag();
+    MMA_16x8x16_F64F64F64F64_TN::fma(
+      id0, id1, id2, id3,
+      a0.real(), a1.real(), a2.real(), a3.real(),
+      a4.real(), a5.real(), a6.real(), a7.real(),
+      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
+      d0.imag(), d1.imag(), d2.imag(), d3.imag());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/arch/mma_sm90_desc.hpp>
+#include <cute/arch/mma_sm90_gmma.hpp>
+#include <cute/arch/mma_sm90_gmma_sparse.hpp>
+#include <cute/layout.hpp>                     // cute::size
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+#include <cute/numeric/numeric_types.hpp>      // cute::half_t, cute::float_e4m3_t, cute::tfloat32_t, etc
+#include <cute/util/type_traits.hpp>           // cute::is_same_v
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+namespace SM90::GMMA {
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_op_selector()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_op_selector_sparse()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rs_op_selector()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC,
+  class TileShape_MNK,
+  GMMA::Major MajorA = GMMA::Major::K,
+  GMMA::Major MajorB = GMMA::Major::K,
+  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
+                                       // But most commonly leave empty for defaults
+>
+CUTE_HOST_DEVICE constexpr
+auto
+rs_op_selector_sparse()
+{
+  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
+  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
+  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
+  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
+  auto Tile_N = size<1>(TileShape_MNK{});
+
+  // F16 accumulator
+  if constexpr (is_same_v<ElementC, half_t>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // F32 accumulator
+  else if constexpr (is_same_v<ElementC, float>) {
+
+    // Input A: half_t ; Input B: half_t
+    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: bfloat16_t ; Input B: bfloat16_t
+    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
+      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: tfloat32_t ; Input B: tfloat32_t
+    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e4m3_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e4m3_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: float_e5m2_t ; Input B: float_e5m2_t
+    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 248 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 232 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 216 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 200 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 184 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 168 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 152 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 136 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 120 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 104 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 88 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 72 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 56 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 40 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // S32 accumulator
+  else if constexpr (is_same_v<ElementC, int32_t>) {
+
+    // Input A: int8_t ; Input B: int8_t
+    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: int8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: int8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    // Input A: uint8_t ; Input B: uint8_t
+    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
+      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
+      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
+      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
+
+      if constexpr (Tile_N % 256 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 240 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 224 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 208 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 192 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 176 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 160 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 144 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 128 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 112 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 96 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 80 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 64 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 48 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 32 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<Args...>{};
+      }
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+      else if constexpr (Tile_N % 24 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<Args...>{};
+      }
+#endif
+      else if constexpr (Tile_N % 16 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<Args...>{};
+      }
+      else if constexpr (Tile_N % 8 == 0) {
+        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<Args...>{};
+      }
+      else {
+        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
+      }
+    }
+
+    else {
+      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
+    }
+  }
+
+  // Unknown accumulator type
+  else {
+    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
+  }
+}
+
+} // end namespace SM90::GMMA
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
new file mode 100755
index 000000000..a53a9748b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/arch/mma.hpp>
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#    define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA Descriptor and utilities
+
+// GMMA enums and utilities
+namespace SM90::GMMA {
+
+enum class LayoutType : uint8_t {
+  INTERLEAVE = 0,
+  B128 = 1,
+  B64 = 2,
+  B32 = 3,
+};
+
+CUTE_HOST_DEVICE char const* to_string(LayoutType const& t) {
+  switch (t) {
+    case LayoutType::INTERLEAVE: return "INTERLEAVE";
+    case LayoutType::B128:       return "B128";
+    case LayoutType::B64:        return "B64";
+    case LayoutType::B32:        return "B32";
+  }
+  return nullptr;
+}
+
+#if !defined(__CUDACC_RTC__)
+// Output operator for all enums in this namespace
+CUTE_HOST std::ostream& operator<<(std::ostream& os, LayoutType const& t) {
+  char const* s = to_string(t);
+  if (s) {
+    std::operator<<(os, s);  // Explicit call to avoid ambiguity
+  } else {
+    os.setstate(std::ios_base::failbit);
+  }
+  return os;
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace SM90::GMMA
+
+union GmmaDescriptor
+{
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor() noexcept : desc_(0) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(uint64_t desc) noexcept : desc_(desc) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(GmmaDescriptor const& t) noexcept : desc_(t.desc_) {}
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor(GmmaDescriptor && t) noexcept : desc_(t.desc_) {}
+
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor& operator=(GmmaDescriptor const& t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  GmmaDescriptor& operator=(GmmaDescriptor && t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  uint64_t desc_;
+  uint32_t reg32_[2];
+  uint16_t reg16_[4];
+
+  // Bitfield implementation avoids the need for shifts in assignment
+  struct {
+    // start_address, bit [0,14), 4LSB not included
+    uint16_t start_address_ : 14, : 2;        // 14 bits [0,14), 2 bits unused
+    // leading dimension byte offset, bit [16,30), 4LSB not included
+    // For N: This is the stride from the first col to the second col of the 8x2 brick in INTERLEAVED
+    //   Unused for all SWIZZLE_* layouts (and assumed to be 1)
+    // For T: This is the stride from the first 8 rows to the next 8 rows.
+    uint16_t leading_byte_offset_ : 14, : 2;  // 14 bits [0,14), 2 bits unused
+    // stride dimension byte offset, bit [32,46), 4LSB not included
+    // For N: This is the stride from the first 8 rows to the next 8 rows.
+    // For T: This is the stride fro mthe first 8 cols to the next 8 cols.
+    uint16_t stride_byte_offset_ : 14, : 2;   // 14 bits [0,14), 2 bits unused
+    // base_offset, bit [49,52)
+    // Valid only for SWIZZLE_128B and SWIZZLE_64B
+    uint8_t : 1, base_offset_ : 3, : 4;       // 1 bit unused, 3 bits [1,4), 4 bits unused
+    // layout type, bit [62,64)
+    // SWIZZLE_NONE = 0, SWIZZLE_32B = 3, SWIZZLE_64B = 2, SWIZZLE_128B = 1
+    uint8_t : 6, layout_type_ : 2;            // 6 bits unused, 2 bits [6,8)
+  } bitfield;
+
+  // Decay to a uint64_t
+  CUTE_HOST_DEVICE constexpr
+  operator uint64_t() const noexcept { return desc_; }
+};
+
+// Printer
+CUTE_HOST_DEVICE void
+print(GmmaDescriptor const& t)
+{
+#if !defined(__CUDACC_RTC__)
+  printf("GmmaDescriptor: 0x%016llx\n",   static_cast<unsigned long long>(t.desc_));
+  printf("  start_addr :  0x%04x\n",      t.bitfield.start_address_);
+  printf("  leading_off:  0x%04x (%d)\n", t.bitfield.leading_byte_offset_, t.bitfield.leading_byte_offset_);
+  printf("  stride_off :  0x%04x (%d)\n", t.bitfield.stride_byte_offset_, t.bitfield.stride_byte_offset_);
+  printf("  base_offset:  0x%01x\n",      t.bitfield.base_offset_);
+  printf("  layout_type:  0x%01x (%s)\n", t.bitfield.layout_type_, to_string(static_cast<SM90::GMMA::LayoutType>(t.bitfield.layout_type_)));
+#endif // !defined(__CUDACC_RTC__)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
new file mode 100755
index 000000000..d809aa4a6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
@@ -0,0 +1,20974 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Warpgroup sync primitives
+
+CUTE_HOST_DEVICE
+void
+warpgroup_arrive()
+{
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_arrive(__LINE__);
+  asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.fence without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+template <int N>
+CUTE_HOST_DEVICE
+void
+warpgroup_wait()
+{
+  static_assert(N >= 0 && N <= 7, "WGMMA wait: N must be in range [0, 7]");
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_wait(__LINE__, N);
+  asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.wait_group<N> without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+// Marks the commit point for one or more sized batch of warpgroup MMAs.
+CUTE_HOST_DEVICE
+void
+warpgroup_commit_batch()
+{
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+  cutlass::arch::synclog_emit_warpgroup_commit_batch(__LINE__);
+  asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory");
+#else
+  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.commit_group without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+}
+
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(uint32_t& reg) {
+  // MSVC emits a build error for 'asm volatile'
+  // even if it only occurs in a __device__ function.
+  // This prevents the error.
+#if defined(__CUDA_ARCH__)
+  asm volatile("" : "+r"(reg) :: "memory");
+#endif
+}
+
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(float& reg) {
+#if defined(__CUDA_ARCH__)
+  asm volatile("" : "+f"(reg) :: "memory");
+#endif
+}
+
+namespace SM90::GMMA {
+
+enum class Major {
+  K  = 0,
+  MN = 1
+};
+
+enum class ScaleOut {
+  Zero = 0,
+  One  = 1
+};
+
+enum class ScaleIn {
+  Neg = -1,
+  One =  1
+};
+
+enum class SparseSel {
+  Zero = 0,
+  One  = 1
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6, %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8,  %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*S8
+struct MMA_64x8x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*S8
+struct MMA_64x16x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*S8
+struct MMA_64x32x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*S8
+struct MMA_64x64x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*S8
+struct MMA_64x96x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*S8
+struct MMA_64x128x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*S8
+struct MMA_64x192x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*S8
+struct MMA_64x256x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=S8*U8
+struct MMA_64x8x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=S8*U8
+struct MMA_64x16x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=S8*U8
+struct MMA_64x32x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=S8*U8
+struct MMA_64x64x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=S8*U8
+struct MMA_64x96x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=S8*U8
+struct MMA_64x128x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=S8*U8
+struct MMA_64x192x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=S8*U8
+struct MMA_64x256x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*S8
+struct MMA_64x8x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*S8
+struct MMA_64x16x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*S8
+struct MMA_64x32x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*S8
+struct MMA_64x64x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*S8
+struct MMA_64x96x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*S8
+struct MMA_64x128x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*S8
+struct MMA_64x192x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*S8
+struct MMA_64x256x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN S32+=U8*U8
+struct MMA_64x8x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN S32+=U8*U8
+struct MMA_64x16x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN S32+=U8*U8
+struct MMA_64x32x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN S32+=U8*U8
+struct MMA_64x64x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN S32+=U8*U8
+struct MMA_64x96x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN S32+=U8*U8
+struct MMA_64x128x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN S32+=U8*U8
+struct MMA_64x192x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN S32+=U8*U8
+struct MMA_64x256x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %4, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " p,  %5, %6;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %7, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " p,   %8,  %9;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x8x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x8x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " p,   %7,  %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x16x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x16x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x32x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x32x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x64x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x64x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x96x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x96x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x128x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x128x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " p,    %54,  %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %98, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " p,    %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x192x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x192x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %101, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " p,    %102, %103;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %130, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " p,    %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x256x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x256x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %133, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " p,    %134, %135;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA
+
+} // namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_sm90_gmma_ext.hpp"
+#endif
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
new file mode 100755
index 000000000..10a36aff8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
@@ -0,0 +1,56445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/config.hpp>                // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+// GMMA 64x24x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54,  %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x16 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x16_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x8_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x8 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x8_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*S8
+struct MMA_64x24x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*S8
+struct MMA_64x48x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*S8
+struct MMA_64x80x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*S8
+struct MMA_64x112x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*S8
+struct MMA_64x144x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*S8
+struct MMA_64x160x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*S8
+struct MMA_64x176x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*S8
+struct MMA_64x208x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*S8
+struct MMA_64x224x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*S8
+struct MMA_64x240x32_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=S8*U8
+struct MMA_64x24x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=S8*U8
+struct MMA_64x48x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=S8*U8
+struct MMA_64x80x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=S8*U8
+struct MMA_64x112x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=S8*U8
+struct MMA_64x144x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=S8*U8
+struct MMA_64x160x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=S8*U8
+struct MMA_64x176x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=S8*U8
+struct MMA_64x208x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=S8*U8
+struct MMA_64x224x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=S8*U8
+struct MMA_64x240x32_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*S8
+struct MMA_64x24x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*S8
+struct MMA_64x48x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*S8
+struct MMA_64x80x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*S8
+struct MMA_64x112x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*S8
+struct MMA_64x144x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*S8
+struct MMA_64x160x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*S8
+struct MMA_64x176x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*S8
+struct MMA_64x208x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*S8
+struct MMA_64x224x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*S8
+struct MMA_64x240x32_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN S32+=U8*U8
+struct MMA_64x24x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN S32+=U8*U8
+struct MMA_64x48x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN S32+=U8*U8
+struct MMA_64x80x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN S32+=U8*U8
+struct MMA_64x112x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN S32+=U8*U8
+struct MMA_64x144x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN S32+=U8*U8
+struct MMA_64x160x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN S32+=U8*U8
+struct MMA_64x176x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN S32+=U8*U8
+struct MMA_64x208x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN S32+=U8*U8
+struct MMA_64x224x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN S32+=U8*U8
+struct MMA_64x240x32_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x24x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x24x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x40x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x40x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x48x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x48x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x56x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x56x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x72x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x72x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x80x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x80x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x88x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x88x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x104x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x104x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x112x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x112x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x120x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x120x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %70, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " p,    %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x136x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x136x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %73, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " p,    %74,  %75;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %74, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " p,    %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x144x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x144x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %77, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " p,    %78,  %79;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %78, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " p,    %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x152x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x152x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %81, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " p,    %82,  %83;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %82, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " p,    %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x160x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x160x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %85, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " p,    %86,  %87;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %86, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " p,    %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x168x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x168x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %89, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " p,    %90,  %91;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %90, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " p,    %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x176x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x176x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %93, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " p,    %94,  %95;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %94, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " p,    %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x184x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x184x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %97, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " p,    %98,  %99;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " p,    %53,  %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %102, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " p,    %103, %104;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x200x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x200x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %105, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " p,    %106, %107;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %106, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " p,    %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x208x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x208x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %109, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " p,    %110, %111;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %110, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " p,    %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x216x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x216x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %113, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " p,    %114, %115;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %114, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " p,    %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x224x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x224x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %117, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " p,    %118, %119;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %118, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " p,    %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x232x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x232x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %121, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " p,    %122, %123;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %122, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " p,    %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x240x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x240x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %125, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " p,    %126, %127;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %126, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " p,    %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA 64x248x32 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+struct MMA_64x248x32_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %129, 0;\n"
+      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " p,    %130, %131;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
new file mode 100755
index 000000000..ecca91b93
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
@@ -0,0 +1,22743 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
+#include <cute/arch/mma_sm90_gmma.hpp>     // GMMA::Major, etc.
+
+namespace cute {
+
+namespace SM90::GMMA::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8, %9, %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137, %138;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10, %11, %12;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14, %15, %16;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17, %18;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22, %23, %24;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38, %39, %40;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54, %55, %56;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70,  %71,  %72;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102, %103, %104;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105, %106;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134, %135, %136;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137, %138;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
+      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
+        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
+        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
+      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
+        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
+        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %6, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
+      "{%0, %1},"
+      " %2,"
+      " %3,"
+      " %4, %5,"
+      " p,  %7, %8;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[2];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %9, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
+      "{%0,  %1},"
+      "{%2,  %3,  %4,  %5},"
+      " %6,"
+      " %7, %8,"
+      " p,   %10, %11;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x8x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %8, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      " %4,"
+      " %5,"
+      " %6, %7,"
+      " p,   %9,  %10;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[4];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %11, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      " %8,"
+      " %9, %10,"
+      " p,   %12, %13;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x16x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      float         & d0, float         & d1, float         & d2, float         & d3,
+      float         & d4, float         & d5, float         & d6, float         & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
+        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %12, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      " %8,"
+      " %9,"
+      " %10, %11,"
+      " p,   %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[8];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %15, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+      "{%8,  %9,  %10, %11},"
+      " %12,"
+      " %13, %14,"
+      " p,   %16, %17;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x32x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %20, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      " %16,"
+      " %17,"
+      " %18, %19,"
+      " p,   %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[16];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %23, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
+      "{%16, %17, %18, %19},"
+      " %20,"
+      " %21, %22,"
+      " p,   %24, %25;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x64x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x96x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %36, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      " %32,"
+      " %33,"
+      " %34, %35,"
+      " p,   %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[32];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %39, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31},"
+      "{%32, %33, %34, %35},"
+      " %36,"
+      " %37, %38,"
+      " p,   %40, %41;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x128x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %52, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45, %46, %47},"
+      " %48,"
+      " %49,"
+      " %50, %51,"
+      " p,   %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[48];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %55, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
+      "{%48,  %49,  %50,  %51},"
+      " %52,"
+      " %53, %54,"
+      " p,    %56,  %57;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %100, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      " %96,"
+      " %97,"
+      " %98, %99,"
+      " p,    %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x192x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[96];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      float         & d92, float         & d93, float         & d94, float         & d95,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %103, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
+      "{%96,  %97,  %98,  %99},"
+      " %100,"
+      " %101, %102,"
+      " p,    %104, %105;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
+        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %68, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      " %64,"
+      " %65,"
+      " %66, %67,"
+      " p,    %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[64];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %71, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
+      "{%64,  %65,  %66,  %67},"
+      " %68,"
+      " %69, %70,"
+      " p,    %72,  %73;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %132, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      " %128,"
+      " %129,"
+      " %130, %131,"
+      " p,    %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x256x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[128];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      float         & d124, float         & d125, float         & d126, float         & d127,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %135, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123, %124, %125, %126, %127},"
+      "{%128, %129, %130, %131},"
+      " %132,"
+      " %133, %134,"
+      " p,    %136, %137;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
+        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA::SPARSE
+
+} // namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_sm90_gmma_sparse_ext.hpp"
+#endif
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
new file mode 100755
index 000000000..c224e4034
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
@@ -0,0 +1,60445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/config.hpp>                // CUTE_HOST_DEVICE
+
+#include "cutlass/arch/synclog.hpp"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#  define CUTE_ARCH_MMA_SM90A_ENABLED
+#endif
+
+namespace cute {
+
+namespace SM90::GMMA::SPARSE {
+
+// SPARSE GMMA 64x24x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12, %13, %14;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15, %16;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16, %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20, %21, %22;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24, %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28, %29, %30;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32, %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36, %37, %38;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40, %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44, %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48, %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52, %53, %54;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55, %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56,  %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60,  %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64,  %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F16F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68,  %69,  %70;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F16+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F16F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71,  %72;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32F16F16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=F16*F16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32F16F16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18, %19, %20;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21, %22;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26, %27, %28;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30, %31, %32;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34, %35, %36;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37, %38;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42, %43, %44;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46, %47, %48;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50, %51, %52;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53, %54;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58,  %59,  %60;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62,  %63,  %64;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66,  %67,  %68;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69,  %70;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74,  %75,  %76;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78,  %79,  %80;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82,  %83,  %84;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86,  %87,  %88;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90,  %91,  %92;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94,  %95,  %96;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98,  %99,  %100;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101, %102;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106, %107, %108;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110, %111, %112;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114, %115, %116;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118, %119, %120;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122, %123, %124;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126, %127, %128;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32BF16BF16_SS
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130, %131, %132;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x32 F32+=BF16*BF16
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x32_F32BF16BF16_RS
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  static_assert(tnspA == GMMA::Major::K,
+      "Register source operand A must have K major layout.");
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133, %134;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x16_F32TF32TF32_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x16_F32TF32TF32_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=S8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32S8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*S8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8S8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_SS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
+      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
+      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
+      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
+      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
+      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
+      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
+        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
+        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
+        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
+        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
+        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
+        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN S32+=U8*U8
+template <
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_S32U8U8_RS_TN_SATURATE
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
+      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
+      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
+      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
+      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
+      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
+      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
+      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
+      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
+      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
+      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
+      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
+      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
+      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
+      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
+      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
+      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
+      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
+      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
+      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
+      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
+      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
+      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
+      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
+      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
+      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
+      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
+      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
+      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
+      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p;\n"
+    "}\n"
+      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
+        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
+        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
+        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
+        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
+        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
+        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
+        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
+        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
+        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
+        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
+        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
+        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
+        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
+        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
+        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
+        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
+        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
+        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
+        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
+        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
+        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
+        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
+        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
+        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
+        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
+        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
+        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
+        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
+        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E4M3E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E4M3_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E4M3_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %10, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      " %6,"
+      " %7,"
+      " %8, %9,"
+      " p,   %11, %12;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[6];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
+      uint64_t const& desc_b,
+      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
+      uint32_t      & d4, uint32_t      & d5,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %13, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5},"
+      "{%6,  %7,  %8,  %9},"
+      " %10,"
+      " %11, %12,"
+      " p,   %14, %15;\n"
+    "}\n"
+      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
+        "+r"(d4), "+r"(d5)
+      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x24x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %14, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      " %10,"
+      " %11,"
+      " %12, %13,"
+      " p,   %15, %16;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[10];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %17, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9},"
+      "{%10, %11, %12, %13},"
+      " %14,"
+      " %15, %16,"
+      " p,   %18, %19;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x40x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %16, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      " %12,"
+      " %13,"
+      " %14, %15,"
+      " p,   %17, %18;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[12];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %19, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11},"
+      "{%12, %13, %14, %15},"
+      " %16,"
+      " %17, %18,"
+      " p,   %20, %21;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %28, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      " %24,"
+      " %25,"
+      " %26, %27,"
+      " p,   %29, %30;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x48x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[24];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %31, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23},"
+      "{%24, %25, %26, %27},"
+      " %28,"
+      " %29, %30,"
+      " p,   %32, %33;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %18, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      " %14,"
+      " %15,"
+      " %16, %17,"
+      " p,   %19, %20;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[14];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %21, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13},"
+      "{%14, %15, %16, %17},"
+      " %18,"
+      " %19, %20,"
+      " p,   %22, %23;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x56x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %22, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      " %18,"
+      " %19,"
+      " %20, %21,"
+      " p,   %23, %24;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[18];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %25, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17},"
+      "{%18, %19, %20, %21},"
+      " %22,"
+      " %23, %24,"
+      " p,   %26, %27;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x72x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %24, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      " %20,"
+      " %21,"
+      " %22, %23,"
+      " p,   %25, %26;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[20];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %27, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19},"
+      "{%20, %21, %22, %23},"
+      " %24,"
+      " %25, %26,"
+      " p,   %28, %29;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x80x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %26, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      " %22,"
+      " %23,"
+      " %24, %25,"
+      " p,   %27, %28;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[22];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %29, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21},"
+      "{%22, %23, %24, %25},"
+      " %26,"
+      " %27, %28,"
+      " p,   %30, %31;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x88x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %30, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      " %26,"
+      " %27,"
+      " %28, %29,"
+      " p,   %31, %32;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[26];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %33, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25},"
+      "{%26, %27, %28, %29},"
+      " %30,"
+      " %31, %32,"
+      " p,   %34, %35;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x104x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %32, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      " %28,"
+      " %29,"
+      " %30, %31,"
+      " p,   %33, %34;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[28];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %35, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27},"
+      "{%28, %29, %30, %31},"
+      " %32,"
+      " %33, %34,"
+      " p,   %36, %37;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x112x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %34, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      " %30,"
+      " %31,"
+      " %32, %33,"
+      " p,   %35, %36;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[30];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %37, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29},"
+      "{%30, %31, %32, %33},"
+      " %34,"
+      " %35, %36,"
+      " p,   %38, %39;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x120x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %38, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      " %34,"
+      " %35,"
+      " %36, %37,"
+      " p,   %39, %40;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[34];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %41, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33},"
+      "{%34, %35, %36, %37},"
+      " %38,"
+      " %39, %40,"
+      " p,   %42, %43;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %72, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      " %68,"
+      " %69,"
+      " %70, %71,"
+      " p,    %73,  %74;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x136x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[68];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %75, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67},"
+      "{%68,  %69,  %70,  %71},"
+      " %72,"
+      " %73, %74,"
+      " p,    %76,  %77;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %40, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      " %36,"
+      " %37,"
+      " %38, %39,"
+      " p,   %41, %42;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[36];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %43, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35},"
+      "{%36, %37, %38, %39},"
+      " %40,"
+      " %41, %42,"
+      " p,   %44, %45;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %76, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      " %72,"
+      " %73,"
+      " %74, %75,"
+      " p,    %77,  %78;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x144x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[72];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %79, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
+      "{%72,  %73,  %74,  %75},"
+      " %76,"
+      " %77, %78,"
+      " p,    %80,  %81;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %42, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      " %38,"
+      " %39,"
+      " %40, %41,"
+      " p,   %43, %44;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[38];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %45, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37},"
+      "{%38, %39, %40, %41},"
+      " %42,"
+      " %43, %44,"
+      " p,   %46, %47;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %80, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      " %76,"
+      " %77,"
+      " %78, %79,"
+      " p,    %81,  %82;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x152x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[76];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %83, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75},"
+      "{%76,  %77,  %78,  %79},"
+      " %80,"
+      " %81, %82,"
+      " p,    %84,  %85;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %44, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      " %40,"
+      " %41,"
+      " %42, %43,"
+      " p,   %45, %46;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[40];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %47, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39},"
+      "{%40, %41, %42, %43},"
+      " %44,"
+      " %45, %46,"
+      " p,   %48, %49;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %84, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      " %80,"
+      " %81,"
+      " %82, %83,"
+      " p,    %85,  %86;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x160x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[80];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %87, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
+      "{%80,  %81,  %82,  %83},"
+      " %84,"
+      " %85, %86,"
+      " p,    %88,  %89;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %46, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      " %42,"
+      " %43,"
+      " %44, %45,"
+      " p,   %47, %48;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[42];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %49, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41},"
+      "{%42, %43, %44, %45},"
+      " %46,"
+      " %47, %48,"
+      " p,   %50, %51;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %88, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      " %84,"
+      " %85,"
+      " %86, %87,"
+      " p,    %89,  %90;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x168x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[84];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %91, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83},"
+      "{%84,  %85,  %86,  %87},"
+      " %88,"
+      " %89, %90,"
+      " p,    %92,  %93;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %48, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      " %44,"
+      " %45,"
+      " %46, %47,"
+      " p,   %49, %50;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[44];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %51, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43},"
+      "{%44, %45, %46, %47},"
+      " %48,"
+      " %49, %50,"
+      " p,   %52, %53;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %92, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      " %88,"
+      " %89,"
+      " %90, %91,"
+      " p,    %93,  %94;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x176x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[88];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %95, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
+      "{%88,  %89,  %90,  %91},"
+      " %92,"
+      " %93, %94,"
+      " p,    %96,  %97;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %50, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      " %46,"
+      " %47,"
+      " %48, %49,"
+      " p,   %51, %52;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[46];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %53, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
+      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
+      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
+      " %16, %17, %18, %19, %20, %21, %22, %23, "
+      " %24, %25, %26, %27, %28, %29, %30, %31, "
+      " %32, %33, %34, %35, %36, %37, %38, %39, "
+      " %40, %41, %42, %43, %44, %45},"
+      "{%46, %47, %48, %49},"
+      " %50,"
+      " %51, %52,"
+      " p,   %54, %55;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %96, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      " %92,"
+      " %93,"
+      " %94, %95,"
+      " p,    %97,  %98;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x184x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[92];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      float         & d00, float         & d01, float         & d02, float         & d03,
+      float         & d04, float         & d05, float         & d06, float         & d07,
+      float         & d08, float         & d09, float         & d10, float         & d11,
+      float         & d12, float         & d13, float         & d14, float         & d15,
+      float         & d16, float         & d17, float         & d18, float         & d19,
+      float         & d20, float         & d21, float         & d22, float         & d23,
+      float         & d24, float         & d25, float         & d26, float         & d27,
+      float         & d28, float         & d29, float         & d30, float         & d31,
+      float         & d32, float         & d33, float         & d34, float         & d35,
+      float         & d36, float         & d37, float         & d38, float         & d39,
+      float         & d40, float         & d41, float         & d42, float         & d43,
+      float         & d44, float         & d45, float         & d46, float         & d47,
+      float         & d48, float         & d49, float         & d50, float         & d51,
+      float         & d52, float         & d53, float         & d54, float         & d55,
+      float         & d56, float         & d57, float         & d58, float         & d59,
+      float         & d60, float         & d61, float         & d62, float         & d63,
+      float         & d64, float         & d65, float         & d66, float         & d67,
+      float         & d68, float         & d69, float         & d70, float         & d71,
+      float         & d72, float         & d73, float         & d74, float         & d75,
+      float         & d76, float         & d77, float         & d78, float         & d79,
+      float         & d80, float         & d81, float         & d82, float         & d83,
+      float         & d84, float         & d85, float         & d86, float         & d87,
+      float         & d88, float         & d89, float         & d90, float         & d91,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %99, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91},"
+      "{%92,  %93,  %94,  %95},"
+      " %96,"
+      " %97, %98,"
+      " p,    %100, %101;\n"
+    "}\n"
+      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
+        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
+        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
+        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
+        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
+        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
+        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
+        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
+        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
+        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
+        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
+        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
+        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
+        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
+        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
+        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
+        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
+        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
+        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
+        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
+        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
+        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
+        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %54, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      " %50,"
+      " %51,"
+      " %52, %53,"
+      " p,    %55,  %56;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[50];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %57, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49},"
+      "{%50,  %51,  %52,  %53},"
+      " %54,"
+      " %55, %56,"
+      " p,    %58,  %59;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %104, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      " %100,"
+      " %101,"
+      " %102, %103,"
+      " p,    %105, %106;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x200x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[100];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %107, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99},"
+      "{%100, %101, %102, %103},"
+      " %104,"
+      " %105, %106,"
+      " p,    %108, %109;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %56, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      " %52,"
+      " %53,"
+      " %54, %55,"
+      " p,    %57,  %58;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[52];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %59, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51},"
+      "{%52,  %53,  %54,  %55},"
+      " %56,"
+      " %57, %58,"
+      " p,    %60,  %61;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %108, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      " %104,"
+      " %105,"
+      " %106, %107,"
+      " p,    %109, %110;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x208x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[104];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %111, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
+      "{%104, %105, %106, %107},"
+      " %108,"
+      " %109, %110,"
+      " p,    %112, %113;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %58, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      " %54,"
+      " %55,"
+      " %56, %57,"
+      " p,    %59,  %60;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[54];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %61, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53},"
+      "{%54,  %55,  %56,  %57},"
+      " %58,"
+      " %59, %60,"
+      " p,    %62,  %63;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %112, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      " %108,"
+      " %109,"
+      " %110, %111,"
+      " p,    %113, %114;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x216x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[108];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %115, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107},"
+      "{%108, %109, %110, %111},"
+      " %112,"
+      " %113, %114,"
+      " p,    %116, %117;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %60, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      " %56,"
+      " %57,"
+      " %58, %59,"
+      " p,    %61,  %62;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[56];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %63, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
+      "{%56,  %57,  %58,  %59},"
+      " %60,"
+      " %61, %62,"
+      " p,    %64,  %65;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %116, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      " %112,"
+      " %113,"
+      " %114, %115,"
+      " p,    %117, %118;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x224x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[112];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %119, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111},"
+      "{%112, %113, %114, %115},"
+      " %116,"
+      " %117, %118,"
+      " p,    %120, %121;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %62, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      " %58,"
+      " %59,"
+      " %60, %61,"
+      " p,    %63,  %64;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[58];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %65, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57},"
+      "{%58,  %59,  %60,  %61},"
+      " %62,"
+      " %63, %64,"
+      " p,    %66,  %67;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %120, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      " %116,"
+      " %117,"
+      " %118, %119,"
+      " p,    %121, %122;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x232x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[116];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %123, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115},"
+      "{%116, %117, %118, %119},"
+      " %120,"
+      " %121, %122,"
+      " p,    %124, %125;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %64, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      " %60,"
+      " %61,"
+      " %62, %63,"
+      " p,    %65,  %66;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[60];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %67, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59},"
+      "{%60,  %61,  %62,  %63},"
+      " %64,"
+      " %65, %66,"
+      " p,    %68,  %69;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %124, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      " %120,"
+      " %121,"
+      " %122, %123,"
+      " p,    %125, %126;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x240x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[120];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %127, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119},"
+      "{%120, %121, %122, %123},"
+      " %124,"
+      " %125, %126,"
+      " p,    %128, %129;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %66, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      " %62,"
+      " %63,"
+      " %64, %65,"
+      " p,    %67,  %68;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F16E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = uint32_t[62];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
+      uint64_t const& desc_b,
+      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
+      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
+      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
+      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
+      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
+      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
+      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
+      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
+      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
+      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
+      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
+      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
+      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
+      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
+      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
+      uint32_t      & d60, uint32_t      & d61,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %69, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61},"
+      "{%62,  %63,  %64,  %65},"
+      " %66,"
+      " %67, %68,"
+      " p,    %70,  %71;\n"
+    "}\n"
+      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
+        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
+        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
+        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
+        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
+        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
+        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
+        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
+        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
+        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
+        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
+        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
+        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
+        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
+        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
+        "+r"(d60), "+r"(d61)
+      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E5M2_SS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint64_t[1];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint64_t const& desc_a,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %128, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      " %124,"
+      " %125,"
+      " %126, %127,"
+      " p,    %129, %130;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "l"(desc_a),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
+  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
+>
+struct GMMA_64x248x64_F32E5M2E5M2_RS_TN
+{
+  using DRegisters = void;
+  using ARegisters = uint32_t[4];
+  using ERegisters = uint32_t[1];
+  using BRegisters = uint64_t[1];
+  using CRegisters = float[124];
+
+  CUTE_HOST_DEVICE static void
+  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
+      uint64_t const& desc_b,
+      float         & d000, float         & d001, float         & d002, float         & d003,
+      float         & d004, float         & d005, float         & d006, float         & d007,
+      float         & d008, float         & d009, float         & d010, float         & d011,
+      float         & d012, float         & d013, float         & d014, float         & d015,
+      float         & d016, float         & d017, float         & d018, float         & d019,
+      float         & d020, float         & d021, float         & d022, float         & d023,
+      float         & d024, float         & d025, float         & d026, float         & d027,
+      float         & d028, float         & d029, float         & d030, float         & d031,
+      float         & d032, float         & d033, float         & d034, float         & d035,
+      float         & d036, float         & d037, float         & d038, float         & d039,
+      float         & d040, float         & d041, float         & d042, float         & d043,
+      float         & d044, float         & d045, float         & d046, float         & d047,
+      float         & d048, float         & d049, float         & d050, float         & d051,
+      float         & d052, float         & d053, float         & d054, float         & d055,
+      float         & d056, float         & d057, float         & d058, float         & d059,
+      float         & d060, float         & d061, float         & d062, float         & d063,
+      float         & d064, float         & d065, float         & d066, float         & d067,
+      float         & d068, float         & d069, float         & d070, float         & d071,
+      float         & d072, float         & d073, float         & d074, float         & d075,
+      float         & d076, float         & d077, float         & d078, float         & d079,
+      float         & d080, float         & d081, float         & d082, float         & d083,
+      float         & d084, float         & d085, float         & d086, float         & d087,
+      float         & d088, float         & d089, float         & d090, float         & d091,
+      float         & d092, float         & d093, float         & d094, float         & d095,
+      float         & d096, float         & d097, float         & d098, float         & d099,
+      float         & d100, float         & d101, float         & d102, float         & d103,
+      float         & d104, float         & d105, float         & d106, float         & d107,
+      float         & d108, float         & d109, float         & d110, float         & d111,
+      float         & d112, float         & d113, float         & d114, float         & d115,
+      float         & d116, float         & d117, float         & d118, float         & d119,
+      float         & d120, float         & d121, float         & d122, float         & d123,
+      uint32_t const& e,
+      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
+  {
+#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
+    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
+    asm volatile(
+    "{\n"
+      ".reg .pred p;\n"
+      "setp.ne.b32 p, %131, 0;\n"
+      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
+      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
+      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
+      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
+      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
+      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
+      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
+      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
+      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
+      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
+      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
+      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
+      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
+      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
+      " %104, %105, %106, %107, %108, %109, %110, %111, "
+      " %112, %113, %114, %115, %116, %117, %118, %119, "
+      " %120, %121, %122, %123},"
+      "{%124, %125, %126, %127},"
+      " %128,"
+      " %129, %130,"
+      " p,    %132, %133;\n"
+    "}\n"
+      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
+        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
+        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
+        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
+        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
+        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
+        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
+        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
+        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
+        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
+        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
+        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
+        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
+        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
+        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
+        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
+        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
+        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
+        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
+        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
+        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
+        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
+        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
+        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
+        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
+        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
+        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
+        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
+        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
+        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
+        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
+      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
+         "l"(desc_b),
+         "r"(e), "n"(int32_t(spsel)),
+         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::GMMA::SPARSE
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/util.hpp b/lightllm-kernel/cutlass/include/cute/arch/util.hpp
new file mode 100755
index 000000000..3749a9c25
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/arch/util.hpp
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/numeric/integer_sequence.hpp>
+
+#if defined(__clang__) && defined(__CUDA__)
+  //  __cvta_generic_to_shared was added in Clang 14: https://reviews.llvm.org/D111665
+  #if __clang_major__ >= 14
+    #define CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
+  #endif
+
+  // __nvvm_get_smem_pointer added in Clang 14: https://reviews.llvm.org/D111665
+  // ... but will not work on Windows until Clang 15: https://reviews.llvm.org/D122897
+  #if (!defined(_WIN32) && __clang_major__ >= 14) || __clang_major__ >= 15
+    #define CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER 1
+  #endif
+#endif
+
+#if defined(__NVCC__) || defined(__CUDACC_RTC__)
+  // __cvta_generic_to_shared added in CUDA 11+
+  #if __CUDACC_VER_MAJOR__ >= 11
+    #define CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
+  #endif
+
+  // __nvvm_get_smem_pointer added in CUDA 10.2
+  #if __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2
+    #define CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER 1
+  #endif
+#endif
+
+#if CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED || CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED
+  #define CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED 1
+#endif
+
+#if !defined(CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED) && CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED && defined(__CUDA_ARCH__)
+  #define CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED 1
+#endif
+
+#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER || CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER
+  #define CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED 1
+#endif
+
+#if !defined(CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED) && CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED && defined(__CUDA_ARCH__)
+  #define CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED 1
+#endif
+
+// Clang 14+ provides a declaration of __nvvm_get_smem_pointer, so we only need
+// to provide one for NVCC
+#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER
+  extern "C" {
+  // This NVVM intrinsic is subject to change in future versions of CUDA.
+  // Clients should not call it directly.
+  CUTE_DEVICE uint32_t __nvvm_get_smem_pointer(void*);
+  }
+#endif
+
+namespace cute
+{
+
+/// CUTE helper to cast SMEM pointer to unsigned
+CUTE_DEVICE
+uint32_t
+cast_smem_ptr_to_uint(void const* const ptr)
+{
+// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
+// the previous internal intrinsics if they are available.
+#if CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED
+  //
+  // This NVVM intrinsic converts an address in shared memory to a plain
+  // unsigned integer. This is necessary to pass to shared memory instructions
+  // in inline PTX.
+  //
+  // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer()  [only available in 10.2].
+  //
+  //__device__ size_t __cvta_generic_to_shared(void* ptr);
+
+  /// CUTE helper to get SMEM pointer
+  return static_cast<uint32_t>(__cvta_generic_to_shared(ptr));
+
+#elif CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED
+
+  return __nvvm_get_smem_pointer(ptr);
+
+#elif defined(__CUDA_ARCH__)
+
+  uint32_t smem_ptr;
+
+  asm(
+  "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
+    : "=r"(smem_ptr) : "l"(ptr));
+
+  return smem_ptr;
+
+#else
+
+
+  (void) ptr;
+  printf("ERROR: cast_smem_ptr_to_uint not supported but used.\n");
+  return 0;
+
+#endif
+}
+
+namespace detail {
+
+//
+// Wrapper for MMAOp::fma
+//
+
+template <class MmaOp>
+struct CallFMA {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr void
+  operator()(Args&&... args) const {
+    return MmaOp::fma(static_cast<Args&&>(args)...);
+  }
+};
+
+//
+// Wrapper for CopyOp::copy
+//
+
+template <class CopyOp>
+struct CallCOPY {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr void
+  operator()(Args&&... args) const {
+    return CopyOp::copy(static_cast<Args&&>(args)...);
+  }
+};
+
+//
+// Utility for exploding pointers/arrays/tensors into functions
+//
+
+template <class Fn,
+          class PtrA, int... I>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrA&& a, int_sequence<I...>)
+{
+  return fn(a[I]...);
+}
+
+template <class Fn,
+          class PtrS, int... Is,
+          class PtrD, int... Id>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrS&& s, int_sequence<Is...>,
+        PtrD&& d, int_sequence<Id...>)
+{
+  return fn(s[Is]..., d[Id]...);
+}
+
+template <class Fn,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>)
+{
+  return fn(a[Ia]..., b[Ib]..., c[Ic]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie,
+          class PtrF, int... If>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>,
+        PtrF&& f, int_sequence<If...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]...);
+}
+
+template <class Fn,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie,
+          class PtrF, int... If,
+          class PtrG, int... Ig>
+CUTE_HOST_DEVICE constexpr
+void
+explode(Fn fn,
+        PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>,
+        PtrF&& f, int_sequence<If...>,
+        PtrG&& g, int_sequence<Ig...>)
+{
+  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
+}
+
+//
+// Utility for exploding tuples into functions
+//
+
+template <class Fn,
+          class TupleA, int... I>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<I...>)
+{
+  return fn(get<I>(a)...);
+}
+
+template <class Fn,
+          class TupleA, int... Ia,
+          class TupleB, int... Ib>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<Ia...>,
+              TupleB&& b, int_sequence<Ib...>)
+{
+  return fn(get<Ia>(a)..., get<Ib>(b)...);
+}
+
+template <class Fn,
+          class TupleA, int... Ia,
+          class TupleB, int... Ib,
+          class TupleC, int... Ic>
+CUTE_HOST_DEVICE constexpr
+void
+explode_tuple(Fn fn,
+              TupleA&& a, int_sequence<Ia...>,
+              TupleB&& b, int_sequence<Ib...>,
+              TupleC&& c, int_sequence<Ic...>)
+{
+  return fn(get<Ia>(a)..., get<Ib>(b)..., get<Ic>(c)...);
+}
+
+} // end namespace detail
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
new file mode 100755
index 000000000..dd6b4e52a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
@@ -0,0 +1,764 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_constant, cute::is_integral
+#include <cute/atom/copy_traits.hpp>           // cute::Copy_Traits
+#include <cute/atom/mma_atom.hpp>              // cute::TiledMMA
+
+namespace cute
+{
+
+template <class... Args>
+struct Copy_Atom;
+
+template <class CopyOperation, class CopyInternalType>
+struct Copy_Atom<CopyOperation, CopyInternalType> : Copy_Atom<Copy_Traits<CopyOperation>, CopyInternalType>
+{};
+
+template <class... Args, class CopyInternalType>
+struct Copy_Atom<Copy_Traits<Args...>, CopyInternalType>
+  : Copy_Traits<Args...>
+{
+  using Traits = Copy_Traits<Args...>;
+
+  // Bit and Thr layouts from the Copy_Traits
+  using ThrID        = typename Traits::ThrID;
+  using BitLayoutSrc = typename Traits::SrcLayout;
+  using BitLayoutDst = typename Traits::DstLayout;
+  using BitLayoutRef = typename Traits::RefLayout;
+
+  using ValType = CopyInternalType;
+
+  using ValLayoutSrc = decltype(recast_layout<uint1_t, ValType>(BitLayoutSrc{}));
+  using ValLayoutDst = decltype(recast_layout<uint1_t, ValType>(BitLayoutDst{}));
+  using ValLayoutRef = decltype(recast_layout<uint1_t, ValType>(BitLayoutRef{}));
+
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutSrc{}) == size(ThrID{}), "CopyOperation is not valid for Src of ValType.");
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutDst{}) == size(ThrID{}), "CopyOperation is not valid for Dst of ValType.");
+  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutRef{}) == size(ThrID{}), "CopyOperation is not valid for Ref of ValType.");
+
+  static constexpr int NumValSrc = size<1>(ValLayoutSrc{});
+  static constexpr int NumValDst = size<1>(ValLayoutDst{});
+
+  // Additional Trait parameters/transformations
+  template <class... TraitsArgs>
+  CUTE_HOST_DEVICE
+  auto
+  with(TraitsArgs&&... args) const {
+    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
+    return Copy_Atom<decltype(traits), CopyInternalType>{traits};
+  }
+
+  //
+  // Tensor call interfaces
+  //
+
+  // Check and call instruction, or recurse
+  template <class SEngine, class SLayout,
+            class DEngine, class DLayout>
+  CUTE_HOST_DEVICE
+  void
+  call(Tensor<SEngine,SLayout> const& src,
+       Tensor<DEngine,DLayout>      & dst) const
+  {
+    static_assert(SLayout::rank == 1, "Expected rank-1 src tensor");
+    static_assert(DLayout::rank == 1, "Expected rank-1 dst tensor");
+
+    if constexpr (is_constant<NumValSrc, decltype(size(src))>::value ||
+                  is_constant<NumValDst, decltype(size(dst))>::value) {
+      // Dispatch to unpack to execute instruction
+      return copy_unpack(*this, src, dst);
+    } else
+    if constexpr (is_tuple<decltype(shape(src))>::value &&
+                  is_tuple<decltype(shape(dst))>::value) {
+      // If the size of the src/dst doesn't match the instruction,
+      //   recurse this rank-1 layout by peeling off the mode
+      //   ((A,B,C,...)) -> (A,B,C,...)
+      return copy(*this, tensor<0>(src), tensor<0>(dst));
+    } else {
+      static_assert(dependent_false<SEngine>, "No instruction match and no recursion possible.");
+    }
+  }
+
+  // Accept mutable temporaries
+  template <class SEngine, class SLayout,
+            class DEngine, class DLayout>
+  CUTE_HOST_DEVICE
+  void
+  call(Tensor<SEngine,SLayout> const& src,
+       Tensor<DEngine,DLayout>     && dst) const
+  {
+    return call(src, dst);
+  }
+};
+
+//
+// A tiling of copy atoms
+//
+
+template <class TiledCopy, class ThrIdx>
+struct ThrCopy;
+
+template <class Copy_Atom,
+          class LayoutCopy_TV,  // (tid,vid) -> coord   [Need not be 2D...]
+          class ShapeTiler_MN>  // coord space
+struct TiledCopy : Copy_Atom
+{
+  // Layout information from the CopyAtom
+  using AtomThrID     = typename Copy_Atom::ThrID;        // thrid -> thr_idx
+  using AtomLayoutSrc = typename Copy_Atom::ValLayoutSrc; // (thr,val) -> offset
+  using AtomLayoutDst = typename Copy_Atom::ValLayoutDst; // (thr,val) -> offset
+  using AtomLayoutRef = typename Copy_Atom::ValLayoutRef; // (thr,val) -> offset
+
+  using AtomNumThr = decltype(size<0>(AtomLayoutRef{}));
+  using AtomNumVal = decltype(size<1>(AtomLayoutRef{}));
+
+  // Layout information for the TiledCopy
+  using Tiler_MN       = ShapeTiler_MN;
+  using TiledLayout_TV = LayoutCopy_TV;
+  using TiledNumThr    = decltype(size<0>(TiledLayout_TV{}));
+  using TiledNumVal    = decltype(size<1>(TiledLayout_TV{}));
+
+  CUTE_STATIC_ASSERT_V(TiledNumThr{} % AtomNumThr{} == Int<0>{}, "TiledCopy uses too few thrs for selected CopyAtom");
+  CUTE_STATIC_ASSERT_V(TiledNumVal{} % AtomNumVal{} == Int<0>{}, "TiledCopy uses too few vals for selected CopyAtom");
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  // where
+  //   ThrV:  The threads local to a COPY_ATOM Src.
+  //   ThrX:  The threads tiled across COPY_ATOMs Src.
+  //   FrgV:  The values local to a COPY_ATOM Src.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class STensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tidfrg_S(STensor&& stensor)
+  {
+    CUTE_STATIC_ASSERT_V(rank(stensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
+
+    // Tile the stensor and compute the (src-thr, src-val) -> (ref-thr, ref-val) layout
+    return tile2thrfrg(zipped_divide(stensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}));
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  // where
+  //   ThrV:  The threads local to a COPY_ATOM Dst.
+  //   ThrX:  The threads tiled across COPY_ATOMs Dst.
+  //   FrgV:  The values local to a COPY_ATOM Dst.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class DTensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tidfrg_D(DTensor&& dtensor)
+  {
+    CUTE_STATIC_ASSERT_V(rank(dtensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
+
+    // Tile the dtensor and compute the (dst-thr, dst-val) -> (ref-thr, ref-val) layout
+    return tile2thrfrg(zipped_divide(dtensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}));
+  }
+
+  // Tile a tensor or a layout from shape
+  //   ((TileM,TileN,...), (RestM,RestN,...))
+  // to shape
+  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
+  template <class Tensor, class Ref2TrgLayout>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  tile2thrfrg(Tensor&& tensor, Ref2TrgLayout const& ref2trg)
+  {
+    // Take the thrs/vals that the atom is interested in
+    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
+    auto atom_layout_TV = zipped_divide(TiledLayout_TV{}, make_shape(AtomNumThr{}, AtomNumVal{}));
+    // ((atom_tid,atom_val),(rest_tid,rest_val)) -> (m,n)
+
+    // Transform to the trg layout
+    auto trg_layout_TV = atom_layout_TV.compose(ref2trg, _);
+    // ((trg_tid,trg_val),(rest_tid,rest_val)) -> (m,n)
+
+    // Transform the thrs mode from thrid to thr_idx
+    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
+    auto thrval2mn = coalesce(zip(trg_layout_TV), Shape<_1,Shape<_1,_1>>{});
+    // ((trg_tid,rest_tid),(trg_val,rest_val)) -> (m,n)
+
+    /// ==================
+
+    // Transform the tile mode
+    auto tv_tensor = tensor.compose(thrval2mn, _);
+    // ((thrid,val),(RestM,RestN,...))
+
+    // Unfold and return
+    return tv_tensor(make_coord(_,_), _);
+  }
+
+  // retile_S and retile_D assume they are working with the reference layout -- they are the same
+  template <class Tensor>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  retile(Tensor&& tensor)
+  {
+    constexpr int R = remove_cvref_t<Tensor>::rank;
+    // Assert that AtomLayoutSrc|Dst is identity so we can skip the Ref transformation
+
+    // Assume the first size<0>(tensor) elements are the first val_ids in TiledLayout_TV.
+    // Then, we only need the shape+layout of those size<0>(tensor) elements in TiledLayout_TV
+    //   and that shape is what we gather from the other modes of tensor
+
+    auto V = size<0>(tensor);
+
+    auto frg_layout_mn = upcast<TiledNumThr{} * V>(right_inverse(TiledLayout_TV{}).with_shape(shape(Tiler_MN{})));
+    // (m,n) -> v_idx -- The shape and order of the V inside of TiledLayout_TV
+
+    auto frg_layout_v = zipped_divide(logical_product(make_layout(V), right_inverse(frg_layout_mn)), make_layout(AtomNumVal{}));
+    // (atom_vals,rest_vals) -> (v,m,n)
+
+    /// =======
+
+    // Tile the tensor for TileFrg
+    auto t_tensor = zipped_divide(tensor, prepend(product_each(shape(frg_layout_mn)), V));
+    // ((TileV,TileM,TileN,...),(1,RestM,RestN,...))
+
+    // Transform the tile mode
+    auto v_tensor = t_tensor.compose(frg_layout_v, _);
+    // ((atom_vals,rest_vals),(1,RM,RN,...))
+
+    // Unfold and return
+    return v_tensor(_, append<R>(Int<0>{},_));
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutS_TV()
+  {
+    // (M,N) -> (M,N)
+    auto ref_S = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
+    // (thr_idx,val_idx) -> (M,N)
+    return tile2thrfrg(ref_S, right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}))(_,_,Int<0>{});
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutS_MN()
+  {
+    // (thr_idx,val_idx) -> (M,N)
+    auto layoutS_TV = get_layoutS_TV();
+    // (M,K) -> (thr_idx,val_idx)
+    auto layoutS_MK = right_inverse(layoutS_TV).with_shape(shape(Tiler_MN{}));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_S = make_layout(size<0>(TiledLayout_TV{}));
+
+    return cute::make_tuple(layoutS_MK, thrID_S);
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutD_TV()
+  {
+    // (M,N) -> (M,N)
+    auto ref_D = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
+    // (thr_idx,val_idx) -> (M,N)
+    return tile2thrfrg(ref_D, right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}))(_,_,Int<0>{});
+  }
+
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  get_layoutD_MN()
+  {
+    // (thr_idx,val_idx) -> (M,N)
+    auto layoutD_TV = get_layoutD_TV();
+    // (M,K) -> (thr_idx,val_idx)
+    auto layoutD_MK = right_inverse(layoutD_TV).with_shape(shape(Tiler_MN{}));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_D = make_layout(size<0>(TiledLayout_TV{}));
+
+    return cute::make_tuple(layoutD_MK, thrID_D);
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE static
+  auto
+  get_slice(ThrIdx const& thr_idx)
+  {
+    return ThrCopy<TiledCopy, ThrIdx>(thr_idx);
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE  static
+  auto
+  get_thread_slice(ThrIdx const& thr_idx)
+  {
+    return get_slice(thr_idx);
+  }
+};
+
+template <class TiledCopy, class ThrIdx>
+struct ThrCopy
+{
+  ThrIdx thr_idx_;
+
+  CUTE_HOST_DEVICE
+  ThrCopy(ThrIdx const& thr_idx) : thr_idx_(thr_idx) {}
+
+  template <class STensor>
+  CUTE_HOST_DEVICE
+  auto
+  partition_S(STensor&& stensor) const {
+    //static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //              "Expected ValType for tiling SrcTensor.");
+    auto thr_tensor = make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
+    return thr_tensor(thr_idx_, _, repeat<rank_v<STensor>>(_));
+  }
+
+  template <class DTensor>
+  CUTE_HOST_DEVICE
+  auto
+  partition_D(DTensor&& dtensor) const {
+    //static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //              "Expected ValType for tiling DstTensor.");
+    auto thr_tensor = make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
+    return thr_tensor(thr_idx_, _, repeat<rank_v<DTensor>>(_));
+  }
+
+  template <class STensor>
+  CUTE_HOST_DEVICE static
+  auto
+  retile_S(STensor&& stensor) {
+    // static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //               "Expected ValType for tiling SrcTensor.");
+    return make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::retile(stensor.layout()));
+  }
+
+  template <class DTensor>
+  CUTE_HOST_DEVICE static
+  auto
+  retile_D(DTensor&& dtensor) {
+    // static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
+    //               "Expected ValType for tiling DstTensor.");
+    return make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::retile(dtensor.layout()));
+  }
+};
+
+
+template <class... Args,
+          class LayoutCopy_TV,
+          class Tiler>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_impl(Copy_Atom<Args...> const& atom,
+                     LayoutCopy_TV      const&,
+                     Tiler              const&)
+{
+  return TiledCopy<Copy_Atom<Args...>, LayoutCopy_TV, Tiler>{atom};
+}
+
+//
+// These tile the Copy_Atom as a whole
+//
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_A(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutA_TV(), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+}
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_B(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutB_TV(), make_shape(tile_size<1>(mma),tile_size<2>(mma)));
+}
+
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_C(Copy_Atom<CArgs...> const& copy_atom,
+                  TiledMMA<MArgs...>  const& mma)
+{
+  return make_tiled_copy_impl(copy_atom, mma.get_layoutC_TV(), make_shape(tile_size<0>(mma),tile_size<1>(mma)));
+}
+
+// returns the smallest tiled copy that can retile LayoutC_TV
+// for use with pipelined epilogues with subtiled stores
+template <class... CArgs, class... MArgs>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_C_atom(Copy_Atom<CArgs...> const& copy_atom,
+                       TiledMMA<MArgs...>  const& mma)
+{
+  // Truncate the V-layout to just the Copy_Atom, keep the V-order
+  auto layoutC_TV = mma.get_layoutC_TV();
+  auto copy_V     = Int<Copy_Atom<CArgs...>::NumValSrc>{};
+  CUTE_STATIC_ASSERT_V(copy_V <= size<1>(layoutC_TV));
+  auto layout_TV  = composition(layoutC_TV, make_layout(make_shape(size<0>(layoutC_TV), copy_V)));
+
+  // Recompute tiler and restride the TV layout for the new tiler
+
+  // Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them
+  // Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA
+  auto mma_tiler = make_shape(tile_size<0>(mma),tile_size<1>(mma));
+  auto mma_zeros = repeat_like(mma_tiler, Int<0>{});
+
+  auto tiler = transform(make_seq<rank(mma_tiler)>{}, [&](auto i) {
+    return filter(composition(make_layout(mma_tiler, replace<i>(mma_zeros, Int<1>{})), layout_TV));
+  });
+
+  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
+  // Apply the tiler to a reference and transform the codomain
+  // tile_coord -> mma_coord
+  auto tile2mma = composition(make_layout(mma_tiler), tiler);
+
+  // (tid,vid) -> tile_coord
+  auto layout_tv = composition(left_inverse(tile2mma), layout_TV);
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+/** Produce a TiledCopy from logical thread and values layouts.
+ * The thread and value layouts map coordinates to thr_idx and val_idx.
+ *    The product of these layouts is taken to produce the TV layout and the Tiler.
+ * Useful when threads and values need very specific mappings onto coordinates
+ *    in the target tensors.
+ */
+template <class... Args,
+          class ThrLayout,
+          class ValLayout = Layout<_1>>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
+                ThrLayout          const& thr_layout = {},     // (m,n) -> thr_idx
+                ValLayout          const& val_layout = {})     // (m,n) -> val_idx
+{
+  // Take the raked_products to compute the Layout_MN
+  // (M,N) -> (thr_idx, val_idx)
+  auto layout_mn = raked_product(thr_layout, val_layout);
+  // (thr_idx, val_idx) -> (M,N)
+  auto layout_tv = right_inverse(layout_mn).with_shape(make_shape(size(thr_layout), size(val_layout)));
+  // Tiler for extracting relevant elements
+  // (M,N) -> tensor coord
+  auto tiler = product_each(shape(layout_mn));
+
+#if 0
+  print("thr_layout: "); print(thr_layout); print("\n");
+  print("val_layout: "); print(val_layout); print("\n");
+  print("layout_mn : "); print(layout_mn);  print("\n");
+  print("layout_tv : "); print(layout_tv);  print("\n");
+  print("tiler     : "); print(tiler);      print("\n");
+#endif
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+/** Produce a TiledCopy from thread and value offset maps.
+ * The TV Layout maps threads and values to the codomain of the data_layout.
+ * It is verified that the intended codomain is valid within data_layout.
+ * Useful when threads and values don't care about owning specific coordinates, but
+ *   care more about the vector-width and offsets between them.
+ */
+template <class... Args, class AtomTVLayout, class DataLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
+                  AtomTVLayout const& atom_tv_layout,   // atom (thr,val) -> data addr
+                  DataLayout   const& data_layout)      // coord          -> data addr    The target layout
+{
+  static_assert(is_static<AtomTVLayout>::value);
+  static_assert(is_static<DataLayout>::value);
+
+  // data addr -> data coord    Append 1:0 so off-the-ends get the stride-0
+  auto inv_data_layout = make_layout(left_inverse(data_layout), Layout<_1,_0>{});
+
+  // (tid,vid) -> data_coord
+  auto layout_tv_data = composition(inv_data_layout, atom_tv_layout);
+
+  // Check validity
+  CUTE_STATIC_ASSERT_V(coalesce(composition(data_layout, layout<1>(layout_tv_data))) == coalesce(layout<1>(atom_tv_layout)),
+                       "The memory pointed to by AtomTVLayout does not exist in the DataLayout.");
+
+#if 0
+  if (thread0()) {
+    print("data_layout        : "); print(data_layout); print("\n");
+    print("atom_tv_layout     : "); print(atom_tv_layout); print("\n");
+    print("layout_tv_data     : "); print(layout_tv_data); print("\n");
+  }
+#endif
+
+  //
+  // Tiler -- Find the active elements in the DATA tensor and generate a tiler to extract them
+  //
+
+  // Convert to the awkward by-mode tiler to preserve the modes of the tiled DATA
+  auto flat_data_shape = product_each(shape(data_layout));
+  auto flat_data_zeros = repeat<rank(flat_data_shape)>(Int<0>{});
+
+  auto tiler = transform(make_seq<rank(flat_data_shape)>{}, [&](auto i) {
+    return filter(composition(make_layout(flat_data_shape, replace<i>(flat_data_zeros, Int<1>{})), layout_tv_data));
+  });
+
+  //
+  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
+  //
+
+  // Apply the tiler to a reference and transform the codomain
+  // tile_coord -> data_coord
+  auto tile2data = composition(make_layout(flat_data_shape), tiler);
+
+  // (tid,vid) -> tile_coord
+  auto layout_tv = composition(left_inverse(tile2data), layout_tv_data);
+
+#if 0
+  if (thread0()) {
+    print("tiler              : "); print(tiler); print("\n");
+    print("tile2data          : "); print(tile2data); print("\n");
+    print("layout_tv          : "); print(layout_tv); print("\n");
+  }
+#endif
+
+  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
+}
+
+// Make a TiledCopy out of the copy_atom that matches the Src-Layout of tiled_copy
+template <class... Args,
+          class TiledCopy>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_S(Copy_Atom<Args...> const& copy_atom,
+                  TiledCopy          const& tiled_copy)
+{
+  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutS_TV(), typename TiledCopy::Tiler_MN{});
+}
+
+// Make a TiledCopy out of the copy_atom that matches the Dst-Layout of tiled_copy
+template <class... Args,
+          class TiledCopy>
+CUTE_HOST_DEVICE
+auto
+make_tiled_copy_D(Copy_Atom<Args...> const& copy_atom,
+                  TiledCopy          const& tiled_copy)
+{
+  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutD_TV(), typename TiledCopy::Tiler_MN{});
+}
+
+//
+// Size
+//
+
+// The logical size of a TileCopy
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_size(TiledCopy<Args...> const&)
+{
+  return size<I...>(typename TiledCopy<Args...>::Tiler_MN{});
+}
+
+// The number of threads involved in a TiledCopy
+template <class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+size(TiledCopy<Args...> const&)
+{
+  return typename TiledCopy<Args...>::TiledNumThr{};
+}
+
+//
+// Display utilities
+//
+
+template <class... Args, class T>
+CUTE_HOST_DEVICE
+void
+print(Copy_Atom<Copy_Traits<Args...>, T> const&)
+{
+  using Atom = Copy_Atom<Copy_Traits<Args...>, T>;
+  print("Copy_Atom\n");
+  print("  ThrID:        "); print(typename Atom::ThrID{});        print("\n");
+  print("  ValLayoutSrc: "); print(typename Atom::ValLayoutSrc{}); print("\n");
+  print("  ValLayoutDst: "); print(typename Atom::ValLayoutDst{}); print("\n");
+  print("  ValLayoutRef: "); print(typename Atom::ValLayoutRef{}); print("\n");
+  print("  ValueType:    "); print(sizeof_bits<typename Atom::ValType>::value); print("b\n");
+}
+
+template <class Atom, class... Args>
+CUTE_HOST_DEVICE
+void
+print(TiledCopy<Atom, Args...> const& copy, char const* pad = "")
+{
+  using Copy = TiledCopy<Atom, Args...>;
+  print("TiledCopy\n");
+  print("  Tiler_MN:       "); print(typename Copy::Tiler_MN{});       print("\n");
+  print("  TiledLayout_TV: "); print(typename Copy::TiledLayout_TV{}); print("\n");
+  print(static_cast<Atom const&>(copy));
+}
+
+template <class TiledCopy, class ThrIdx>
+CUTE_HOST_DEVICE
+void
+print(ThrCopy<TiledCopy, ThrIdx> const& thr_copy)
+{
+  print("ThrCopy\n");
+  print("  ThrIdx: "); print(thr_copy.thr_idx_); print("\n");
+  print(TiledCopy{});
+}
+
+// TiledCopy to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+auto
+print_latex(TiledCopy<Args...> const& copy,
+            TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  auto [layoutS_MN, thrID_S] = copy.get_layoutS_MN();
+  auto [layoutD_MN, thrID_D] = copy.get_layoutD_MN();
+
+  print_latex_copy(layoutS_MN, thrID_S,
+                   layoutD_MN, thrID_D);
+}
+
+// MNK Copy Layout to LaTeX TikZ
+template <class LayoutS, class ThrIDS,
+          class LayoutD, class ThrIDD,
+          class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex_copy(LayoutS const& S, ThrIDS const& TS,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutD const& D, ThrIDD const& TD,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(S) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(D) == Int<2>{});
+
+  assert(size<0>(S) == size<0>(D));
+  assert(size<1>(S) == size<1>(D));
+
+  // Commented prints
+  printf("%% LayoutS: "); print(S);  printf("\n");
+  printf("%% ThrIDS : "); print(TS); printf("\n");
+  printf("%% LayoutD: "); print(D);  printf("\n");
+  printf("%% ThrIDD : "); print(TD); printf("\n\n");
+
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // S starting at 0,0
+  for (int i = 0; i < size<0>(S); ++i) {
+    for (int j = 0; j < size<1>(S); ++j) {
+      int thrid   = S(i,j) % size(TS);
+      int val_idx = S(i,j) / size(TS);
+      int thr_idx = TS(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, 0, int(size<0>(S)), int(size<1>(S)));
+  // S Labels
+  for (int i =  0, j = -1; i < size<0>(S); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int i = -1, j =  0; j < size<1>(S); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // D starting at 0,size<1>(S)+3
+  for (int i = 0; i < size<0>(D); ++i) {
+    for (int j = 0; j < size<1>(D); ++j) {
+      int thrid   = D(i,j) % size(TD);
+      int val_idx = D(i,j) / size(TD);
+      int thr_idx = TD(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j + size<1>(S) + 3,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, int(size<1>(S)+3), int(size<0>(D)), int(size<1>(D)+size<1>(S)+3));
+  // D Labels
+  for (int i = 0, j = size<1>(D); i < size<0>(D); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, i);
+  }
+  for (int i = -1, j =         0; j < size<1>(D); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+} // end namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/atom/copy_traits_sm50.hpp>
+#include <cute/atom/copy_traits_sm75.hpp>
+#include <cute/atom/copy_traits_sm80.hpp>
+#include <cute/atom/copy_traits_sm90.hpp>
+
+// Config
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#  define CUTE_COPY_ATOM_TMA_SM90_ENABLED
+#endif
+
+#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
+#include <cute/atom/copy_traits_sm90_tma.hpp>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
new file mode 100755
index 000000000..bfbeb4ea5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy.hpp>
+
+#include <cute/tensor_impl.hpp>
+
+namespace cute
+{
+
+/**
+ * concept Copy_Traits
+ * {
+ *   using ThrID     =    // Logical thread id (tid) -> tidx
+ *
+ *   using SrcLayout =    // (Logical src thread id (tid), Logical src value id (vid)) -> bit
+ *   using DstLayout =    // (Logical dst thread id (tid), Logical dst value id (vid)) -> bit
+ *   using RefLayout =    // (Logical ref thread id (tid), Logical ref value id (vid)) -> bit
+ * };
+ *
+ * The abstract bit ordering of the Copy_Traits (the codomain of SrcLayout, DstLayout, and RefLayout)
+ * is arbitrary and only used to construct maps
+ *   (ref-tid,ref-vid) -> (src-tid,src-vid)
+ *   (ref-tid,ref-vid) -> (dst-tid,dst-vid)
+ * in TiledCopy. The Layout_TV in TiledCopy is in accordance with the RefLayout of a Traits, then mapped to
+ * the Src or Dst (tid,vid) representation on demand.
+ *
+ */
+
+template <class CopyOperation, class... CopyOpArgs>
+struct Copy_Traits
+{
+  static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
+};
+
+template <class S, class D>
+struct Copy_Traits<UniversalCopy<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <int MaxVecBits>
+struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+//
+// Generic copy_unpack for common argument-based Copy_Traits
+//
+
+template <class CopyOp, class... Args,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(Copy_Traits<CopyOp,Args...> const&,
+            Tensor<SEngine,SLayout>     const& src,
+            Tensor<DEngine,DLayout>          & dst)
+{
+  // Specializations can generalize on these checks
+  //static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<CopyOp>");
+  //static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<CopyOp>");
+
+  using RegistersSrc = typename CopyOp::SRegisters;
+  using RegistersDst = typename CopyOp::DRegisters;
+  using RegTypeSrc   = typename remove_extent<RegistersSrc>::type;
+  using RegTypeDst   = typename remove_extent<RegistersDst>::type;
+  constexpr int RegNumSrc = extent<RegistersSrc>::value;
+  constexpr int RegNumDst = extent<RegistersDst>::value;
+
+  Tensor rS = recast<RegTypeSrc>(src);
+  Tensor rD = recast<RegTypeDst>(dst);
+
+  CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
+    "Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
+    "Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");
+
+  detail::explode(detail::CallCOPY<CopyOp>{},
+                  rS, make_int_sequence<RegNumSrc>{},
+                  rD, make_int_sequence<RegNumDst>{});
+}
+
+//
+// Accept mutable temporaries
+//
+
+template <class CopyOp, class... Args,
+          class SEngine, class SLayout,
+          class DEngine, class DLayout>
+CUTE_HOST_DEVICE constexpr
+void
+copy_unpack(Copy_Traits<CopyOp,Args...> const& traits,
+            Tensor<SEngine,SLayout>     const& src,
+            Tensor<DEngine,DLayout>         && dst)
+{
+  copy_unpack(traits, src, dst);
+}
+
+namespace detail {
+
+template <class CopyOp, class = void>
+constexpr bool is_prefetch = false;
+
+template <class CopyOp>
+constexpr bool is_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = is_same_v<CopyOp, typename CopyOp::PREFETCH>;
+
+} // end namespace detail
+
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
new file mode 100755
index 000000000..7a693805e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm50.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR1>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape < _2,  _16>,Shape <_32,  _2>>,
+                           Stride<Stride<_32, _128>,Stride< _1, _64>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR4>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_32>;
+ 
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <_32,_64>,
+                           Stride<_64, _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape < _4,  _2,   _4>, Shape<_32,   _2>>,
+                           Stride<Stride<_64, _32, _512>,Stride< _1, _256>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
new file mode 100755
index 000000000..9ad82c617
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm75.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM75_U32x1_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,_32>,
+                           Stride<_32, _1>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U32x2_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,Shape <_32,   _2>>,
+                           Stride<_32,Stride< _1,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U32x4_LDSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <_32,Shape <_32,   _4>>,
+                           Stride<_32,Stride< _1,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x2_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x4_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
+                           Stride<Stride<_128,_0>,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _2>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+template <>
+struct Copy_Traits<SM75_U16x8_LDSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape < _32,_128>,
+                           Stride<_128,  _1>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _4>>,
+                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
new file mode 100755
index 000000000..e5ff0b7b3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm80.hpp>
+#include <cute/atom/copy_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Construct a zfill variant with a given predicate value
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
+  with(bool pred) const {
+    return {pred};
+  }
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Construct a zfill variant with a given predicate value
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
+  with(bool pred) const {
+    return {pred};
+  }
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Predicate value that determines whether to load or zfill
+  bool pred = false;
+
+  // Overload copy_unpack for zfill variant to pass the predicate into the op
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
+    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
+
+    Tensor rS = recast<S>(src);
+    Tensor rD = recast<D>(dst);
+
+    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
+      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
+      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
+
+    SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
+  }
+};
+
+template <class S, class D>
+struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // Predicate value that determines whether to load or zfill
+  bool pred = false;
+
+  // Overload copy_unpack for zfill variant to pass the predicate into the op
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
+    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
+
+    Tensor rS = recast<S>(src);
+    Tensor rD = recast<D>(dst);
+
+    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
+      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
+    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
+      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
+
+    SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Element copy selector
+template <class SrcTensor, class DstTensor>
+CUTE_HOST_DEVICE constexpr
+auto
+select_elementwise_copy(SrcTensor const&, DstTensor const&)
+{
+  using SrcType = typename SrcTensor::value_type;
+  using DstType = typename DstTensor::value_type;
+
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  if constexpr (is_gmem<SrcTensor>::value && is_smem<DstTensor>::value &&
+                sizeof(SrcType) == sizeof(DstType) &&
+               (sizeof(SrcType) == 4 || sizeof(SrcType) == 8 || sizeof(SrcType) == 16))
+  {
+    return SM80_CP_ASYNC_CACHEALWAYS<SrcType,DstType>{};
+  } else {
+    return UniversalCopy<SrcType,DstType>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+#else
+  return UniversalCopy<SrcType,DstType>{};
+#endif
+}
+
+}
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
new file mode 100755
index 000000000..f9590848a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy_sm90.hpp>
+#include <cute/atom/copy_traits.hpp>
+#include <cute/atom/copy_traits_sm75.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct Copy_Traits<SM90_U32x1_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U32x2_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U32x4_STSM_N>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x2_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x4_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+template <>
+struct Copy_Traits<SM90_U16x8_STSM_T>
+{
+  // Logical thread id to thread idx (warp)
+  using ThrID   = Layout<_32>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::DstLayout;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::SrcLayout;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
new file mode 100755
index 000000000..54f76073b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -0,0 +1,940 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+  \brief im2col make_tma_copy
+*/
+
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/arch/copy_sm90_desc.hpp"
+#include "cute/tensor.hpp"
+
+#include "cute/algorithm/prefetch.hpp"
+#include "cutlass/fast_math.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cute
+{
+
+// Utility for unpacking TMA_LOAD_IM2COL arguments into a CopyOp
+template <class CopyOp>
+struct TMA_LOAD_IM2COL_Unpack
+{
+  /// Copy from src to dst.
+  ///
+  /// @param traits Copy traits created with a TMA descriptor that
+  ///   correctly matches the input tensor and other convolution
+  ///   parameters.
+  ///
+  /// @param src Tile of the im2col-transformed coordinate tensor
+  ///   (result of get_tma_tensor), representing the global-memory
+  ///   tensor from which to load.
+  ///
+  /// @param dst Shared memory tile, into which to load.
+  template <class... Args,
+            class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
+              Tensor<TS,SLayout>           const& src, // tile of the transformed global activation (A) tensor
+              Tensor<TD,DLayout>                & dst) // shared memory tile
+  {
+    auto src_coord_offset = src(Int<0>{});
+    auto src_coord_cwhdn_offset_srt = flatten(src_coord_offset);
+    // Interpret the TMA IM2COL coordinate as  (c, ([w,h,d]), n, ([s,r,t]))
+    CUTE_STATIC_ASSERT_V(rank(src_coord_offset) == _4{});
+    CUTE_STATIC_ASSERT_V(rank<1>(src_coord_offset) == rank<3>(src_coord_offset));
+
+    if constexpr (detail::is_prefetch<CopyOp>) {
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
+    } else {
+      static_assert(is_smem<TD>::value, "SM90_TMA_LOAD_IM2COL requires the destination be shared memory.");
+      void* dst_ptr = cute::raw_pointer_cast(dst.data());
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   make_tuple(dst_ptr), seq<0>{},
+                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
+    }
+  }
+};
+
+// Copy_Traits for SM90 im2col TMA load comes in two layers.
+//
+// 1. Copy_Traits<SM90_TMA_LOAD_IM2COL>
+// 2. Copy_Traits<SM90_TMA_LOAD_IM2COL_OP>
+//
+// Copy_Traits<SM90_TMA_LOAD_IM2COL>
+// is the "outer" layer.  It has a TMA descriptor,
+// but no barrier ("tma_mbar"), so it's "nonexecutable."
+// One calls its "with" member function with a barrier,
+// to get an executable "inner"-layer
+// Copy_Traits<SM90_TMA_LOAD_IM2COL_OP> object.
+// That object's "copy_unpack" member function
+// actually invokes im2col TMA load.
+
+struct SM90_TMA_LOAD_IM2COL_OP : SM90_TMA_LOAD_IM2COL {};
+
+/// @brief Non-executable specialization of Copy_Traits for SM90
+///   im2col TMA load, with TMA descriptor but no barrier.
+///
+/// Use `.with(memory_barrier)` to construct an executable version.
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, TMATensor>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const
+  {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  /// @brief Get an executable specialization.
+  ///
+  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL are not
+  /// directly executable.  Instead, call this "with" member function
+  /// to get an executable specialization.  "Executable" means that
+  /// @c copy_unpack works.
+  ///
+  /// @param tma_mbar Memory barrier for synchronization
+  ///
+  /// @param multicast_mask Multicast mask (unused; only exists
+  ///   for interface compatibility with the actual multicast Copy_Traits)
+  ///
+  /// @return Executable specialization of @c Copy_Traits
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
+  with(uint64_t& tma_mbar, [[maybe_unused]] uint16_t const& multicast_mask = 0) const
+  {
+    return {{}, {&tma_desc_, &tma_mbar}};
+  }
+
+  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL
+  // are not directly executable.  Instead, call .with
+  // to get an executable specialization.
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+/// @brief Executable specialization of Copy_Traits for SM90 im2col
+///   TMA load, with TMA descriptor and barrier.
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_OP>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL arguments
+  tuple<
+  Im2ColTmaDescriptor const*,
+  uint64_t* // smem mbarrier
+  > const opargs_;
+};
+
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL::PREFETCH, NumBitsPerTMA, Args...>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL::PREFETCH>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL::PREFETCH arguments
+  tuple<Im2ColTmaDescriptor const*> const opargs_;
+
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, Args...> const& traits)
+    : opargs_({&traits.tma_desc_}) {}
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_IM2COL_MULTICAST_OP : SM90_TMA_LOAD_IM2COL_MULTICAST {};
+
+/// @brief Non-executable specialization of Copy_Traits for SM90
+///   im2col TMA load, with TMA descriptor but no barrier or multicast
+///   mask.
+///
+/// Use `.with(memory_barrier)` to construct an executable version.
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST, NumBitsPerTMA, TMATensor>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  /// @brief Get an executable specialization.
+  ///
+  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
+  /// are not directly executable.  Instead, call this "with" member
+  /// function to get an executable specialization.  "Executable"
+  /// means that @c copy_unpack works.
+  ///
+  /// @param tma_mbar Memory barrier for synchronization
+  ///
+  /// @param multicast_mask Multicast mask (defaults to a single CTA)
+  ///
+  /// @return Executable specialization of @c Copy_Traits
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
+  with(uint64_t& tma_mbar, uint16_t const& multicast_mask) const {
+    return {{}, {&tma_desc_, &tma_mbar, multicast_mask}};
+  }
+
+  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
+  // are not directly executable.  Instead, call .with to get an
+  // executable specialization.
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+/// @brief Executable specialization of Copy_Traits for SM90 multicast
+///   im2col TMA load, with TMA descriptor, barrier, and multicast mask.
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
+     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_MULTICAST_OP>
+{
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit.
+  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_IM2COL_MULTICAST arguments
+  tuple<
+  Im2ColTmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint16_t   // multicast mask
+  > const opargs_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_STORE IM2COL////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// The executable SM90_TMA_STORE_IM2COL with tma_desc
+template <class NumBitsPerTMA, class TMATensor>
+struct Copy_Traits<SM90_TMA_STORE_IM2COL, NumBitsPerTMA, TMATensor>
+{
+  using ThrID   = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE_IM2COL arguments
+  Im2ColTmaDescriptor tma_desc_;
+  TMATensor tma_tensor_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  Im2ColTmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  TMATensor const
+  get_tma_tensor(GShape const&) const
+  {
+    return tma_tensor_;
+  }
+
+  // This is the copy_unpack dispatch for this Copy_Traits
+  // Src needs to be a smem tensor
+  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE_IM2COL");
+
+    void const* const desc_ptr = &(traits.tma_desc_);
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = flatten(take<0,3>(dst(Int<0>{})));
+
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE_IM2COL>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+};
+
+namespace detail {
+
+/// @brief Creates a TMA descriptor for im2col TMA load.
+///
+/// @param tensor_cwhdn Global activation tensor (A matrix of Fprop).
+///   This is the original (not im2col-transformed) tensor in global
+///   memory.
+///
+/// @param slayout Rank 2 (M,K) shared memory layout of the activation
+///   tensor.  Here, K is "GEMM K," not the filter tensor's mode of
+///   the same name.
+//////
+/// @param traversal_stride Traversal strides convolution parameter
+//////
+/// Each of padding_shape, traversal_stride, and dilation_shape is a
+/// tuple whose size is the number of spatial modes (e.g., 3 for a 5-D
+/// convolution).
+///
+/// @return TMA descriptor for im2col TMA load
+template <class EngineA, class LayoutA,
+          class SmemSwizzle, class TMALayout,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST
+auto
+make_im2col_tma_copy_desc(
+    Tensor<EngineA, LayoutA>    const& tensor_cwhdn,       // (C,W,H,D,N)
+    uint32_t                           range_c,            // TILE_C
+    uint32_t                           range_whdn,         // TILE_WHDN
+    SmemSwizzle                 const& smem_swizzle,       // Swizzle
+    TMALayout                   const& tma_layout_vt,      // TMA layout
+    LowerCornerStride           const& lower_corner_whd,   // WHD offset of the "base pointer"
+    UpperCornerStride           const& upper_corner_whd,   // WHD upper corner
+    LowerPaddingStride          const& lower_padding_whd,  // WHD lower padding
+    UpperPaddingStride          const& upper_padding_whd,  // WHD upper padding
+    TraversalStride             const& stride_whd,         // WHD traversal stride
+    LowerSRTStride              const& lower_srt,          // SRT offset of the "base pointer"
+    DilationStride              const& stride_srt,          // SRT stride - dilation
+    TMA::DescriptorAuxParams    const& aux_params = {})
+{
+  static_assert(is_gmem<EngineA>::value, "Tensor must point to GPU global memory.");
+  using value_type = typename EngineA::value_type;
+
+  constexpr uint32_t num_total_modes   = LayoutA::rank;
+  constexpr int      num_spatial_modes = num_total_modes - 2;
+
+  // Gmem starting address
+  void* gmem_address = (void*) raw_pointer_cast(tensor_cwhdn.data());
+
+  // Gmem extents are just the tensor shape
+  cute::array<uint64_t, 5> gmem_prob_shape = {1,1,1,1,1};
+  for_each(make_seq<num_total_modes>{}, [&](auto i) {
+    gmem_prob_shape[i] = static_cast<uint64_t>(shape<i>(tensor_cwhdn));
+  });
+
+  // Gmem strides are byte strides of the activation tensor in CWHDN order
+  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
+  for_each(make_seq<num_total_modes>{}, [&](auto i) {
+    gmem_prob_stride[i] = sizeof(value_type) * stride<i>(tensor_cwhdn);
+  });
+
+  // Traversal strides are a function of the dilation shape
+  // corresponding to spatial (WHD) modes.
+  cute::array<uint32_t, 5> tma_traversal_strides = {1,1,1,1,1};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_traversal_strides[i+1] = static_cast<uint32_t>(get<i>(stride_whd));
+  });
+
+  cute::array<int32_t, num_spatial_modes> tma_lower_corner{};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_lower_corner[i] = static_cast<int32_t>(get<i>(lower_corner_whd));
+  });
+
+  cute::array<int32_t, num_spatial_modes> tma_upper_corner{};
+  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
+    tma_upper_corner[i] = static_cast<int32_t>(get<i>(upper_corner_whd));
+  });
+
+  Im2ColTmaDescriptor tma_desc;
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+
+  CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<value_type>();
+  CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
+  CUtensorMapL2promotion  tma_l2Promotion = to_CUtensorMapL2promotion(aux_params.l2promo_);
+  CUtensorMapFloatOOBfill tma_oob_fill    = to_CUtensorMapFloatOOBfill(aux_params.oobfill_);
+  TMA::SmemSwizzleBits    swizzle_bits    = detail::get_tma_swizzle_bits(smem_swizzle);
+  TMA::SmemSwizzleBase    swizzle_base    = detail::get_tma_swizzle_base(smem_swizzle);
+  CUtensorMapSwizzle      tma_swizzle     = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
+
+  CUresult encode_result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeIm2col)(
+      &tma_desc,
+      tma_format,
+      num_total_modes,
+      gmem_address,
+      gmem_prob_shape.data(),
+      gmem_prob_stride.data() + 1, // gmem_prob_stride[0] implicitly sizeof(value_type)
+      tma_lower_corner.data(),
+      tma_upper_corner.data(),
+      range_c,
+      range_whdn,
+      tma_traversal_strides.data(),
+      tma_interleave,
+      tma_swizzle,
+      tma_l2Promotion,
+      tma_oob_fill);
+
+  // The extra asserts help indicate the error's cause.
+  assert(encode_result != CUDA_ERROR_DEINITIALIZED);
+  assert(encode_result != CUDA_ERROR_NOT_INITIALIZED);
+  assert(encode_result != CUDA_ERROR_INVALID_CONTEXT);
+  assert(encode_result != CUDA_ERROR_INVALID_VALUE);
+  assert(encode_result == CUDA_SUCCESS);
+
+#endif // (__CUDACC_VER_MAJOR__ >= 12)
+  //
+  // Calculate gemm shapes and linearized shapes based on tma layout tiling.
+  //
+
+  // Compute [w, h, d, n]
+  // q/p/z = (w/h/d + (upper_corner_whd - lower_corner_whd - 1)) / stride_whd + 1
+  auto gemm_mn_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
+    return (shape<i+1>(tensor_cwhdn) + get<i>(upper_corner_whd) - get<i>(lower_corner_whd) - Int<1>{}) / get<i>(stride_whd) + Int<1>{};
+  });
+  auto gemm_mn = append(gemm_mn_, shape<num_spatial_modes+1>(tensor_cwhdn));
+
+  // Compute [c, s, r, t]
+  // fprop/wgrad, s/r/t = 1 + (upper_padding_whd - upper_corner_whd) / stride_srt
+  // wgrad,       s/r/t = 1 + (lower_padding_whd - lower_corner_whd) / stride_srt
+  auto gemm_k_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
+    auto padding_size = conditional_return(get<i>(stride_srt) > Int<0>{},
+                                           get<i>(upper_padding_whd) - get<i>(upper_corner_whd),
+                                           get<i>(lower_corner_whd)  - get<i>(lower_padding_whd));
+    return Int<1>{} + padding_size / get<i>(stride_srt);
+  });
+  auto gemm_k = prepend(gemm_k_, shape<0>(tensor_cwhdn));
+
+  // For fprop/dgrad kernel, gemm_shapes is ((q, p, z, n), (c, s, r, t))
+  // For wgrad kernel, gemm_shapes is ((c, s, r, t), (q, p, z, n))
+  auto gemm_shapes_common = make_shape(
+      transform_leaf(gemm_mn, [](auto s) {
+        return conditional_return(cute::is_static<decltype(s)>{}, s, cutlass::FastDivmod(s));
+      }),
+      gemm_k);
+  auto gemm_shapes = make_shape(
+      basis_get(stride<0,1>(tma_layout_vt), gemm_shapes_common),
+      basis_get(stride<0,0>(tma_layout_vt), gemm_shapes_common));
+
+  // For fprop/dgrad kernel, linearized shapes is (whdn, (c, s, r, t))
+  // For wgrad kernel linearized shapes is ((c, s, r, t), whdn)
+  auto linear_shapes_common = make_shape(size(gemm_mn), gemm_k);
+  auto linear_shapes = make_shape(
+      basis_get(stride<0,1>(tma_layout_vt), linear_shapes_common),
+      basis_get(stride<0,0>(tma_layout_vt), linear_shapes_common));
+
+  //
+  // Calculate gmem basis stride based on tma layout tiling.
+  //
+
+  auto tma_basis_scale = make_shape(Int<1>{}, stride_whd, Int<1>{}, stride_srt);
+  auto tma_basis = elem_scale(tma_basis_scale, make_basis_like(tma_basis_scale));
+
+  auto gbasis_strides_common = make_stride(
+      append(get<1>(tma_basis), get<2>(tma_basis)),
+      prepend(get<3>(tma_basis), get<0>(tma_basis)));    // ((w,h,d,n),(c,s,r,t))
+  auto gbasis_strides = make_stride(
+      basis_get(stride<0,1>(tma_layout_vt), gbasis_strides_common),
+      basis_get(stride<0,0>(tma_layout_vt), gbasis_strides_common));
+
+  //
+  // Create tma tensor
+  //
+
+  auto lower_corner = make_arithmetic_tuple(Int<0>{}, lower_corner_whd, Int<0>{}, lower_srt);
+
+  auto tensor_multimode = make_tensor(ArithmeticTupleIterator(lower_corner), gemm_shapes, gbasis_strides);
+  auto tensor_linear = make_identity_tensor(linear_shapes);
+  auto tma_tensor = make_tensor(tensor_multimode.data(), composition(
+      tensor_multimode.layout(),
+      tensor_linear(Int<0>{}),
+      tensor_linear.layout()));
+
+  return cute::make_tuple(tma_desc, tma_tensor);
+}
+
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class VShape, class VStride,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_tma_atom_im2col(CopyOp,
+                     Tensor<GEngine,GLayout>      const& gtensor,           // Full GMEM Tensor: ((w, h, d, n), c)
+                     SLayout                      const& slayout,           // CTA Tile of SMEM, potentially swizzled
+                     int32_t                      const& num_multicast,     // The number of CTAs involved in multicasting
+                     Layout<VShape,VStride>       const& cta_v_map,         // V: CTA val idx -> gmem mode
+                     LowerCornerStride            const& lower_corner_whd,
+                     UpperCornerStride            const& upper_corner_whd,
+                     LowerPaddingStride           const& lower_padding_whd,
+                     UpperPaddingStride           const& upper_padding_whd,
+                     TraversalStride              const& stride_whd,        // traversal stride
+                     LowerSRTStride               const& lower_srt,
+                     DilationStride               const& stride_srt,        // dilation
+                     TMA::DescriptorAuxParams     const& aux_params = {})
+{
+  //
+  // TMA parameter checking
+  //
+
+  CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
+    "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
+
+  //
+  // TMA slayout manipulation
+  //
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // trunc_smem_idx -> trunc_smem_coord
+
+  // Map from smem idx to a gmem mode
+  auto sidx_to_gmode = coalesce(composition(cta_v_map, inv_smem_layout));
+
+#if 0
+  print("g_layout         : "); print(gtensor.layout()); print("\n");
+  print("s_layout         : "); print(slayout); print("\n");
+  print("cta_t_map        : "); print(cta_t_map); print("\n");
+  print("cta_v_map        : "); print(cta_v_map); print("\n");
+  print("inv_smem         : "); print(inv_smem_layout); print("\n");
+  print("sidx_to_gmode    : "); print(sidx_to_gmode); print("\n");
+#endif
+
+  //
+  // TMA gtensor manipulation
+  //
+
+  // Generate a TupleBasis for the gtensor
+  auto glayout_basis = make_identity_layout(product_each(shape(gtensor)));
+
+  // Tile the modes of gtensor with the truncated cta_v_map o inv_smem_layout_trunc
+  auto tma_layout_full = flatten(composition(glayout_basis, sidx_to_gmode));
+
+  // Truncate any incompatibilities -- no starting in the middle of gmodes
+  auto smem_rank = find_if(stride(tma_layout_full), [](auto e) {
+    [[maybe_unused]] auto v = basis_value(e);
+    return not is_constant<1,decltype(v)>{};
+  });
+  static_assert(smem_rank >= 2, "IM2COL expects at least 2 modes of the smem to vectorize with gmem.");
+  // IM2COL uses a maximum of 2 modes
+  constexpr int smem_tma_rank = cute::min(int(smem_rank), 2);
+
+  // Keep only the static-1 basis modes into gmem
+  auto tma_layout_trunc = take<0,smem_tma_rank>(tma_layout_full);
+
+  // Split according to the portion each multicast CTA will be responsible for
+  auto tma_layout_vt = logical_divide(tma_layout_trunc, shape_div(size(tma_layout_trunc), num_multicast));
+
+#if 0
+  print("glayout_basis   : "); print(glayout_basis); print("\n");
+  print("tma_layout_full : "); print(tma_layout_full); print("\n");
+
+  print("tma_layout_trunc: "); print(tma_layout_trunc); print("\n");
+  print("tma_layout_vt   : "); print(tma_layout_vt); print("\n");
+#endif
+
+  auto range_c    = size<0,0>(tma_layout_vt);
+  auto range_whdn = size<0,1>(tma_layout_vt);
+  Tensor gtensor_cwhdn = make_tensor(gtensor.data(),
+                                     flatten(make_layout(make_layout(basis_get(stride<0,0>(tma_layout_vt), gtensor.shape()),
+                                                                     basis_get(stride<0,0>(tma_layout_vt), gtensor.stride())),
+                                                         make_layout(basis_get(stride<0,1>(tma_layout_vt), gtensor.shape()),
+                                                                     basis_get(stride<0,1>(tma_layout_vt), gtensor.stride())))));
+  auto [tma_desc, tma_tensor] = make_im2col_tma_copy_desc(
+      gtensor_cwhdn,
+      range_c,
+      range_whdn,
+      detail::get_swizzle_portion(slayout),
+      tma_layout_vt,
+      lower_corner_whd,
+      upper_corner_whd,
+      lower_padding_whd,
+      upper_padding_whd,
+      stride_whd,
+      lower_srt,
+      stride_srt,
+      aux_params);
+
+  //
+  // Construct the Copy_Traits
+  //
+
+  using T = typename GEngine::value_type;
+  constexpr int num_bits_per_tma = decltype(size(tma_layout_trunc))::value * sizeof(T) * 8;
+
+  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(tma_tensor)>;
+  using Atom = Copy_Atom<Traits, typename GEngine::value_type>;
+
+#if 0
+  print("num_bits      :  "); print(num_bits_per_tma); print("\n");
+#endif
+
+  Traits tma_traits{tma_desc, tma_tensor};
+
+  // Return the Copy_Atom
+  return Atom{tma_traits};
+}
+
+/// Make a TiledCopy for im2col TMA load.
+///
+/// @param copy_op The copy implementation: either
+///   SM90_TMA_LOAD_IM2COL or SM90_TMA_LOAD_IM2COL_MULTICAST.
+///
+/// @param tensor_cwhdn The global tensor to use for im2col TMA loads.
+///   For Fprop convolutions, this is the activation tensor.  This is
+///   the "original tensor that points to global memory, not the
+///   coordinate (im2col-transformed) tensor.
+///
+/// @param slayout Layout of shared memory tile.
+///
+/// @param stride_whd The traversal strides convolution
+///   parameter.
+///
+/// @return TiledCopy specialization for im2col TMA loads.
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_im2col(CopyOp                       const& copy_op,
+                     Tensor<GEngine,GLayout>      const& gtensor,
+                     SLayout                      const& slayout,
+                     Layout<TShape,TStride>       const& cta_t_map,          // CTA tid -> logical TMA tid
+                     Layout<VShape,VStride>       const& cta_v_map,          // CTA vid -> gmem coord
+                     LowerCornerStride            const& lower_corner_whd,
+                     UpperCornerStride            const& upper_corner_whd,
+                     LowerPaddingStride           const& lower_padding_whd,
+                     UpperPaddingStride           const& upper_padding_whd,
+                     TraversalStride              const& stride_whd,         // traversal stride
+                     LowerSRTStride               const& lower_srt,
+                     DilationStride               const& stride_srt,         // dilation
+                     TMA::DescriptorAuxParams     const& aux_params = {})
+{
+  //
+  // TMA parameter checking
+  //
+
+  CUTE_STATIC_ASSERT_V(size(slayout) % cosize(cta_t_map) == Int<0>{},
+    "Number of active CTAs in TMA must divide domain size of slayout.");
+
+  Copy_Atom atom = make_tma_atom_im2col(copy_op, gtensor, slayout, cosize(cta_t_map), cta_v_map,
+                                        lower_corner_whd, upper_corner_whd, lower_padding_whd,
+                                        upper_padding_whd, stride_whd, lower_srt, stride_srt, aux_params);
+
+  //
+  // Construct the TiledCopy
+  //
+
+  auto cta_tiler = product_each(shape(cta_v_map));
+
+  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
+
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // CTA V -> smem_coord
+  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
+  // Scale that up to cover all of the smem_coords
+  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
+  // CTA T -> smem idx
+  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  // CTA TID -> smem coord
+  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
+  // Combine with the T mapping
+  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
+
+#if 0
+  print("cta_tiler : "); print(cta_tiler); print("\n");
+  print("layout_v : "); print(layout_v); print("\n");
+  print("layout_V : "); print(layout_V); print("\n");
+  print("layout_t : "); print(layout_t); print("\n");
+  print("layout_T : "); print(layout_T); print("\n");
+  print("layout_TV : "); print(layout_TV); print("\n");
+#endif
+
+  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
+}
+
+/// Make a TiledCopy for im2col TMA with no offsets.
+/// E.g. im2col TMA load for C and im2col TMA store for D.
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_im2col(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     Layout<TShape,TStride>  const& cta_t_map,          // CTA tid -> logical TMA tid
+                     Layout<VShape,VStride>  const& cta_v_map)          // CTA vid -> gmem coord
+{
+  constexpr int num_spatial_modes = rank<0>(GLayout{}) - 1;
+  return make_tma_copy_im2col(copy_op, gtensor, slayout, cta_t_map, cta_v_map,
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_corner_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_corner_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_padding_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_padding_whd
+                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{}),  // stride_whd
+                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_srt
+                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{})); // stride_srt
+}
+
+} // namespace detail
+
+
+
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class MulticastSize,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     MulticastSize            const& multicast_size,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
+  auto cta_t_tile = make_layout(multicast_size);
+
+  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn,
+                                      slayout, cta_t_tile, cta_v_tile,
+                                      lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// Explicit default for multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{},
+                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// Explicit default for cta_tiler and multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class LowerCornerStride,
+          class UpperCornerStride,
+          class LowerPaddingStride,
+          class UpperPaddingStride,
+          class TraversalStride,
+          class LowerSRTStride,
+          class DilationStride>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     LowerCornerStride        const& lower_corner_whd,
+                     UpperCornerStride        const& upper_corner_whd,
+                     LowerPaddingStride       const& lower_padding_whd,
+                     UpperPaddingStride       const& upper_padding_whd,
+                     TraversalStride          const& stride_whd,
+                     LowerSRTStride           const& lower_srt,
+                     DilationStride           const& stride_srt)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{},
+                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
+}
+
+// No offsets copy.
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler,
+          class MulticastSize>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler,
+                     MulticastSize            const& multicast_size)
+{
+  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
+  auto cta_t_tile = make_layout(multicast_size);
+
+  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn, slayout, cta_t_tile, cta_v_tile);
+}
+
+// Explicit default for multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout,
+          class CTATiler>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout,
+                     CTATiler                 const& cta_tiler)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{});
+}
+
+// Explicit default for cta_tiler and multicast_size
+template <class CopyOp,
+          class Engine0, class Layout0,
+          class SLayout>
+CUTE_HOST_RTC
+auto
+make_im2col_tma_copy(CopyOp                   const& copy_op,
+                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
+                     SLayout                  const& slayout)
+{
+  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{});
+}
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
new file mode 100755
index 000000000..3738cc396
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -0,0 +1,1525 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#endif
+
+#include <cute/atom/copy_traits_sm90_tma_swizzle.hpp>
+#include <cute/atom/copy_traits.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#include <cute/algorithm/prefetch.hpp>
+
+#include <cute/numeric/integral_ratio.hpp>
+
+#include <cutlass/cuda_host_adapter.hpp>
+
+namespace cute
+{
+
+template <class GmemTmaBasisStrides_, class TmaGmemBasis_, class TmaSwizzle_>
+struct AuxTmaParams {
+  using GmemStrides  = GmemTmaBasisStrides_;    // Strides for Gmem mode -> Tma coord mode, may be dynamic
+  GmemStrides g_stride_;
+  using TmaGmemBasis = TmaGmemBasis_;           // Layout for Tma box shape -> Gmem mode(s), always static
+  static_assert(is_static<TmaGmemBasis>::value);
+  using TmaSwizzle   = TmaSwizzle_;             // Tma swizzle, always Swizzle<B,M,S>
+  static_assert(is_static<TmaSwizzle>::value);
+};
+
+// Utility for unpacking TMA_LOAD arguments into a CopyOp
+template <class CopyOp>
+struct TMA_LOAD_Unpack
+{
+  template <class... Args,
+            class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
+              Tensor<TS,SLayout>           const& src,
+              Tensor<TD,DLayout>                & dst)
+  {
+    auto src_coord = src.data().coord_;
+    if constexpr (detail::is_prefetch<CopyOp>) {
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   src_coord, tuple_seq<decltype(src_coord)>{});
+    } else {
+      static_assert(is_smem<TD>::value, "SM90_TMA_LOAD requires the destination be shared memory.");
+      void* dst_ptr = cute::raw_pointer_cast(dst.data());
+#if 0
+      auto [c0,c1,c2,c3,c4] = append<5>(src_coord, 0);
+      printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+            threadIdx.x, threadIdx.y, threadIdx.z,
+            blockIdx.x, blockIdx.y, blockIdx.z,
+            int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), dst_ptr);
+#endif
+      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
+                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
+                                   make_tuple(dst_ptr), seq<0>{},
+                                   src_coord, tuple_seq<decltype(src_coord)>{});
+    }
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD ///////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_OP : SM90_TMA_LOAD {};
+
+// The non-executable SM90_TMA_LOAD with tma_desc and no tma_mbar
+// Use .with(tma_mbar) to construct an executable version
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Construct an executable SM90_TMA_LOAD with tma_mbar
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+  with(
+    uint64_t& tma_mbar,
+    [[maybe_unused]] uint16_t const& multicast_mask = 0,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    // We accept multicast_mask here to keep the API for both atoms consistent
+    return {{}, {&tma_desc_, &tma_mbar, static_cast<uint64_t>(cache_hint)}};
+  }
+
+  // Construct an executable SM90_TMA_LOAD with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+  with(
+    TmaDescriptor const* new_tma_desc,
+    uint64_t& tma_mbar,
+    [[maybe_unused]] uint16_t const& multicast_mask = 0,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    // We accept multicast_mask here to keep the API for both atoms consistent
+    return {{}, {new_tma_desc, &tma_mbar, static_cast<uint64_t>(cache_hint)}};
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  // Don't try to execute a copy with SM90_TMA_LOAD before calling .with()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+// The executable SM90_TMA_LOAD with tma_desc and tma_mbar
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
+     : TMA_LOAD_Unpack<SM90_TMA_LOAD_OP>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD arguments
+  tuple<
+  TmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint64_t   // cache hint
+  > const opargs_;
+};
+
+// The prefetch for SM90_TMA_LOAD with tma_desc
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_TMA_LOAD::PREFETCH, NumBitsPerTMA, Args...>
+     : TMA_LOAD_Unpack<SM90_TMA_LOAD::PREFETCH>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD::PREFETCH arguments
+  tuple<TmaDescriptor const*> const opargs_;
+
+  // Construct with any other Traits' TMA Desc
+  template <class... CopyArgs>
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<CopyArgs...> const& traits)
+    : opargs_({&traits.tma_desc_}) {}
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+struct SM90_TMA_LOAD_MULTICAST_OP : SM90_TMA_LOAD_MULTICAST {};
+
+// The non-executable SM90_TMA_LOAD_MULTICAST with tma_desc and no tma_mbar
+// Use .with(tma_mbar, multicast_mask) to construct an executable version
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_MULTICAST arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Construct an executable SM90_TMA_LOAD_MULTICAST with tma_mbar
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+  with(
+    uint64_t& tma_load_mbar,
+    uint16_t const& multicast_mask,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    return {{}, {&tma_desc_, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)}};
+  }
+
+  // Construct an executable SM90_TMA_LOAD_MULTICAST_OP with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+  with(
+    TmaDescriptor const* new_tma_desc,
+    uint64_t& tma_load_mbar,
+    uint16_t const& multicast_mask,
+    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
+    return {{}, {new_tma_desc, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)}};
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst) = delete;
+};
+
+// The executable SM90_TMA_LOAD_MULTICAST with tma_desc and tma_mbar and multicast_mask
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
+     : TMA_LOAD_Unpack<SM90_TMA_LOAD_MULTICAST_OP>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_LOAD_MULTICAST arguments
+  tuple<
+  TmaDescriptor const*,
+  uint64_t*, // smem mbarrier
+  uint16_t,  // multicast mask
+  uint64_t   // cache hint
+  > const opargs_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_STORE //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// Utility for unpacking TMA_STORE arguments into a CopyOp
+template <class CopyOp>
+struct TMA_STORE_Unpack
+{
+  template <class... Args,
+            class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
+              Tensor<TS,SLayout>           const& src,
+              Tensor<TD,DLayout>                & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
+
+    void const* const desc_ptr = traits.tma_desc_;
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = dst.data().coord_;
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+};
+
+struct SM90_TMA_STORE_OP : SM90_TMA_STORE {};
+
+// The executable SM90_TMA_STORE with tma_desc
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
+    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_STORE");  // TMA spoofed src tensor
+
+    void const* const desc_ptr = &(traits.tma_desc_);
+    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
+    auto dst_coord = dst.data().coord_;
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
+                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
+                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
+  }
+
+  // Construct Copy_Traits executable (w/ swapped out TMA descriptor) for SM90_TMA_STORE (for grouped gemm/ptr array gemm)
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_TMA_STORE_OP, NumBitsPerTMA>
+  with(TmaDescriptor const* new_tma_desc) const {
+    return {{}, new_tma_desc};
+  }
+};
+
+// The executable SM90_TMA_STORE with tma_desc
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_TMA_STORE_OP, NumBitsPerTMA>
+     : TMA_STORE_Unpack<SM90_TMA_STORE_OP>
+{
+  using ThrID     = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_STORE arguments
+  TmaDescriptor const* tma_desc_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// TMA_REDUCE_ADD //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// The executable SM90_TMA_REDUCE_ADD with tma_desc
+template <class NumBitsPerTMA, class AuxParams_>
+struct Copy_Traits<SM90_TMA_REDUCE_ADD, NumBitsPerTMA, AuxParams_>
+{
+  using ThrID   = Layout<_1>;
+
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_TMA_REDUCE_ADD arguments
+  TmaDescriptor tma_desc_;
+  using AuxParams = AuxParams_;
+  AuxParams aux_params_;
+
+  // Return TmaDescriptor/TensorMap
+  CUTE_HOST_DEVICE constexpr
+  TmaDescriptor const*
+  get_tma_descriptor() const {
+    return &tma_desc_;
+  }
+
+  // Generate the TMA coord tensor
+  template <class GShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_tma_tensor(GShape const& g_shape) const {
+    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
+    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
+  }
+
+  template <class Coord, int... Is>
+  CUTE_HOST_DEVICE constexpr
+  void
+  copy_unpack_(void const* const src_ptr,
+               Coord const& dst_coord, seq<Is...>) const
+  {
+#if 0
+    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
+    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
+           threadIdx.x, threadIdx.y, threadIdx.z,
+           blockIdx.x, blockIdx.y, blockIdx.z,
+           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
+#endif
+
+    SM90_TMA_REDUCE_ADD::copy(&tma_desc_,
+                         src_ptr, get<Is>(dst_coord)...);
+  }
+
+  // This is the copy_unpack dispatch for this Copy_Traits
+  // Src needs to be a smem tensor
+  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_REDUCE_ADD");
+    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_REDUCE_ADD");  // TMA spoofed src tensor
+
+    traits.copy_unpack_(cute::raw_pointer_cast(src.data()), dst.data().coord_, tuple_seq<decltype(dst.data().coord_)>{});
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////// BULK COPY //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+template <class NumBitsPerTMA, class... OpArgs>
+struct Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, OpArgs...>
+{
+  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
+                "Bulk Copy requires copy vector size align to 16B.");
+
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_BULK_COPY_G2S arguments
+  // 0: uint64_t* bulk_load_memory_barrier
+  cute::tuple<OpArgs...> bulk_load_mbar_;
+
+  // Record the memory barrier for the instruction
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, uint64_t*>
+  with(uint64_t& bulk_mbar) const {
+    return {{&bulk_mbar}};
+  }
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_same<cute::tuple<OpArgs...>, cute::tuple<uint64_t*>>::value,
+                  "Extra arguments not set. Set .with() before use.");
+    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_COPY_G2S");
+    static_assert(is_smem<TD>::value, "Expected smem dst for SM90_BULK_COPY_G2S");
+    SM90_BULK_COPY_G2S::copy(raw_pointer_cast(src.data()), get<0>(traits.bulk_load_mbar_),
+                             raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+template <class NumBitsPerTMA, class... Args>
+struct Copy_Traits<SM90_BULK_COPY_G2S::PREFETCH, NumBitsPerTMA, Args...>
+     : Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA>
+{
+  template <class... CopyArgs>
+  CUTE_HOST_DEVICE
+  Copy_Traits(Copy_Traits<CopyArgs...> const& traits) {}
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_PREFETCH");
+    SM90_BULK_COPY_G2S::PREFETCH::copy(raw_pointer_cast(src.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+template <class NumBitsPerTMA>
+struct Copy_Traits<SM90_BULK_COPY_S2G, NumBitsPerTMA>
+{
+  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
+                "Bulk Copy requires copy vector size align to 16B.");
+
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  template <class TS, class SLayout,
+            class TD, class DLayout>
+  CUTE_HOST_DEVICE friend constexpr
+  void
+  copy_unpack(Copy_Traits        const& traits,
+              Tensor<TS,SLayout> const& src,
+              Tensor<TD,DLayout>      & dst)
+  {
+    static_assert(is_smem<TS>::value, "Expected smem src for SM90_BULK_COPY_S2G");
+    static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_BULK_COPY_S2G");
+    SM90_BULK_COPY_S2G::copy(raw_pointer_cast(src.data()), raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
+  }
+};
+
+//
+// Placeholder for the bulk copy algorithm's default, auto-vectorizing behavior
+//
+
+template <class... OpArgs>
+struct Copy_Traits<SM90_BULK_COPY_AUTO, OpArgs...>
+{
+  // Logical thread id to thread idx (one-thread)
+  using ThrID = Layout<_1>;
+  // Map from (src-thr,src-val) to bit
+  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = SrcLayout;
+
+  // SM90_UBULK_COPY arguments
+  // 0: uint64_t* bulk_load_memory_barrier [if this is a BULK_LOAD_G2S]
+  cute::tuple<OpArgs...> opargs_;
+
+  // Record the memory barrier for the instruction
+  CUTE_HOST_DEVICE constexpr
+  Copy_Traits<SM90_BULK_COPY_AUTO, uint64_t*>
+  with(uint64_t& bulk_mbar) const {
+    return {{&bulk_mbar}};
+  }
+};
+
+//
+// MAKE_TMA_COPY and related
+//
+
+namespace detail {
+
+// Custom version of coalesce that greedily combines modes only up to size-256
+// Look at each element and the back of the stack (in order of priority)
+// back(NewLayout)  get<I>(OldLayout)
+//      s0:d0           _1:d1     =>  continue
+//      _1:d0           s1:d1     =>  replace_back     s1:d1
+//      s0:d0           s1:s0*d0  =>  replace_back  s0*s1:d0   if s0*s1 <= 256
+//      s0:d0           s1:d1     =>  append           s1:d1
+//
+// @pre OldShape and OldStride are flat
+template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256_impl(OldShape const& old_shape, OldStride const& old_stride,
+                  NewShape const& new_shape, NewStride const& new_stride)
+{
+  if constexpr (I == rank_v<OldShape>) {
+    // Base case, we're done
+    if constexpr (is_constant<1, NewShape>::value) {
+      return Layout<_1,_0>{};
+    } else {
+      return Layout<NewShape,NewStride>{new_shape,new_stride};
+    }
+  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
+    // shape<I>(layout) == _1, skip it and continue
+    return coalesce_256_impl<I+1>(old_shape, old_stride, new_shape, new_stride);
+  } else if constexpr (is_constant<1, NewShape>::value) {
+    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
+    return coalesce_256_impl<I+1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
+  } else if constexpr (is_constant<true, decltype(back(new_shape) * back(new_stride) == get<I>(old_stride) &&
+                                                  get<I>(old_shape) * back(new_shape) <= Int<256>{})>::value) {
+    // Merge modes because the shapes and strides match and the merge is 256 or less
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  replace_back(new_shape, get<I>(old_shape) * back(new_shape)),
+                                  new_stride);
+  } else {
+    // Can't replace or merge, so append a new mode
+    return coalesce_256_impl<I+1>(old_shape, old_stride,
+                                  append(new_shape,  get<I>(old_shape)),
+                                  append(new_stride, get<I>(old_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Combine all the modes that are possible to combine
+// Does not respect the profile of the layout, but does preserve total size
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_256(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+  return coalesce_256_impl<1>(flat_shape, flat_stride, get<0>(flat_shape), get<0>(flat_stride));
+}
+
+template <class TmaInternalType,
+          class GEngine, class GLayout,
+          class SShape, class SStride,
+          class VShape, class VStride>
+CUTE_HOST_DEVICE constexpr
+auto
+construct_tma_gbasis(Tensor<GEngine,GLayout> const& gtensor,       // The original GMEM Tensor
+                     Layout<SShape,SStride>  const& slayout,       // The layout of SMEM
+                     Layout<VShape,VStride>  const& cta_v_map)     // smem_idx to hier gmode
+{
+  //
+  // TMA parameter checking
+  //
+
+  // CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
+  //                      "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
+  CUTE_STATIC_ASSERT_V(size(slayout) == size(cta_v_map),
+                       "TMA requires CTA_Tile and SLayout top-level size equivalence.");
+
+#if 0
+  print("gtensor         : "); print(gtensor); print("\n");
+  print("slayout         : "); print(slayout); print("\n");
+  print("cta_v_map       : "); print(cta_v_map); print("\n");
+#endif
+
+  //
+  // TMA slayout manipulation
+  //
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+
+  // Compose with the V-Map to convert smem coord (CTA val idx) to gmem mode
+  // smem idx -> gmem mode
+  auto sidx2gmode_full = coalesce(composition(cta_v_map, inv_smem_layout));
+
+#if 0
+  print("inv_smem_layout : "); print(inv_smem_layout); print("\n");
+  print("sidx2gmode_full : "); print(sidx2gmode_full); print("\n");
+#endif
+
+  //
+  // TMA gtensor truncation
+  //
+
+  // Truncate any incompatibilities -- no starting in the middle of gmodes
+  auto smem_rank = find_if(stride(sidx2gmode_full), [](auto e) {
+    [[maybe_unused]] auto v = basis_value(e);
+    return not is_constant<1,decltype(v)>{};
+  });
+  static_assert(smem_rank > 0, "Could not find a common tile-gmem vectorization. Does the Tile select out major GMEM modes?");
+
+  // Keep only the static-1 basis modes into gmem
+  auto sidx2gmode = take<0,smem_rank>(sidx2gmode_full);
+
+#if 0
+  print("smem_rank  : "); print(smem_rank); print("\n");
+  print("sidx2gmode : "); print(sidx2gmode); print("\n");
+#endif
+
+  //
+  // TMA gtensor manipulation
+  //
+
+  // The smem vector is the same units as gtensor, so compose first and then recast
+  // tma_val_idx:gmem_strides
+  auto tile_gstride = recast<TmaInternalType>(gtensor.compose(sidx2gmode)).layout();
+  // Coalesce modes up to size-256 (the maximum TMA box extent in units of TmaInternalType)
+  // tma_box_shape:gmem_strides
+  auto tma_gstride  = coalesce_256(tile_gstride);
+
+  // Perform the tiling, recast, and coalesce to the gmem vector again, but with indirections to the gtensor modes
+  auto gbasis = make_identity_layout(shape(gtensor));
+  auto tile_gbasis_tmp = gbasis.compose(sidx2gmode);
+
+  // Instead of the recast (gbasis doesn't have type info), replace the shape with the already-recasted shape
+  // tma_box_shape:gmem_mode
+  auto tile_gbasis = make_layout(shape(tile_gstride), stride(tile_gbasis_tmp));
+
+  // "Coalesce" the tile basis into a compatible shape with the tma_gstride
+  auto tma_gbasis_tile = tile_gbasis.compose(make_layout(wrap(shape(tma_gstride))));
+
+  // Recast the original tensor for shape/stride inspections
+  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
+
+  // Find missing bases that don't appear in tile_gbasis
+  auto tile_gbasis_remaining_stride = filter_tuple(flatten(shape (gtensor_T)), flatten(stride(gtensor_T)),
+                                                   flatten(stride(gbasis)),
+                                                   [&](auto s, auto d, auto e)
+  {
+    if constexpr (is_constant<1, decltype(s)>::value || is_constant<0, decltype(d)>::value) {
+      return cute::tuple<>{};          // If size-1 or stride-0, then don't append
+    } else {
+      using E = decltype(e);
+      auto has_e = any_of(flatten(stride(tma_gbasis_tile)), [] (auto tb) { return tb == E{}; });
+      if constexpr (decltype(has_e)::value) {
+        return cute::tuple<>{};        // If d was found, then don't append
+      } else {
+        return cute::tuple<E>(e);      // Else, this is missing so append
+      }
+    }
+  });
+
+  // Append the remaining basis modes that contribute to the TMA with size-1
+  auto tile_gbasis_remaining_shape = repeat<rank(tile_gbasis_remaining_stride)>(Int<1>{});
+  auto tma_gbasis_full = make_layout(tuple_cat(wrap( shape(tma_gbasis_tile)), wrap(tile_gbasis_remaining_shape )),
+                                     tuple_cat(wrap(stride(tma_gbasis_tile)), wrap(tile_gbasis_remaining_stride)));
+
+  // Group the trailing modes to make this max rank-5 -- TMA rank limitation
+  // tma_box_shape:gmem_mode
+  auto tma_gbasis = group<cute::min(rank(tma_gbasis_full),4),-1>(tma_gbasis_full);
+
+#if 0
+  print("tile_gstride : "); print(tile_gstride); print("\n");
+  print("tma_gstride  : "); print(tma_gstride); print("\n");
+  print("gbasis       : "); print(gbasis); print("\n");
+  print("tile_gbasis  : "); print(tma_gbasis_tile); print("\n");
+  print("tma_gbasis   : "); print(tma_gbasis); print("\n");
+#endif
+
+  return tma_gbasis;
+}
+
+template <class GEngine, class GLayout,
+          class TmaGmemBasisStride,
+          class ShapeT, size_t TmaRank>
+CUTE_HOST_DEVICE constexpr
+void
+fill_tma_gmem_shape_stride(Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, in units of TmaInternalType
+                           TmaGmemBasisStride        const& tma_gbasis_stride, // Map Tma mode idx -> Gmem mode(s)
+                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
+                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
+{
+  static_assert(is_tuple<TmaGmemBasisStride>::value);
+  static_assert(is_same<uint32_t, ShapeT>::value || is_same<uint64_t, ShapeT>::value);
+
+  using TmaInternalType = typename GEngine::value_type;
+  constexpr int tma_rank = decltype(rank(tma_gbasis_stride))::value;
+  static_assert(TmaRank >= tma_rank);
+
+  auto gmem_shape  =  shape(gtensor);
+  auto gmem_stride = stride(gtensor);
+  // Use the indirections in tma_gbasis_stride into gtensor to construct the tma gmem shapes/strides
+  for_each(make_seq<tma_rank>{}, [&](auto i) {
+    constexpr int tma_i_rank = decltype(rank<i>(tma_gbasis_stride))::value;
+    if constexpr (tma_i_rank == 1) {
+      // Trivial contribution of this gmem mode to this tma mode
+      auto ej = unwrap(get<i>(tma_gbasis_stride));
+      gmem_prob_shape[i]  = basis_get(ej, gmem_shape);
+      gmem_prob_stride[i] = basis_get(ej, gmem_stride);
+    } else {
+      // Apply a recurrence to each gmem mode that contributes to this tma mode
+      for_each(get<i>(tma_gbasis_stride), [&](auto ej) {
+        // Problem shape
+        uint64_t shape_j  = basis_get(ej, gmem_shape);
+        // Problem stride (in bytes)
+        uint64_t stride_j = basis_get(ej, gmem_stride);
+        uint64_t old_stride = gmem_prob_stride[i];
+        gmem_prob_stride[i] = gcd(gmem_prob_stride[i], stride_j);
+
+        if (gmem_prob_stride[i] != 0) {
+          // Recurrence: g_shape = (s_i - 1) * (d_i / gcd_j d_j) + 1
+          gmem_prob_shape[i] = (gmem_prob_shape[i]-1) * (old_stride / gmem_prob_stride[i])
+                             +            (shape_j-1) * (stride_j   / gmem_prob_stride[i])
+                             + 1;
+        } else {
+          gmem_prob_shape[i] = shape_j;
+        }
+      });
+    }
+  });
+}
+
+// Overload for an existing Copy_Traits
+template <class GEngine, class GLayout,
+          class Op, class Bits, class Aux,
+          class ShapeT, size_t TmaRank>
+CUTE_HOST_DEVICE constexpr
+void
+fill_tma_gmem_shape_stride(Copy_Traits<Op,Bits,Aux>  const& tma_traits,
+                           Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, value_type = TmaInternalType
+                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
+                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
+{
+  return fill_tma_gmem_shape_stride(gtensor, stride(typename Aux::TmaGmemBasis{}),
+                                    gmem_prob_shape, gmem_prob_stride);
+}
+
+// Use a sidx2gmode to read through the GMEM tensor
+//   and construct a TMA Descriptor for the resulting instruction
+// At the same time, construct the Tma Tensor's Stride to generate
+//   the TMA coordinates that the instruction consumes.
+//
+template <class TmaInternalType,
+          class GEngine, class GLayout,
+          class TShape, class TStride,
+          int B, int M, int S>
+CUTE_HOST_RTC
+auto
+make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The original GMEM Tensor
+                   Layout<TShape,TStride>  const& tma_gbasis,      // TMA mode -> GMEM mode mapping
+                   Swizzle<B,M,S>          const& swizzle,         // Swizzle fn on smem_idx
+                   uint32_t                       num_multicast)   // The number of CTAs in multicasting
+{
+  //
+  // TMA desc creation
+  //
+
+  constexpr int tma_dim = decltype(rank(tma_gbasis))::value;
+
+  //
+  // TMA gmem desc info
+  //
+
+  // Recast the original tensor for shape/stride inspections
+  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
+
+  void* gmem_address = (void*) raw_pointer_cast(gtensor_T.data());
+  auto  gmem_layout  = gtensor_T.layout();
+
+  cute::array<uint64_t, 5> gmem_prob_shape  = {1,1,1,1,1};
+  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
+
+  fill_tma_gmem_shape_stride(gtensor_T, stride(tma_gbasis), gmem_prob_shape, gmem_prob_stride);
+
+  assert((reinterpret_cast<uint64_t>(gmem_address) & 0b1111) == 0);  // Address must be 16B-aligned
+
+  assert(gmem_prob_shape[0] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[0] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[1] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[1] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[2] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[2] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[3] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[3] <= (uint64_t(1) << 32));         // Size must be max 2^32
+  assert(gmem_prob_shape[4] >= (uint64_t(1)));               // Size must be min 1
+  assert(gmem_prob_shape[4] <= (uint64_t(1) << 32));         // Size must be max 2^32
+
+  // TMA descriptor does not store the zeroth stride and assumes it is 1 (TmaInternalType element).
+  assert(gmem_prob_stride[0] == 1 && "Majorness of smem doesn't match majorness of gmem");
+
+  // convert strides to byte strides
+  for(uint64_t& stride : gmem_prob_stride) {
+    stride = (stride * sizeof_bits_v<TmaInternalType>) / 8;
+  }
+
+  // Assert the byte strides. Tma Descriptor uses byte strides
+  assert((gmem_prob_stride[1]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[1] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[2]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[2] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[3]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[3] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+  assert((gmem_prob_stride[4]) < (uint64_t(1) << 40));       // Stride must be max 2^40
+  assert((gmem_prob_stride[4] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
+
+  //
+  // TMA smem desc info
+  //
+
+  cute::array<uint32_t, 5> smem_box_shape  = {1,1,1,1,1};
+  cute::array<uint32_t, 5> smem_box_stride = {1,1,1,1,1};
+  // The smem box is simply given by the sizes of the modes in tma_gbasis
+  for_each(make_seq<tma_dim>{}, [&](auto i) {
+    smem_box_shape[i] *= size<i>(tma_gbasis);
+  });
+  // Finally, truncate the tma box by the num_multicast
+  for (uint32_t i = tma_dim-1, multicast = num_multicast; multicast > 1; --i) {
+    assert(smem_box_shape[i] % multicast == 0 || multicast % smem_box_shape[i] == 0);
+    uint32_t new_mult = ceil_div(multicast, smem_box_shape[i]);
+    smem_box_shape[i] = ceil_div(smem_box_shape[i], multicast);
+    multicast = new_mult;
+  }
+
+  assert(smem_box_shape[0] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[0] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[1] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[1] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[2] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[2] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[3] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[3] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+  assert(smem_box_shape[4] >= (uint32_t(1)));                // Size must be min 1
+  assert(smem_box_shape[4] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
+
+  assert(smem_box_stride[0] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[0] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[1] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[1] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[2] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[2] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[3] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[3] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+  assert(smem_box_stride[4] >= (uint32_t(1)));               // Stride must be min 1
+  assert(smem_box_stride[4] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
+
+    //
+    // Construct the descriptor
+    //
+
+    TmaDescriptor tma_desc{};
+
+    //
+    // TMA general info
+    //
+
+  #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+
+    CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<TmaInternalType>();
+    CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
+    CUtensorMapL2promotion  tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
+    CUtensorMapFloatOOBfill tma_oobFill     = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+
+    // TMA smem swizzle type
+    TMA::SmemSwizzleBits swizzle_bits = get_tma_swizzle_bits(swizzle);
+    TMA::SmemSwizzleBase swizzle_base = get_tma_swizzle_base(swizzle);
+    CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
+    CUresult result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeTiled)(
+        &tma_desc,
+        tma_format,
+        tma_dim,
+        gmem_address,
+        gmem_prob_shape.data(),
+        gmem_prob_stride.data() + 1,  // gmem_prob_stride[0] implicitly 1
+        smem_box_shape.data(),
+        smem_box_stride.data(),
+        tma_interleave,
+        smem_swizzle,
+        tma_l2Promotion,
+        tma_oobFill);
+
+    if (result != CUDA_SUCCESS) {
+      std::cerr << "TMA Desc Addr:   " << &tma_desc
+                << "\nformat         " << tma_format
+                << "\ndim            " << tma_dim
+                << "\ngmem_address   " << gmem_address
+                << "\nglobalDim      " << gmem_prob_shape
+                << "\nglobalStrides  " << gmem_prob_stride
+                << "\nboxDim         " << smem_box_shape
+                << "\nelementStrides " << smem_box_stride
+                << "\ninterleave     " << tma_interleave
+                << "\nswizzle        " << smem_swizzle
+                << "\nl2Promotion    " << tma_l2Promotion
+                << "\noobFill        " << tma_oobFill << std::endl;
+      std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
+      assert(false);
+    }
+
+  #endif // (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  auto recast_ratio = cute::trait_ratio(sizeof_bits<typename GEngine::value_type>{},
+                                        sizeof_bits<             TmaInternalType>{});
+
+  auto gbasis = make_basis_like(shape(gtensor));
+
+  // Finally, get the inverse permutation of the E<i> bases for the mocked gmem stride
+  auto gmem_tma_basis_stride = transform_leaf(gbasis, [&](auto ei) {
+    auto si = basis_get(ei,  shape(gmem_layout));
+    auto di = basis_get(ei, stride(gmem_layout));
+    if constexpr (is_constant<1, decltype(si)>::value || is_constant<0, decltype(di)>::value) {
+      return Int<0>{};                  // If size-1 or stride-0, return arithmetic identity -- no contribution to the TMA
+    } else {
+      auto tma_gmem_basis_stride = stride(tma_gbasis);
+      // Find j such that E<i> is in stride<j>(tma_gbasis)
+      using EI = decltype(ei);
+      [[maybe_unused]] auto j = find_if(tma_gmem_basis_stride, [&](auto tma_stride_j) { return any_of(tma_stride_j, [&](auto dj) { return dj == EI{}; }); });
+      if constexpr (decltype(j == rank(tma_gmem_basis_stride))::value) {
+        return Int<0>{};               // If not-found, return arithmetic identity -- no contribution to the TMA
+      } else
+      if constexpr (decltype(j == Int<0>{})::value) {
+        auto scale = recast_ratio * basis_get(ei, stride(gtensor));
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a recast scale factor
+      } else
+      if constexpr (decltype(rank<j>(tma_gmem_basis_stride) == Int<1>{})::value) {
+        return E<j>{};                 // Return TMA Coord basis -- known scale of Int<1>{}
+      } else {
+        int32_t scale = ceil_div(int32_t(di * sizeof_bits_v<TmaInternalType> / cute::max(gmem_prob_stride[j], uint64_t{16})), 8);
+        return E<j>{} * scale;         // Return TMA Coord basis -- with a dynamic scale factor
+      }
+    }
+  });
+
+#if 0
+    print("gmem_tma_basis_stride : "); print(gmem_tma_basis_stride); print("\n");
+#endif
+
+  using AuxParams = AuxTmaParams<decltype(gmem_tma_basis_stride),
+                                 decltype(tma_gbasis),
+                                 decltype(swizzle)>;
+  return cute::make_tuple(tma_desc, AuxParams{gmem_tma_basis_stride});
+}
+
+template <class TmaInternalType,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_atom(CopyOp,
+                   Tensor<GEngine,GLayout> const& gtensor,       // Full GMEM Tensor
+                   SLayout                 const& slayout,       // CTA Tile of SMEM, potentially swizzled
+                   uint32_t                const& num_multicast, // The number of CTAs involved in multicasting
+                   Layout<VShape,VStride>  const& cta_v_map)     // V: CTA val idx -> gmem mode
+{
+  //
+  // TMA truncated layout
+  //
+
+  auto smem_swizzle = get_swizzle_portion(slayout);
+  auto smem_layout  = get_nonswizzle_portion(slayout);
+
+  auto tma_gbasis = detail::construct_tma_gbasis<TmaInternalType>(gtensor, smem_layout, cta_v_map);
+
+  //
+  // Construct the TMA Desc and the strides of the TMA Tensor
+  //
+
+  auto [tma_desc, aux_params] = detail::make_tma_copy_desc<TmaInternalType>(gtensor,
+                                                                            tma_gbasis,
+                                                                            smem_swizzle,
+                                                                            num_multicast);
+
+  //
+  // Construct the Copy_Traits
+  //
+
+  constexpr int num_bits_per_tma = size(tma_gbasis) * sizeof_bits_v<TmaInternalType>;
+  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(aux_params)>;
+  using Atom   = Copy_Atom<Traits, typename GEngine::value_type>;
+
+  Traits tma_traits{tma_desc, aux_params};
+
+#if 0
+  print("num_bits_per_tma :  "); print(num_bits_per_tma); print("\n");
+  print("g_stride_bases   :  "); print(tma_traits.aux_params_.g_stride_); print("\n");
+#endif
+
+  // Return the Copy_Atom
+  return Atom{tma_traits};
+}
+
+// The "logical TMA tid" is a map from the CTA rank to its logical id
+// within the instruction.  It works like a mask or ordering on the
+// CTAs.  For non-multicast TMA, all CTAs should map to 0.  For
+// multicast TMA of size 4, CTAs will be mapped to {0,1,2,3}.
+template <class TmaInternalType,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class TShape, class TStride,
+          class VShape, class VStride>
+CUTE_HOST_RTC
+auto
+make_tma_copy_tiled(CopyOp                  const& copy_op,
+                    Tensor<GEngine,GLayout> const& gtensor,     // Full GMEM Tensor
+                    SLayout                 const& slayout,     // CTA Tile of SMEM
+                    Layout<TShape,TStride>  const& cta_t_map,   // T: CTA thr idx -> logical TMA tid
+                    Layout<VShape,VStride>  const& cta_v_map)   // V: CTA val idx -> gmem mode
+{
+  Copy_Atom atom = make_tma_copy_atom<TmaInternalType>(copy_op, gtensor, slayout,
+                                                       cosize(cta_t_map), cta_v_map);
+
+  //
+  // Construct the TiledCopy
+  //
+
+  [[maybe_unused]] auto cta_tiler = product_each(shape(cta_v_map));
+
+  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
+
+  // smem idx -> smem coord
+  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
+  // CTA V -> smem_coord
+  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
+  // Scale that up to cover all of the smem_coords
+  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
+  // CTA T -> smem idx
+  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
+  // CTA TID -> smem coord
+  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
+  // Combine with the T mapping
+  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
+
+#if 0
+  print("cta_tiler : "); print(cta_tiler); print("\n");
+  print("layout_v : "); print(layout_v); print("\n");
+  print("layout_V : "); print(layout_V); print("\n");
+  print("layout_t : "); print(layout_t); print("\n");
+  print("layout_T : "); print(layout_T); print("\n");
+  print("layout_TV : "); print(layout_TV); print("\n");
+#endif
+
+  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
+}
+
+} // end namespace detail
+
+/** Make a CuTe CTA-collective TiledCopy for a TMA operation.
+ *
+ * @param CopyOp The target copy operation: SM90_TMA_LOAD, SM90_TMA_LOAD_MULTICAST, SM90_TMA_STORE
+ * @param gtensor The GMEM Tensor to be involved in the TMA.
+ * @param slayout The SMEM Layout to be involved in the TMA.
+ * @param cta_tile The CTA-local tile that each CTA will be tiling GMEM with.
+ *                 This is often the blk_shape that is used to tile the GMEM for CTAs:
+ *                   local_tile(gtensor, blk_shape, blk_coord) -> CTA-local tile of gtensor
+ * @param cluster_size When using SM90_TMA_LOAD_MULTICAST, this can be a (static) power-of-2 <= 16
+ *                   defining the multicast size (used to further partition the SMEM)
+ *                 Else, static-1
+ *
+ * This code attempts to maximize the TMA box size. It does this by tracing
+ * the SMEM "vector" -- the inverse of the smem layout -- to find the largest
+ * contiguous array of smem that can be written to/from global memory given
+ * the constraints that the TMA instruction imposes.
+ *
+ * This is accomplished by assigning "basis" strides to the GMEM to track which
+ * modes of SMEM map to which modes of GMEM, then reorder the modes of GMEM according
+ * to the SMEM vector, and then using those GMEM/SMEM modes to fill in the desc.
+ *
+ * Examples:
+     using T = float;
+     T* gptr = nullptr;
+
+    {
+    // Simple 2D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256), GenRowMajor{}); // K-Major GMEM
+    auto slayout   = make_layout(make_shape(_64{}, _32{}), GenRowMajor{});    // K-Major SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // GMMA 2D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256));                                 // MN-Major GMEM
+    auto slayout   = tile_to_shape(GMMA::Layout_MN_SW128_Atom<T>{}, make_shape(_128{},_64{})); // MN-Major Swizzled+Tiled 128x64 SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // 3D
+    Tensor gtensor = make_tensor(gptr, make_shape(1024, 32, 512), make_stride(64, Int<1>{}, 65536)); // GMEM
+    auto slayout   = make_layout(make_shape(_16{}, _8{}, _2{}), make_stride(_16{}, _1{}, _8{}));     // SMEM w/ same major-mode
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
+    }
+
+    {
+    // cuTENSOR 4D
+    auto layout = make_shape(make_shape(32,40),make_shape(make_shape(8,8),656)); // GMEM
+    auto cta_tile    = make_shape(_128{},make_shape(_32{},_2{}));                // GMEM Tiling:
+                                                                                 //   Take 128-elem from m: m0 must divide 128,
+                                                                                 //                         m-last may be predicated
+                                                                                 //   Take 32-elem from k0, 2-elem from k1
+    auto slayout = make_layout(cta_tile);                                        // Col-Major SMEM
+    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout, cta_tile, Int<1>{});
+    }
+ *
+ * Check the TMA box size and desc:
+    print("TMA Box size:  "); print(typename decltype(tma)::Tiler_MN{}); print("\n");
+    print("TMA desc     : "); print(tma.tma_desc_); print("\n");
+ *
+ * Usage:
+     Tensor mA = tma_a.get_tma_tensor(make_shape(M,N));        // (M,N) TMA coord tensor
+     Tensor gA = local_tile(mA, cta_tile, cta_coord);          // (BLK_M,BLK_N) TMA coord tensor for this CTA
+     Tensor sA = make_tensor(make_smem_ptr<T>(sptr), slayout); // (BLK_M,BLK_N) SMEM tensor
+
+     auto cta_tma = tma.get_slice(cta_idx_in_cluster);         // Slice for multicast partitioning
+     Tensor tAgA = cta_tma.partition_S(gA);                    // Partition for src
+     Tensor tAsA = cta_tma.partition_D(sA);                    // Partition for dst
+
+     copy(tma.with(barrier, mcast_mask), tAgA, tAsA);          // copy with supporting TMA params
+ */
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              CTA_Tiler               const& cta_tiler,
+              Cluster_Size            const& cluster_size)
+{
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
+                cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler,
+                                cluster_size);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
+    auto cta_t_tile = make_layout(cluster_size);
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    return detail::make_tma_copy_tiled<TmaType>(copy_op,
+                                                gtensor, slayout,
+                                                cta_t_tile, cta_v_tile);
+  }
+}
+
+// Explicit defaulting
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout)
+{
+  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), Int<1>{});
+}
+
+// Explicit defaulting
+template <class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              Cluster_Size            const& cluster_size)
+{
+  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), cluster_size);
+}
+
+////////////////////////////////////
+// Experimental Make TMA Atom and Partitioner
+///////////////////////////////////
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size = Int<1>>
+CUTE_HOST_RTC
+auto
+make_tma_atom(CopyOp                  const& copy_op,
+              Tensor<GEngine,GLayout> const& gtensor,
+              SLayout                 const& slayout,
+              CTA_Tiler               const& cta_tiler,
+              Cluster_Size            const& cluster_size = {})
+{
+  auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
+  // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+  using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+  return detail::make_tma_copy_atom<TmaType>(copy_op,
+                                             gtensor, slayout,
+                                             size(cluster_size), cta_v_tile);
+}
+
+// The "VectorCopy Partitioner" for TMA
+template <class... Args,
+          class CtaCoord,
+          class TShape, class TStride,
+          class SEngine, class SLayout,
+          class GEngine, class GLayout>
+CUTE_DEVICE
+auto
+tma_partition(Copy_Atom<Args...>      const& copy_atom,
+              CtaCoord                const& cta_coord,
+              Layout<TShape,TStride>  const& cta_layout,  // T: CTA coord -> logical multicast id
+              Tensor<SEngine,SLayout> const& stensor,     // SMEM Tensor (TMATile, Rest...)
+              Tensor<GEngine,GLayout> const& gtensor)     // GMEM Tensor (TMATile, Rest...)
+{
+  CUTE_STATIC_ASSERT_V(size<0>(stensor) == size<0>(gtensor));
+
+  // Invert the smem to get the largest contiguous vector in the smem layout
+  Layout inv_smem_layout = right_inverse(get_nonswizzle_portion(layout<0>(stensor)));
+  // Scale that up to cover all of the smem_coords
+  Layout layout_v = tile_to_shape(make_layout(inv_smem_layout), size<0>(stensor));
+
+  // Factor out the single-instrucion portion
+  Layout tma_layout_v = make_layout(Int<Copy_Atom<Args...>::NumValSrc>{});
+  auto layout_V = make_tile(logical_divide(layout_v, tma_layout_v));
+
+  // Append with _ until we cover all Rest... modes
+  auto glayout_V = append<GLayout::rank>(layout_V, _);
+  auto slayout_V = append<SLayout::rank>(layout_V, _);
+  // Transform tile mode and coalesce
+  Tensor gtensor_v = coalesce(gtensor.compose(glayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
+  Tensor stensor_v = coalesce(stensor.compose(slayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
+
+#if 0
+  if (thread0()) {
+    print("cta_coord  : "); print(cta_coord); print("\n");
+    print("cta_layout : "); print(cta_layout); print("\n");
+    print("gtensor   : "); print(gtensor); print("\n");
+    print("stensor   : "); print(stensor); print("\n");
+    print("layout_V  : "); print(layout_V); print("\n");
+    print("gtensor_v : "); print(gtensor_v); print("\n");
+    print("stensor_v : "); print(stensor_v); print("\n");
+  }
+#endif
+
+  // Offset inside the TMA-mode for the multicast
+  auto multicast_offset = cta_layout(cta_coord) * (size(tma_layout_v) / cosize(cta_layout));
+  auto multicast_coord  = make_coord(make_coord(multicast_offset, Int<0>{}));
+  auto gcoord = append<GLayout::rank>(multicast_coord, Int<0>{});
+  auto scoord = append<SLayout::rank>(multicast_coord, Int<0>{});
+
+  Tensor gresult = domain_offset(gcoord, gtensor_v);
+  Tensor sresult = domain_offset(scoord, stensor_v);
+
+  return cute::make_tuple(gresult, sresult);
+}
+
+// TMA Multicast Masks Calculation
+template <int Mode, class CtaLayout, class CtaCoord>
+CUTE_HOST_DEVICE constexpr
+auto
+create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
+                          CtaCoord  const& cta_coord_vmnk)
+{
+  auto cta_coord_slicer = replace<Mode>(cta_coord_vmnk, _);
+  auto [cta_layout, elected_cta] = slice_and_offset(cta_coord_slicer, cta_layout_vmnk);
+  // Get the instruction code
+  uint16_t mcast_mask = 0;
+  for (int i = 0; i < size(cta_layout); ++i) {
+    mcast_mask |= uint16_t(1) << cta_layout(i);
+  }
+  // Shift by the instruction's elected block rank (dynamic)
+  mcast_mask <<= elected_cta;
+  return mcast_mask;
+}
+
+////////////////////////////////////
+// Make TMA copy A/B/C
+///////////////////////////////////
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy_A_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler,
+                     Cluster_Size            const& cluster_size)
+{
+  // Keep only MK modes from MNK
+  auto cta_tiler_mk = remove<1>(cta_tiler);
+
+  // mcast along N mode for this M load, if any
+  auto cluster_size_n = size<1>(cluster_size);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_mk,
+                                cluster_size_n);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mk);
+    auto cta_t_tile = make_layout(cluster_size_n);
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
+    return tma_copy;
+  }
+}
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler,
+          class Cluster_Size>
+CUTE_HOST_RTC
+auto
+make_tma_copy_B_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler,
+                     Cluster_Size            const& cluster_size)
+{
+  // Keep only NK modes from MNK
+  auto cta_tiler_nk = remove<0>(cta_tiler);
+
+  // mcast along M mode for this N load, if any
+  auto cluster_size_m = size<0>(cluster_size);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_nk,
+                                cluster_size_m);
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_nk);
+    auto cta_t_tile = make_layout(cluster_size_m);
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
+    return tma_copy;
+  }
+}
+
+template <class TmaInternalType = void,
+          class CopyOp,
+          class GEngine, class GLayout,
+          class SLayout,
+          class CTA_Tiler>
+CUTE_HOST_RTC
+auto
+make_tma_copy_C_sm90(CopyOp                  const& copy_op,
+                     Tensor<GEngine,GLayout> const& gtensor,
+                     SLayout                 const& slayout,
+                     CTA_Tiler               const& cta_tiler)
+{
+  // Keep only MN modes from MNK
+  auto cta_tiler_mn = remove<2>(cta_tiler);
+
+  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
+      cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
+    return make_im2col_tma_copy(copy_op,
+                                gtensor,
+                                slayout,
+                                cta_tiler_mn,
+                                _1{});
+  } else {
+    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mn);
+
+    // No multicast, so only 1 CTA involved
+    auto cta_t_map = Layout<_1,_0>{};
+
+    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
+    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
+    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_map, cta_v_tile);
+    return tma_copy;
+  }
+}
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
new file mode 100755
index 000000000..3286e72b3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/// @file copy_traits_sm90_tma_swizzle.hpp
+/// @brief Functions for converting swizzle layout to TMA descriptor
+
+#if !defined(__CUDACC_RTC__)
+#include <cuda.h>
+#endif
+
+#include <cute/arch/copy_sm90_desc.hpp>
+#include <cute/swizzle_layout.hpp>
+
+namespace cute::detail {
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+TMA::SmemSwizzleBits
+get_tma_swizzle_bits(Swizzle<B,M,S>)
+{
+  if constexpr (M == 4) {
+    switch (B) {
+      default:  static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
+      case 3:   return TMA::SmemSwizzleBits::B128;
+      case 2:   return TMA::SmemSwizzleBits::B64;
+      case 1:   return TMA::SmemSwizzleBits::B32;
+      case 0:   return TMA::SmemSwizzleBits::DISABLE;
+    }
+  } else
+  {
+    static_assert(M < 0, "Unsupported layout swizzle.");
+  }
+}
+
+template <class Layout>
+TMA::SmemSwizzleBits
+get_tma_swizzle_bits(Layout const& layout)
+{
+  return get_tma_swizzle_bits(get_swizzle_portion(layout));
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+TMA::SmemSwizzleBase
+get_tma_swizzle_base(Swizzle<B,M,S>)
+{
+  if constexpr (M == 4) {
+    static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
+    static_assert(S == 3, "Expected S = 3 when M == 4. Unsupported layout swizzle.");
+    return TMA::SmemSwizzleBase::SWIZZLE_BASE_16B;
+  } 
+  else {
+    static_assert(M == 4, "Expected 128b=16B=(2^4)B base swizzle.");
+  }
+}
+
+template <class Layout>
+TMA::SmemSwizzleBase
+get_tma_swizzle_base(Layout const& layout)
+{
+  return get_tma_swizzle_base(get_swizzle_portion(layout));
+}
+
+} // namespace cute::detail
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
new file mode 100755
index 000000000..bf4082743
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
@@ -0,0 +1,1117 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/arch/mma.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/tensor_impl.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute {
+
+template <class... Args>
+struct MMA_Atom;
+
+template <class MMAOperation>
+struct MMA_Atom<MMAOperation> : MMA_Atom<MMA_Traits<MMAOperation>>
+{};
+
+template <class MMAOperation, class... Args>
+struct MMA_Atom<MMA_Traits<MMAOperation, Args...>>
+  : MMA_Traits<MMAOperation, Args...>
+{
+  using MMA_Op = MMAOperation;
+  using Traits = MMA_Traits<MMAOperation, Args...>;
+
+  // Element value types from the MMA_Traits
+  using ValTypeD = typename Traits::ValTypeD;
+  using ValTypeA = typename Traits::ValTypeA;
+  using ValTypeB = typename Traits::ValTypeB;
+  using ValTypeC = typename Traits::ValTypeC;
+
+  // Thr-Val layouts from the MMA_Traits
+  using Shape_MNK  = typename Traits::Shape_MNK;
+  using ThrID      = typename Traits::ThrID;
+  using LayoutC_TV = typename Traits::CLayout;
+  using LayoutA_TV = typename Traits::ALayout;
+  using LayoutB_TV = typename Traits::BLayout;
+
+  // Fragment value types from the MMA_Traits (optional, defaults to Val type)
+  using FrgTypeD = typename detail::FrgTypeC_or_Default<Traits>::type;
+  using FrgTypeA = typename detail::FrgTypeA_or_Default<Traits>::type;
+  using FrgTypeB = typename detail::FrgTypeB_or_Default<Traits>::type;
+  using FrgTypeC = typename detail::FrgTypeC_or_Default<Traits>::type;
+
+  // Additional Trait parameters/transformations
+  template <class... TraitsArgs>
+  CUTE_HOST_DEVICE
+  auto
+  with(TraitsArgs&&... args) const {
+    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
+    return MMA_Atom<decltype(traits)>{traits};
+  }
+
+  //
+  // Tensor call interfaces
+  //
+
+  // Cast, check, and call fma
+  template <class TD, class DLayout,
+            class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr
+  void
+  call(Tensor<TD, DLayout>      & D,
+       Tensor<TA, ALayout> const& A,
+       Tensor<TB, BLayout> const& B,
+       Tensor<TC, CLayout> const& C) const
+  {
+    static_assert(DLayout::rank == 1, "Expected rank-1 D tensor");
+    static_assert(ALayout::rank == 1, "Expected rank-1 A tensor");
+    static_assert(BLayout::rank == 1, "Expected rank-1 B tensor");
+    static_assert(CLayout::rank == 1, "Expected rank-1 C tensor");
+
+    return mma_unpack(static_cast<Traits const&>(*this), D, A, B, C);
+  }
+
+  // Three arguments reproduces C
+  template <class TA, class ALayout,
+            class TB, class BLayout,
+            class TC, class CLayout>
+  CUTE_HOST_DEVICE constexpr
+  void
+  call(Tensor<TA, ALayout> const& A,
+       Tensor<TB, BLayout> const& B,
+       Tensor<TC, CLayout>      & C) const
+  {
+    return call(C, A, B, C);
+  }
+
+  //
+  // make_fragment_A|B|C
+  //   These functions are awkward as they expect already-partitioned tensors
+  //     resulting from a previous call to partition_A|B|C
+  //   The reasoning is that we can inspect the layout of the partitioned data
+  //     and attempt to match it in generated fragment to promote vectorization
+  //     when copying from partition to fragment.
+  //
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_C(CTensor&& ctensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<3>{});  // VMN
+    CUTE_STATIC_ASSERT_V(size<0>(ctensor) == size<1>(LayoutC_TV{}));
+    // C is a bit special because we are after accumulators here
+    // The input/output type doesn't have to match the accumulator type
+    //static_assert(std::is_same<ValTypeC, typename remove_cvref_t<CTensor>::value_type>::value, "Expecting ValTypeC type");
+
+    // We'll never base the accumulator layout on the input tensor layout, so just return a FrgTypeC tensor
+    return make_tensor<FrgTypeC>(shape(ctensor));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_A(ATensor&& atensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<3>{});  // VMK
+    CUTE_STATIC_ASSERT_V(size<0>(atensor) == size<1>(LayoutA_TV{}));
+
+    if constexpr (has_dereference<FrgTypeA>::value) {
+      // If the intended FrgTypeA is a view (of the current tensor), forward the whole
+      static_assert(is_same<ValTypeA, typename remove_cvref_t<ATensor>::value_type>::value
+                      , "Expecting ValTypeA type");
+      return make_tensor<FrgTypeA>(static_cast<ATensor&&>(atensor));
+    } else {
+      // Else, the intended FrgTypeA is a value type, construct a new tensor with a fragment layout
+      return make_fragment_like<FrgTypeA>(atensor);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_fragment_B(BTensor&& btensor)
+  {
+    // Check that this tensor is likely already partitioned
+    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<3>{});  // VNK
+    CUTE_STATIC_ASSERT_V(size<0>(btensor) == size<1>(LayoutB_TV{}));
+
+    if constexpr (has_dereference<FrgTypeB>::value) {
+      // If the intended FrgTypeB is a view (of the current tensor), forward the whole
+      static_assert(is_same<ValTypeB, typename remove_cvref_t<BTensor>::value_type>::value
+                      , "Expecting ValTypeB type");
+      return make_tensor<FrgTypeB>(static_cast<BTensor&&>(btensor));
+    } else {
+      // Else, the intended FrgTypeB is a value type, construct a new tensor with a fragment layout
+      return make_fragment_like<FrgTypeB>(btensor);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+};
+
+//
+// A tiling of mma atoms
+//
+
+template <class TiledMMA, class ThrCoord>
+struct ThrMMA;
+
+// @tparam MMA_Atom The MMA_Atom to use in the TiledMMA
+// @tparam AtomLayoutMNK The MNK-tiling of the Atom to be performed.
+// @tparam PermuationsMNK Permutations to apply to each MNK-mode before tiling for the Atom.
+template <class MMA_Atom,
+          class AtomLayoutMNK,
+          class PermutationMNK = Tile<Underscore,Underscore,Underscore>>
+struct TiledMMA : MMA_Atom
+{
+  using Atom           = MMA_Atom;
+  using AtomShape_MNK  = typename MMA_Atom::Shape_MNK;
+  using AtomThrID      = typename MMA_Atom::ThrID;
+  using AtomLayoutC_TV = typename MMA_Atom::LayoutC_TV;
+  using AtomLayoutA_TV = typename MMA_Atom::LayoutA_TV;
+  using AtomLayoutB_TV = typename MMA_Atom::LayoutB_TV;
+
+  static_assert(   rank_v<AtomLayoutMNK>  == 3,   "TiledMMA requires rank-3 AtomLayoutMNK");
+  static_assert(   rank_v<PermutationMNK> == 3,   "TiledMMA requires rank-3 PermutationMNK");
+  static_assert( is_tuple<PermutationMNK>::value, "TiledMMA requires independent permutations of MNK.");
+  static_assert(is_static<PermutationMNK>::value, "TiledMMA requires static permutations of MNK.");
+
+  using ThrLayoutVMNK = decltype(tiled_product(AtomThrID{}, AtomLayoutMNK{}));
+  ThrLayoutVMNK thr_layout_vmnk_;
+
+  CUTE_HOST_DEVICE constexpr
+  TiledMMA(MMA_Atom const& mma_atom = {}, AtomLayoutMNK const& thr_layout_mnk = {})
+    : MMA_Atom(mma_atom),
+      thr_layout_vmnk_(tiled_product(AtomThrID{}, thr_layout_mnk)) {}
+
+  CUTE_HOST_DEVICE constexpr auto
+  get_thr_layout_vmnk() const {
+    return thr_layout_vmnk_;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,N,...)
+  // to shape
+  //   ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN,...)))
+  // where
+  //   ThrV:  The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrM:  The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
+  //   ThrN:  The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestM: The values tiled in M.
+  //   RestN: The values tiled in N.
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_C(CTensor&& ctensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<0>(),
+                            permutation_mnk<1>());
+    auto t_tensor = logical_divide(ctensor, t_tile);                 // (PermM,PermN)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<1>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomN),(RestM,RestN))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutC_TV{},_);           // ((ThrV,FrgV),(RestM,RestN))
+
+    // Tile the tensor for the C-threads
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
+                                        make_layout(size<2>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN)))
+
+    return thr_tensor;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (M,K,...)
+  // to shape
+  //   ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK,...)))
+  // where
+  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrM: The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
+  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestM: The values tiled in M.
+  //   RestK: The values tiled in K.
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_A(ATensor&& atensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<0>(),
+                            permutation_mnk<2>());
+    auto t_tensor = logical_divide(atensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
+                                        make_layout(size<3>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  // Tile a tensor or a layout from shape
+  //   (N,K,...)
+  // to shape
+  //   ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
+  // where
+  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
+  //   ThrN: The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
+  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
+  //   FrgV:  The values local to an MMA.
+  //   RestN: The values tiled in N.
+  //   RestK: The values tiled in K.
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_B(BTensor&& btensor) const
+  {
+    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<2>{});
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(permutation_mnk<1>(),
+                            permutation_mnk<2>());
+    auto t_tensor = logical_divide(btensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto b_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto b_tensor = zipped_divide(t_tensor, b_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = b_tensor.compose(AtomLayoutB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk_)),
+                                        make_layout(size<3>(thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_slice(ThrIdx const& thr_idx) const
+  {
+    auto thr_vmnk = thr_layout_vmnk_.get_flat_coord(thr_idx);
+    return ThrMMA<TiledMMA, decltype(thr_vmnk)>{*this, thr_vmnk};
+  }
+
+  template <class ThrIdx,
+            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_thread_slice(ThrIdx const& thr_idx) const
+  {
+    return get_slice(thr_idx);
+  }
+
+  //
+  // Utility for printing and visualization
+  //
+
+  // The permutation applied to the MNK-mode data
+  template <int I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  permutation_mnk() const {
+    static_assert(0 <= I && I < 3);
+    auto perm = get<I>(PermutationMNK{});
+    return conditional_return(is_underscore<decltype(perm)>{}, size<I>(AtomShape_MNK{}) * size<I+1>(get_thr_layout_vmnk()), perm);
+  }
+
+  // The size of the MNK-mode
+  template <int I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile_size_mnk() const {
+    static_assert(0 <= I && I < 3);
+    return size(permutation_mnk<I>());
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutC_MN() const
+  {
+    // (M,N) -> (M,N)
+    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
+    // (cthrid,val) -> (M,N)
+    auto layoutC_TV = thrfrg_C(ref_C);
+    // (M,N) -> (cthrid,frg)
+    auto layoutC_MN = right_inverse(layoutC_TV).with_shape(shape(ref_C));
+
+    // cthrid = (v,m,n) -> thr_idx
+    auto thrID_C = thr_layout_vmnk_(_,_,_,Int<0>{});
+
+    return cute::make_tuple(layoutC_MN, thrID_C);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutC_TV() const
+  {
+    // (M,N) -> (M,N)
+    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
+    // (cthrid,val) -> (M,N)
+    auto layoutC_TV = thrfrg_C(ref_C);
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,N)
+    return layoutC_TV.compose(thridx_2_thrid, _);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutA_MK() const
+  {
+    // (M,K) -> (M,K)
+    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
+    // (athrid,val) -> (M,K)
+    auto layoutA_TV = thrfrg_A(ref_A);
+    // (M,K) -> (athrid,frg)
+    auto layoutA_MK = right_inverse(layoutA_TV).with_shape(shape(ref_A));
+
+    // athrid = (v,m,k) -> thr_idx
+    auto thrID_A = thr_layout_vmnk_(_,_,Int<0>{},_);
+
+    return cute::make_tuple(layoutA_MK, thrID_A);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutA_TV() const
+  {
+    // (M,K) -> (M,K)
+    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
+    // (athrid,val) -> (M,K)
+    auto layoutA_TV = thrfrg_A(ref_A);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
+                                                 make_stride(               Int<1>{} ,                Int<0>{} )),
+                                     _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_A(ref_A).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutB_NK() const
+  {
+    // (N,K) -> (N,K)
+    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
+    // (bthrid,val) -> (N,K)
+    auto layoutB_TV = thrfrg_B(ref_B);
+    // (N,K) -> (bthrid,frg)
+    auto layoutB_NK = right_inverse(layoutB_TV).with_shape(shape(ref_B));
+
+    // bthrid = (v,n,k) -> thr_idx
+    auto thrID_B = thr_layout_vmnk_(_,Int<0>{},_,_);
+
+    return cute::make_tuple(layoutB_NK, thrID_B);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutB_TV() const
+  {
+    // (N,K) -> (N,K)
+    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
+    // (bthrid,val) -> (N,K)
+    auto layoutB_TV = thrfrg_B(ref_B);
+
+    // (ThrV,(ThrN,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
+                                                 make_stride(               Int<0>{} ,                Int<1>{} )),
+                                     _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (N,K)
+    return thrfrg_B(ref_B).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+};
+
+template <class TiledMMA, class ThrVMNK>
+struct ThrMMA : TiledMMA
+{
+  ThrVMNK thr_vmnk_;
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_C(CTensor&& ctensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<CTensor&&>(ctensor).data(), this->thrfrg_C(ctensor.layout()));
+
+    auto thr_vmn = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<2>(thr_vmnk_)));
+    return thr_tensor(thr_vmn, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_A(ATensor&& atensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<ATensor&&>(atensor).data(), this->thrfrg_A(atensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<3>(thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_B(BTensor&& btensor) const
+  {
+    auto thr_tensor = make_tensor(static_cast<BTensor&&>(btensor).data(), this->thrfrg_B(btensor.layout()));
+
+    auto thr_vnk = make_coord(get<0>(thr_vmnk_), make_coord(get<2>(thr_vmnk_), get<3>(thr_vmnk_)));
+    return thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class CTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_C(CTensor&& ctensor) const
+  {
+    return TiledMMA::make_fragment_C(partition_C(ctensor));
+  }
+
+  template <class ATensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_A(ATensor&& atensor) const
+  {
+    return TiledMMA::make_fragment_A(partition_A(atensor));
+  }
+
+  template <class BTensor>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_B(BTensor&& btensor) const
+  {
+    return TiledMMA::make_fragment_B(partition_B(btensor));
+  }
+};
+
+//
+// These tile the MMA_Atom as a whole
+//
+
+template <class MMA_Op,
+          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
+          class Permutations = Tile<Underscore,Underscore,Underscore>>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tiled_mma(MMA_Atom<MMA_Op> const& mma_atom,
+               MMAThrLayout     const& thr_layout   = {},
+               Permutations     const& permutations = {})
+{
+  auto thr_layout_mnk  = append<3>(thr_layout, Layout<_1,_0>{});
+  auto permutation_mnk = append<3>(permutations, _);
+
+  return TiledMMA<MMA_Atom<MMA_Op>,
+                  decltype(thr_layout_mnk),
+                  decltype(permutation_mnk)>{mma_atom, thr_layout_mnk};
+}
+
+template <class MMA_Op,
+          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
+          class Permutations = Tile<Underscore,Underscore,Underscore>>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tiled_mma(MMA_Op       const&,
+               MMAThrLayout const& thr_layout   = {},
+               Permutations const& permutations = {})
+{
+  // Attempt to wrap in an MMA_Atom<> and forward
+  return make_tiled_mma(MMA_Atom<MMA_Op>{}, thr_layout, permutations);
+}
+
+//
+// partition_fragment_C -- static context
+//
+
+template <class... Args, class Shape_MN>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_C(TiledMMA<Args...> const& mma, Shape_MN const& shape_MN)
+{
+  constexpr int R = rank_v<Shape_MN>;
+  static_assert(R >= 2, "Must have at least rank-2");
+  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
+  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
+  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutC_TV{});
+  auto M = shape_div(size<0>(shape_MN), size<0>(atomMNK) * size<1>(thrVMNK));
+  auto N = shape_div(size<1>(shape_MN), size<1>(atomMNK) * size<2>(thrVMNK));
+  return cute::tuple_cat(make_shape(V,M,N), take<2,R>(shape_MN));
+}
+
+template <class... Args, class Shape_MN>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_fragment_C(TiledMMA<Args...> const& mma, Shape_MN const& shapeMN)
+{
+  return make_tensor<typename TiledMMA<Args...>::FrgTypeC>(partition_shape_C(mma, shapeMN));
+}
+
+// partition_fragment_A and partition_fragment_B often depend on the
+//   layout of A and B and/or the thread_idx that is requesting the partition.
+// For these reasons, they should not be used in a static context.
+// See TiledMMA::get_slice(thr_idx).partition_fragment_A(tensorA) instead.
+
+template <class... Args, class Shape_MK>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_A(TiledMMA<Args...> const& mma, Shape_MK const& shape_MK)
+{
+  constexpr int R = rank_v<Shape_MK>;
+  static_assert(R >= 2, "Must have at least rank-2");
+  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
+  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
+  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutA_TV{});
+  auto M = shape_div(size<0>(shape_MK), size<0>(atomMNK) * size<1>(thrVMNK));
+  auto K = shape_div(size<1>(shape_MK), size<2>(atomMNK) * size<3>(thrVMNK));
+  return cute::tuple_cat(make_shape(V,M,K), take<2,R>(shape_MK));
+}
+
+template <class... Args, class Shape_NK>
+CUTE_HOST_DEVICE constexpr
+auto
+partition_shape_B(TiledMMA<Args...> const& mma, Shape_NK const& shape_NK)
+{
+  constexpr int R = rank_v<Shape_NK>;
+  static_assert(R >= 2, "Must have at least rank-2");
+  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
+  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
+  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutB_TV{});
+  auto N = shape_div(size<0>(shape_NK), size<1>(atomMNK) * size<2>(thrVMNK));
+  auto K = shape_div(size<1>(shape_NK), size<2>(atomMNK) * size<3>(thrVMNK));
+  return cute::tuple_cat(make_shape(V,N,K), take<2,R>(shape_NK));
+}
+
+//
+// Size
+//
+
+template <int I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_size(TiledMMA<Args...> const& mma)
+{
+  return mma.template tile_size_mnk<I>();
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_shape(TiledMMA<Args...> const& mma)
+{
+  return make_shape(tile_size<0>(mma), tile_size<1>(mma), tile_size<2>(mma));
+}
+
+// Deprecate?
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+size(TiledMMA<Args...> const& mma)
+{
+  return size<I...>(mma.get_thr_layout_vmnk());
+}
+
+// Alias
+template <int... I, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+thr_size(TiledMMA<Args...> const& mma)
+{
+  return size<I...>(mma.get_thr_layout_vmnk());
+}
+
+//
+// Display utilities
+//
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print(MMA_Atom<MMA_Traits<Args...>> const&)
+{
+  using Atom = MMA_Atom<MMA_Traits<Args...>>;
+  print("MMA_Atom\n");
+  print("  ThrID:      "); print(typename Atom::ThrID{});      print("\n");
+  print("  Shape_MNK:  "); print(typename Atom::Shape_MNK{});  print("\n");
+  print("  LayoutA_TV: "); print(typename Atom::LayoutA_TV{}); print("\n");
+  print("  LayoutB_TV: "); print(typename Atom::LayoutB_TV{}); print("\n");
+  print("  LayoutC_TV: "); print(typename Atom::LayoutC_TV{}); print("\n");
+}
+
+template <class Atom, class TiledThr, class TiledPerm>
+CUTE_HOST_DEVICE
+void
+print(TiledMMA<Atom, TiledThr, TiledPerm> const& mma)
+{
+  print("TiledMMA\n");
+  print("  ThrLayoutVMNK:  "); print(mma.get_thr_layout_vmnk());  print("\n");
+  print("  PermutationMNK: "); print(TiledPerm{}); print("\n");
+  print(static_cast<Atom const&>(mma));
+}
+
+template <class TiledMMA, class ThrVMNK>
+CUTE_HOST_DEVICE
+void
+print(ThrMMA<TiledMMA, ThrVMNK> const& thr_mma)
+{
+  print("ThrMMA\n");
+  print("  Thr VMNK: "); print(thr_mma.thr_vmnk_); print("\n");
+  print(static_cast<TiledMMA>(thr_mma));
+}
+
+// MMA Atom to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(MMA_Atom<Args...> const& mma_atom,
+            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
+{
+  print_latex(make_tiled_mma(mma_atom));
+}
+
+// TiledMMA to LaTeX TikZ
+template <class... Args, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(TiledMMA<Args...> const& mma,
+            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
+{
+  auto layout_and_thrid_C = mma.get_layoutC_MN();
+  auto layoutC_MN = get<0>(layout_and_thrid_C);
+  auto thrID_C    = get<1>(layout_and_thrid_C);
+
+  auto layout_and_thrid_A = mma.get_layoutA_MK();
+  auto layoutA_MK = get<0>(layout_and_thrid_A);
+  auto thrID_A    = get<1>(layout_and_thrid_A);
+
+  auto layout_and_thrid_B = mma.get_layoutB_NK();
+  auto layoutB_NK = get<0>(layout_and_thrid_B);
+  auto thrID_B    = get<1>(layout_and_thrid_B);
+
+  print_latex_mma(layoutC_MN, thrID_C,
+                  layoutA_MK, thrID_A,
+                  layoutB_NK, thrID_B);
+}
+
+// MNK MMA Layout to LaTeX TikZ
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB,
+          class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+                LayoutB const& B, ThrIDB const& TB,  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+                TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
+
+  assert(size<0>(A) == size<0>(C));
+  assert(size<0>(B) == size<1>(C));
+  assert(size<1>(A) == size<1>(B));
+
+  // Commented prints
+  printf("%% LayoutC: "); print(C);  printf("\n");
+  printf("%% ThrIDC : "); print(TC); printf("\n");
+  printf("%% LayoutA: "); print(A);  printf("\n");
+  printf("%% ThrIDA : "); print(TA); printf("\n");
+  printf("%% LayoutB: "); print(B);  printf("\n");
+  printf("%% ThrIDB : "); print(TB); printf("\n\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // C starting at 0,0
+  for (int m = 0; m < size<0>(C); ++m) {
+    for (int n = 0; n < size<1>(C); ++n) {
+      int thrid   = C(m,n) % size(TC);
+      int val_idx = C(m,n) / size(TC);
+      int thr_idx = TC(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             m, n,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, 0, int(size<0>(C)), int(size<1>(C)));
+
+  // A starting at 0,-size<1>(A)-1
+  for (int m = 0; m < size<0>(A); ++m) {
+    for (int k = 0; k < size<1>(A); ++k) {
+      int thrid   = A(m,k) % size(TA);
+      int val_idx = A(m,k) / size(TA);
+      int thr_idx = TA(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             m, k-1-size<1>(A),
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         0, int(-size<1>(A)-1), int(size<0>(A)), -1);
+  // A labels
+  for (int m =  0, k = -1; m < size<0>(A); ++m) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), m);
+  }
+  for (int m = -1, k =  0; k < size<1>(A); ++k) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), k);
+  }
+
+  // B starting at -size<1>(B)-1,0
+  for (int n = 0; n < size<0>(B); ++n) {
+    for (int k = 0; k < size<1>(B); ++k) {
+      int thrid   = B(n,k) % size(TB);
+      int val_idx = B(n,k) / size(TB);
+      int thr_idx = TB(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             k-1-size<1>(B), n,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
+         int(-size<1>(B)-1), 0, -1, int(size<0>(B)));
+  // B labels
+  for (int n =  0, k = -1; n < size<0>(B); ++n) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, n);
+  }
+  for (int n = -1, k =  0; k < size<1>(B); ++k) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, k);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+// MNK MMA Layout to console printer
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB>
+CUTE_HOST_DEVICE
+void
+print_layout_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+                 LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+{
+  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
+  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
+
+  assert(size<0>(A) == size<0>(C));
+  assert(size<0>(B) == size<1>(C));
+  assert(size<1>(A) == size<1>(B));
+
+  int a_width = size<1>(A) * 6 + 4;
+
+  // Print out B (white-shifted) k-by-n
+  for (int k = 0; k < size<1>(B); ++k) {
+    // Header
+    printf("%*s", a_width, "");
+    for (int n = 0; n < size<0>(B); ++n) printf("+-----");
+    printf("+\n");
+    // Values
+    printf("%*s", a_width, "");
+    for (int n = 0; n < size<0>(B); ++n) printf("|T%02dV%1d", int(TB(B(n,k) % size(TB))), int(B(n,k) / size(TB)));
+    printf("|\n");
+  }
+  // Footer
+  printf("%*s", a_width, "");
+  for (int n = 0; n < size<0>(B); ++n) printf("+-----");
+  printf("+\n\n");
+
+  // Print out A m-by-k and C m-by-n
+  for (int m = 0; m < size<0>(A); ++m) {
+    // Header
+    for (int k = 0; k < size<1>(A); ++k) printf("+-----");
+    printf("+   ");
+    for (int n = 0; n < size<1>(C); ++n) printf("+-----");
+    printf("+\n");
+    // Values
+    for (int k = 0; k < size<1>(A); ++k) printf("|T%02dV%1d", int(TA(A(m,k) % size(TA))), int(A(m,k) / size(TA)));
+    printf("|   ");
+    for (int n = 0; n < size<1>(C); ++n) printf("|T%02dV%1d", int(TC(C(m,n) % size(TC))), int(C(m,n) / size(TC)));
+    printf("|\n");
+  }
+  // Footer
+  for (int k = 0; k < size<1>(A); ++k) printf("+-----");
+  printf("+   ");
+  for (int n = 0; n < size<1>(C); ++n) printf("+-----");
+  printf("+\n");
+}
+
+// MNK MMA Layout to SVG -- 8-value color coded by thread
+template <class LayoutC, class ThrIDC,
+          class LayoutA, class ThrIDA,
+          class LayoutB, class ThrIDB>
+CUTE_HOST_DEVICE
+void
+print_svg_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+              LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
+              LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
+{
+  char const *color_map[8] = {"175,175,255", "175,255,175", "255,255,175",
+                              "255,175,175", "210,210,255", "210,255,210",
+                              "255,255,210", "255,210,210"};
+
+  const int cell_width = 20;
+  const int cell_height = 20;
+
+  const int page_width = (size<1>(A) + size<0>(B) + 2) * cell_width;
+  const int page_height = (size<1>(B) + size<0>(A) + 2) * cell_height;
+
+  // header
+  printf("<svg width=\"100%%\" height=\"100%%\" viewBox=\"0 0 %d %d\" "
+         "preserveAspectRatio=\"xMidYMid meet\" "
+         "xmlns=\"http://www.w3.org/2000/svg\">\n",
+         page_width, page_height);
+
+  // C
+  int c_base_x = (size<1>(A) + 2) * cell_width;
+  int c_base_y = (size<1>(B) + 2) * cell_height;
+  for (int m = 0; m < cute::size<0>(C); ++m) {
+    for (int n = 0; n < cute::size<1>(C); ++n) {
+
+      int thrid = C(m, n) % size(TC);
+      int val_idx = C(m, n) / size(TC);
+      int thr_idx = TC(thrid);
+
+      int x = n * cell_width + c_base_x;
+      int y = m * cell_height + c_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\"/>\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // A
+  int a_base_x = cell_width;
+  int a_base_y = (size<1>(B) + 2) * cell_height;
+  for (int m = 0; m < size<0>(A); ++m) {
+    for (int k = 0; k < size<1>(A); ++k) {
+      int thrid = A(m, k) % size(TA);
+      int val_idx = A(m, k) / size(TA);
+      int thr_idx = TA(thrid);
+
+      int x = k * cell_width + a_base_x;
+      int y = m * cell_height + a_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // B
+  int b_base_x = (size<1>(A) + 2) * cell_width;
+  int b_base_y = cell_height;
+  for (int n = 0; n < size<0>(B); ++n) {
+    for (int k = 0; k < size<1>(B); ++k) {
+      int thrid = B(n, k) % size(TB);
+      int val_idx = B(n, k) / size(TB);
+      int thr_idx = TB(thrid);
+
+      int x = n * cell_width + b_base_x;
+      int y = k * cell_height + b_base_y;
+
+      int thr_x = x + cell_width / 2;
+      int thr_y = y + cell_height / 4;
+      int val_x = x + cell_width / 2;
+      int val_y = y + cell_height * 3 / 4;
+
+      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
+             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
+             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
+             thr_x, thr_y, thr_idx);
+      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
+             val_x, val_y, val_idx);
+    }
+  }
+
+  // A labels
+  for (int m = 0; m < size<0>(A); ++m) {
+    int x = cell_width / 2;
+    int y = m * cell_height + cell_height / 2 + a_base_y;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, m);
+  }
+  for (int k = 0; k < size<1>(A); ++k) {
+    int x = cell_width + k * cell_width + cell_width / 2;
+    int y = -cell_height / 2 + a_base_y;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, k);
+  }
+
+  // B labels
+  for (int n = 0; n < size<0>(B); ++n) {
+    int x = b_base_x + cell_width * n + cell_width / 2;
+    int y = cell_height / 2;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, n);
+  }
+  for (int k = 0; k < size<1>(B); ++k) {
+    int x = b_base_x - cell_width / 2;
+    int y = cell_height * (k + 1) + cell_height / 2;
+    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
+           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
+           x, y, k);
+  }
+
+  // footer
+  printf("</svg>");
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print_svg(MMA_Atom<Args...> const &mma_atom) {
+  print_svg(make_tiled_mma(mma_atom));
+}
+
+template <class... Args>
+CUTE_HOST_DEVICE
+void
+print_svg(TiledMMA<Args...> const &mma) {
+  auto layout_and_thrid_C = mma.get_layoutC_MN();
+  auto layoutC_MN = get<0>(layout_and_thrid_C);
+  auto thrID_C = get<1>(layout_and_thrid_C);
+
+  auto layout_and_thrid_A = mma.get_layoutA_MK();
+  auto layoutA_MK = get<0>(layout_and_thrid_A);
+  auto thrID_A = get<1>(layout_and_thrid_A);
+
+  auto layout_and_thrid_B = mma.get_layoutB_NK();
+  auto layoutB_NK = get<0>(layout_and_thrid_B);
+  auto thrID_B = get<1>(layout_and_thrid_B);
+
+  print_svg_mma(layoutC_MN, thrID_C, layoutA_MK, thrID_A, layoutB_NK, thrID_B);
+}
+
+} // namespace cute
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cute/atom/mma_traits_sm61.hpp>
+#include <cute/atom/mma_traits_sm70.hpp>
+#include <cute/atom/mma_traits_sm75.hpp>
+#include <cute/atom/mma_traits_sm80.hpp>
+#include <cute/atom/mma_traits_sm90.hpp>
+#include <cute/atom/mma_traits_sm90_gmma.hpp>
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
new file mode 100755
index 000000000..0994698a8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/tensor_impl.hpp>  // cute::Tensor
+#include <cute/pointer.hpp>      // cute::is_rmem
+#include <cute/arch/mma.hpp>     // cute::UniversalFMA
+#include <cute/arch/util.hpp>    // cute::detail::explode
+
+namespace cute
+{
+
+/**
+ * concept MMA_Traits
+ * {
+ *   using ValTypeD =  // Logical A-value type
+ *   using ValTypeA =  // Logical B-value type
+ *   using ValTypeB =  // Logical C-value type
+ *   using ValTypeC =  // Logical D-value type    (NOTE: Not used? Assumed == ValTypeD)
+ *
+ *   using FrgTypeA =  // A-type consumed by MMA  (if ommitted, same as ValTypeA)
+ *   using FrgTypeB =  // B_type consumed by MMA  (if ommitted, same as ValTypeB)
+ *   using FrgTypeC =  // C_type consumed by MMA  (if ommitted, same as ValTypeC)
+ *
+ *   using Shape_MNK =    // Logical MxNxK shape of the MMA
+ *
+ *   using ThrID     =    // Logical thread id (tid) -> tidx
+ *
+ *   using ALayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MK-coord
+ *   using BLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat NK-coord
+ *   using CLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MN-coord
+ * };
+ */
+
+template <class MMAOperation, class... MMAOpArgs>
+struct MMA_Traits
+{
+  static_assert(sizeof(MMAOperation) == 0, "MMA_Traits not implemented for this MMA_Operation.");
+};
+
+template <class D, class A, class B, class C>
+struct MMA_Traits<UniversalFMA<D,A,B,C>>
+{
+  using ValTypeD = D;
+  using ValTypeA = A;
+  using ValTypeB = B;
+  using ValTypeC = C;
+
+  // Logical shape of the MMA
+  using Shape_MNK = Shape<_1,_1,_1>;
+
+  // Logical thread id (tid) -> tidx
+  using ThrID   = Layout<_1>;
+
+  // (Logical thread id (tid), Logical value id (vid)) -> coord
+
+  // (tid,vid) -> (m,k)
+  using ALayout = Layout<Shape<_1,_1>>;
+  // (tid,vid) -> (n,k)
+  using BLayout = Layout<Shape<_1,_1>>;
+  // (tid,vid) -> (m,n)
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+// Extract an MMA_Op from an MMA_Traits
+template <class MMA_Traits>
+struct MMA_Op {};
+
+template <class MMA_Op_Arg, class... Args>
+struct MMA_Op<MMA_Traits<MMA_Op_Arg, Args...>> {
+  using type = MMA_Op_Arg;
+};
+
+//
+// Generic mma_unpack for any MMA_Traits
+//
+
+template <class AnyMMATraits,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(AnyMMATraits        const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
+
+  // Register value types from the MMA_Operation register arrays
+  using MMA_Op   = typename MMA_Op<AnyMMATraits>::type;
+  using RegTypeD = typename remove_extent<typename MMA_Op::DRegisters>::type;
+  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
+  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
+  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
+
+  Tensor rA = recast<RegTypeA>(A);
+  Tensor rB = recast<RegTypeB>(B);
+  Tensor rD = recast<RegTypeD>(D);
+  Tensor rC = recast<RegTypeC>(C);
+
+  constexpr int RegNumD = extent<typename MMA_Op::DRegisters>::value;
+  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
+  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
+  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumD>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMA_Op::fma,
+                  rD, make_int_sequence<RegNumD>{},
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{});
+}
+
+// Accept mutable temporaries
+template <class AnyMMATraits,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(AnyMMATraits        const& traits,
+           Tensor<TD, DLayout>     && D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  mma_unpack(traits, D, A, B, C);
+}
+
+namespace detail {
+
+template <class X, class = void>
+struct FrgTypeA_or_Default { using type = typename X::ValTypeA; };
+template <class X>
+struct FrgTypeA_or_Default<X,void_t<typename X::FrgTypeA>> { using type = typename X::FrgTypeA; };
+
+template <class X, class = void>
+struct FrgTypeB_or_Default { using type = typename X::ValTypeB; };
+template <class X>
+struct FrgTypeB_or_Default<X,void_t<typename X::FrgTypeB>> { using type = typename X::FrgTypeB; };
+
+template <class X, class = void>
+struct FrgTypeC_or_Default { using type = typename X::ValTypeC; };
+template <class X>
+struct FrgTypeC_or_Default<X,void_t<typename X::FrgTypeC>> { using type = typename X::FrgTypeC; };
+
+} // end namespace detail
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
new file mode 100755
index 000000000..f72a63940
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
@@ -0,0 +1,73 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm61.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct MMA_Traits<SM61_DP4A>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_1,_1,_4>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape<_1,_4>>;
+  using BLayout = Layout<Shape<_1,_4>>;
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM61_DP2A>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int16_t;
+  using ValTypeB = int16_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_1,_1,_2>;
+  using ThrID   = Layout<_1>;
+  using ALayout = Layout<Shape<_1,_2>>;
+  using BLayout = Layout<Shape<_1,_2>>;
+  using CLayout = Layout<Shape<_1,_1>>;
+};
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
new file mode 100755
index 000000000..f0702a961
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm70.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+namespace {
+
+// Logical thread id to thread idx (quadpair)
+using SM70_QuadPair = Layout<Shape <_4, _2>,
+                             Stride<_1,_16>>;
+// (T8,V4) -> (M8,K4)
+using SM70_8x4_Row  = Layout<Shape <_8,_4>,
+                             Stride<_1,_8>>;
+// (T8,V4) -> (M8,K4)
+using SM70_8x4_Col  = Layout<Shape <Shape <_4,_2>,_4>,
+                             Stride<Stride<_8,_4>,_1>>;
+// (T8,V8) -> (M8,N8)
+using SM70_8x8_16b  = Layout<Shape <_8,_8>,
+                             Stride<_1,_8>>;
+// (T8,V8) -> (M8,N8)
+using SM70_8x8_32b  = Layout<Shape <Shape <_2, _2,_2>,Shape <_2,_2, _2>>,
+                             Stride<Stride<_1,_16,_4>,Stride<_8,_2,_32>>>;
+
+} 
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NT>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TT>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_16b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NT>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Col;
+  using BLayout = SM70_8x4_Row;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TT>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = SM70_QuadPair;
+  using ALayout = SM70_8x4_Row;
+  using BLayout = SM70_8x4_Col;
+  using CLayout = SM70_8x8_32b;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
new file mode 100755
index 000000000..1d3f51961
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm75.hpp>
+
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+
+namespace cute
+{
+
+template <>
+struct MMA_Traits<SM75_16x8x8_F32F16F16F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM75_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_4>,
+                         Stride<Stride<_32,_1>,_8>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_4>,
+                         Stride<Stride<_32,_1>,_8>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
new file mode 100755
index 000000000..706b10d88
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm80.hpp>
+#include <cute/atom/mma_traits.hpp>
+#include <cute/layout.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace cute
+{
+
+namespace {
+
+// (T32,V1) -> (M8,N8)
+using SM80_8x4      = Layout<Shape <Shape < _4,_8>,_1>,
+                             Stride<Stride< _8,_1>,_0>>;
+// (T32,V2) -> (M8,N8)
+using SM80_8x8_Row  = Layout<Shape <Shape < _4,_8>,_2>,
+                             Stride<Stride<_16,_1>,_8>>;
+// (T32,V4) -> (M8,N16)
+using SM80_8x16_Row = Layout<Shape <Shape < _4,_8>,_4>,
+                             Stride<Stride<_32,_1>,_8>>;
+// (T32,V4) -> (M16,N8)
+using SM80_16x8_Row = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                             Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp16 = fp16 * fp16 + fp16 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_16x8_Row;
+  using BLayout = SM80_8x8_Row;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2,  _2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8,_128>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = fp16 * fp16 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32F16F16F32_TN>
+     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F32F16F16F32_TN>
+     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = bf16 * bf16 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32BF16BF16F32_TN>
+     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_F32BF16BF16F32_TN>
+     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp32 = tf32 * tf32 + fp32 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x4_F32TF32TF32F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = cutlass::tfloat32_t;
+  using ValTypeB = cutlass::tfloat32_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_4>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using BLayout = SM80_8x4;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x8_F32TF32TF32F32_TN>
+{
+  using ValTypeD = float;
+  using ValTypeA = cutlass::tfloat32_t;
+  using ValTypeB = cutlass::tfloat32_t;
+  using ValTypeC = float;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape <_4,_8>, _2>,
+                         Stride<Stride<_8,_1>,_32>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_8,_8,_4>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_8x4;
+  using BLayout = SM80_8x4;
+  using CLayout = SM80_8x8_Row;
+};
+
+// Custom complex fp64 MMA composed of 4 fp64 MMAs -- same layouts
+template <>
+struct MMA_Traits<SM80_8x8x4_C64C64C64C64_TN>
+     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+// Custom complex fp64 MMA composed of 3 fp64 MMAs -- same layouts
+template <>
+struct MMA_Traits<SM80_8x8x4_GC64C64C64GC64_TN>
+     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s8 * s8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = SM80_8x16_Row;
+  using BLayout = SM80_8x16_Row;
+  using CLayout = SM80_8x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2>>,
+                         Stride<Stride<_64,_1>,Stride<_16,_8>>>;
+  using BLayout = SM80_8x16_Row;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_32>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2,  _2>>,
+                         Stride<Stride<_64,_1>,Stride<_16,_8,_256>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, Shape <_4,  _2>>,
+                         Stride<Stride<_32,_1>, Stride<_8,_128>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = s8 * u8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32S8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32S8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32S8U8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u8 * s8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32U8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32U8S8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U8S8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = u8 * u8 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN>
+     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_8x8x16_S32U8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN>
+     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x16_S32U8U8S32_TN> {};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN>
+     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN_SATURATE>
+     : MMA_Traits<SM80_16x8x32_S32U8U8S32_TN> {};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// s32 = b1 ^ b1 + s32 ///////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_16,_8,_256>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2,_2>>,
+                       Stride<Stride<_512,_1>,Stride<_16,_8,_2048>>>;
+  using BLayout = Layout<Shape<Shape <_4,_8>,Shape<_32,_2>>,
+                         Stride<Stride<_256,_1>,Stride< _8,_1024>>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_ANDPOPC>
+      :MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC> {};
+
+template<>
+struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+
+  using Shape_MNK = Shape<_8,_8,_128>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,_32>,
+                       Stride<Stride<_256,_1>,_8>>;
+  using BLayout = Layout<Shape<Shape<_4,_8>,_32>,
+                         Stride<Stride<_256,_1>,_8>>;
+  using CLayout = SM80_8x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_ANDPOPC>
+      :MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC> {};
+
+template<>
+struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = cute::uint1b_t;
+  using ValTypeB = cute::uint1b_t;
+  using ValTypeC = int32_t;
+  
+  using Shape_MNK = Shape<_16,_8,_128>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2>>,
+                       Stride<Stride<_512,_1>,Stride<Stride<_16,_8>>>>;
+  using BLayout = Layout<Shape <Shape<_4,_8>,_32>,
+                         Stride<Stride<_256,_1>,_8>>;
+  using CLayout = SM80_16x8_Row;
+};
+
+template <>
+struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_ANDPOPC>
+      :MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC> {};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
new file mode 100755
index 000000000..b2ced3f87
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+#include <cute/layout.hpp>
+
+namespace cute {
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+using SM90_16x8x4_F64F64F64F64_TN = SM90::MMA_16x8x4_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_4>;
+  using ThrID =  Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
+                         Stride<Stride<_16,_1>,_8>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>,_1>,
+                         Stride<Stride< _8,_1>,_0>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+using SM90_16x8x8_F64F64F64F64_TN = SM90::MMA_16x8x8_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_8>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, _2>,
+                         Stride<Stride< _8,_1>,_32>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+using SM90_16x8x16_F64F64F64F64_TN = SM90::MMA_16x8x16_F64F64F64F64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
+{
+  using ValTypeD = double;
+  using ValTypeA = double;
+  using ValTypeB = double;
+  using ValTypeC = double;
+
+  using Shape_MNK = Shape<_16,_8,_16>;
+  using ThrID   = Layout<_32>;
+  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _4>>,
+                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
+  using BLayout = Layout<Shape <Shape < _4,_8>, _4>,
+                         Stride<Stride< _8,_1>,_32>>;
+  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
+                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
+};
+
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////// cfp64 = cfp64 * cfp64 + cfp64 ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+using SM90_16x8x4_C64C64C64C64_TN  = SM90::MMA_16x8x4_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x4_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+using SM90_16x8x8_C64C64C64C64_TN  = SM90::MMA_16x8x8_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x8_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+using SM90_16x8x16_C64C64C64C64_TN = SM90::MMA_16x8x16_C64C64C64C64_TN;
+
+template <>
+struct MMA_Traits<SM90_16x8x16_C64C64C64C64_TN>
+     : MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
+{
+  using ValTypeD = complex<double>;
+  using ValTypeA = complex<double>;
+  using ValTypeB = complex<double>;
+  using ValTypeC = complex<double>;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
new file mode 100755
index 000000000..b02f5b3af
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -0,0 +1,8999 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/pointer_flagged.hpp>            // cute::smem_ptr_flag
+#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
+#include <cute/swizzle.hpp>                    // cute::Swizzle
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
+#include <cute/arch/mma_sm90_gmma.hpp>         // cute::SM90_64x8x16_F16F16F16_SS, etc
+#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+
+namespace cute {
+
+// Fence between the async destination accumulators of GMMA & source for their dependent use
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE
+void
+warpgroup_fence_operand(Tensor<Engine, Layout>& frg) {
+  CUTE_STATIC_ASSERT(is_static<Layout>::value);
+  if constexpr (is_same_v<typename Engine::value_type, float>) {
+    auto f32_frg = recast<float>(frg);
+    CUTE_UNROLL
+    for (int i = 0; i < size(f32_frg); ++i) {
+      warpgroup_fence_operand(f32_frg(i));
+    }
+  }
+  else {
+    CUTE_STATIC_ASSERT(is_rmem<Engine>::value);
+    auto u32_frg = recast<uint32_t>(frg);
+    CUTE_UNROLL
+    for (int i = 0; i < size(u32_frg); ++i) {
+      warpgroup_fence_operand(u32_frg(i));
+    }
+  }
+}
+
+namespace SM90::GMMA {
+
+///////////////////////////////////////////
+// Common layouts for GMMA Shared Memory //
+///////////////////////////////////////////
+
+// M|N-major GMMA layouts in units of bits
+using Layout_MN_INTER_Atom_Bits = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape< _128,_8>,Stride<_1, _128>>>;
+using Layout_MN_SW32_Atom_Bits  = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape< _256,_8>,Stride<_1, _256>>>;
+using Layout_MN_SW64_Atom_Bits  = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape< _512,_8>,Stride<_1, _512>>>;
+using Layout_MN_SW128_Atom_Bits = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_1024,_8>,Stride<_1,_1024>>>;
+
+// K-major GMMA layouts in units of bits
+using Layout_K_INTER_Atom_Bits  = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape<_8, _128>,Stride< _128,_1>>>;
+using Layout_K_SW32_Atom_Bits   = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape<_8, _256>,Stride< _256,_1>>>;
+using Layout_K_SW64_Atom_Bits   = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape<_8, _512>,Stride< _512,_1>>>;
+using Layout_K_SW128_Atom_Bits  = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_8,_1024>,Stride<_1024,_1>>>;
+
+// M|N-major layouts in units of Type
+template <class Type>
+using Layout_MN_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_INTER_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW32_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW64_Atom_Bits{}));
+template <class Type>
+using Layout_MN_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW128_Atom_Bits{}));
+
+// K-major layouts in units of Type
+template <class Type>
+using Layout_K_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_INTER_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW32_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW64_Atom_Bits{}));
+template <class Type>
+using Layout_K_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW128_Atom_Bits{}));
+
+// With GMMA::Major param
+template <class Type, Major tnsp>
+using Layout_INTER_Atom = typename conditional<tnsp == Major::MN,
+                                               Layout_MN_INTER_Atom<Type>,
+                                               Layout_K_INTER_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW32_Atom = typename conditional<tnsp == Major::MN,
+                                              Layout_MN_SW32_Atom<Type>,
+                                              Layout_K_SW32_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW64_Atom = typename conditional<tnsp == Major::MN,
+                                              Layout_MN_SW64_Atom<Type>,
+                                              Layout_K_SW64_Atom<Type>>::type;
+template <class Type, Major tnsp>
+using Layout_SW128_Atom = typename conditional<tnsp == Major::MN,
+                                               Layout_MN_SW128_Atom<Type>,
+                                               Layout_K_SW128_Atom<Type>>::type;
+
+//
+// Tensor (position-dependent swizzle) to LayoutType utility
+//
+
+template <class Engine, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+LayoutType
+layout_type(Tensor<Engine, Layout<Shape,Stride>> const&)
+{
+  static_assert(is_same<uint128_t, typename Engine::value_type>::value,
+                "Expected uint128_t type in LayoutType conversion.");
+
+  using Swizzle = get_swizzle_t<Engine>;
+  constexpr int B = Swizzle::num_bits;
+  constexpr int M = Swizzle::num_base;
+  constexpr int S = Swizzle::num_shft;
+
+  static_assert(M == 4,           "Unsupported layout swizzle");
+  static_assert(0 <= B && B <= 3, "Unsupported layout swizzle");
+  static_assert(S == 3,           "Unsupported layout swizzle");
+
+  switch (B) {
+    case 0: return LayoutType::INTERLEAVE;
+    case 1: return LayoutType::B32;
+    case 2: return LayoutType::B64;
+    case 3: return LayoutType::B128;
+  }
+  return LayoutType::INTERLEAVE;  // ERROR
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Construction method for GMMA Descriptors
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+* ///////////////////////////////
+* // make_gmma_desc<Major::MN> //
+* ///////////////////////////////
+* Each GmmaDescriptor Major-MN describes a canonical layout of the form
+*
+* LayoutType::INTERLEAVE   : Swizzle<0,4,3> o smem_ptr o ((T,1,m),(8,k)):((1,T,SBO),(1T,LBO))
+* LayoutType::B32          : Swizzle<1,4,3> o smem_ptr o ((T,2,m),(8,k)):((1,T,LBO),(2T,SBO))
+* LayoutType::B64          : Swizzle<2,4,3> o smem_ptr o ((T,4,m),(8,k)):((1,T,LBO),(4T,SBO))
+* LayoutType::B128         : Swizzle<3,4,3> o smem_ptr o ((T,8,m),(8,k)):((1,T,LBO),(8T,SBO))
+*
+* where
+*   T  : sizeof(uint128_t) / sizeof(value_type)
+*   m  : integer in [1,16] corresponding to GMMA shape
+*   k  : integer in [1,32] corresponding to GMMA shape
+*   SBO: stride byte offset
+*   LBO: leading byte offset
+*
+* See GMMA::Layout_MN_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-MN layouts.
+* For example,
+*   auto smem_layout = tile_to_shape(Layout_MN_SW128_Atom<value_type>{}, Shape<_128,_64>{});
+* is guaranteed to be accepted by make_gmma_desc<Major::MN> for appropriate value_type.
+*
+* //////////////////////////////
+* // make_gmma_desc<Major::K> //
+* //////////////////////////////
+* Each GmmaDescriptor Major-K describes a canonical layout of the form
+*
+* LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((8,m),(T,2)):((1T,SBO),(1,LBO))
+* LayoutType::B32        : Swizzle<1,4,3> o smem_ptr o ((8,m),(T,2)):((2T,SBO),(1, T ))
+* LayoutType::B64        : Swizzle<2,4,3> o smem_ptr o ((8,m),(T,2)):((4T,SBO),(1, T ))
+* LayoutType::B128       : Swizzle<3,4,3> o smem_ptr o ((8,m),(T,2)):((8T,SBO),(1, T ))
+*
+* See GMMA::Layout_K_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-K layouts.
+* For example,
+*   auto smem_layout = tile_to_shape(Layout_K_SW128_Atom<value_type>{}, Shape<_128,_64>{});
+* is guaranteed to be accepted by make_gmma_desc<Major::K> for appropriate value_type.
+*/
+template <Major MajorMode, class TEngine, class TLayout>
+CUTE_HOST_DEVICE constexpr
+GmmaDescriptor
+make_gmma_desc(Tensor<TEngine,TLayout> const& tensor)
+{
+  static_assert(is_smem<TEngine>::value, "GMMA Descriptors can only be constructed on smem.");
+  static_assert(TLayout::rank == 2, "GMMA Descriptors can only be constructed on rank-2 tensors.");
+  using value_type = typename TEngine::value_type;
+
+  Tensor u128_tensor = recast<uint128_t const>(tensor);
+
+  // Result
+  GmmaDescriptor desc;
+
+  // Layout type
+  constexpr LayoutType LAYOUT_TYPE = layout_type(u128_tensor);
+  desc.bitfield.layout_type_ = uint8_t(LAYOUT_TYPE);
+
+  // Start address (4LSB not included)
+  uint32_t start_address = cast_smem_ptr_to_uint(raw_pointer_cast(u128_tensor.data()));
+  desc.bitfield.start_address_ = static_cast<uint16_t>(start_address >> 4);
+
+  constexpr uint8_t base_offset = 0;
+  desc.bitfield.base_offset_ = base_offset;
+
+  // LayoutType meta
+  constexpr int W = LAYOUT_TYPE == LayoutType::INTERLEAVE ? 1 :
+                    LAYOUT_TYPE == LayoutType::B32        ? 2 :
+                    LAYOUT_TYPE == LayoutType::B64        ? 4 :
+                    LAYOUT_TYPE == LayoutType::B128       ? 8 : -1;
+
+  if constexpr (MajorMode == Major::MN)
+  {
+    /* In units of uint128_t, each GmmaDescriptor Major-MN describes a canonical layout of the form
+     *
+     * LayoutType::INTERLEAVE         : Swizzle<0,4,3> o smem_ptr o ((1,n),(8,k)):((X,SBO),(1,LBO))
+     * LayoutType::B32                : Swizzle<1,4,3> o smem_ptr o ((2,n),(8,k)):((1,LBO),(2,SBO))
+     * LayoutType::B64                : Swizzle<2,4,3> o smem_ptr o ((4,n),(8,k)):((1,LBO),(4,SBO))
+     * LayoutType::B128               : Swizzle<3,4,3> o smem_ptr o ((8,n),(8,k)):((1,LBO),(8,SBO))
+     */
+    static_assert(size<1>(u128_tensor) == Int<(256 / cute::sizeof_bits<value_type>::value)>{} || // A and B in dense MMA
+                  size<1>(u128_tensor) == Int<(128 / cute::sizeof_bits<value_type>::value)>{} || // A in sparse MMA
+                  size<1>(u128_tensor) == Int<(512 / cute::sizeof_bits<value_type>::value)>{},   // B in sparse MMA
+                         "Not a canonical GMMA_MN Layout: Expected K-size 256/sizeof_bits<T> for dense or (128|512)/sizeof_bits<T> for sparse.");
+
+    // Construct the canonical GMMA T Layout with shape ((W,n),(8,2))
+    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<Int<W>,_1>{}, Layout<Int<8>,_1>{}));
+
+    // Check ranks of canonical
+    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
+    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
+    // Check canonical mode strides
+    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
+    constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == LayoutType::INTERLEAVE ? stride<0,0>(canonical_layout) : 1;
+    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_MN Layout: Expected stride failure.");
+    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
+    constexpr uint32_t expected_stride_10 = W;
+    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_MN Layout: Expected stride failure.");
+
+    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
+    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
+    constexpr uint32_t stride_11 = stride<1,1>(canonical_layout);
+
+    desc.bitfield.stride_byte_offset_  = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_01 : stride_11;
+    desc.bitfield.leading_byte_offset_ = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_11 : stride_01;
+  }
+  else if constexpr (MajorMode == Major::K)
+  {
+    /* In units of uint128_t, each GmmaDescriptor Major-K describes a canonical layout of the form
+     *
+     * LayoutType::INTERLEAVE    : Swizzle<0,4,3> o smem_ptr o ((8,n),2):((1,SBO),LBO)
+     * LayoutType::B32           : Swizzle<1,4,3> o smem_ptr o ((8,n),2):((2,SBO),1)
+     * LayoutType::B64           : Swizzle<2,4,3> o smem_ptr o ((8,n),2):((4,SBO),1)
+     * LayoutType::B128          : Swizzle<3,4,3> o smem_ptr o ((8,n),2):((8,SBO),1)
+     */
+    CUTE_STATIC_ASSERT_V(size<0>(u128_tensor) % Int<8>{} == Int<0>{},          // N|M size
+                         "Not a canonical GMMA_K Layout: Expected MN-size multiple of 8.");
+    CUTE_STATIC_ASSERT_V(size<1>(u128_tensor) == Int<2>{} || size<1>(u128_tensor) == Int<4>{},      // K   size
+                         "Not a canonical GMMA_K Layout: Expected K-size 2 for dense or 4 for sparse (in units of uint128_t).");
+
+    // Construct the canonical GMMA N Layout with shape ((8,n),(2,1))
+    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{}));
+
+    // Check ranks of canonical
+    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
+    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
+    // Check canonical mode strides
+    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
+    constexpr uint32_t expected_stride_00 = W;
+    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_K Layout: Expected stride failure.");
+    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
+    constexpr uint32_t expected_stride_10 = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride<1,0>(canonical_layout) : 1;
+    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_K Layout: Expected stride failure.");
+
+    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
+    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
+
+    desc.bitfield.stride_byte_offset_  = stride_01;
+    desc.bitfield.leading_byte_offset_ = stride_10;
+  } else {
+    static_assert(MajorMode != Major::MN && MajorMode != Major::K, "Unrecognized MajorMode!");
+  }
+
+#if 0
+  // DEBUG and SANITY
+  assert((start_address & 0b0000001111) == 0); // Must be 16B aligned (4LSB are 0) no negotiation
+  assert((start_address & 0b1110000000) == 0); // Assert base_offset is 0, generalize later
+  if (thread0()) {
+    print("smem_desc input     tensor: "); print(tensor.data()); print(" o "); print(tensor.layout()); print("\n");
+    print("smem_desc uint128_t tensor: "); print(u128_tensor.data()); print(" o "); print(u128_tensor.layout()); print("\n");
+    //print("     desc canonical layout: "); print(canonical_layout); print("\n");
+    print(desc);
+  }
+#endif
+
+  return desc;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Higher level GMMA Descriptor utilities
+///////////////////////////////////////////////////////////////////////////////
+
+struct DescriptorIterator
+{
+  using reference    = GmmaDescriptor;
+  using element_type = GmmaDescriptor;
+  using value_type   = GmmaDescriptor;
+
+  GmmaDescriptor desc_;
+
+  // Dereference returns the GmmaDescriptor
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const { return desc_; }
+
+  // Advance and return a new GmmaDescriptor
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const { return *(*this + i); }
+
+  // Return an advanced iterator
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  DescriptorIterator operator+(Index const& offset) const
+  {
+    return { GmmaDescriptor{desc_ + uint64_t(offset)} };
+  }
+};
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+GmmaDescriptor
+raw_pointer_cast(DescriptorIterator const& ptr) {
+  return ptr.desc_;
+}
+
+// Recast a DescriptorIterator Tensor to uint64_t, it's RegType in mma_unpack
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+DescriptorIterator
+recast_ptr(DescriptorIterator const& iter) {
+  static_assert(is_same<NewT, uint64_t>::value, "Can only cast GmmaDescriptorIterator to uint64_t.");
+  return iter;  // Do nothing, it will still dereference to GmmaDescriptor and decay to uint64_t
+}
+
+CUTE_HOST_DEVICE void
+print(DescriptorIterator) {
+  printf("GMMA::DescriptorIterator");
+}
+
+// The GMMA Traits below have custom fragment type flags for their smem desc tensors.
+// These flags specialize a MakeTensor customization point to correctly make the fragment that is desired.
+template <Major>
+struct smem_desc : DescriptorIterator {};
+
+} // end namespace SM90::GMMA
+
+// Customization point for creating a GMMA::smem_desc Tensor
+template <SM90::GMMA::Major MajorMode>
+struct MakeTensor<SM90::GMMA::smem_desc<MajorMode>>
+{
+  template <class TEngine, class TLayout>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
+  {
+    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
+    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
+                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// MMA_TRAITS ///////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA {
+
+//
+// Specialized mma_unpack implementation for SM90 GMMA instructions
+//
+
+template <class MMA_Op, class... MMA_Args,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr
+void
+mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
+
+  // Register value types from the MMA_Operation register arrays
+  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
+  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
+  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
+
+  // SM90 GMMA take three arguments rather than four, try to assert C and D are aliased
+  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
+  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
+  // assert((void*)&C == (void*)&D);
+
+  Tensor rA = recast<RegTypeA>(A);
+  Tensor rB = recast<RegTypeB>(B);
+  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
+
+  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
+  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
+  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMA_Op::fma,
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  &(traits.accumulate_), seq<0>{});
+}
+
+// Accumulator layouts
+template<int N>
+using CLayout_64xN   = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,Int<N/8>>>,
+                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,   _512>>>;
+
+using CLayout_64x8   = CLayout_64xN<  8>;
+using CLayout_64x16  = CLayout_64xN< 16>;
+using CLayout_64x32  = CLayout_64xN< 32>;
+using CLayout_64x64  = CLayout_64xN< 64>;
+using CLayout_64x96  = CLayout_64xN< 96>;
+using CLayout_64x128 = CLayout_64xN<128>;
+using CLayout_64x192 = CLayout_64xN<192>;
+using CLayout_64x256 = CLayout_64xN<256>;
+
+// Register source layout for 32-bit value types
+using ALayout_64x8   = Layout<Shape <Shape <  _4,_8, _4>,Shape <    _2,  _2>>,
+                              Stride<Stride< _64,_1,_16>,Stride<    _8,_256>>>;
+
+// Register source layout for 16-bit (sparse 32-bit) value types
+using ALayout_64x16  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,  _2>>,
+                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,_512>>>;
+
+// Register source layout for 8-bit (sparse 16-bit) value types
+using ALayout_64x32  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _4,_2,   _2>>,
+                              Stride<Stride<_256,_1,_16>,Stride<_64,_8,_1024>>>;
+
+// Register source layout for sparse 8-bit value types
+using ALayout_64x64  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _8,_2,   _2>>,
+                              Stride<Stride<_512,_1,_16>,Stride<_64,_8,_2048>>>;
+
+// Shared memory source layouts for any value type
+template <int M, int K>
+using ABLayout       = Layout<Shape <_128,Shape <Int<M>,Int<K>>>,
+                              Stride<  _0,Stride<    _1,Int<M>>>>;
+
+} // end namespace SM90::GMMA
+
+using namespace SM90;
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F16F16F16_SS = SM90::GMMA::MMA_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F16F16F16_RS = SM90::GMMA::MMA_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F16F16F16_SS = SM90::GMMA::MMA_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F16F16F16_RS = SM90::GMMA::MMA_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F16F16F16_SS = SM90::GMMA::MMA_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F16F16F16_RS = SM90::GMMA::MMA_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F16F16F16_SS = SM90::GMMA::MMA_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F16F16F16_RS = SM90::GMMA::MMA_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F16F16F16_SS = SM90::GMMA::MMA_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F16F16F16_RS = SM90::GMMA::MMA_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F16F16F16_SS = SM90::GMMA::MMA_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F16F16F16_RS = SM90::GMMA::MMA_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F16F16F16_SS = SM90::GMMA::MMA_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F16F16F16_RS = SM90::GMMA::MMA_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F16F16F16_SS = SM90::GMMA::MMA_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F16F16F16_RS = SM90::GMMA::MMA_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32F16F16_SS = SM90::GMMA::MMA_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32F16F16_RS = SM90::GMMA::MMA_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32F16F16_SS = SM90::GMMA::MMA_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32F16F16_RS = SM90::GMMA::MMA_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32F16F16_SS = SM90::GMMA::MMA_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32F16F16_RS = SM90::GMMA::MMA_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32F16F16_SS = SM90::GMMA::MMA_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32F16F16_RS = SM90::GMMA::MMA_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32F16F16_SS = SM90::GMMA::MMA_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32F16F16_RS = SM90::GMMA::MMA_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32F16F16_SS = SM90::GMMA::MMA_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32F16F16_RS = SM90::GMMA::MMA_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32F16F16_SS = SM90::GMMA::MMA_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32F16F16_RS = SM90::GMMA::MMA_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32F16F16_SS = SM90::GMMA::MMA_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32F16F16_RS = SM90::GMMA::MMA_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<  8,  8>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<  8,  8>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 16,  8>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 16,  8>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 32,  8>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 32,  8>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 64,  8>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 64,  8>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 96,  8>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 96,  8>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<128,  8>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<128,  8>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<192,  8>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<192,  8>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<256,  8>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<256,  8>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x8x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x16x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x32x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x64x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x96x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x128x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x192x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x256x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x8x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x16x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x32x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x64x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x96x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x128x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x192x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x256x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_traits_sm90_gmma_ext.hpp"
+#endif
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
new file mode 100755
index 000000000..15e2412c8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
@@ -0,0 +1,20116 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+using CLayout_64x24  = CLayout_64xN< 24>;
+using CLayout_64x40  = CLayout_64xN< 40>;
+using CLayout_64x48  = CLayout_64xN< 48>;
+using CLayout_64x56  = CLayout_64xN< 56>;
+using CLayout_64x72  = CLayout_64xN< 72>;
+using CLayout_64x80  = CLayout_64xN< 80>;
+using CLayout_64x88  = CLayout_64xN< 88>;
+using CLayout_64x104 = CLayout_64xN<104>;
+using CLayout_64x112 = CLayout_64xN<112>;
+using CLayout_64x120 = CLayout_64xN<120>;
+using CLayout_64x136 = CLayout_64xN<136>;
+using CLayout_64x144 = CLayout_64xN<144>;
+using CLayout_64x152 = CLayout_64xN<152>;
+using CLayout_64x160 = CLayout_64xN<160>;
+using CLayout_64x168 = CLayout_64xN<168>;
+using CLayout_64x176 = CLayout_64xN<176>;
+using CLayout_64x184 = CLayout_64xN<184>;
+using CLayout_64x200 = CLayout_64xN<200>;
+using CLayout_64x208 = CLayout_64xN<208>;
+using CLayout_64x216 = CLayout_64xN<216>;
+using CLayout_64x224 = CLayout_64xN<224>;
+using CLayout_64x232 = CLayout_64xN<232>;
+using CLayout_64x240 = CLayout_64xN<240>;
+using CLayout_64x248 = CLayout_64xN<248>;
+
+}
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F16F16F16_SS = SM90::GMMA::MMA_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F16F16F16_RS = SM90::GMMA::MMA_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F16F16F16_SS = SM90::GMMA::MMA_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F16F16F16_RS = SM90::GMMA::MMA_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F16F16F16_SS = SM90::GMMA::MMA_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F16F16F16_RS = SM90::GMMA::MMA_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F16F16F16_SS = SM90::GMMA::MMA_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F16F16F16_RS = SM90::GMMA::MMA_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F16F16F16_SS = SM90::GMMA::MMA_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F16F16F16_RS = SM90::GMMA::MMA_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F16F16F16_SS = SM90::GMMA::MMA_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F16F16F16_RS = SM90::GMMA::MMA_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F16F16F16_SS = SM90::GMMA::MMA_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F16F16F16_RS = SM90::GMMA::MMA_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F16F16F16_SS = SM90::GMMA::MMA_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F16F16F16_RS = SM90::GMMA::MMA_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F16F16F16_SS = SM90::GMMA::MMA_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F16F16F16_RS = SM90::GMMA::MMA_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F16F16F16_SS = SM90::GMMA::MMA_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F16F16F16_RS = SM90::GMMA::MMA_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F16F16F16_SS = SM90::GMMA::MMA_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F16F16F16_RS = SM90::GMMA::MMA_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F16F16F16_SS = SM90::GMMA::MMA_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F16F16F16_RS = SM90::GMMA::MMA_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F16F16F16_SS = SM90::GMMA::MMA_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F16F16F16_RS = SM90::GMMA::MMA_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F16F16F16_SS = SM90::GMMA::MMA_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F16F16F16_RS = SM90::GMMA::MMA_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F16F16F16_SS = SM90::GMMA::MMA_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F16F16F16_RS = SM90::GMMA::MMA_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F16F16F16_SS = SM90::GMMA::MMA_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F16F16F16_RS = SM90::GMMA::MMA_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F16F16F16_SS = SM90::GMMA::MMA_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F16F16F16_RS = SM90::GMMA::MMA_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F16F16F16_SS = SM90::GMMA::MMA_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F16F16F16_RS = SM90::GMMA::MMA_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F16F16F16_SS = SM90::GMMA::MMA_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F16F16F16_RS = SM90::GMMA::MMA_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F16F16F16_SS = SM90::GMMA::MMA_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F16F16F16_RS = SM90::GMMA::MMA_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F16F16F16_SS = SM90::GMMA::MMA_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F16F16F16_RS = SM90::GMMA::MMA_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F16F16F16_SS = SM90::GMMA::MMA_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F16F16F16_RS = SM90::GMMA::MMA_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F16F16F16_SS = SM90::GMMA::MMA_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F16F16F16_RS = SM90::GMMA::MMA_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F16F16F16_SS = SM90::GMMA::MMA_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F16F16F16_RS = SM90::GMMA::MMA_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32F16F16_SS = SM90::GMMA::MMA_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32F16F16_RS = SM90::GMMA::MMA_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32F16F16_SS = SM90::GMMA::MMA_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32F16F16_RS = SM90::GMMA::MMA_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32F16F16_SS = SM90::GMMA::MMA_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32F16F16_RS = SM90::GMMA::MMA_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32F16F16_SS = SM90::GMMA::MMA_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32F16F16_RS = SM90::GMMA::MMA_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32F16F16_SS = SM90::GMMA::MMA_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32F16F16_RS = SM90::GMMA::MMA_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32F16F16_SS = SM90::GMMA::MMA_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32F16F16_RS = SM90::GMMA::MMA_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32F16F16_SS = SM90::GMMA::MMA_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32F16F16_RS = SM90::GMMA::MMA_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32F16F16_SS = SM90::GMMA::MMA_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32F16F16_RS = SM90::GMMA::MMA_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32F16F16_SS = SM90::GMMA::MMA_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32F16F16_RS = SM90::GMMA::MMA_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32F16F16_SS = SM90::GMMA::MMA_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32F16F16_RS = SM90::GMMA::MMA_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32F16F16_SS = SM90::GMMA::MMA_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32F16F16_RS = SM90::GMMA::MMA_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32F16F16_SS = SM90::GMMA::MMA_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32F16F16_RS = SM90::GMMA::MMA_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32F16F16_SS = SM90::GMMA::MMA_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32F16F16_RS = SM90::GMMA::MMA_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32F16F16_SS = SM90::GMMA::MMA_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32F16F16_RS = SM90::GMMA::MMA_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32F16F16_SS = SM90::GMMA::MMA_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32F16F16_RS = SM90::GMMA::MMA_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32F16F16_SS = SM90::GMMA::MMA_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32F16F16_RS = SM90::GMMA::MMA_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32F16F16_SS = SM90::GMMA::MMA_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32F16F16_RS = SM90::GMMA::MMA_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32F16F16_SS = SM90::GMMA::MMA_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32F16F16_RS = SM90::GMMA::MMA_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32F16F16_SS = SM90::GMMA::MMA_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32F16F16_RS = SM90::GMMA::MMA_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32F16F16_SS = SM90::GMMA::MMA_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32F16F16_RS = SM90::GMMA::MMA_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32F16F16_SS = SM90::GMMA::MMA_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32F16F16_RS = SM90::GMMA::MMA_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32F16F16_SS = SM90::GMMA::MMA_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32F16F16_RS = SM90::GMMA::MMA_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32F16F16_SS = SM90::GMMA::MMA_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32F16F16_RS = SM90::GMMA::MMA_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32F16F16_SS = SM90::GMMA::MMA_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32F16F16_RS = SM90::GMMA::MMA_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = half_t;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::Major tnspA,
+  GMMA::Major tnspB,
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 24,  8>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 24,  8>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 40,  8>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 40,  8>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 48,  8>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 48,  8>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 56,  8>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 56,  8>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 72,  8>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 72,  8>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 80,  8>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 80,  8>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout< 88,  8>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout< 88,  8>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<104,  8>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<104,  8>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<112,  8>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<112,  8>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<120,  8>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<120,  8>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<136,  8>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<136,  8>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<144,  8>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<144,  8>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<152,  8>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<152,  8>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<160,  8>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<160,  8>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<168,  8>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<168,  8>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<176,  8>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<176,  8>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<184,  8>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<184,  8>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<200,  8>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<200,  8>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<208,  8>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<208,  8>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<216,  8>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<216,  8>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<224,  8>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<224,  8>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<232,  8>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<232,  8>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<240,  8>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<240,  8>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64,  8>;
+  using BLayout = GMMA::ABLayout<248,  8>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = tfloat32_t;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_8>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x8;
+  using BLayout = GMMA::ABLayout<248,  8>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = int8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN_SATURATE;
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x24x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x48x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x80x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x112x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x144x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x160x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x176x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x208x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x224x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+using SM90_64x240x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN_SATURATE; 
+
+template <>
+struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN_SATURATE>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = uint8_t;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e4m3_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x24x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x40x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x48x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x56x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x72x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x80x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x88x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x104x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x112x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x120x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x136x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x144x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x152x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x160x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x168x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x176x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x184x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x200x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x208x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x216x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x224x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x232x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x240x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
+  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
+>
+using SM90_64x248x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
+struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
+{
+  using ValTypeD = float;
+  using ValTypeA = float_e5m2_t;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
new file mode 100755
index 000000000..27c41ad33
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
@@ -0,0 +1,7738 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
+#include <cute/swizzle.hpp>                    // cute::Swizzle
+#include <cute/tensor_impl.hpp>                // cute::Tensor
+#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
+#include <cute/arch/mma_sm90_gmma_sparse.hpp>  // cute::SM90::SPARSE::GMMA_64x8x32_F16F16F16_SS, etc
+#include <cute/atom/mma_traits_sm90_gmma.hpp>  // cute::GMMA::Layout_*
+#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/numeric/integral_constant.hpp>  // cute::is_static
+
+namespace cute {
+
+namespace SM90::GMMA {
+
+///////////////////////////////////////////
+// Common layouts for GMMA Shared Memory //
+///////////////////////////////////////////
+
+// M|N-major layouts in units of Type and sparsity factor S
+template <class Type, int S>
+using Layout_MN_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_INTER_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW32_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW64_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_MN_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW128_Atom<Type>{}.layout_b()))>;
+
+// K-major layouts in units of Type and sparsity factor S
+template <class Type, int S>
+using Layout_K_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_INTER_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW32_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW64_Atom<Type>{}.layout_b()))>;
+template <class Type, int S>
+using Layout_K_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
+                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW128_Atom<Type>{}.layout_b()))>;
+
+// With GMMA::Major param
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_INTER_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                 Layout_MN_INTER_SpAtom<Type,S>,
+                                                 Layout_K_INTER_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW32_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                Layout_MN_SW32_SpAtom<Type,S>,
+                                                Layout_K_SW32_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW64_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                Layout_MN_SW64_SpAtom<Type,S>,
+                                                Layout_K_SW64_SpAtom<Type,S>>::type;
+template <class Type, int S, GMMA::Major tnsp>
+using Layout_SW128_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
+                                                 Layout_MN_SW128_SpAtom<Type,S>,
+                                                 Layout_K_SW128_SpAtom<Type,S>>::type;
+
+///////////////////////////////////////////////////////////////////////////////
+// Higher level GMMA Descriptor utilities
+///////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major>
+struct sparse_smem_desc : DescriptorIterator {};
+
+} // end namespace SM90::GMMA
+
+// Customization point for creating a cute::GMMAsparse_smem_desc Tensor
+template <SM90::GMMA::Major MajorMode>
+struct MakeTensor<SM90::GMMA::sparse_smem_desc<MajorMode>>
+{
+  // Note that this is the exact same as cute::GMMAsmem_desc above, plus additional static checks.
+  template <class TEngine, class TLayout>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
+  {
+    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
+    static_assert(is_sparse<typename TEngine::value_type>::value, "Expected sparse value_type.");
+    static_assert(is_sparse_ptr<TEngine>::value, "Expected sparse iter.");
+    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
+                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// MMA_TRAITS ///////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA {
+
+// Metadata layouts
+using ELayout_64x64  = Layout<Shape <Shape <_2,   _2,_8, _4>, Shape <_32>>, 
+                              Stride<Stride<_8,_2048,_1,_16>, Stride<_64>>>;
+
+using ELayout_64x32  = Layout<Shape <Shape <   _2,_2,_8, _4>, Shape <_16,_2>>, 
+                              Stride<Stride<_1024,_0,_1,_16>, Stride<_64,_8>>>;
+
+using ELayout_64x16  = Layout<Shape <Shape <  _2,_2,_8, _4>, Shape < _8,_2>>, 
+                              Stride<Stride<_512,_0,_1,_16>, Stride<_64,_8>>>;
+
+} // namespace SM90::GMMA
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace SM90::GMMA::SPARSE {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class MMAOp,
+          class TD, class DLayout,
+          class TA, class ALayout,
+          class TB, class BLayout,
+          class TC, class CLayout>
+CUTE_HOST_DEVICE constexpr void
+mma_unpack(MMA_Traits<MMAOp>   const& traits,
+           Tensor<TD, DLayout>      & D,
+           Tensor<TA, ALayout> const& A_zipped,
+           Tensor<TB, BLayout> const& B,
+           Tensor<TC, CLayout> const& C)
+{
+  static_assert(is_rmem_v<TD>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TA>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TB>, "Expected registers in MMA_Atom::call");
+  static_assert(is_rmem_v<TC>, "Expected registers in MMA_Atom::call");
+
+  using DRegisters = typename MMAOp::DRegisters;
+  using ARegisters = typename MMAOp::ARegisters;
+  using ERegisters = typename MMAOp::ERegisters;
+  using BRegisters = typename MMAOp::BRegisters;
+  using CRegisters = typename MMAOp::CRegisters;
+
+  // Register value types from the MMAOp register arrays
+  using RegTypeD   = typename remove_extent<DRegisters>::type;
+  using RegTypeA   = typename remove_extent<ARegisters>::type;
+  using RegTypeE   = typename remove_extent<ERegisters>::type;
+  using RegTypeB   = typename remove_extent<BRegisters>::type;
+  using RegTypeC   = typename remove_extent<CRegisters>::type;
+
+  constexpr int RegNumA = extent<ARegisters>::value;
+  constexpr int RegNumE = extent<ERegisters>::value;
+  constexpr int RegNumB = extent<BRegisters>::value;
+  constexpr int RegNumC = extent<CRegisters>::value;
+
+  auto [A, E] = unzip_tensor(A_zipped);
+  Tensor rA   = recast<RegTypeA>(A);
+  Tensor rE   = recast<RegTypeE>(E);
+  Tensor rB   = recast<RegTypeB>(B);
+
+  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
+  CUTE_STATIC_ASSERT_V(size(rE) == Int<RegNumE>{});
+  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
+
+  static_assert(is_same<RegTypeD, void>::value, "GMMA DRegisters must have void type.");
+  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
+  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
+
+  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
+
+  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
+
+  detail::explode(MMAOp::fma,
+                  rA, make_int_sequence<RegNumA>{},
+                  rB, make_int_sequence<RegNumB>{},
+                  rC, make_int_sequence<RegNumC>{},
+                  rE, make_int_sequence<RegNumE>{},
+                  &(traits.accumulate_), seq<0>{});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace SM90::SPARSE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_8,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<  8, 32>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_16,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 16, 32>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_32,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 32, 32>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_64,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 64, 32>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_96,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 96, 32>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_128,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<128, 32>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_192,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<192, 32>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_256,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<256, 32>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<  8, 16>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 16, 16>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 32, 16>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 64, 16>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 96, 16>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<128, 16>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<192, 16>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<256, 16>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_8,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<  8, 64>;
+  using CLayout = GMMA::CLayout_64x8;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_16,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 16, 64>;
+  using CLayout = GMMA::CLayout_64x16;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_32,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 32, 64>;
+  using CLayout = GMMA::CLayout_64x32;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_64,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 64, 64>;
+  using CLayout = GMMA::CLayout_64x64;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_96,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 96, 64>;
+  using CLayout = GMMA::CLayout_64x96;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_128,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<128, 64>;
+  using CLayout = GMMA::CLayout_64x128;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_192,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<192, 64>;
+  using CLayout = GMMA::CLayout_64x192;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_256,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<256, 64>;
+  using CLayout = GMMA::CLayout_64x256;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
+
+#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
+#include "mma_traits_sm90_gmma_sparse_ext.hpp"
+#endif
\ No newline at end of file
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
new file mode 100755
index 000000000..3680b7e13
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
@@ -0,0 +1,17335 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+#pragma once
+  
+#include <cute/arch/mma_sm90.hpp>
+#include <cute/atom/mma_traits.hpp>
+
+namespace cute {
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, half_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = half_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_24,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 24, 32>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_40,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 40, 32>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_48,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 48, 32>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_56,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 56, 32>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_72,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 72, 32>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_80,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 80, 32>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_88,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout< 88, 32>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_104,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<104, 32>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_112,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<112, 32>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_120,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<120, 32>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_136,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<136, 32>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_144,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<144, 32>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_152,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<152, 32>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_160,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<160, 32>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_168,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<168, 32>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_176,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<176, 32>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_184,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<184, 32>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_200,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<200, 32>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_208,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<208, 32>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_216,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<216, 32>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_224,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<224, 32>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_232,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<232, 32>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_240,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<240, 32>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<tnspA>;
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 32>;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, bfloat16_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<tnspB>;
+
+  using Shape_MNK = Shape<_64,_248,_32>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x32;
+  using ELayout = GMMA::ELayout_64x32;
+  using BLayout = GMMA::ABLayout<248, 32>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 24, 16>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 40, 16>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 48, 16>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 56, 16>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 72, 16>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 80, 16>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout< 88, 16>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<104, 16>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<112, 16>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<120, 16>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<136, 16>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<144, 16>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<152, 16>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<160, 16>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<168, 16>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<176, 16>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<184, 16>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<200, 16>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<208, 16>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<216, 16>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<224, 16>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<232, 16>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<240, 16>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 16>;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, tfloat32_t>;
+  using ValTypeE = sparse_elem<4, uint8_t>;
+  using ValTypeB = tfloat32_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_16>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x16;
+  using ELayout = GMMA::ELayout_64x16;
+  using BLayout = GMMA::ABLayout<248, 16>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, int8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = int8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE<spsel>>
+{
+  using ValTypeD = int32_t;
+  using ValTypeA = sparse_elem<2, uint8_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = uint8_t;
+  using ValTypeC = int32_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e4m3_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e4m3_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_24,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 24, 64>;
+  using CLayout = GMMA::CLayout_64x24;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_40,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 40, 64>;
+  using CLayout = GMMA::CLayout_64x40;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_48,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 48, 64>;
+  using CLayout = GMMA::CLayout_64x48;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_56,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 56, 64>;
+  using CLayout = GMMA::CLayout_64x56;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_72,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 72, 64>;
+  using CLayout = GMMA::CLayout_64x72;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_80,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 80, 64>;
+  using CLayout = GMMA::CLayout_64x80;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_88,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout< 88, 64>;
+  using CLayout = GMMA::CLayout_64x88;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_104,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<104, 64>;
+  using CLayout = GMMA::CLayout_64x104;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_112,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<112, 64>;
+  using CLayout = GMMA::CLayout_64x112;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_120,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<120, 64>;
+  using CLayout = GMMA::CLayout_64x120;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_136,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<136, 64>;
+  using CLayout = GMMA::CLayout_64x136;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_144,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<144, 64>;
+  using CLayout = GMMA::CLayout_64x144;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_152,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<152, 64>;
+  using CLayout = GMMA::CLayout_64x152;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_160,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<160, 64>;
+  using CLayout = GMMA::CLayout_64x160;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_168,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<168, 64>;
+  using CLayout = GMMA::CLayout_64x168;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_176,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<176, 64>;
+  using CLayout = GMMA::CLayout_64x176;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_184,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<184, 64>;
+  using CLayout = GMMA::CLayout_64x184;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_200,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<200, 64>;
+  using CLayout = GMMA::CLayout_64x200;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_208,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<208, 64>;
+  using CLayout = GMMA::CLayout_64x208;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_216,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<216, 64>;
+  using CLayout = GMMA::CLayout_64x216;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_224,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<224, 64>;
+  using CLayout = GMMA::CLayout_64x224;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_232,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<232, 64>;
+  using CLayout = GMMA::CLayout_64x232;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_240,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<240, 64>;
+  using CLayout = GMMA::CLayout_64x240;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = half_t;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = half_t;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ABLayout< 64, 64>;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
+struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
+{
+  using ValTypeD = float;
+  using ValTypeA = sparse_elem<2, float_e5m2_t>;
+  using ValTypeE = sparse_elem<8, uint8_t>;
+  using ValTypeB = float_e5m2_t;
+  using ValTypeC = float;
+
+  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
+
+  using Shape_MNK = Shape<_64,_248,_64>;
+  using ThrID   = Layout<_128>;
+  using ALayout = GMMA::ALayout_64x64;
+  using ELayout = GMMA::ELayout_64x64;
+  using BLayout = GMMA::ABLayout<248, 64>;
+  using CLayout = GMMA::CLayout_64x248;
+
+  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/config.hpp b/lightllm-kernel/cutlass/include/cute/config.hpp
new file mode 100755
index 000000000..b5cfcf47d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/config.hpp
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#  define CUTE_HOST_DEVICE __forceinline__ __host__ __device__
+#  define CUTE_DEVICE      __forceinline__          __device__
+#  define CUTE_HOST        __forceinline__ __host__
+#else
+#  define CUTE_HOST_DEVICE inline
+#  define CUTE_DEVICE      inline
+#  define CUTE_HOST        inline
+#endif // CUTE_HOST_DEVICE, CUTE_DEVICE
+
+#if defined(__CUDACC_RTC__)
+#  define CUTE_HOST_RTC CUTE_HOST_DEVICE
+#else
+#  define CUTE_HOST_RTC CUTE_HOST
+#endif
+
+#if !defined(__CUDACC_RTC__) && !defined(__clang__) && \
+  (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))
+#  define CUTE_UNROLL    #pragma unroll
+#  define CUTE_NO_UNROLL #pragma unroll 1
+#elif defined(__CUDACC_RTC__) || defined(__clang__)
+#  define CUTE_UNROLL    _Pragma("unroll")
+#  define CUTE_NO_UNROLL _Pragma("unroll 1")
+#else
+#  define CUTE_UNROLL
+#  define CUTE_NO_UNROLL
+#endif // CUTE_UNROLL
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+#  define CUTE_INLINE_CONSTANT                 static const __device__
+#else
+#  define CUTE_INLINE_CONSTANT                 static constexpr
+#endif
+
+// __grid_constant__ was introduced in CUDA 11.7.
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7)))
+#  define CUTE_GRID_CONSTANT_SUPPORTED
+#endif
+
+// __grid_constant__ can be enabled only on SM70+.
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#  define CUTE_GRID_CONSTANT_ENABLED
+#endif
+
+#if ! defined(CUTE_GRID_CONSTANT)
+#  if defined(CUTE_GRID_CONSTANT_SUPPORTED) && defined(CUTE_GRID_CONSTANT_ENABLED)
+#    define CUTE_GRID_CONSTANT __grid_constant__
+#  else
+#    define CUTE_GRID_CONSTANT
+#  endif
+#endif
+
+// Some versions of GCC < 11 have trouble deducing that a
+// function with "auto" return type and all of its returns in an "if
+// constexpr ... else" statement must actually return.  Thus, GCC
+// emits spurious "missing return statement" build warnings.
+// Developers can suppress these warnings by using the
+// CUTE_GCC_UNREACHABLE macro, which must be followed by a semicolon.
+// It's harmless to use the macro for other GCC versions or other
+// compilers, but it has no effect.
+#if ! defined(CUTE_GCC_UNREACHABLE)
+#  if defined(__GNUC__)
+#    define CUTE_GCC_UNREACHABLE __builtin_unreachable()
+#  else
+#    define CUTE_GCC_UNREACHABLE
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+// Provides support for alternative operators 'and', 'or', and 'not'
+#  include <iso646.h>
+#endif // _MSC_VER
+
+#if defined(__CUDACC_RTC__)
+#  define CUTE_STL_NAMESPACE cuda::std
+#  define CUTE_STL_NAMESPACE_IS_CUDA_STD
+#else
+#  define CUTE_STL_NAMESPACE std
+#endif
+
+//
+// Assertion helpers
+//
+
+#if defined(__CUDACC_RTC__)
+#  include <cuda/std/cassert>
+#else
+#  include <cassert>
+#endif
+
+#define CUTE_STATIC_V(x)            decltype(x)::value
+
+#define CUTE_STATIC_ASSERT          static_assert
+#define CUTE_STATIC_ASSERT_V(x,...) static_assert(decltype(x)::value, ##__VA_ARGS__)
+
+// Fail and print a message. Typically used for notification of a compiler misconfiguration.
+#if defined(__CUDA_ARCH__)
+#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x); __brkpt()
+#else
+#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x)
+#endif
+
+//
+// IO
+//
+
+#if !defined(__CUDACC_RTC__)
+#  include <cstdio>
+#  include <iostream>
+#  include <iomanip>
+#endif
+
+//
+// Debugging utilities
+//
+
+#include <cute/util/debug.hpp>
diff --git a/lightllm-kernel/cutlass/include/cute/container/alignment.hpp b/lightllm-kernel/cutlass/include/cute/container/alignment.hpp
new file mode 100755
index 000000000..52e4cbadd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/alignment.hpp
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/math.hpp>
+
+namespace cute
+{
+
+// Test if a pointer is aligned to N bytes
+template <int N>
+CUTE_HOST_DEVICE constexpr
+bool
+is_byte_aligned(void const* const ptr)
+{
+  static_assert(has_single_bit(N), "N must be a power of 2 in alignment check");
+  return (reinterpret_cast<uintptr_t>(ptr) & (N-1)) == 0;
+}
+
+#if defined(__CUDACC__)
+#  define CUTE_ALIGNAS(n) __align__(n)
+#else
+#  define CUTE_ALIGNAS(n) alignas(n)
+#endif
+
+template <size_t Alignment, class Child = void>
+struct aligned_struct {};
+
+template <class Child> struct CUTE_ALIGNAS(  1) aligned_struct<  1, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  2) aligned_struct<  2, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  4) aligned_struct<  4, Child> {};
+template <class Child> struct CUTE_ALIGNAS(  8) aligned_struct<  8, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 16) aligned_struct< 16, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 32) aligned_struct< 32, Child> {};
+template <class Child> struct CUTE_ALIGNAS( 64) aligned_struct< 64, Child> {};
+template <class Child> struct CUTE_ALIGNAS(128) aligned_struct<128, Child> {};
+template <class Child> struct CUTE_ALIGNAS(256) aligned_struct<256, Child> {};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/array.hpp b/lightllm-kernel/cutlass/include/cute/container/array.hpp
new file mode 100755
index 000000000..9cdcf5f4c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/array.hpp
@@ -0,0 +1,492 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute
+{
+
+template <class T, size_t N>
+struct array
+{
+  using element_type = T;
+  using value_type = remove_cv_t<T>;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using reference = element_type&;
+  using const_reference = const element_type&;
+  using pointer = element_type*;
+  using const_pointer = const element_type*;
+  using iterator = pointer;
+  using const_iterator = const_pointer;
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos)
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back()
+  {
+    // return *rbegin();
+    return operator[](N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const
+  {
+    // return *rbegin();
+    return operator[](N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T* data()
+  {
+    return __elems_;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T const* data() const
+  {
+    return __elems_;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin()
+  {
+    return data();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const
+  {
+    return data();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin()
+  {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const
+  {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end()
+  {
+    return data() + size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const
+  {
+    return data() + size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend()
+  {
+    return end();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const
+  {
+    return end();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const
+  {
+    return size() == 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const
+  {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const
+  {
+    return size();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(const T& value)
+  {
+    for (auto& e : *this) {
+      e = value;
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void clear()
+  {
+    fill(T(0));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void swap(array& other)
+  {
+    using CUTE_STL_NAMESPACE::swap;
+    for (size_type i = 0; i < size(); ++i) {
+      swap((*this)[i], other[i]);
+    }
+  }
+
+  element_type __elems_[N];
+};
+
+
+template <class T>
+struct array<T, 0>
+{
+  using element_type = T;
+  using value_type = remove_cv_t<T>;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using reference = element_type&;
+  using const_reference = const element_type&;
+  using pointer = element_type*;
+  using const_pointer = const element_type*;
+  using const_iterator = const_pointer;
+  using iterator = pointer;
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos)
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const
+  {
+    return begin()[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back()
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const
+  {
+    return *begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T* data()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  T const* data() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend()
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const
+  {
+    return nullptr;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const
+  {
+    return true;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const
+  {
+    return 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const
+  {
+    return 0;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(const T& value)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  void clear()
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  void swap(array& other)
+  {}
+};
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+bool operator==(array<T,N> const& lhs, array<T,N> const& rhs)
+{
+  for (size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void clear(array<T,N>& a)
+{
+  a.fill(T(0));
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void fill(array<T,N>& a, T const& value)
+{
+  a.fill(value);
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void swap(array<T,N>& a, array<T,N>& b)
+{
+  a.swap(b);
+}
+
+/// @return A cute::array of the elements of @c t in reverse order.
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+cute::array<T,N> reverse(cute::array<T,N> const& t) 
+{
+  if constexpr (N == 0u) {
+    return t;
+  } else {
+    cute::array<T,N> t_r{};
+    for (size_t k = 0; k < N; ++k) {
+      t_r[k] = t[N - k - 1];
+    }
+    return t_r;
+  }
+}
+
+} // end cute
+
+
+//
+// Specialize tuple-related functionality for cute::array
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace cute
+{
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T& get(array<T,N>& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T const& get(array<T,N> const& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T&& get(array<T,N>&& a)
+{
+  static_assert(I < N, "Index out of range");
+  return cute::move(a[I]);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N> const>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N> const>
+{
+  using type = T;
+};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<cute::array<T,N> const>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array<T,N> const>
+{
+  using type = T;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp b/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
new file mode 100755
index 000000000..a9d14a1a2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/container/alignment.hpp>  // CUTE_ALIGNAS
+#include <cute/container/array.hpp>      // cute::array
+
+namespace cute
+{
+
+template <class T, size_t N, size_t Alignment = 16>
+struct CUTE_ALIGNAS(Alignment) array_aligned : cute::array<T,N> {};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp b/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
new file mode 100755
index 000000000..57db56aba
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
@@ -0,0 +1,643 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates subbyte trivial types
+           in a packed storage.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+namespace cute
+{
+//
+// Underlying subbyte storage type
+//
+template <class T>
+using subbyte_storage_type_t = conditional_t<(cute::sizeof_bits_v<T> <=   8), uint8_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  16), uint16_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  32), uint32_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <=  64), uint64_t,
+                               conditional_t<(cute::sizeof_bits_v<T> <= 128), uint128_t,
+                               T>>>>>;
+
+template <class T> struct subbyte_iterator;
+template <class, class> struct swizzle_ptr;
+
+//
+// subbyte_reference
+//   Proxy object for sub-byte element references
+//
+template <class T>
+struct subbyte_reference
+{
+  // Iterator Element type (const or non-const)
+  using element_type = T;
+  // Iterator Value type without type qualifier.
+  using value_type   = remove_cv_t<T>;
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
+                "Size of Element must not be greater than Storage.");
+
+private:
+
+  // Bitmask for covering one item
+  static constexpr storage_type BitMask = storage_type(storage_type(-1) >> (sizeof_bits_v<storage_type> - sizeof_bits_v<element_type>));
+  // Flag for fast branching on straddled elements
+  static constexpr bool is_storage_unaligned = ((sizeof_bits_v<storage_type> % sizeof_bits_v<element_type>) != 0);
+
+  friend struct subbyte_iterator<T>;
+
+  // Pointer to storage element
+  storage_type* ptr_ = nullptr;
+
+  // Bit index of value_type starting position within storage_type element.
+  // RI: 0 <= idx_ < sizeof_bit<storage_type>
+  uint8_t idx_ = 0;
+
+  // Ctor
+  template <class PointerType>
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) {}
+
+public:
+
+  // Copy Ctor
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference(subbyte_reference const& other) {
+    *this = element_type(other);
+  }
+
+  // Copy Assignment
+  CUTE_HOST_DEVICE constexpr
+  subbyte_reference& operator=(subbyte_reference const& other) {
+    return *this = element_type(other);
+  }
+
+  // Assignment
+  template <class T_ = element_type>
+  CUTE_HOST_DEVICE constexpr
+  enable_if_t<!is_const_v<T_>, subbyte_reference&> operator=(element_type x)
+  {
+    static_assert(is_same_v<T_, element_type>, "Do not specify template arguments!");
+    storage_type item = (reinterpret_cast<storage_type const&>(x) & BitMask);
+
+    // Update the current storage element
+    storage_type bit_mask_0 = storage_type(BitMask << idx_);
+    ptr_[0] = storage_type((ptr_[0] & ~bit_mask_0) | (item << idx_));
+
+    // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
+    if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
+      uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
+      storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
+      // Update the next storage element
+      ptr_[1] = storage_type((ptr_[1] & ~bit_mask_1) | (item >> straddle_bits));
+    }
+
+    return *this;
+  }
+
+  // Comparison of referenced values
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator==(subbyte_reference const& x, subbyte_reference const& y) { return x.get() == y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator!=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() != y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator< (subbyte_reference const& x, subbyte_reference const& y) { return x.get() <  y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator> (subbyte_reference const& x, subbyte_reference const& y) { return x.get() >  y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator<=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() <= y.get(); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator>=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() >= y.get(); }
+
+  // Value
+  CUTE_HOST_DEVICE
+  element_type get() const
+  {
+    if constexpr (is_same_v<bool, value_type>) {      // Extract to bool -- potentially faster impl
+      return bool((*ptr_) & (BitMask << idx_));
+    } else {                                          // Extract to element_type
+      // Extract from the current storage element
+      auto item = storage_type((ptr_[0] >> idx_) & BitMask);
+
+      // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
+      if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
+        uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
+        storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
+        // Extract from the next storage element
+        item |= storage_type((ptr_[1] & bit_mask_1) << straddle_bits);
+      }
+
+      return reinterpret_cast<element_type&>(item);
+    }
+  }
+
+  // Extract to type element_type
+  CUTE_HOST_DEVICE constexpr
+  operator element_type() const {
+    return get();
+  }
+
+  // Address
+  CUTE_HOST_DEVICE
+  subbyte_iterator<T> operator&() const {
+    return {ptr_, idx_};
+  }
+};
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+print(subbyte_reference<T> ref) {
+  cute::print(ref.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE
+void
+pretty_print(subbyte_reference<T> ref) {
+  cute::pretty_print(ref.get());
+}
+
+//
+// subbyte_iterator
+//   Random-access iterator over subbyte references
+//
+template <class T>
+struct subbyte_iterator
+{
+  // Iterator Element type (const or non-const)
+  using element_type = T;
+  // Iterator Value type without type qualifier.
+  using value_type   = remove_cv_t<T>;
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+  // Reference proxy type
+  using reference = subbyte_reference<element_type>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
+                "Size of Element must not be greater than Storage.");
+
+private:
+
+  template <class, class> friend struct swizzle_ptr;
+  template <class U> friend CUTE_HOST_DEVICE constexpr U* raw_pointer_cast(subbyte_iterator<U> const&);
+  template <class N, class U> friend CUTE_HOST_DEVICE constexpr auto recast_ptr(subbyte_iterator<U> const&);
+  template <class U> friend CUTE_HOST_DEVICE void print(subbyte_iterator<U> const&);
+
+  // Pointer to storage element
+  storage_type* ptr_;
+
+  // Bit index of value_type starting position within storage_type element.
+  // RI: 0 <= idx_ < sizeof_bit<storage_type>
+  uint8_t idx_;
+
+public:
+
+  // Default Ctor
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator() : ptr_{nullptr}, idx_{0} {};
+
+  // Ctor
+  template <class PointerType>
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) { }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return reference(ptr_, idx_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator+=(uint64_t k) {
+    k = sizeof_bits_v<value_type> * k + idx_;
+    ptr_ += k / sizeof_bits_v<storage_type>;
+    idx_  = k % sizeof_bits_v<storage_type>;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator+(uint64_t k) const {
+    return subbyte_iterator(ptr_, idx_) += k;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](uint64_t k) const {
+    return *(*this + k);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator++() {
+    idx_ += sizeof_bits_v<value_type>;
+    if (idx_ >= sizeof_bits_v<storage_type>) {
+      ++ptr_;
+      idx_ -= sizeof_bits_v<storage_type>;
+    }
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator++(int) {
+    subbyte_iterator ret(*this);
+    ++(*this);
+    return ret;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator& operator--() {
+    if (idx_ >= sizeof_bits_v<value_type>) {
+      idx_ -= sizeof_bits_v<value_type>;
+    } else {
+      --ptr_;
+      idx_ += sizeof_bits_v<storage_type> - sizeof_bits_v<value_type>;
+    }
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  subbyte_iterator operator--(int) {
+    subbyte_iterator ret(*this);
+    --(*this);
+    return ret;
+  }
+
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator==(subbyte_iterator const& x, subbyte_iterator const& y) {
+    return x.ptr_ == y.ptr_ && x.idx_ == y.idx_;
+  }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator!=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x == y); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator< (subbyte_iterator const& x, subbyte_iterator const& y) {
+    return x.ptr_ < y.ptr_ || (x.ptr_ == y.ptr_ && x.idx_ < y.idx_);
+  }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator<=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(y <  x); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator> (subbyte_iterator const& x, subbyte_iterator const& y) { return  (y <  x); }
+  CUTE_HOST_DEVICE constexpr friend
+  bool operator>=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x <  y); }
+};
+
+// Conversion to raw pointer with loss of subbyte index
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T*
+raw_pointer_cast(subbyte_iterator<T> const& x) {
+  assert(x.idx_ == 0);
+  return reinterpret_cast<T*>(x.ptr_);
+}
+
+// Conversion to NewT_ with possible loss of subbyte index
+template <class NewT_, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(subbyte_iterator<T> const& x) {
+  using NewT = conditional_t<(is_const_v<T>), NewT_ const, NewT_>;
+  if constexpr (cute::is_subbyte_v<NewT>) {       // Making subbyte_iter, preserve the subbyte idx
+    return subbyte_iterator<NewT>(x.ptr_, x.idx_);
+  } else {                                       // Not subbyte, assume/assert subbyte idx 0
+    return reinterpret_cast<NewT*>(raw_pointer_cast(x));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+print(subbyte_iterator<T> const& x) {
+  printf("subptr[%db](%p.%u)", int(sizeof_bits_v<T>), x.ptr_, x.idx_);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+print(subbyte_reference<T> const& x) {
+  print(x.get());
+}
+//
+// array_subbyte
+//   Statically sized array for non-byte-aligned data types
+//
+template <class T, size_t N>
+struct array_subbyte
+{
+  using element_type    = T;
+  using value_type      = remove_cv_t<T>;
+  using pointer         = element_type*;
+  using const_pointer   = element_type const*;
+
+  using size_type       = size_t;
+  using difference_type = ptrdiff_t;
+
+  //
+  // References
+  //
+  using reference       = subbyte_reference<element_type>;
+  using const_reference = subbyte_reference<element_type const>;
+
+  //
+  // Iterators
+  //
+  using iterator        = subbyte_iterator<element_type>;
+  using const_iterator  = subbyte_iterator<element_type const>;
+
+  // Storage type (const or non-const)
+  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
+
+  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
+
+private:
+
+  // Number of storage elements, ceil_div
+  static constexpr size_type StorageElements = (N * sizeof_bits_v<value_type> + sizeof_bits_v<storage_type> - 1) / sizeof_bits_v<storage_type>;
+
+  // Internal storage
+  storage_type storage[StorageElements];
+
+public:
+
+  CUTE_HOST_DEVICE constexpr
+  size_type size() const {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  size_type max_size() const {
+    return N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  bool empty() const {
+    return !N;
+  }
+
+  // Efficient clear method
+  CUTE_HOST_DEVICE constexpr
+  void clear() {
+    CUTE_UNROLL
+    for (size_type i = 0; i < StorageElements; ++i) {
+      storage[i] = storage_type(0);
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  void fill(T const& value) {
+    CUTE_UNROLL
+    for (size_type i = 0; i < N; ++i) {
+      at(i) = value;
+    }
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference at(size_type pos) {
+    return iterator(storage)[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference at(size_type pos) const {
+    return const_iterator(storage)[pos];
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](size_type pos) {
+    return at(pos);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference operator[](size_type pos) const {
+    return at(pos);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference front() {
+    return at(0);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference front() const {
+    return at(0);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference back() {
+    return at(N-1);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_reference back() const {
+    return at(N-1);
+  }
+
+  // In analogy to std::vector<bool>::data(), these functions are deleted to prevent bugs.
+  // Instead, prefer
+  //   auto* data = raw_pointer_cast(my_subbyte_array.begin());
+  // where the type of auto* is implementation-defined and
+  // with the knowledge that [data, data + my_subbyte_array.size()) may not be a valid range.
+  CUTE_HOST_DEVICE constexpr
+  pointer data() = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  const_pointer data() const = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator begin() const {
+    return const_iterator(storage);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cbegin() const {
+    return begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator end() {
+    return iterator(storage) + N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator end() const {
+    return const_iterator(storage) + N;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  const_iterator cend() const {
+    return end();
+  }
+
+  //
+  // Comparison operators
+  //
+
+};
+
+//
+// Operators
+//
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void clear(array_subbyte<T,N>& a)
+{
+  a.clear();
+}
+
+template <class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+void fill(array_subbyte<T,N>& a, T const& value)
+{
+  a.fill(value);
+}
+
+} // namespace cute
+
+//
+// Specialize tuple-related functionality for cute::array_subbyte
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace cute
+{
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T& get(array_subbyte<T,N>& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T const& get(array_subbyte<T,N> const& a)
+{
+  static_assert(I < N, "Index out of range");
+  return a[I];
+}
+
+template <size_t I, class T, size_t N>
+CUTE_HOST_DEVICE constexpr
+T&& get(array_subbyte<T,N>&& a)
+{
+  static_assert(I < N, "Index out of range");
+  return cute::move(a[I]);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class T>
+struct is_reference<cute::subbyte_reference<T>>
+    : CUTE_STL_NAMESPACE::true_type
+{};
+
+
+template <class T, size_t N>
+struct tuple_size<cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<const cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, const cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class T, size_t N>
+struct tuple_size<cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+template <class T, size_t N>
+struct tuple_size<const cute::array_subbyte<T,N>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
+{};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, const cute::array_subbyte<T,N>>
+{
+  using type = T;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp b/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
new file mode 100755
index 000000000..d7fac42a5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Portable bit field that supports byte and word straddling that can
+           be used in unions to bit-wise define parameters.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>                  // CUTE_HOST_DEVICE
+#include <cute/numeric/numeric_types.hpp>   // uint_bit_t
+#include <cute/util/type_traits.hpp>        // cute::is_same
+
+namespace cute
+{
+
+class dummy_type {};
+
+template <uint32_t BitStart, uint32_t NumBits, class OtherValueType = dummy_type>
+struct bit_field
+{
+  static_assert(0 < NumBits && NumBits <= 64, "bit_fields with more than 64 bits are not supported.");
+
+  // value_type: Use the smallest value type that fits NumBits
+  static constexpr uint32_t value_type_bits = (NumBits <=  8) ?  8 :
+                                              (NumBits <= 16) ? 16 :
+                                              (NumBits <= 32) ? 32 : 64;
+  using value_type   = cute::uint_bit_t<value_type_bits>;
+  // storage_type: Use the smallest storage_type that avoids boundary crossing
+  static constexpr uint32_t storage_type_bits = (BitStart /  8 == (BitStart + NumBits - 1) /  8) ?  8 :
+                                                (BitStart / 16 == (BitStart + NumBits - 1) / 16) ? 16 :
+                                                (BitStart / 32 == (BitStart + NumBits - 1) / 32) ? 32 : 64;
+  using storage_type = cute::uint_bit_t<storage_type_bits>;
+
+  static_assert(sizeof(OtherValueType) == sizeof(value_type) || is_same<OtherValueType,dummy_type>::value,
+                "sizeof(OtherValueType) must be same as sizeof(value_type).");
+
+  // Number of storage values needed: ceil_div(BitStart + NumBits, storage_type_bits)
+  static constexpr uint32_t N      = (BitStart + NumBits + storage_type_bits - 1) / storage_type_bits;
+  // Index of storage value for BitStart
+  static constexpr uint32_t idx    = BitStart / storage_type_bits;
+  // Bit of data_[idx] for BitStart
+  static constexpr uint32_t bit_lo = BitStart % storage_type_bits;
+  // Number of bits in data_[idx] used for NumBits if straddling, else 0
+  static constexpr uint32_t bit_hi = (idx + 1 < N) ? (storage_type_bits - bit_lo) : 0;
+
+public:
+
+  // NumBits mask
+  static constexpr value_type   mask    = value_type(uint64_t(-1) >> (64u - NumBits));
+  // NumBits mask for BitStart
+  static constexpr storage_type mask_lo = storage_type(mask) << bit_lo;
+  // NumBits mask for leftover bits in data_[idx+1] if straddling, else 0
+  static constexpr storage_type mask_hi = (idx + 1 < N) ? (storage_type(mask) >> bit_hi) : 0;
+
+  storage_type data_[N];
+
+  // Get value
+  CUTE_HOST_DEVICE constexpr
+  value_type get() const {
+    storage_type result = (data_[idx] & mask_lo) >> bit_lo;
+    if constexpr (bit_hi != 0) {
+      result |= (data_[idx+1] & mask_hi) << bit_hi;
+    }
+    return static_cast<value_type>(result);
+  }
+
+  // Set value
+  CUTE_HOST_DEVICE constexpr
+  void set(value_type x) {
+    storage_type item = static_cast<storage_type>(x & mask);
+    data_[idx] = static_cast<storage_type>((data_[idx] & ~mask_lo) | (item << bit_lo));
+    if constexpr (bit_hi != 0) {
+      data_[idx+1] = static_cast<storage_type>((data_[idx+1] & ~mask_hi) | (item >> bit_hi));
+    }
+  }
+
+  // Assign value
+  CUTE_HOST_DEVICE constexpr
+  bit_field& operator=(value_type x) {
+    set(x);
+    return *this;
+  }
+
+  // Cast to value
+  CUTE_HOST_DEVICE constexpr
+  operator value_type () const {
+    return get();
+  }
+
+  // Assign OtherValueType
+  CUTE_HOST_DEVICE constexpr
+  bit_field& operator=(OtherValueType x) {
+    return *this = *reinterpret_cast<value_type*>(&x);
+  }
+
+  // Cast to OtherValueType
+  CUTE_HOST_DEVICE constexpr
+  operator OtherValueType () const {
+    value_type x = get();
+    return *reinterpret_cast<OtherValueType*>(&x);
+  }
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp b/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
new file mode 100755
index 000000000..fbc314e54
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
@@ -0,0 +1,183 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
+#include <cute/numeric/integral_constant.hpp>  // cute::integral_constant
+
+namespace cute
+{
+
+//
+// dim3
+//
+
+using dim3 = ::dim3;
+
+// MSVC doesn't define its C++ version macro to match
+// its C++ language version.  This means that when
+// building with MSVC, dim3 isn't constexpr-friendly.
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t& get(dim3& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t const& get(dim3 const& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE
+#if ! defined(_MSC_VER)
+constexpr
+#endif
+uint32_t&& get(dim3&& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return cute::move(a.x);
+  } else if constexpr (I == 1) {
+    return cute::move(a.y);
+  } else if constexpr (I == 2) {
+    return cute::move(a.z);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Specialize cute::tuple-traits for external types
+template <>
+struct tuple_size<dim3>
+    : integral_constant<size_t, 3>
+{};
+
+template <size_t I>
+struct tuple_element<I, dim3>
+{
+  using type = uint32_t;
+};
+
+//
+// uint3
+//
+
+using uint3 = ::uint3;
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t& get(uint3& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t const& get(uint3 const& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return a.x;
+  } else if constexpr (I == 1) {
+    return a.y;
+  } else if constexpr (I == 2) {
+    return a.z;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I>
+CUTE_HOST_DEVICE constexpr
+uint32_t&& get(uint3&& a)
+{
+  static_assert(I < 3, "Index out of range");
+  if constexpr (I == 0) {
+    return cute::move(a.x);
+  } else if constexpr (I == 1) {
+    return cute::move(a.y);
+  } else if constexpr (I == 2) {
+    return cute::move(a.z);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Specialize cute::tuple-traits for external types
+template <>
+struct tuple_size<uint3>
+    : integral_constant<size_t, 3>
+{};
+
+template <size_t I>
+struct tuple_element<I, uint3>
+{
+  using type = uint32_t;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp b/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
new file mode 100755
index 000000000..c20df2c23
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/container/type_list.hpp>
+
+namespace cute {
+
+namespace detail {
+
+// Empty Structure Optimization
+template <bool IsFirstEmpty, bool IsRestEmpty, class... T>
+struct ESO;
+
+template <class First, class... Rest>
+static constexpr bool is_first_empty_v = cute::is_empty<First>::value;
+template <class First, class... Rest>
+static constexpr bool is_rest_empty_v  = (cute::is_empty<Rest>::value && ...);
+
+template <class... T>
+using ESO_t = ESO<is_first_empty_v<T...>, is_rest_empty_v<T...>, T...>;
+
+// Empty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<true, true, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const&, Rest const&...) {}
+};
+
+// NonEmpty First and Empty Rest...
+template <class First, class... Rest>
+struct ESO<false, true, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : first_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const& first, Rest const&...) : first_{first} {}
+
+  First first_;
+};
+
+// Empty First and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<true, false, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : rest_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const&, Rest const&... rest) : rest_{rest...} {}
+
+  ESO_t<Rest...> rest_;
+};
+
+// NonEmpty T and NonEmpty Rest...
+template <class First, class... Rest>
+struct ESO<false, false, First, Rest...> {
+  CUTE_HOST_DEVICE constexpr
+  ESO() : first_{}, rest_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  ESO(First const& first, Rest const&... rest) : first_{first}, rest_{rest...} {}
+
+  First first_;
+  ESO_t<Rest...> rest_;
+};
+
+// Get Nth value from ESO
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...> const& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T const&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(s.rest_); }
+  }
+}
+
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(s.rest_); }
+  }
+}
+
+template <size_t N, class T, class... Rest, bool F, bool R>
+CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>&& s) {
+  if constexpr (N == 0) {
+    if constexpr (F) { return T{}; }
+    else             { return static_cast<T&&>(s.first_); }
+  } else {
+    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
+    else             { return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_)); }
+  }
+}
+
+// findt: Implementation detail of cute::find.
+// If X is the first template argument of the tuple, findt returns C<N>.
+
+template <class X, size_t N,
+  bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
+CUTE_HOST_DEVICE constexpr
+auto
+findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
+{
+  if constexpr (cute::is_same_v<X, First>) {
+    return C<N>{};
+  }
+  else {
+    static_assert(sizeof...(Rest) != 0,
+      "The type does not appear in the argument list of the tuple.");
+    if constexpr (IsRestEmpty) {
+      // The rest is empty, so creating an instance of it is cheap.
+      return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
+    }
+    else {
+      return cute::detail::findt<X, N+1>(t.rest_);
+    }
+  }
+}
+
+} // end namespace detail
+
+// packed_tuple<T...> is a tuple type that is a standard-layout type
+// whenever all of its template arguments are standard layout types:
+//   (cute::is_standard_layout_v<T> && ...) implies (cute::is_standard_layout_v<packed_tuple<T...>>)
+
+template <class... T>
+struct packed_tuple : detail::ESO_t<T...>
+{
+  CUTE_HOST_DEVICE constexpr
+  packed_tuple() {}
+
+  CUTE_HOST_DEVICE constexpr
+  packed_tuple(T const&... ts)
+    : detail::ESO_t<T...>(ts...)
+  {}
+};
+
+template <>
+struct packed_tuple<> {};
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...> const& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...>& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(packed_tuple<T...>&& t) {
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
+}
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+packed_tuple<T...>
+make_packed_tuple(T const&... t)
+{
+  return {t...};
+}
+
+// Returns the position of type X (as a static integer) in the tuple
+// type's argument list.  X must be unique in the argument list.
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+find(packed_tuple<T...> const& t) noexcept
+{
+  return detail::findt<X, 0>(t);
+}
+
+} // end namespace cute
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::packed_tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::packed_tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std {
+
+template <class ... T>
+struct tuple_size<cute::packed_tuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class ... T>
+struct tuple_element<I, cute::packed_tuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, cute::packed_tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/tuple.hpp b/lightllm-kernel/cutlass/include/cute/container/tuple.hpp
new file mode 100755
index 000000000..3123a68d8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/tuple.hpp
@@ -0,0 +1,744 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
+#include <cute/numeric/integer_sequence.hpp>
+
+#include <cute/container/cuda_types.hpp>
+#include <cute/container/type_list.hpp>
+#if defined(CUTLASS_USE_PACKED_TUPLE)
+#  include <cute/container/packed_tuple.hpp>
+#endif
+
+//#include <cute/container/array.hpp>            // Advanced optimizations
+
+// cute::tuple is like std::tuple, with two differences.
+//
+// 1. It works on both host and device.
+// 2. Its template arguments must be semiregular types.
+//
+// Semiregular types are default constructible and copyable.
+// They include "value types" like int or float,
+// but do _not_ include references like int& or float&.
+// (See std::tie for an example of a tuple of references.)
+//
+// If the template arguments of cute::tuple are all empty types (in
+// the sense of std::is_empty_v), then the cute::tuple is also an
+// empty type.  Furthermore, if CUTLASS_USE_PACKED_TUPLE is defined,
+// cute::tuple is always a standard-layout type if all of its template
+// arguments are standard-layout types.
+
+namespace cute
+{
+
+#if defined(CUTLASS_USE_PACKED_TUPLE)
+
+template<class... T>
+using tuple = packed_tuple<T...>;
+
+#else
+
+namespace detail
+{
+
+// This is simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
+// the conversion SFINAE, special overloading, and avoiding cvref template types.
+//
+// Over standard-conforming tuple implementations, this appears to accelerate compilation times by over 3x.
+
+// EBO stands for "empty base optimization."
+// We use this technique to ensure that cute::tuple
+// doesn't need to waste space storing any template arguments
+// of cute::tuple that have no data (like integral_constant).
+// Otherwise, cute::tuple would need to spend at least 1 byte
+// for each of its template arguments.
+//
+// This is one way in which cute::tuple differs from std::tuple.
+// Empty types in the template argument list are not even constructed,
+// and do not have unique element addresses.  In fact, they are not
+// even members of the tuple or stored in any way.  Calling `get`
+// constructs and returns an instance of an empty type on demand.
+//
+// EBO always "holds" a single value of type T.
+// N is like an array index that TupleBase uses
+// to access the desired tuple element.
+template <size_t N, class T, bool IsEmpty = is_empty<T>::value>
+struct EBO;
+
+template <class T, size_t N, bool B>
+CUTE_HOST_DEVICE constexpr C<N> findt(EBO<N, T, B> const&)
+{ return {}; }
+
+// Specialization for types T that have no data;
+// the "static tuple leaf."  Valid T here include
+// integral_constant<U, Value>, Int<Value>,
+// and any other semiregular type
+// for which std::is_empty_v<T> is true.
+template <size_t N, class T>
+struct EBO<N, T, true>
+{
+  CUTE_HOST_DEVICE constexpr
+  EBO() {}
+
+  CUTE_HOST_DEVICE constexpr
+  EBO(T const&) {}
+};
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T getv(EBO<N, T, true> const&)
+{ return {}; }
+
+// Specialization for types T that are not empty;
+// the "dynamic tuple leaf."  Valid T here include int,
+// any other integral or floating-point type,
+// or any semiregular type for which std::is_empty_v<T> is false.
+template <size_t N, class T>
+struct EBO<N, T, false>
+{
+  CUTE_HOST_DEVICE constexpr
+  EBO() : t_{} {}
+
+  CUTE_HOST_DEVICE constexpr
+  EBO(T const& t) : t_{t} {}
+
+  T t_;
+};
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T const& getv(EBO<N, T, false> const& x)
+{ return x.t_; }
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T& getv(EBO<N, T, false>& x)
+{ return x.t_; }
+
+template <size_t N, class T>
+CUTE_HOST_DEVICE constexpr T&& getv(EBO<N, T, false>&& x)
+{ return cute::move(x.t_); }
+
+template <class IdxSeq, class... T>
+struct TupleBase;
+
+// Base class of cute::tuple binds each element to an index
+// by inheriting from EBO<i, t> for each (i, t) in (I..., T...).
+// The storage (for nonempty t) lives in the base classes.
+template <size_t... I, class... T>
+struct TupleBase<index_sequence<I...>, T...>
+    : EBO<I,T>...
+{
+  CUTE_HOST_DEVICE constexpr
+  TupleBase() {}
+
+  CUTE_HOST_DEVICE constexpr
+  TupleBase(T const&... t) : EBO<I,T>(t)... {}
+};
+
+} // end namespace detail
+
+// Attempting to use the following commented-out alias
+// in the declaration of `struct tuple` causes MSVC 2022 build errors.
+//
+//template <class... T>
+//using TupleBase = detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>;
+
+// This is the actual cute::tuple class.
+// The storage (if any) lives in TupleBase's EBO base classes.
+//
+// Inheriting from the above alias TupleBase
+// causes MSVC 2022 build errors when assigning one tuple to another:
+// In summary: this is verbose as a work-around for MSVC build errors.
+template <class... T>
+struct tuple : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>
+{
+  CUTE_HOST_DEVICE constexpr
+  tuple() {}
+
+  CUTE_HOST_DEVICE constexpr
+  tuple(T const&... t) : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>(t...) {}
+};
+
+template <>
+struct tuple<>
+{};
+
+//
+// get for cute::tuple (just like std::get for std::tuple)
+//
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...> const& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...>& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(t);
+}
+
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(tuple<T...>&& t) noexcept
+{
+  static_assert(I < sizeof...(T), "Index out of range");
+  return detail::getv<I>(static_cast<tuple<T...>&&>(t));
+}
+
+//
+// find a type X within a cute::tuple
+//   Requires X to be unique in tuple
+//   Returns a static integer
+//
+
+template <class X, class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+find(tuple<T...> const& t) noexcept
+{
+  return detail::findt<X>(t);
+}
+
+#endif // CUTLASS_USE_PACKED_TUPLE
+
+//
+// Custom is_tuple trait simply checks the existence of tuple_size
+//      and assumes std::get<I>(.), std::tuple_element<I,.>
+//
+namespace detail {
+
+template <class T>
+auto has_tuple_size( T*) -> bool_constant<(0 <= tuple_size<T>::value)>;
+auto has_tuple_size(...) -> false_type;
+
+} // end namespace detail
+
+template <class T>
+struct is_tuple : decltype(detail::has_tuple_size((T*)0)) {};
+
+template<typename T>
+constexpr bool is_tuple_v = cute::is_tuple<T>::value;
+
+//
+// make_tuple (value-based implementation)
+//
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+tuple<T...>
+make_tuple(T const&... t)
+{
+  return {t...};
+}
+
+//
+// tuple_cat concatenates multiple cute::tuple into a single cute::tuple,
+// just like std::tuple_cat for std::tuple.
+//
+
+#if 0
+// Original implementation
+
+namespace detail {
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+}
+
+template <class T0, class T1, class T2, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, Ts const&... ts)
+{
+  return cute::tuple_cat(cute::tuple_cat(t0,t1),t2,ts...);
+}
+#endif
+
+#if 1
+// Extended implementation
+
+namespace detail {
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+template <class T0, class T1, class T2,
+          size_t... I0, size_t... I1, size_t... I2>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)...);
+}
+
+template <class T0, class T1, class T2, class T3,
+          size_t... I0, size_t... I1, size_t... I2, size_t... I3>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)...);
+}
+
+template <class T0, class T1, class T2, class T3, class T4,
+          size_t... I0, size_t... I1, size_t... I2, size_t... I3, size_t... I4>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4,
+          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>, index_sequence<I4...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)..., get<I4>(t4)...);
+}
+
+template <class T0, class T1>
+struct tuple_cat_static;
+
+template <class... T0s, class... T1s>
+struct tuple_cat_static<tuple<T0s...>, tuple<T1s...>> {
+  using type = tuple<T0s..., T1s...>;
+};
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  if constexpr (is_static<T0>::value && is_static<T1>::value &&
+		is_tuple<T0>::value && is_tuple<T1>::value) {
+    return typename detail::tuple_cat_static<T0, T1>::type{};
+  } else {
+    return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T0, class T1, class T2>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2)
+{
+  return detail::tuple_cat(t0, t1, t2,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3)
+{
+  return detail::tuple_cat(t0, t1, t2, t3,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{},
+                           make_index_sequence<tuple_size<T3>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3, class T4>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4)
+{
+  return detail::tuple_cat(t0, t1, t2, t3, t4,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{},
+                           make_index_sequence<tuple_size<T2>::value>{},
+                           make_index_sequence<tuple_size<T3>::value>{},
+                           make_index_sequence<tuple_size<T4>::value>{});
+}
+
+template <class T0, class T1, class T2, class T3, class T4, class T5, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, T5 const& t5, Ts const&... ts)
+{
+  return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), cute::tuple_cat(t5, ts...));
+}
+#endif
+
+#if 0
+// Outer-Inner indexing trick to concat all tuples at once
+
+namespace detail {
+
+template <size_t... Ns>
+struct tuple_cat_helper
+{
+  static constexpr cute::array<size_t,sizeof...(Ns)> ns = {Ns...};
+
+  static constexpr size_t total_size() {
+    size_t sum = 0;
+    for (size_t n : ns) sum += n;
+    return sum;
+  }
+  static constexpr size_t total_size_ = total_size();
+
+  static constexpr auto values() {
+    cute::array<size_t[2],total_size_> outer_inner = {};
+
+    size_t idx = 0;
+    for (size_t i = 0; i < ns.size(); ++i) {
+      for (size_t j = 0; j < ns[i]; ++j, ++idx) {
+        outer_inner[idx][0] = i;
+        outer_inner[idx][1] = j;
+      }
+    }
+    return outer_inner;
+  }
+  static constexpr auto outer_inner_ = values();
+
+  using total_sequence = make_index_sequence<total_size_>;
+};
+
+template <class Helper, class Tuple, size_t... I>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(Tuple const& t, index_sequence<I...>)
+{
+  return cute::make_tuple(get<Helper::outer_inner_[I][1]>(get<Helper::outer_inner_[I][0]>(t))...);
+}
+
+template <class T0, class T1,
+          size_t... I0, size_t... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1,
+          index_sequence<I0...>, index_sequence<I1...>)
+{
+  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+CUTE_HOST_DEVICE constexpr
+tuple<>
+tuple_cat()
+{
+  return {};
+}
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE constexpr
+Tuple const&
+tuple_cat(Tuple const& t)
+{
+  return t;
+}
+
+template <class T0, class T1>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(T0 const& t0, T1 const& t1)
+{
+  return detail::tuple_cat(t0, t1,
+                           make_index_sequence<tuple_size<T0>::value>{},
+                           make_index_sequence<tuple_size<T1>::value>{});
+}
+
+template <class... Tuples>
+CUTE_HOST_DEVICE constexpr
+auto
+tuple_cat(Tuples const&... ts)
+{
+  using Helper = detail::tuple_cat_helper<tuple_size<Tuples>::value...>;
+  return detail::tuple_cat<Helper>(cute::make_tuple(ts...), typename Helper::total_sequence{});
+}
+#endif
+
+//
+// Equality operators
+//
+
+namespace detail {
+
+template <class TupleA, class TupleB, size_t... I>
+CUTE_HOST_DEVICE constexpr
+auto
+equal_impl(TupleA const& a, TupleB const& b, index_sequence<I...>)
+{
+  return (cute::true_type{} && ... && (get<I>(a) == get<I>(b)));
+}
+
+} // end namespace detail
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(TupleT const& t, TupleU const& u)
+{
+  if constexpr (tuple_size<TupleT>::value == tuple_size<TupleU>::value) {
+    return detail::equal_impl(t, u, make_index_sequence<tuple_size<TupleT>::value>{});
+  } else {
+    return cute::false_type{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(TupleT const& t, TupleU const& u)
+{
+  return cute::false_type{};
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator!=(TupleT const& t, TupleU const& u)
+{
+  return !(t == u);
+}
+
+template <class TupleT, class TupleU,
+          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator!=(TupleT const& t, TupleU const& u)
+{
+  return cute::true_type{};
+}
+
+//
+// Comparison operators
+//
+
+//
+// There are many ways to compare tuple of elements and because CuTe is built
+//   on parameterizing layouts of coordinates, some comparisons are appropriate
+//   only in certain cases.
+//  -- lexicographical comparison [reverse, reflected, revref]
+//  -- colexicographical comparison [reverse, reflected, revref]
+//  -- element-wise comparison [any,all]
+// This can be very confusing. To avoid errors in selecting the appropriate
+//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
+//
+// That said, see int_tuple for more explicitly named common comparison ops.
+//
+
+//
+// Display utilities
+//
+
+namespace detail {
+
+template <class Tuple, size_t... Is>
+CUTE_HOST_DEVICE void print_tuple(Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
+{
+  using cute::print;
+  if (sizeof...(Is) == 0) {
+    print(s);
+  } else {
+    ((void(print(Is == 0 ? s : ',')), void(print(get<Is>(t)))), ...);
+  }
+  print(e);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Tuple, std::size_t... Is>
+CUTE_HOST std::ostream& print_tuple_os(std::ostream& os, Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
+{
+  if (sizeof...(Is) == 0) {
+    os << s;
+  } else {
+    (void(os << (Is == 0 ? s : ',') << get<Is>(t)), ...);
+  }
+  return os << e;
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace detail
+
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST_DEVICE void print(Tuple const& t)
+{
+  return detail::print_tuple(t, make_index_sequence<tuple_size<Tuple>::value>{});
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Tuple,
+          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Tuple const& t)
+{
+  return detail::print_tuple_os(os, t, make_index_sequence<tuple_size<Tuple>::value>{});
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
+
+#if ! defined(CUTLASS_USE_PACKED_TUPLE)
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+//
+// std compatibility
+//
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::tuple<T...>>
+    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
+
+#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/lightllm-kernel/cutlass/include/cute/container/type_list.hpp b/lightllm-kernel/cutlass/include/cute/container/type_list.hpp
new file mode 100755
index 000000000..a15f2c1c1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/container/type_list.hpp
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE, CUTE_STL_NAMESPACE
+
+namespace cute
+{
+
+template <class... T>
+struct type_list {};
+
+// get<I> for type_list<T...>
+//   requires tuple_element_t<I,type_list<T...>> to have std::is_default_constructible
+template <size_t I, class... T>
+CUTE_HOST_DEVICE constexpr
+CUTE_STL_NAMESPACE::tuple_element_t<I, type_list<T...>>
+get(type_list<T...> const& t) noexcept {
+  return {};
+}
+
+} // end namespace cute
+
+//
+// Specialize tuple-related functionality for cute::type_list
+//
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/tuple>
+#else
+#include <tuple>
+#endif
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+template <class... T>
+struct tuple_size<const cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+} // end namespace std
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+template <class... T>
+struct tuple_size<const cute::type_list<T...>>
+    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::type_list<T...>>
+{
+  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
+};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/int_tuple.hpp b/lightllm-kernel/cutlass/include/cute/int_tuple.hpp
new file mode 100755
index 000000000..95d06bbdd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/int_tuple.hpp
@@ -0,0 +1,864 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
+#include <cute/container/array.hpp>             // cute::array
+#include <cute/container/tuple.hpp>             // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>   // cute::Int
+#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform
+
+/** IntTuple is an integer or a tuple of IntTuples.
+ * This file holds utilities for working with IntTuples,
+ * but does not hold a concrete concept or class of IntTuple.
+ */
+
+namespace cute
+{
+
+// Implementation of get<0>(Integral).
+//   Even though is_tuple<Integral> is false and tuple_size<Integral> doesn't compile,
+//   CuTe defines rank(Integral) as 1, so it's useful for get<0>(Integral) to return its input
+template <size_t I, class T, __CUTE_REQUIRES(cute::is_integral<cute::remove_cvref_t<T>>::value)>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(T&& t) noexcept
+{
+  static_assert(I == 0, "Index out of range");
+  return static_cast<T&&>(t);
+}
+
+// Custom recursive get for anything that implements get<I>(.) (for a single integer I).
+template <size_t I0, size_t I1, size_t... Is, class T>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+get(T&& t) noexcept
+{
+  return get<I1, Is...>(get<I0>(static_cast<T&&>(t)));
+}
+
+//
+// rank
+//
+
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(IntTuple const& t)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    if constexpr (is_tuple<IntTuple>::value) {
+      return Int<tuple_size<IntTuple>::value>{};
+    } else {
+      return Int<1>{};
+    }
+  } else {
+    return rank(get<Is...>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class IntTuple>
+using rank_t = decltype(rank(declval<IntTuple>()));
+
+template <class IntTuple>
+static constexpr auto rank_v = rank_t<IntTuple>::value;
+
+//
+// shape
+//
+
+template <class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(IntTuple const& s)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return transform(s, [](auto const& a) { return shape(a); });
+  } else {
+    return s;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int I, int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(IntTuple const& s)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return shape<Is...>(get<I>(s));
+  } else {
+    return get<I,Is...>(shape(s));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// max
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+max(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::max(cute::apply(t0, [](auto const&... a){ return cute::max(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::max(t0, cute::max(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// min
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+min(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::min(cute::apply(t0, [](auto const&... a){ return cute::min(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::min(t0, cute::min(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// gcd
+//
+
+template <class T0, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+gcd(T0 const& t0, Ts const&... ts)
+{
+  if constexpr (is_tuple<T0>::value) {
+    return cute::gcd(cute::apply(t0, [](auto const&... a){ return cute::gcd(a...); }), ts...);
+  } else if constexpr (sizeof...(Ts) == 0) {
+    return t0;
+  } else {
+    return cute::gcd(t0, cute::gcd(ts...));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// depth
+//
+
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(IntTuple const& t)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    if constexpr (is_tuple<IntTuple>::value) {
+      return Int<1>{} + cute::apply(t, [](auto const&... v){ return cute::max(depth(v)...); });
+    } else {
+      return Int<0>{};
+    }
+  } else {
+    return depth(get<Is...>(t));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Tuple>
+using depth_t = decltype(depth(declval<Tuple>()));
+
+template <class Tuple>
+static constexpr auto depth_v = depth_t<Tuple>::value;
+
+//
+// product
+//
+
+// Implementation of product as a function object
+struct Product
+{
+  template <class IntTuple>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(IntTuple const& a) const
+  {
+    if constexpr (is_tuple<IntTuple>::value) {
+      if constexpr (tuple_size<IntTuple>::value == 0) {
+        return Int<1>{};
+      } else {
+        return cute::transform_apply(a, Product{}, multiplies_unary_lfold{});
+      }
+    } else if constexpr (cute::is_integral<IntTuple>::value) {
+      return a;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+};
+// Callable product function object
+CUTE_INLINE_CONSTANT Product product;
+
+// Return a rank(t) tuple @a result such that get<i>(@a result) = product(get<i>(@a t))
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+product_each(Tuple const& t)
+{
+  return transform(wrap(t), product);
+}
+
+// Take the product of Tuple at the leaves of TupleG
+template <class Tuple, class TupleG>
+CUTE_HOST_DEVICE constexpr
+auto
+product_like(Tuple const& tuple, TupleG const& guide)
+{
+  return transform_leaf(guide, tuple, [](auto const& g, auto const& t) { return product(t); });
+}
+
+// Return the product of elements in a mode
+template <int... Is, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+size(IntTuple const& a)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return product(a);
+  } else {
+    return size(get<Is...>(a));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class IntTuple>
+static constexpr auto size_v = decltype(size(declval<IntTuple>()))::value;
+
+//
+// sum
+//
+
+template <class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+sum(IntTuple const& a)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    return cute::apply(a, [](auto const&... v){ return (Int<0>{} + ... + sum(v)); });
+  } else {
+    return a;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// inner_product
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+inner_product(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
+    return transform_apply(a, b, [](auto const& x, auto const& y) { return inner_product(x,y); },
+                                 [](auto const&... v) { return (Int<0>{} + ... + v); });
+  } else {
+    return a * b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// ceil_div
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+ceil_div(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
+      static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
+      constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
+      return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return ceil_div(x,y); });
+    } else {                                     // tuple int
+      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+        [] (auto const& init, auto const& ai) {
+          return cute::make_tuple(append(get<0>(init), ceil_div(ai, get<1>(init))), ceil_div(get<1>(init), ai));
+        });
+      return result;
+    }
+  } else
+  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
+    return ceil_div(a, product(b));
+  } else {
+    return (a + b - Int<1>{}) / b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// round_up
+//   Round @a a up to the nearest multiple of @a b.
+//   For negative numbers, rounds away from zero.
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+round_up(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
+    constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
+    return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return round_up(x,y); });
+  } else {
+    return ((a + b - Int<1>{}) / b) * b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Division for Shapes
+ * Case Tuple Tuple:
+ *   Perform shape_div element-wise
+ * Case Tuple Int:
+ *   Fold the division of b across each element of a
+ *   Example: shape_div((4,5,6),40) -> shape_div((1,5,6),10) -> shape_div((1,1,6),2) -> (1,1,3)
+ * Case Int Tuple:
+ *   Return shape_div(a, product(b))
+ * Case Int Int:
+ *   Enforce the divisibility condition a % b == 0 || b % a == 0 when possible
+ *   Return a / b with rounding away from 0 (that is, 1 or -1 when a < b)
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_div(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
+      static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
+      return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); });
+    } else {                                     // tuple int
+      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
+        [] (auto const& init, auto const& ai) {
+          return cute::make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai));
+        });
+      return result;
+    }
+  } else
+  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
+    return shape_div(a, product(b));
+  } else
+  if constexpr (is_static<IntTupleA>::value && is_static<IntTupleB>::value) {
+    static_assert(IntTupleA::value % IntTupleB::value == 0 || IntTupleB::value % IntTupleA::value == 0, "Static shape_div failure");
+    return C<shape_div(IntTupleA::value, IntTupleB::value)>{};
+  } else {                                       // int int
+    //assert(a % b == 0 || b % a == 0);          // Waive dynamic assertion
+    return a / b != 0 ? a / b : signum(a) * signum(b);  // Division with rounding away from zero
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Minimum for Shapes
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_min(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value || is_tuple<IntTupleB>::value) {
+    static_assert(dependent_false<IntTupleA>, "Not implemented.");
+  } else
+  if constexpr (is_constant<1, IntTupleA>::value || is_constant<1, IntTupleB>::value) {
+    return Int<1>{};            // _1 is less than all other shapes, preserve static
+  } else {
+    return cute::min(a, b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Return a tuple the same profile as A scaled by corresponding elements in B
+ */
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_scale(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    return transform(a, b, [](auto const& x, auto const& y) { return elem_scale(x,y); });
+  } else {
+    return a * product(b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Test if two IntTuple have the same profile (hierarchical rank division)
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+congruent(IntTupleA const& a, IntTupleB const& b)
+{
+  return bool_constant<is_same<decltype(repeat_like(shape(a),_0{})),
+                               decltype(repeat_like(shape(b),_0{}))>::value>{};
+}
+
+template <class A, class B>
+using is_congruent = decltype(congruent(declval<A>(), declval<B>()));
+
+/** Test if two IntTuple have the similar profiles up to Shape A (hierarchical rank division)
+ * weakly_congruent is a partial order on A and B: A <= B
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+weakly_congruent(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
+      return false_type{};
+    } else {
+      return transform_apply(a, b, [](auto const& x, auto const& y) { return weakly_congruent(x,y); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else if constexpr (is_integral<IntTupleA>::value) {
+    return true_type{};
+  } else if constexpr (is_integral<IntTupleB>::value) {
+    return false_type{};
+  } else {
+    return weakly_congruent(shape(a), shape(b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class B>
+using is_weakly_congruent = decltype(weakly_congruent(declval<A>(), declval<B>()));
+
+/** Test if Shape A is compatible with Shape B:
+ *    the size of A and B are the same, and
+ *    any coordinate into A can also be used as a coordinate into B
+ * Equivalently, the size of Shape B is the same as Shape A at each terminal of Shape A.
+ * compatible is a partial order on A and B: A <= B
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+compatible(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
+      return false_type{};
+    } else {
+      return transform_apply(a, b, [](auto const& x, auto const& y) { return compatible(x,y); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else if constexpr (is_integral<IntTupleA>::value) {
+    return a == size(b);
+  } else if constexpr (is_integral<IntTupleB>::value) {
+    return false_type{};
+  } else {
+    return compatible(shape(a), shape(b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class B>
+using is_compatible = decltype(compatible(declval<A>(), declval<B>()));
+
+/** Test if Shape A is evenly divided by Tiler B
+ * @returns Static or dynamic boolean
+ * @post if result is true_type, then
+ *       size(a) == logical_divide(make_layout(shape(a)),b) will always compile
+ *       and result in true_type.
+ */
+template <class Shape, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+evenly_divides(Shape const& a, Tiler const& b)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    if constexpr (rank_v<Tiler> > rank_v<Shape>) {
+      return false_type{};
+    } else {
+      return transform_apply(b, a, [](auto const& x, auto const& y) { return evenly_divides(y,x); },
+                                   [](auto const&... z) { return (true_type{} && ... && z); });
+    }
+  } else {
+    return size(a) == size(b) * size(ceil_div(shape(a), b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Replace the elements of Tuple B that are paired with an Int<0> with an Int<1>
+ */
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value) {
+    return transform(a, b, [](auto const& x, auto const& y) { return filter_zeros(x,y); });
+  } else if constexpr (is_constant<0, IntTupleA>::value) {
+    return repeat_like(b, Int<1>{});
+  } else {
+    return b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tuple const& t)
+{
+  return filter_zeros(t, t);
+}
+
+//
+// Converters and constructors with arrays and params
+//
+
+/** Make an IntTuple of rank N from an Indexable array.
+ * Access elements up to a dynamic index n, then use init (requires compatible types)
+ * Consider cute::take<B,E> if all indexing is known to be valid
+ * \code
+ *   std::vector<int> a = {6,3,4};
+ *   auto tup = make_int_tuple<5>(a, a.size(), 0)            // (6,3,4,0,0)
+ * \endcode
+ */
+template <int N, class Indexable, class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_int_tuple(Indexable const& t, int n, T const& init)
+{
+  static_assert(N > 0);
+  if constexpr (N == 1) {
+    return 0 < n ? t[0] : init;
+  } else {
+    return transform(make_seq<N>{}, [&](auto i) { return i < n ? t[i] : init; });
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** Fill the dynamic values of a Tuple with values from another Tuple
+ * \code
+ *   auto params = make_tuple(6,3,4);
+ *   cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>> result;
+ *   fill_int_tuple_from(result, params);                    // (_1,(6,3,_3),4,_2)
+ * \endcode
+ */
+template <class Tuple, class TupleV>
+CUTE_HOST_DEVICE constexpr
+auto
+fill_int_tuple_from(Tuple& result, TupleV const& vals)
+{
+  return fold(result, vals, [](auto const& init, auto&& r) {
+    if constexpr (is_static<remove_cvref_t<decltype(r)>>::value) {       // Skip static elements of result
+      return init;
+    } else if constexpr (is_tuple<remove_cvref_t<decltype(r)>>::value) { // Recurse into tuples
+      return fill_int_tuple_from(r, init);
+    } else {                                                             // Assign and consume arg
+      static_assert(tuple_size<remove_cvref_t<decltype(init)>>::value > 0, "Not enough values to fill with!");
+      r = get<0>(init);
+      return remove<0>(init);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+}
+
+/** Make a "Tuple" by filling in the dynamic values in order from the arguments
+ * \code
+ *   using result_t = cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>>;
+ *   auto result = make_int_tuple_from<result_t>(6,3,4);     // (_1,(6,3,_3),4,_2)
+ * \endcode
+ */
+template <class Tuple, class... Ts>
+CUTE_HOST_DEVICE constexpr
+Tuple
+make_int_tuple_from(Ts const&... ts)
+{
+  Tuple result = Tuple{};
+  fill_int_tuple_from(result, cute::make_tuple(ts...));
+  return result;
+}
+
+/** Convert a tuple to a flat homogeneous array of type T
+ * \code
+ *   auto tup = cute::make_tuple(Int<1>{}, cute::make_tuple(6,3,Int<3>{}),4,Int<2>{});
+ *   cute::array<uint64_t,6> result = to_array<uint64_t>(tup);   // [1,6,3,3,4,2]
+ * \endcode
+ */
+template <class T = int64_t, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+to_array(IntTuple const& t)
+{
+  auto flat_t = flatten_to_tuple(t);
+  constexpr int N = tuple_size<decltype(flat_t)>::value;
+  cute::array<T,N> result;
+  for_each(make_seq<N>{}, [&] (auto i) { result[i] = get<i>(flat_t); });
+  return result;
+}
+
+//
+// Comparison operators
+//
+
+//
+// There are many ways to compare tuple of elements and because CuTe is built
+//   on parameterizing layouts of coordinates, some comparisons are appropriate
+//   only in certain cases.
+//  -- lexicographical comparison [reverse, reflected, revref]   : Correct for coords in RowMajor Layout
+//  -- colexicographical comparison [reverse, reflected, revref] : Correct for coords in ColMajor Layout
+//  -- element-wise comparison [any,all]                         :
+// This can be very confusing. To avoid errors in selecting the appropriate
+//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
+//
+// When actually desiring to order coordinates, the user should map them to
+//   their indices within the Layout they came from:
+//      e.g.  layoutX(coordA) < layoutX(coordB)
+// That said, we implement the three most common ways to compare tuples below.
+//   These are implemented with slighly more explicit names than op<.
+//
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less(IntTupleA const& a, IntTupleB const& b);
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less(IntTupleA const& a, IntTupleB const& b);
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less(IntTupleA const& a, IntTupleB const& b);
+
+namespace detail {
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleB is exhausted
+  } else if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
+  } else {
+    return lex_less(get<I>(a), get<I>(b)) || (get<I>(a) == get<I>(b) && lex_less_impl<I+1>(a,b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleB is exhausted
+  } else if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
+  } else {
+    constexpr size_t A = tuple_size<TupleA>::value - 1 - I;
+    constexpr size_t B = tuple_size<TupleB>::value - 1 - I;
+    return colex_less(get<A>(a), get<B>(b)) || (get<A>(a) == get<B>(b) && colex_less_impl<I+1>(a,b));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <size_t I, class TupleA, class TupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less_impl(TupleA const& a, TupleB const& b)
+{
+  if constexpr (I == tuple_size<TupleA>::value) {
+    return cute::true_type{};     // Terminal: TupleA is exhausted
+  } else if constexpr (I == tuple_size<TupleB>::value) {
+    return cute::false_type{};    // Terminal: TupleA is not exhausted, TupleB is exhausted
+  } else {
+    return elem_less(get<I>(a), get<I>(b)) && elem_less_impl<I+1>(a,b);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Lexicographical comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::lex_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_leq(T const& t, U const& u) {
+  return !lex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_gtr(T const& t, U const& u) {
+  return lex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+lex_geq(T const& t, U const& u) {
+  return !lex_less(t, u);
+}
+
+// Colexicographical comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::colex_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_leq(T const& t, U const& u) {
+  return !colex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_gtr(T const& t, U const& u) {
+  return colex_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+colex_geq(T const& t, U const& u) {
+  return !colex_less(t, u);
+}
+
+// Elementwise [all] comparison
+
+template <class IntTupleA, class IntTupleB>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_less(IntTupleA const& a, IntTupleB const& b)
+{
+  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
+    return detail::elem_less_impl<0>(a, b);
+  } else {
+    return a < b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_leq(T const& t, U const& u) {
+  return !elem_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_gtr(T const& t, U const& u) {
+  return elem_less(u, t);
+}
+
+template <class T, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+elem_geq(T const& t, U const& u) {
+  return !elem_less(t, u);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/layout.hpp b/lightllm-kernel/cutlass/include/cute/layout.hpp
new file mode 100755
index 000000000..bc1b54efb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/layout.hpp
@@ -0,0 +1,2058 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/int_tuple.hpp>
+#include <cute/stride.hpp>
+#include <cute/underscore.hpp>
+#include <cute/numeric/arithmetic_tuple.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/numeric/integral_ratio.hpp>
+#include <cute/numeric/numeric_types.hpp>  // cute::sizeof_bits
+
+namespace cute
+{
+
+// Aliases
+
+template <class... Shapes>
+using Shape = cute::tuple<Shapes...>;
+
+template <class... Strides>
+using Stride = cute::tuple<Strides...>;
+
+template <class... Strides>
+using Step = cute::tuple<Strides...>;
+
+template <class... Coords>
+using Coord = cute::tuple<Coords...>;
+
+template <class... Layouts>
+using Tile = cute::tuple<Layouts...>;
+
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Shape<Ts...>
+make_shape(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Stride<Ts...>
+make_stride(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Step<Ts...>
+make_step(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Coord<Ts...>
+make_coord(Ts const&... t) {
+  return {t...};
+}
+template <class... Ts>
+CUTE_HOST_DEVICE constexpr
+Tile<Ts...>
+make_tile(Ts const&... t)
+{
+  return {t...};
+}
+
+//
+// Layout
+//
+
+template <class Shape, class Stride = LayoutLeft::Apply<Shape> >
+struct Layout
+    : private cute::tuple<Shape, Stride>   // EBO for static layouts
+{
+  // Expensive in compilation time...
+  //static_assert(is_congruent<Shape, Stride>::value, "Shape and Stride must be congruent");
+
+  // NOTE: This defaults static Shapes/Strides correctly, but not dynamic
+  CUTE_HOST_DEVICE constexpr
+  Layout(Shape  const& shape  = {}, Stride const& stride = {})
+      : cute::tuple<Shape, Stride>(shape, stride)
+  {}
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = rank_v<Shape>;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return *this;
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() {
+    return get<0,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return get<0,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() {
+    return get<1,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
+  }
+
+  template <int... I>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const {
+    return get<1,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
+  }
+
+  //
+  // Mappings
+  //
+
+  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+  // OR
+  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return slice(coord, *this);
+    } else {
+      return crd2idx(coord, shape(), stride());
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // Convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(OtherLayout const& other) const {
+    return composition(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return composition(*this, make_tile(layouts...));
+  }
+
+  template <class OtherShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(OtherShape const& shape) const {
+    return composition(*this, make_layout(shape));
+  }
+
+  template <class... Shapes>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(Shapes const&... shapes) const {
+    return composition(*this, make_layout(make_shape(shapes...)));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(OtherLayout const& other) const {
+    return tiled_divide(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return tiled_divide(*this, make_tile(layouts...));
+  }
+
+  //
+  // Utility
+  //
+
+  //
+  // Index to Coordinate
+  //
+
+  // NOTE: Only valid for compact layouts
+
+  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post congruent(@a result, shape())
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_hier_coord(IInt const& idx) const {
+    return cute::idx2crd(idx, shape(), stride());
+  }
+
+  // Return the (flat) ND logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_flat_coord(IInt const& idx) const {
+    return cute::crd2crd(this->get_hier_coord(idx), shape(), repeat<rank>(Int<1>{}));
+  }
+
+  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
+  // @post crd2idx(@a result, shape(), stride()) == idx
+  // @post is_integral<decltype(@a result)>::value
+  template <class IInt,
+            __CUTE_REQUIRES(is_integral<IInt>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_1d_coord(IInt const& idx) const {
+    return cute::crd2idx(this->get_hier_coord(idx), shape());
+  }
+
+  //
+  // Coordinate to Coordinate
+  //
+
+#if 0
+  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
+  // @post congruent(@a result, shape())
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_hier_coord(Coord const& crd) const {
+    return cute::crd2crd(crd, shape(), shape());
+  }
+
+  // Return the (flat) ND logical coordinate corresponding to the linear index
+  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_flat_coord(Coord const& crd) const {
+    return cute::crd2crd(crd, shape(), product_each(shape()));
+  }
+
+  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
+  // @post is_integral<decltype(@a result)>::value
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  crd_2_1d_coord(Coord const& crd) const {
+    //return cute::crd2crd(crd, shape(), product(shape()));
+    return cute::crd2idx(crd, shape());
+  }
+#endif
+};
+
+// Equality, return a static or dynamic boolean
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(Layout<ShapeA,StrideA> const& layoutA, Layout<ShapeB,StrideB> const& layoutB)
+{
+  return layoutA.shape() == layoutB.shape() && layoutA.stride() == layoutB.stride();
+}
+
+template <class Layout>
+struct is_layout : false_type {};
+template <class Shape, class Stride>
+struct is_layout<Layout<Shape,Stride>> : true_type {};
+
+//
+// Layout construction
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, Stride const& stride)
+{
+  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
+  static_assert(is_tuple<Stride>::value || is_integral<Stride>::value);
+  return Layout<Shape,Stride>(shape, stride);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape)
+{
+  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
+  return make_layout(shape, compact_major<LayoutLeft>(shape));
+}
+
+//
+// Convenience tags for common layouts
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, LayoutLeft)
+{
+  return make_layout(shape, compact_major<LayoutLeft>(shape));
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Shape const& shape, LayoutRight)
+{
+  return make_layout(shape, compact_major<LayoutRight>(shape));
+}
+
+//
+// Construct a layout from multiple layouts by concatenation
+//
+
+// One argument overload
+template <class Shape0, class Stride0>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0)
+{
+  return make_layout(make_shape (layout0.shape() ),
+                     make_stride(layout0.stride()));
+}
+
+// Two argument overload
+template <class Shape0, class Stride0,
+          class Shape1, class Stride1>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0,
+            Layout<Shape1,Stride1> const& layout1)
+{
+  return make_layout(make_shape (layout0.shape() , layout1.shape() ),
+                     make_stride(layout0.stride(), layout1.stride()));
+}
+
+// Var argument overload
+template <class Shape0, class Stride0,
+          class Shape1, class Stride1,
+          class... Shapes, class... Strides>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Layout<Shape0,Stride0> const& layout0,
+            Layout<Shape1,Stride1> const& layout1,
+            Layout<Shapes,Strides> const&... layouts)
+{
+  return make_layout(make_shape (layout0.shape() , layout1.shape() , layouts.shape()... ),
+                     make_stride(layout0.stride(), layout1.stride(), layouts.stride()...));
+}
+
+//
+// Advanced Layout constructions
+//
+
+// Make a compact layout with shape @a shape and strides following the order induced by @a order.
+// Dynamic values in @a order are ignored, considered large, and considered ordered from left to right.
+// Example:
+//   make_ordered_layout(Shape<_2,_2,_2,_2>{}, Step<_0,_2,_3,_1>{})
+//     ->  (_2,_2,_2,_2):(_1,_4,_8,_2)
+//   make_ordered_layout(make_shape(2,3,4,5), make_step(Int<2>{}, 67, 42, Int<50>{}))
+//     -> (2,3,4,5):(_1,10,30,2)
+template <class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+auto
+make_ordered_layout(Shape const& shape, Order const& order)
+{
+  return make_layout(shape, compact_order(shape, order));
+}
+
+// Make a compact layout with the same shape as @a layout
+//   and strides following the order induced by @a layout.stride().
+// Static-0 strides in the input @a layout are preserved in the output.
+// Example:
+//   make_layout_like(Layout<Shape<_2,_2,_2,_2>, Stride<_0,_2,_4,_1>>{})
+//     ->  (_2,_2,_2,_2):(_0,_2,_4,_1)
+//   make_layout_like(make_layout(make_shape(2,3,4,5), make_stride(Int<0>{},42,Int<1>{},Int<0>{})))
+//     -> (2,3,4,5):(_0,4,_1,_0)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout_like(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(layout.shape(),
+                     compact_order(filter_zeros(layout.stride(), layout.shape()), layout.stride()));
+}
+
+// Make a compact layout with the same shape as @a layout
+//   and strides following the order induced by @a layout.stride(),
+//   except mode-0 is always stride-1 and generated column-major.
+// The 0th mode is commonly used for MMA_Atoms or Copy_Atoms so this
+//   generates the 0th mode with LayoutLeft (preserving stride-0s) regardless of the reference layout
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Layout<Shape,Stride> const& layout)
+{
+  constexpr int R = Layout<Shape,Stride>::rank;
+  if constexpr (R > 1 && is_static<Shape>::value) {
+    return tiled_product(make_layout(get<0>(layout.shape()),
+                                     compact_major<LayoutLeft>(filter_zeros(get<0>(layout.stride()), get<0>(layout.shape())))),
+                         make_ordered_layout(take<1,R>(layout.shape()), take<1,R>(layout.stride())));
+  } else {
+    return make_layout(layout.shape());
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Shape,
+          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Shape const& shape)
+{
+  return make_layout(shape);
+}
+
+//
+// Make an identity layout that maps a coordinate to itself
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_identity_layout(Shape const& shape)
+{
+  return make_layout(shape, make_basis_like(shape));
+}
+
+//
+// Operations to manipulate Layouts like a tuple of pairs
+//
+
+// Return the Is...th sublayout.
+// For Is... = <I0,I1,...,IN>, equivalent to get<IN>(...get<I1>(get<I0>(layout)))
+template <size_t... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(get<Is...>(layout.shape()),
+                     get<Is...>(layout.stride()));
+}
+
+// Return a new layout with only the modes in the range [B,E)
+template <int B, int E, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Layout<Shape,Stride> const& layout)
+{
+  static_assert(B < E, "take: empty range error");
+  static_assert(0 <= B && E <= Layout<Shape,Stride>::rank, "take: range out of bounds");
+  return make_layout(take<B,E>(layout.shape()),
+                     take<B,E>(layout.stride()));
+}
+
+// Return a new layout with only the modes Is... = <I0,I1,...,IN>
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+select(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(select<Is...>(layout.shape()),
+                     select<Is...>(layout.stride()));
+}
+
+// Return a layout with depth at most 1
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(flatten(layout.shape()),
+                     flatten(layout.stride()));
+}
+
+// Return a layout whose profile is congruent to TargetProfile
+// @pre Input layout is flat, flatten(@a layout) == @a layout
+// @pre Input layout can be folded to profile, rank(@a layout) == rank(flatten(@a target_profile))
+// @post congruent(@a result, @a target_profile)
+template <class Shape, class Stride, class TargetProfile>
+CUTE_HOST_DEVICE constexpr
+auto
+unflatten(Layout<Shape,Stride> const& layout, TargetProfile const& target_profile)
+{
+  return make_layout(unflatten(layout.shape(),  target_profile),
+                     unflatten(layout.stride(), target_profile));
+}
+
+//
+// Utilities
+//
+
+// Return the sublayout of mode I...
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+layout(Layout<Shape,Stride> const& layout)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return layout;
+  } else {
+    return get<Is...>(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Return the shape of a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(Layout<Shape,Stride>& layout)
+{
+  return layout.template shape<Is...>();
+}
+
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(Layout<Shape,Stride> const& layout)
+{
+  return layout.template shape<Is...>();
+}
+
+// Return the stride of a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(Layout<Shape,Stride>& layout)
+{
+  return layout.template stride<Is...>();
+}
+
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(Layout<Shape,Stride> const& layout)
+{
+  return layout.template stride<Is...>();
+}
+
+// Return the number of elements in a mode
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+size(Layout<Shape,Stride> const& layout)
+{
+  return size(shape<Is...>(layout));
+}
+
+// Return the number of modes
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(Layout<Shape,Stride> const& layout)
+{
+  return rank(shape<Is...>(layout));
+}
+
+// Return the depth of the layout
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(Layout<Shape,Stride> const& layout)
+{
+  return depth(shape<Is...>(layout));
+}
+
+// Return the codomain shape of a mode
+// @post size(coshape(@a a)) == cosize(@a a)
+// @return C Coordinate with smallest elements such that
+//           @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
+//           where sub_layout = get<Is...>(layout).
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coshape(Layout<Shape,Stride> const& layout)
+{
+  // Protect against negative strides
+  auto abs_sub_layout = make_layout(shape<Is...>(layout),
+                                    transform_leaf(stride<Is...>(layout), abs_fn{}));
+  auto co_coord = as_arithmetic_tuple(abs_sub_layout(size(abs_sub_layout) - Int<1>{}));
+  return co_coord + repeat_like(co_coord, Int<1>{});
+}
+
+// Return the codomain size of a mode
+// @return M smallest integer such that
+//           @a sub_layout(c) < M for all c < size(@a sub_layout)
+//           where sub_layout = get<Is...>(layout).
+template <int... Is, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+cosize(Layout<Shape,Stride> const& layout)
+{
+  return size(coshape<Is...>(layout));
+}
+
+template <class Layout>
+using cosize_t = decltype(cosize(declval<Layout>()));
+
+template <class Layout>
+static constexpr auto cosize_v = cosize_t<Layout>::value;
+
+// With crd2idx(coord, shape), makes sense to have crd2idx(coord, Layout) as well
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return crd2idx(c, layout.shape(), layout.stride());
+}
+
+//
+// Slice and Dice a layout
+//
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return make_layout(slice(c, layout.shape()),
+                     slice(c, layout.stride()));
+}
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return cute::make_tuple(slice(c, layout), crd2idx(c, layout));
+}
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+dice(Coord const& c, Layout<Shape,Stride> const& layout)
+{
+  return make_layout(dice(c, layout.shape()),
+                     dice(c, layout.stride()));
+}
+
+// Compute a pointer offset and (potentially modified) layout from a coordinate
+// This exists so it can be overloaded for ComposedLayout
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, Layout<Shape,Stride> const& layout)
+{
+  return cute::make_tuple(layout, layout(coord));
+}
+
+//
+// Transform the modes of a layout
+//
+
+namespace detail {
+
+template <class Tuple, class F, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple const& t, F&& f, seq<I...>)
+{
+  return make_layout(f(get<I>(t))...);
+}
+
+template <class Tuple0, class Tuple1, class F, int... I, int... I0, int... I1>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f, seq<I...>, seq<I0...>, seq<I1...>)
+{
+  return make_layout(f(get<I>(t0),get<I>(t1))..., get<I0>(t0)..., get<I1>(t1)...);
+}
+
+} // end namespace detail
+
+template <class Tuple, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple const& t, F&& f)
+{
+  return detail::transform_layout(t, f, make_seq<decltype(rank(t))::value>{});
+}
+
+template <class Tuple0, class Tuple1, class F>
+CUTE_HOST_DEVICE constexpr
+auto
+transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f)
+{
+  constexpr int R0 = decltype(rank(t0))::value;
+  constexpr int R1 = decltype(rank(t1))::value;
+  constexpr int R  = (R0 < R1) ? R0 : R1;
+  return detail::transform_layout(t0, t1, f, make_seq<R>{}, make_range<R,R0>{}, make_range<R,R1>{});
+}
+
+//
+// Coalesce and Filter
+//
+
+namespace detail {
+
+// Look at each element and the front of the stack (in order of priority)
+// front(NewLayout)  get<I>(Layout)
+//      s0:d0           _1:d1     =>  continue
+//      _1:d0           s1:d1     =>  replace_front    s1:d1
+//      s0:s1*d1        s1:d1     =>  replace_front s0*s1:d1
+//      s0:d0           s1:d1     =>  prepend          s1:d1
+//
+// @pre OldShape and OldStride are flat
+template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+bw_coalesce(OldShape const& old_shape, OldStride const& old_stride,
+            NewShape const& new_shape, NewStride const& new_stride)
+{
+  if constexpr (I == -1) {
+    // Base case, we're done
+    if constexpr (is_constant<1, NewShape>::value) {
+      return Layout<_1,_0>{};
+    } else {
+      return Layout<NewShape,NewStride>{new_shape,new_stride};
+    }
+  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
+    // shape<I>(layout) == _1, skip it and continue
+    return bw_coalesce<I-1>(old_shape, old_stride, new_shape, new_stride);
+  } else if constexpr (is_constant<1, NewShape>::value) {
+    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
+    return bw_coalesce<I-1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
+  } else if constexpr (is_static<decltype(get<0>(new_shape))>::value &&
+                       is_constant<true, decltype(get<I>(old_shape) * get<I>(old_stride) == get<0>(new_stride))>::value) {
+    // Merge modes because the shapes and strides match
+    return bw_coalesce<I-1>(old_shape, old_stride,
+                            replace_front(new_shape,  get<I>(old_shape) * get<0>(new_shape)),
+                            replace_front(new_stride, get<I>(old_stride)));
+  } else {
+    // Can't replace or merge, so prepend a new mode
+    return bw_coalesce<I-1>(old_shape, old_stride,
+                            prepend(new_shape,  get<I>(old_shape)),
+                            prepend(new_stride, get<I>(old_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// cute::coalesce promises to not change the Layout as a function from integers to codomain.
+// It accomplishes this inside of the Layout's domain, but not always outside of the domain.
+//   Example: (_4,_1):(_1,_0) coalesces to _4:_1.
+// detail::coalesce_x preserves the Layout function inside its domain and outside.
+//
+// @post depth(@a result) <= 1
+// @post for all i, 0 <= i, @a layout(i) == @a result(i)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_x(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+
+  constexpr int R = decltype(rank(flat_shape))::value;
+  if constexpr (is_constant<1, decltype(get<R-1>(flat_shape))>::value) {
+    return detail::bw_coalesce<R-2>(flat_shape, flat_stride,             Int<2>{}, get<R-1>(flat_stride));
+  } else {
+    return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
+  }
+}
+
+// Apply coalesce_x at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce_x(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return cute::transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce_x(l,t); });
+  } else {
+    return coalesce_x(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// "Simplify" the layout by combining modes that are possible to combine
+// Does not respect the shape of the layout, but does preserve total size
+// @post size(@a result) == size(@a layout)
+// @post depth(@a result) <= 1
+// @post for all i, 0 <= i < size(@a layout), @a layout(i) == @a result(i)
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Layout<Shape,Stride> const& layout)
+{
+  auto flat_shape  = flatten(layout.shape());
+  auto flat_stride = flatten(layout.stride());
+
+  constexpr int R = decltype(rank(flat_shape))::value;
+  return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
+}
+
+// Apply coalesce at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce(l,t); });
+  } else {
+    return coalesce(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Combine static and dynamic modes of a shape.
+// @post size(@a result) == size(@a shape)
+// @post depth(@a result) <= 1
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Shape const& shape)
+{
+  static_assert(is_integral<Shape>::value || is_tuple<Shape>::value);
+
+  return cute::fold_first(flatten(shape), [](auto const& init, auto const& a) {
+    if constexpr (is_static<decltype(back(init))>::value == is_static<decltype(a)>::value) {
+      return replace_back(init, back(init) * a);  // Both static or both dynamic, coalesce and replace
+    } else {
+      return append(init, a);                     // Can't coalesce, so append
+    }
+  });
+}
+
+// Replace the modes in layout that have a 0-stride with a 1-size
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(filter_zeros(layout.stride(), layout.shape()), layout.stride());
+}
+
+// Replace the modes in layout that correspond to a 0 at the terminals of trg_profile with a 1-size
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  return make_layout(filter_zeros(trg_profile, layout.shape()), layout.stride());
+}
+
+// Remove all of the 0-strides and 1-sizes
+// Return 1-shape if empty
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Layout<Shape,Stride> const& layout)
+{
+  return coalesce(filter_zeros(layout));
+}
+
+// Apply filter at the terminals of trg_profile
+template <class Shape, class Stride, class IntTuple>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
+{
+  if constexpr (is_tuple<IntTuple>::value) {
+    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
+    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return filter(l,t); });
+  } else {
+    return filter(layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Append, Prepend, Replace
+//
+
+template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(Layout<ShapeA,StrideA> const& layout,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(append<N>(layout.shape(),  x.shape()),
+                     append<N>(layout.stride(), x.stride()));
+}
+
+template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(Layout<ShapeA,StrideA> const& layout,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(append(layout.shape(),  x.shape()),
+                     append(layout.stride(), x.stride()));
+}
+
+template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(prepend<N>(layout.shape(),  x.shape()),
+                     prepend<N>(layout.stride(), x.stride()));
+}
+
+template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return make_layout(prepend(layout.shape(),  x.shape()),
+                     prepend(layout.stride(), x.stride()));
+}
+
+template <int N, class ShapeA, class StrideA, class ShapeX, class StrideX>
+CUTE_HOST_DEVICE constexpr
+auto
+replace(Layout<ShapeA,StrideA> const& layout,
+        Layout<ShapeX,StrideX> const& x)
+{
+  return make_layout(replace<N>(layout.shape(),  x.shape()),
+                     replace<N>(layout.stride(), x.stride()));
+}
+
+template <int B, int E, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+group(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(group<B,E>(layout.shape()),
+                     group<B,E>(layout.stride()));
+}
+
+//
+// Composition of two layouts: lhs o rhs
+// @post compatible(rhs, result)
+// @post result(c) = lhs(rhs(c))
+//         for all c in the domain of rhs
+//
+
+namespace detail {
+
+template <class LShape, class LStride,
+          class RShape, class RStride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
+                 RShape const& rhs_shape, RStride const& rhs_stride)
+{
+  if constexpr (is_tuple<RShape>::value) {
+    // Apply the right-distributivity of Layout composition
+    return transform_layout(rhs_shape, rhs_stride, [&](auto const& s, auto const& d) {
+      return composition_impl(lhs_shape, lhs_stride, s, d);
+    });
+  } else
+  if constexpr (is_scaled_basis<RStride>::value) {
+    // Special case for a ScaledBasis stride
+    return composition_impl(basis_get(rhs_stride, lhs_shape), basis_get(rhs_stride, lhs_stride),
+                            rhs_shape, basis_value(rhs_stride));
+  } else
+  if constexpr (is_constant<0, RStride>::value) {
+    // Special case shortcut for any static stride-0
+    return Layout<RShape, RStride>{rhs_shape, rhs_stride};
+  } else
+  if constexpr (is_integral<decltype(lhs_shape)>::value) {
+    // Special case shortcut for any integral LShape
+    return Layout{rhs_shape, rhs_stride * lhs_stride};
+  } else
+  if constexpr (is_constant<1, RStride>::value) {
+    // Special case shortcut for any static stride-1
+    constexpr int R  = rank_v<LShape>;
+    auto result_shape_0  = take<0,R-1>(lhs_shape);
+
+    // Mod out the rhs_shape from the lhs_shape
+    auto const [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
+      [] (auto const& init, auto const& si) {
+        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
+      });
+
+    // Jump into coalesce and append (rest_shape, get<R-1>(lhs_stride))
+    return detail::bw_coalesce<R-2>(result_shape_1, lhs_stride, rest_shape, get<R-1>(lhs_stride));
+  } else {
+    // General case: integral RShape and RStride, tuple LShape and LStride
+    constexpr int R  = rank_v<LShape>;
+    auto result_shape_0  = take<0,R-1>(lhs_shape);
+    auto result_stride_0 = take<0,R-1>(lhs_stride);
+
+    // Divide out the rhs_stride from the lhs_shape
+    auto const [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
+      [] (auto const& init, auto const& di) {
+        return cute::make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di));
+      });
+
+    // Apply any lhs_shape changes to the stride
+    auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1));
+
+    // Mod out the rhs_shape from the lhs_shape
+    auto const [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
+      [] (auto const& init, auto const& si) {
+        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
+      });
+
+    // Jump into coalesce and append (rest_shape, rest_stride * get<R-1>(lhs_stride))
+    return detail::bw_coalesce<R-2>(result_shape_2, result_stride_1, rest_shape, rest_stride * get<R-1>(lhs_stride));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class LShape, class LStride,
+          class RShape, class RStride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<LShape,LStride> const& lhs,
+            Layout<RShape,RStride> const& rhs)
+{
+  auto coprofile = repeat_like(decltype(coshape(rhs)){}, Int<0>{});
+  auto flat_lhs = detail::coalesce_x(lhs, coprofile);
+  return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs.shape(), rhs.stride());
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<LShape,LStride> const& lhs,
+            Tiler                  const& rhs)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank);
+    // Drop any modes of lhs that aren't hit by rhs
+    return detail::transform_layout(lhs, rhs, [](auto const& l, auto const& r) { return composition(l,r); }, make_seq<tuple_size<Tiler>::value>{}, seq<>{}, seq<>{});
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return lhs;
+  } else if constexpr (is_integral<Tiler>::value) {
+    auto flat_lhs = detail::coalesce_x(lhs);
+    return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs, Int<1>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Complement
+//
+// Build the complement of a layout.
+// @post size(@a result) >= @a cosize_hi / size(filter(@a layout)));
+// @post For all i in [1,size(@a result)),
+//           @a result(i) < @a result(i-1)
+//           For all j in [0, size(@a layout)),
+//               @a result(i) != @a layout(j)
+//
+
+namespace detail {
+
+// @pre @a layout has been filtered (flattened and no stride-0 or size-1 modes).
+template <class Shape, class Stride, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Shape const& shape, Stride const& stride, CoTarget const& cotarget)
+{
+  if constexpr (is_constant<0, Stride>::value) {
+    // Special case for irreducible rank-1 stride-0 layout
+    return make_layout(coalesce(cotarget));
+  } else {
+    // General case
+    constexpr int R = rank_v<Shape>;
+    static_assert(R == 1 || is_static<Stride>::value,
+                  "Dynamic-stride complement only for rank-1 layouts");
+
+    // Should just be a sort and a fold...
+    // Then we could even handle dynamic strides (but they would destroy all static strides)
+    auto [shape_, stride_, result_shape_, result_stride] =
+      fold(make_seq<R-1>{},
+           cute::make_tuple(shape, stride, cute::make_tuple(), cute::make_tuple(Int<1>{})),
+           [](auto const& init, auto i)
+           {
+              auto [shape, stride, result_shape, result_stride] = init;
+              auto min_stride = cute::min(stride);
+              auto min_idx    = cute::find(stride, min_stride);
+              auto new_shape  = min_stride / get<i>(result_stride);
+              auto new_stride = min_stride * get<min_idx>(shape);
+              static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
+
+              return cute::make_tuple(remove<min_idx>(shape),              // Remove the min_idx from shape
+                                      remove<min_idx>(stride),             // Remove the min_idx from stride
+                                      append(result_shape , new_shape ),   // new shape  = min_stride / last_stride
+                                      append(result_stride, new_stride));  // new stride = min_stride * curr_shape
+            });
+
+    // Append the last shape mode
+    auto new_shape    = get<0>(stride_) / get<R-1>(result_stride);         // new shape  = min_stride / last_stride
+    static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
+    auto result_shape = append(result_shape_, new_shape);
+
+    // Compute the rest_shape and rest_stride
+    auto new_stride  = get<0>(stride_) * get<0>(shape_);                   // new stride = min_stride * curr_shape
+    auto rest_shape  = coalesce(ceil_div(cotarget, new_stride));
+    auto rest_stride = compact_major<LayoutLeft>(rest_shape, new_stride);
+
+    // Coalesce and append (rest_shape, rest_stride)
+    return coalesce(make_layout(make_shape (result_shape , rest_shape ),
+                                make_stride(result_stride, rest_stride)));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Shape, class Stride, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Layout<Shape,Stride> const& layout, CoTarget const& cotarget)
+{
+  auto filter_layout = filter(layout);
+  return detail::complement(filter_layout.shape(), filter_layout.stride(), shape(cotarget));
+}
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(Layout<Shape,Stride> const& layout)
+{
+  auto filter_layout = filter(layout);
+  return detail::complement(filter_layout.shape(), filter_layout.stride(), cosize(filter_layout));
+}
+
+//
+// Right-Inverse and Left-Inverse
+//
+
+namespace detail {
+
+template <int NextStride, class Shape, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+inverse_seq(Shape const& shape, Stride const& stride, seq<Is...>)
+{
+  auto next_I = cute::find_if(stride, [](auto a) { return is_constant<NextStride, decltype(a)>{}; });
+
+  if constexpr (next_I == decltype(rank(stride))::value) {
+    // If not found, return current seq
+    return seq<Is...>{};
+  } else {
+    // auto next_stride = get<next_I>(shape) * get<next_I>(stride);
+    // NOTE: Needed for g++-7
+    using next_stride = decltype(get<next_I>(shape) * get<next_I>(stride));
+
+    if constexpr (is_static<next_stride>::value && !is_constant<NextStride, next_stride>::value) {
+      // If next_stride is static and unique, then continue
+      return inverse_seq<next_stride::value>(shape, stride, seq<Is..., next_I>{});
+    } else {
+      // Else return current seq + next_I
+      return seq<Is..., next_I>{};
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+//
+// Build the right-inverse of a layout
+// @pre is_static<Layout>
+// @result A layout @a result such that
+//    @a layout(@a result(i)) == i for all i < size(@a result)
+// @result A layout @a result such that
+//    composition(@a layout, @a result) is identical to make_layout(shape(result))
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout = coalesce(layout);
+  auto astride = transform_leaf(flat_layout.stride(), abs_fn{});
+
+  // Find Int<1>{}, the starting stride, and follow the strides to gen inverse_seq
+  [[maybe_unused]] auto iseq = detail::inverse_seq<1>(flat_layout.shape(), astride, seq<>{});
+
+  if constexpr (iseq.size() == 0) {
+    return Layout<_1,_0>{};     // Empty case, nothing found
+  } else {
+    // Generate the corresponding new strides and construct
+    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
+    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
+                       unwrap(transform(iseq, [&](auto i) { return signum(stride<i>(flat_layout)) * get<i>(rstride); })));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(Underscore const& _)
+{
+  return _;
+}
+
+//
+// Build the left-inverse of a layout
+// @pre is_static<Layout>
+// @pre @a layout is an injective function
+// @result A layout @a result such that
+//    @a result(@a layout(i)) == i for all i < size(@a layout)
+// @result A layout @a result such that
+//    composition(@a result, @a layout) is identical to make_layout(shape(layout))
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(Layout<Shape,Stride> const& layout)
+{
+  return right_inverse(make_layout(layout, complement(layout)));
+}
+
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(Underscore const& _)
+{
+  return _;
+}
+
+//
+// Max Common Layout
+//
+
+/* Return a layout that points to the maximum number of contiguous elements
+ * that logically correspond in the layouts of @a a and @a b.
+ *
+ * @returns Layout R
+ * @post For all 0 <= i < size(R), a(R(i)) == i and b(R(i)) == i
+ */
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Layout<ShapeA,StrideA> const& a,
+                  Layout<ShapeB,StrideB> const& b)
+{
+  Layout inv_b  = right_inverse(b);
+  Layout common = coalesce(composition(a, inv_b));
+
+  // Keep only the static identity component of the common layout
+  if constexpr (is_static<decltype(shape<0>(common))>::value &&
+                is_constant<1, decltype(stride<0>(common))>::value) {
+    // Truncate to the size of the contiguous vector (static stride-1 mode)
+    return composition(inv_b, layout<0>(common));
+  } else {
+    return Layout<_1,_0>{};
+  }
+}
+
+/* Return Int<N> such that N is the maximum number of contiguous elements
+ * that logically correspond in the layouts of @a a and @a b.
+ *
+ * @returns Int<N> with N >= 1
+ * @post For all 0 <= n < N, a(b.get_1d_coord(n)) == n
+ *       (NOTE: Problems with negative strides/coords in this post-condition)
+ */
+template <class ShapeA, class StrideA,
+          class ShapeB, class StrideB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Layout<ShapeA,StrideA> const& a,
+                  Layout<ShapeB,StrideB> const& b)
+{
+  Layout common = coalesce(composition(a, right_inverse(b)));
+
+  // Keep only the static identity component of the common layout
+  if constexpr (is_static<decltype(shape<0>(common))>::value &&
+                is_constant<1, decltype(stride<0>(common))>::value) {
+    // Truncate to the size of the contiguous vector (static stride-1 mode)
+    return shape<0>(common);
+  } else {
+    return Int<1>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/* Return a layout that distributes ShapeB over ShapeA.
+ *
+ * @returns Layout result
+ * @post evenly_divides(@a b, size(@a result))
+ * @post evenly_divides(@a a, @a result)
+ * @post For all i,j in [0,size(@a result)) with i < j, @a result(i) < @a result(j). Surjective and Ordered.
+ * @post composition(make_layout(shape(@a a)), @a result) is admissible
+ * \code
+ *   // Note that 6 does not divide this shape
+ *   Layout layoutA = Layout<Shape<Int<15>,Int<14>>>{};
+ *
+ *   // Want to tile any 6 elements and don't care where they come from
+ *   Layout dist = domain_distribute(layoutA, Int<6>{});   // (_3,_2):(_1,_15)
+ *
+ *   // Not guaranteed to find all 6 though...
+ *   CUTE_STATIC_ASSERT_V(Int<6>{} == size(dist));
+ *
+ *   Layout result = zipped_divide(layoutA, dist);         // (_6,Rest)
+ * \endcode
+ */
+template <class ShapeA, class ShapeB>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_distribute(ShapeA const& a, ShapeB const& b)
+{
+  static_assert(is_integral<ShapeB>::value);
+  static_assert(is_static<ShapeB>::value);
+
+  auto flat_shape_a = flatten(shape(a));
+
+  static_assert(is_static<decltype(flat_shape_a)>::value);
+
+  // Compute the shape of the result
+  auto [result_shape, b_rest] = cute::fold(flat_shape_a, cute::make_tuple(cute::tuple<>{}, size(b)), [](auto init, auto a_) {
+    auto [result, b_] = init;
+    auto gcd_ = gcd(a_, b_);
+    return cute::make_tuple(append(result, gcd_), b_ / gcd_);
+  });
+
+  // Compute the stride of the result
+  auto result_stride = compact_major<LayoutLeft>(flat_shape_a);
+
+  return coalesce(make_layout(result_shape, result_stride));
+}
+
+//
+// Kernel (Nullspace) of a Layout
+//
+
+namespace detail {
+
+template <int NextI, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+nullspace_seq(Stride const& stride, seq<Is...>)
+{
+  if constexpr (NextI == rank_v<Stride>) {
+    return seq<Is...>{};
+  } else
+  if constexpr (is_constant<0, decltype(get<NextI>(stride))>::value) {
+    return detail::nullspace_seq<NextI+1>(stride, seq<Is..., NextI>{});
+  } else {
+    return detail::nullspace_seq<NextI+1>(stride, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+//
+// Build the nullspace of a layout
+// @result A layout @a result such that
+//    size(@a result) == size(@a layout) / size(filter(@a layout))
+//    @a layout(@a result(i)) == 0 for all i < size(@a result)
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+nullspace(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout = flatten(layout);
+
+  auto iseq = detail::nullspace_seq<0>(flat_layout.stride(), seq<>{});
+
+  if constexpr (iseq.size() == 0) {
+    return Layout<_1,_0>{};     // Empty case, nothing found
+  } else {
+    // Generate the corresponding new strides and construct
+    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
+    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
+                       unwrap(transform(iseq, [&](auto i) { return get<i>(rstride); })));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Zip
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(Layout<Shape,Stride> const& layout)
+{
+  return make_layout(zip(layout.shape()),
+                     zip(layout.stride()));
+}
+
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(Layout<TShape,TStride> const& layoutA,
+    Layout<UShape,UStride> const& layoutB)
+{
+  return make_layout(zip(layoutA.shape(),  layoutB.shape()),
+                     zip(layoutA.stride(), layoutB.stride()));
+}
+
+//
+// Tile unzip
+//   Logical product and logical divide (on layouts) produce rank-2 results by design.
+//   Follow the profile of @a tile and zip the rank-2 modes located at the terminals into
+//   their own mode.
+//
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_unzip(Layout<LShape,LStride> const& layout,
+           Tiler                  const& tiler)
+{
+  return make_layout(zip2_by(layout.shape(),  tiler),
+                     zip2_by(layout.stride(), tiler));
+}
+
+//
+// Logical divide
+//
+
+template <class LShape, class LStride,
+          class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Layout<LShape,LStride> const& layout,
+               Layout<TShape,TStride> const& tiler)
+{
+  return composition(layout, make_layout(tiler, complement(tiler, shape(layout))));
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Layout<LShape,LStride> const& layout,
+               Tiler                  const& tiler)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_divide: Too many modes in tiler.");
+    return transform_layout(layout, tiler, [](auto const& l, auto const& t) { return logical_divide(l,t); });
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return layout;
+  } else if constexpr (is_integral<Tiler>::value) {
+    return logical_divide(layout, make_layout(tiler));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Generalization of ceil_div for Layout lhs
+//   is effectively the "rest mode" of logical_divide.
+// Occurs in the calculation of gridDim, for example, for generalized tilers
+// Example:
+//   dim3 gridDim(size(ceil_div(problem_shape_M, cta_tiler_M)),
+//                size(ceil_div(problem_shape_N, cta_tiler_N)));
+// This does not consider compositional acceptance, so it may be the case that
+//   ceil_div produces a result while logical_divide (and friends) do not.
+template <class Target, class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+ceil_div(Target                 const& target,
+         Layout<TShape,TStride> const& tiler)
+{
+  return shape(complement(tiler, shape(target)));
+}
+
+//
+// Convenience operator
+//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+//   by gathering the tile modes and residuals into a rank-2 result.
+//
+
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(Layout<LShape,LStride> const& layout,
+              Tiler                  const& tiler)
+{
+  return tile_unzip(logical_divide(layout, tiler), tiler);
+}
+
+// Same as zipped_divide, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(Layout<LShape,LStride> const& layout,
+             Tiler                  const& tiler)
+{
+  auto result = zipped_divide(layout, tiler);
+
+  auto R1 = rank<1>(result);
+  return result(_, repeat<R1>(_));
+}
+
+// Same as zipped_divide, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(Layout<LShape,LStride> const& layout,
+            Tiler                  const& tiler)
+{
+  auto result = zipped_divide(layout, tiler);
+
+  auto R0 = rank<0>(result);
+  auto R1 = rank<1>(result);
+  return result(repeat<R0>(_), repeat<R1>(_));
+}
+
+//
+// Logical product
+//
+
+template <class LShape, class LStride,
+          class TShape, class TStride>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<LShape,LStride> const& block,
+                Layout<TShape,TStride> const& tiler)
+{
+  return make_layout(block, composition(complement(block, size(block)*cosize(tiler)), tiler));
+}
+
+template <class LShape, class LStride, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<LShape,LStride> const& block,
+                Tiler                  const& tiler)
+{
+  if constexpr (is_tuple<Tiler>::value) {
+    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_product: Too many modes in tiler.");
+    return transform_layout(block, tiler, [](auto const& l, auto const& t) { return logical_product(l,t); });
+  } else if constexpr (is_underscore<Tiler>::value) {
+    return block;
+  } else if constexpr (is_integral<Tiler>::value) {
+    return logical_product(block, make_layout(tiler));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Convenience operator
+//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+//   by gathering the block modes and products into a rank-2 result.
+//
+
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_product(Layout<LShape,LStride> const& block,
+               Tiler                  const& tiler)
+{
+  return tile_unzip(logical_product(block, tiler), tiler);
+}
+
+// Same as zipped_product, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_product(Layout<LShape,LStride> const& block,
+              Tiler                  const& tiler)
+{
+  auto result = zipped_product(block, tiler);
+
+  auto R1 = rank<1>(result);
+  return result(_, repeat<R1>(_));
+}
+
+// Same as zipped_product, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
+template <class LShape, class LStride,
+          class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_product(Layout<LShape,LStride> const& block,
+             Tiler                  const& tiler)
+{
+  auto result = zipped_product(block, tiler);
+
+  auto R0 = rank<0>(result);
+  auto R1 = rank<1>(result);
+  return result(repeat<R0>(_), repeat<R1>(_));
+}
+
+//
+// Rank-sensitive products
+//
+
+// blocked_product -- Reproduce a block over a tiler.
+// Think of every element of "tiler" as a "block"
+//   and return the layout of the resulting structure.
+// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+blocked_product(Layout<TShape,TStride> const& block,
+                Layout<UShape,UStride> const& tiler)
+{
+  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
+
+  auto result = logical_product(append<R>(block), append<R>(tiler));
+
+  return coalesce(zip(get<0>(result), get<1>(result)), tuple_repeat<R>(Int<1>{}));
+}
+
+// raked_product -- Reproduce a block over a tiler with block-interleaving.
+// Think of every element of "tiler" as a "block", interleave those blocks,
+//   and return the layout of the resulting structure.
+// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
+template <class TShape, class TStride,
+          class UShape, class UStride>
+CUTE_HOST_DEVICE constexpr
+auto
+raked_product(Layout<TShape,TStride> const& block,
+              Layout<UShape,UStride> const& tiler)
+{
+  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
+
+  auto result = logical_product(append<R>(block), append<R>(tiler));
+
+  return coalesce(zip(get<1>(result), get<0>(result)), tuple_repeat<R>(Int<1>{}));
+}
+
+// tile_to_shape -- Perform a product of a layout so that the result matches a target shape.
+// This is similar to blocked_product, but specifies the result shape instead of the
+//   product shape, which is more convenient in certain circumstances.
+// @param block The layout to repeat
+// @param trg_shape The target shape of the result
+// @param ord_shape The order of the modes of @a trg_shape to tile @a layout with.
+//                  Defaults to GenColMajor, so @a layout will repeat
+//                    across the first mode first, the second mode second, etc
+//                  E.g. Step<_2,_1,_3> will cause @a layout to repeat
+//                    across the second mode first, the first mode second, and the third mode last.
+// @pre rank(@a block) <= rank(@a trg_shape)
+// @post compatible(@a trg_shape, shape(@a result))
+template <class Shape, class Stride,
+          class TrgShape, class ModeOrder = LayoutLeft>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_to_shape(Layout<Shape,Stride> const& block,
+              TrgShape             const& trg_shape,
+              ModeOrder            const& ord_shape = {})
+{
+  CUTE_STATIC_ASSERT_V(rank(block) <= rank(trg_shape), "Rank of layout must be <= rank of target shape.");
+  constexpr int R = rank_v<TrgShape>;
+
+  auto padded_block = append<R>(block);
+
+  auto block_shape  = product_each(shape(padded_block));
+  auto target_shape = product_each(shape(trg_shape));
+
+  // Assert proper division
+  if constexpr (is_static<decltype(target_shape)>::value) {
+    CUTE_STATIC_ASSERT_V(evenly_divides(target_shape, block_shape),
+                         "tile_to_shape: block shape does not divide the target shape.");
+  }
+
+  auto product_shape = ceil_div(target_shape, block_shape);
+
+  return coalesce(blocked_product(padded_block, make_ordered_layout(product_shape, ord_shape)), product_shape);
+}
+
+//
+// Upcast
+//   For stride-1 mode, divide size by N. Divide all other strides by N.
+//
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {                  // tuple stride
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N>(s,d); });
+  } else if constexpr (is_constant<0, Stride>::value) {    // static-0 stride
+    return Layout<Shape,Stride>{shape,stride};
+  } else if constexpr (is_static<Stride>::value) {         // static stride
+    return make_layout(shape_div(shape,  shape_div(Int<N>{}, abs(stride))),
+                       shape_div(stride, Int<N>{}));
+  } else {                                                 // dynamic stride
+    // assume dynamic strides are larger than N and divisible
+    // assert(stride % N == 0);
+    return make_layout(shape, safe_div(stride, Int<N>{}));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Layout<Shape,Stride> const& layout)
+{
+  return upcast<N>(layout.shape(), layout.stride());
+}
+
+//
+// Downcast
+//   For stride-1 mode, multiply size by N. Multiply all other strides by N.
+//
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return downcast<N>(s,d); });
+  } else if constexpr (is_constant<1, Stride>::value || is_constant<-1, Stride>::value) {
+    return make_layout(shape * Int<N>{}, stride);
+  } else {
+    return make_layout(shape, stride * Int<N>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Layout<Shape,Stride> const& layout)
+{
+  CUTE_STATIC_ASSERT(has_int1<Stride>::value, "Downcast requires adjacent elements");
+  return downcast<N>(layout.shape(), layout.stride());
+}
+
+//
+// Recast
+//
+
+template <class OldType, class NewType,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(Layout<Shape,Stride> const& layout)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return layout;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(layout);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(layout);
+  }
+  else {
+    static_assert(dependent_false<scale>, "Recast not supported.");
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Determine the maximum alignment of a Layout.
+// The maximum alignment is the largest N for which upcast<N>(layout) will compile.
+//   upcast<N>(layout) compiles when the static shapes and strides pass divisibility checks.
+//   Therefore, upcast<M>(layout) will also compile for all divisors M of N.
+// Note that this only considers the static shapes and strides of the Layout
+//   in symmetry with upcast<N> only checking against static shapes and strides and assuming all
+//   dynamic shapes and strides are large and multiples of N.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Layout<Shape,Stride> const& layout)
+{
+  auto flat_layout   = coalesce(layout);
+  auto static_shape  = transform( shape(flat_layout), [](auto s){ return conditional_return<is_static<decltype(s)>::value>(s, Int<1>{}); });
+  auto static_stride = transform(stride(flat_layout), [](auto d){ return conditional_return<is_static<decltype(d)>::value>(d, Int<0>{}); });
+  auto filter_layout = make_layout(static_shape, static_stride);
+  auto permuted = logical_divide(filter_layout, right_inverse(filter_layout));
+  return gcd(size<0>(permuted), stride<1>(permuted));
+}
+
+//
+// Display utilities
+//
+
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE void print(Layout<Shape,Stride> const& layout)
+{
+  print(layout.shape()); print(":"); print(layout.stride());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Shape, class Stride>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Layout<Shape,Stride> const& layout)
+{
+  return os << shape(layout) << ":" << stride(layout);
+}
+#endif
+
+// Generic 2D Layout to console table
+template <class Layout>
+CUTE_HOST_DEVICE
+void
+print_layout(Layout const& layout)  // (m,n) -> idx
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  int idx_width = num_digits(cosize(layout)) + 2;
+  const char* delim = "+-----------------------";
+
+  print(layout); print("\n");
+
+  // Column indices
+  print("    ");
+  for (int n = 0; n < size<1>(layout); ++n) { printf("  %*d ", idx_width-2, n); }
+  printf("\n");
+
+  // Print out A m-by-n
+  for (int m = 0; m < size<0>(layout); ++m) {
+    // Header
+    print("    ");
+    for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
+    printf("+\n");
+    // Values
+    printf("%2d  ", m);  // Row indices
+    for (int n = 0; n < size<1>(layout); ++n) { printf("| %*d ", idx_width-2, int(layout(m,n))); }
+    printf("|\n");
+  }
+  // Footer
+  print("    ");
+  for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
+  printf("+\n");
+}
+
+// Generic ThrVal 2D Layout to console table
+template <class Layout, class ThrID>
+CUTE_HOST_DEVICE
+void
+print_layout(Layout const& layout, ThrID const& thrid)  // (m,n) -> (tid,vid)  and  tid -> thr_idx
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  print(layout); print("\n");
+  print(thrid);  print("\n");
+
+  // Print out m-by-n
+  for (int m = 0; m < size<0>(layout); ++m) {
+    // Header
+    for (int n = 0; n < size<1>(layout); ++n) printf("+------");
+    printf("+\n");
+    // Values
+    for (int n = 0; n < size<1>(layout); ++n) printf("|%03d-%02d", int(thrid(layout(m,n) % size(thrid))), int(layout(m,n) / size(thrid)));
+    printf("|\n");
+  }
+  // Footer
+  for (int n = 0; n < size<1>(layout); ++n) printf("+------");
+  printf("+\n");
+}
+
+struct TikzColor_White {
+  CUTE_HOST_DEVICE char const*
+  operator()(int idx) const {
+    return "white";
+  }
+};
+
+struct TikzColor_BWx8 {
+  CUTE_HOST_DEVICE char const*
+  operator()(int idx) const {
+    static char const* color_map[8] = {"black!00", "black!40", "black!20", "black!60",
+                                       "black!10", "black!50", "black!30", "black!70"};
+    return color_map[idx % 8];
+  }
+};
+
+struct TikzColor_TV {
+  CUTE_HOST_DEVICE char const*
+  operator()(int tid, int vid) const {
+    static char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}",
+                                       "{rgb,255:red,175;green,255;blue,175}",
+                                       "{rgb,255:red,255;green,255;blue,175}",
+                                       "{rgb,255:red,255;green,175;blue,175}",
+                                       "{rgb,255:red,210;green,210;blue,255}",
+                                       "{rgb,255:red,210;green,255;blue,210}",
+                                       "{rgb,255:red,255;green,255;blue,210}",
+                                       "{rgb,255:red,255;green,210;blue,210}"};
+    return color_map[tid % 8];
+  }
+};
+
+// Generic 2D Layout to LaTeX printer
+template <class LayoutA, class TikzColorFn = TikzColor_BWx8>
+CUTE_HOST_DEVICE
+void
+print_latex(LayoutA const& layout_a,   // (m,n) -> idx
+            TikzColorFn color = {})    // lambda(idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(layout_a) <= Int<2>{});
+  auto layout = append<2>(layout_a, Layout<_1,_0>{});
+
+  // Commented print(layout)
+  printf("%% Layout: "); print(layout); printf("\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // Layout
+  for (int i = 0; i < size<0>(layout); ++i) {
+    for (int j = 0; j < size<1>(layout); ++j) {
+      int idx = layout(i,j);
+      printf("\\node[fill=%s] at (%d,%d) {%d};\n",
+             color(idx), i, j, idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
+         int(size<0>(layout)), int(size<1>(layout)));
+  // Labels
+  for (int i =  0, j = -1; i < size<0>(layout); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int i = -1, j =  0; j < size<1>(layout); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+// Generic ThrVal 2D Layout to LaTeX TikZ
+template <class Layout, class ThrID, class TikzColorFn = TikzColor_TV>
+CUTE_HOST_DEVICE
+void
+print_latex(Layout const& layout,    // (m,n) -> (tid,vid)
+            ThrID  const& thr,       // tid -> thr_idx
+            TikzColorFn color = {})  // lambda(thr_idx,val_idx) -> tikz color string
+{
+  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
+
+  // Commented prints
+  printf("%% Layout: "); print(layout); printf("\n");
+  printf("%% ThrID : "); print(thr);  printf("\n");
+  // Header
+  printf("\\documentclass[convert]{standalone}\n"
+         "\\usepackage{tikz}\n\n"
+         "\\begin{document}\n"
+         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
+
+  // Layout
+  for (int i = 0; i < size<0>(layout); ++i) {
+    for (int j = 0; j < size<1>(layout); ++j) {
+      int thrid   = layout(i,j) % size(thr);
+      int val_idx = layout(i,j) / size(thr);
+      int thr_idx = thr(thrid);
+
+      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
+             color(thr_idx, val_idx),
+             i, j,
+             thr_idx, val_idx);
+    }
+  }
+  // Grid
+  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
+         int(size<0>(layout)), int(size<1>(layout)));
+  // Labels
+  for (int i = 0, j = -1; i < size<0>(layout); ++i) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
+  }
+  for (int j = 0, i = -1; j < size<1>(layout); ++j) {
+    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
+  }
+
+  // Footer
+  printf("\\end{tikzpicture}\n"
+         "\\end{document}\n");
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/layout_composed.hpp b/lightllm-kernel/cutlass/include/cute/layout_composed.hpp
new file mode 100755
index 000000000..3e5f83627
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/layout_composed.hpp
@@ -0,0 +1,652 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
+#include <cute/layout.hpp>                     // cute::tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type, cute::Int
+
+/* This implements a ComposedLayout of the form
+ *   LayoutA o Offset o LayoutB
+ * and is useful in cases where composition() does not or cannot apply to LayoutA and LayoutB.
+ * For example, when the "divisibility condition" in shape_div is violated in composition(LayoutA, LayoutB).
+ *
+ * This ComposedLayout provides similar functionality to Layout including tiling, partitioning,
+ * coordinate-to-index mapping and layout manipulations, but is not considered a "normal" layout.
+ * For example, this layout provides shape() and size() functions, but does not provide stride() functions.
+ * Mostly, the similar functionality is accomplished by applying each operation to LayoutB only
+ * as LayoutB defines the domain.
+ */
+
+namespace cute
+{
+
+// A Layout of non-trivially composable functions: F o I o L
+template <class LayoutA, class Offset, class LayoutB>
+struct ComposedLayout : private cute::tuple<LayoutA, Offset, LayoutB>  // EBO for static layouts
+{
+  CUTE_HOST_DEVICE constexpr
+  ComposedLayout(LayoutA const& layoutA = {},
+                 Offset  const& offset  = {},
+                 LayoutB const& layoutB = {})
+      : cute::tuple<LayoutA, Offset, LayoutB>(layoutA, offset, layoutB)
+  {}
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = LayoutB::rank;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout_a() const {
+    return get<0>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  offset() const {
+    return get<1>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout_b() const {
+    return get<2>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return layout_b().shape();
+  }
+
+  // Doesn't really make sense to ask for the strides of this "layout"
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const = delete;
+
+  //
+  // Mappings
+  //
+
+  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+  // OR
+  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return slice(coord, *this);
+    } else {
+      return layout_a()(offset() + layout_b()(coord));    // (A o O o B)(c)
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // Convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(OtherLayout const& other) const {
+    return composition(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return composition(*this, make_tile(layouts...));
+  }
+
+  template <class OtherShape>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(OtherShape const& shape) const {
+    return composition(*this, make_layout(shape));
+  }
+
+  template <class... Shapes>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  with_shape(Shapes const&... shapes) const {
+    return composition(*this, make_layout(make_shape(shapes...)));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class OtherLayout>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(OtherLayout const& other) const {
+    return tiled_divide(*this, other);
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return tiled_divide(*this, make_tile(layouts...));
+  }
+
+  // Equality, return a static or dynamic boolean
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator==(ComposedLayout<Args...> const& other) const {
+    return this->layout_a() == other.layout_a() &&
+           this->layout_b() == other.layout_b() &&
+           this->offset()   == other.offset();
+  }
+};
+
+template <class A, class O, class B>
+struct is_layout<ComposedLayout<A,O,B>> : true_type {};
+
+template <class T>
+struct is_composed_layout : false_type {};
+template <class A, class O, class B>
+struct is_composed_layout<ComposedLayout<A,O,B>> : true_type {};
+
+//
+// Constructors
+//
+
+template <class LayoutA, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+make_composed_layout(LayoutA const& layoutA,
+                     Offset  const& offset,
+                     LayoutB const& layoutB)
+{
+  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
+}
+
+//
+// Utilities
+//
+
+// Return the layout of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+layout(ComposedLayout<A,O,B> const& clayout)
+{
+  return composition(clayout.layout_a(), clayout.offset(), layout<Is...>(clayout.layout_b()));
+}
+
+// Return the shape of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+shape(ComposedLayout<A,O,B> const& layout)
+{
+  return shape<Is...>(layout.layout_b());
+}
+
+// Doesn't make sense to directly ask for the strides of this "layout"
+template <int... Is, class Fn, class O, class Layout>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+stride(ComposedLayout<Fn,O,Layout> const& layout) = delete;
+
+// Return the number of elements in a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+size(ComposedLayout<A,O,B> const& layout)
+{
+  return size<Is...>(layout.layout_b());
+}
+
+// Return the number of modes
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(ComposedLayout<A,O,B> const& layout)
+{
+  return rank<Is...>(layout.layout_b());
+}
+
+// Return the depth of the layout
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(ComposedLayout<A,O,B> const& layout)
+{
+  return depth<Is...>(layout.layout_b());
+}
+
+// Return the codomain size of a mode
+template <int... Is, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+cosize(ComposedLayout<A,O,B> const& layout)
+{
+  return cosize<Is...>(layout.layout_b());
+}
+
+//
+// Operations to manipulate Layouts like a tuple of pairs
+//
+
+template <size_t I, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+get(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), get<I>(a.layout_b()));
+}
+
+template <int Begin, int End, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+take(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), take<Begin,End>(a.layout_b()));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), flatten(a.layout_b()));
+}
+
+template <int N, class A, class O, class B, class X>
+CUTE_HOST_DEVICE constexpr
+auto
+append(ComposedLayout<A,O,B> const& a, X const& x)
+{
+  return composition(a.layout_a(), a.offset(), append<N>(a.layout_b(), x));
+}
+
+template <int Begin, int End, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+group(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), group<Begin,End>(a.layout_b()));
+}
+
+//
+// Slice a ComposedLayout
+//
+
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  auto [slice, offset] = slice_and_offset(coord, layout.layout_b());
+  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + offset, slice}, Int<0>{});
+}
+
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  return get<0>(slice_and_offset(coord, layout));
+}
+
+// Compute a pointer offset and (potentially modified) layout from a coordinate
+// For composed layout tensors the offset is accumulated in the layout itself while pointer is not updated
+template <class Coord, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
+{
+  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + layout.layout_b()(coord), layout.layout_b()}, Int<0>{});
+}
+
+//
+// composition
+//
+
+template <class LayoutA,
+          class Offset,
+          class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(LayoutA const& layoutA,
+            Offset  const& offset,
+            LayoutB const& layoutB)
+{
+  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(ComposedLayout<A,O,B> const& a,
+            Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), composition(a.layout_b(), b));
+}
+
+template <class ShapeA, class StrideA,
+          class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<ShapeA,StrideA> const& a,
+            ComposedLayout<A,O,B>  const& b)
+{
+  CUTE_STATIC_ASSERT_V(b.offset() == Int<0>{}, "Require offset == 0.");
+
+  return composition(composition(a, b.layout_a()), b.layout_b());
+}
+
+//
+// complement
+//
+
+template <class A, class O, class B, class CoTarget>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(ComposedLayout<A,O,B> const& layout, CoTarget const& cotarget)
+{
+  return complement(layout.layout_b(), cotarget);
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+complement(ComposedLayout<A,O,B> const& layout)
+{
+  return complement(layout, cosize(layout));
+}
+
+//
+// inverse
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
+}
+
+//
+// Other operations
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+zip(ComposedLayout<A,O,B> const& a)
+{
+  return composition(a.layout_a(), a.offset(), zip(a.layout_b()));
+}
+
+// Partitions
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(ComposedLayout<A,O,B> const& a,
+               Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), logical_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_unzip(ComposedLayout<A,O,B> const& a,
+           Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tile_unzip(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(ComposedLayout<A,O,B> const& a,
+             Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tiled_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), zipped_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(ComposedLayout<A,O,B> const& a,
+            Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), flat_divide(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(ComposedLayout<A,O,B> const& a,
+                Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), logical_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_product(ComposedLayout<A,O,B> const& a,
+               Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), zipped_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_product(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), tiled_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_product(ComposedLayout<A,O,B> const& a,
+             Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), flat_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+blocked_product(ComposedLayout<A,O,B> const& a,
+                Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), blocked_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+raked_product(ComposedLayout<A,O,B> const& a,
+              Tiler                 const& b)
+{
+  return composition(a.layout_a(), a.offset(), raked_product(a.layout_b(), b));
+}
+
+template <class A, class O, class B,
+          class Shape, class ModeOrder = GenColMajor>
+CUTE_HOST_DEVICE constexpr
+auto
+tile_to_shape(ComposedLayout<A,O,B> const& layout,
+              Shape                 const& trg_shape,
+              ModeOrder             const& ord_shape = {})
+{
+  return composition(layout.layout_a(), layout.offset(), tile_to_shape(layout.layout_b(), trg_shape, ord_shape));
+}
+
+template <class A, class O, class B,
+          class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
+{
+  return composition(layout.layout_a(), layout.offset(), filter(layout.layout_b(), trg_profile));
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b()));
+}
+
+template <class A, class O, class B, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
+{
+  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b(), trg_profile));
+}
+
+
+//
+// Upcast and Downcast
+//
+
+template <int N, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(upcast<N>(layout.layout_a()), upcast<N>(layout.offset()), upcast<N>(layout.layout_b()));
+}
+
+template <int N, class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<A,O,B> const& layout)
+{
+  return composition(downcast<N>(layout.layout_a()), downcast<N>(layout.offset()), downcast<N>(layout.layout_b()));
+}
+
+
+template <class OldType, class NewType,
+          class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(ComposedLayout<A,O,B> const& layout)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return layout;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(layout);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(layout);
+  }
+  else {
+    static_assert(dependent_false<scale>, "Recast not supported.");
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(ComposedLayout<A,O,B> const& layout)
+{
+  // Do not attempt for general ComposedLayouts
+  //return gcd(max_alignment(layout.layout_a()), max_alignment(layout.offset()), max_alignment(layout.layout_b()));
+  return Int<1>{};
+}
+
+//
+// Display utilities
+//
+
+template <class A, class O, class B>
+CUTE_HOST_DEVICE void print(ComposedLayout<A,O,B> const& layout)
+{
+  print(layout.layout_a()); print(" o "); print(layout.offset()); print(" o "); print(layout.layout_b());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class A, class O, class B>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ComposedLayout<A,O,B> const& layout)
+{
+  return os << layout.layout_a() << " o " << layout.offset() << " o " << layout.layout_b();
+}
+#endif
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp b/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
new file mode 100755
index 000000000..2e4690571
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+#include <cute/container/tuple.hpp>
+#include <cute/numeric/integral_constant.hpp>
+#include <cute/algorithm/functional.hpp>
+#include <cute/algorithm/tuple_algorithms.hpp>
+#include <cute/util/type_traits.hpp>
+
+namespace cute
+{
+
+template <class... T>
+struct ArithmeticTuple : tuple<T...>
+{
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(ArithmeticTuple<U...> const& u)
+    : tuple<T...>(static_cast<tuple<U...> const&>(u)) {}
+
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(tuple<U...> const& u)
+    : tuple<T...>(u) {}
+
+  template <class... U>
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTuple(U const&... u)
+    : tuple<T...>(u...) {}
+};
+
+template <class... T>
+struct is_tuple<ArithmeticTuple<T...>> : true_type {};
+
+template <class... Ts>
+struct is_flat<ArithmeticTuple<Ts...>> : is_flat<tuple<Ts...>> {};
+
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_arithmetic_tuple(T const&... t) {
+  return ArithmeticTuple<T...>(t...);
+}
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+as_arithmetic_tuple(T const& t) {
+  if constexpr (is_tuple<T>::value) {
+    return detail::tapply(t, [](auto const& x){ return as_arithmetic_tuple(x); },
+                          [](auto const&... a){ return make_arithmetic_tuple(a...); },
+                          tuple_seq<T>{});
+  } else {
+    return t;
+  }
+}
+
+//
+// Numeric operators
+//
+
+// Addition
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
+  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
+  return t + ArithmeticTuple<U...>(u);
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  return ArithmeticTuple<T...>(t) + u;
+}
+
+// Subtraction
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
+  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), minus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
+  return t - ArithmeticTuple<U...>(u);
+}
+
+template <class... T, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
+  return ArithmeticTuple<T...>(t) - u;
+}
+
+// Negation
+template <class... T>
+CUTE_HOST_DEVICE constexpr
+auto
+operator-(ArithmeticTuple<T...> const& t) {
+  return transform_apply(t, negate{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
+}
+
+//
+// Special cases
+//
+
+template <auto t, class... U>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<U...> const&
+operator+(C<t>, ArithmeticTuple<U...> const& u) {
+  static_assert(t == 0, "Arithmetic tuple op+ error!");
+  return u;
+}
+
+template <class... T, auto u>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<T...> const&
+operator+(ArithmeticTuple<T...> const& t, C<u>) {
+  static_assert(u == 0, "Arithmetic tuple op+ error!");
+  return t;
+}
+
+template <auto t, class... U>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<U...> const&
+operator-(C<t>, ArithmeticTuple<U...> const& u) {
+  static_assert(t == 0, "Arithmetic tuple op- error!");
+  return -u;
+}
+
+template <class... T, auto u>
+CUTE_HOST_DEVICE constexpr
+ArithmeticTuple<T...> const&
+operator-(ArithmeticTuple<T...> const& t, C<u>) {
+  static_assert(u == 0, "Arithmetic tuple op- error!");
+  return t;
+}
+
+//
+// ArithmeticTupleIterator
+//
+
+template <class ArithTuple>
+struct ArithmeticTupleIterator
+{
+  using value_type   = ArithTuple;
+  using element_type = ArithTuple;
+  using reference    = ArithTuple;
+
+  ArithTuple coord_;
+
+  CUTE_HOST_DEVICE constexpr
+  ArithmeticTupleIterator(ArithTuple const& coord = {}) : coord_(coord) {}
+
+  CUTE_HOST_DEVICE constexpr
+  ArithTuple operator*() const { return coord_; }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto operator[](Coord const& c) const { return *(*this + c); }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto operator+(Coord const& c) const {
+    return ArithmeticTupleIterator<remove_cvref_t<decltype(coord_ + c)>>(coord_ + c);
+  }
+};
+
+template <class Tuple>
+CUTE_HOST_DEVICE constexpr
+auto
+make_inttuple_iter(Tuple const& t) {
+  return ArithmeticTupleIterator(as_arithmetic_tuple(t));
+}
+
+template <class T0, class T1, class... Ts>
+CUTE_HOST_DEVICE constexpr
+auto
+make_inttuple_iter(T0 const& t0, T1 const& t1, Ts const&... ts) {
+  return make_inttuple_iter(cute::make_tuple(t0, t1, ts...));
+}
+
+//
+// ArithmeticTuple "basis" elements
+//   A ScaledBasis<T,N> is a (at least) rank-N+1 ArithmeticTuple:
+//      (_0,_0,...,T,_0,...)
+//   with value T in the Nth mode
+
+template <class T, int N>
+struct ScaledBasis : private tuple<T>
+{
+  CUTE_HOST_DEVICE constexpr
+  ScaledBasis(T const& t = {}) : tuple<T>(t) {}
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) value()       { return get<0>(static_cast<tuple<T>      &>(*this)); }
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto) value() const { return get<0>(static_cast<tuple<T> const&>(*this)); }
+
+  CUTE_HOST_DEVICE static constexpr
+  auto mode() { return Int<N>{}; }
+};
+
+template <class T>
+struct is_scaled_basis : false_type {};
+template <class T, int N>
+struct is_scaled_basis<ScaledBasis<T,N>> : true_type {};
+
+template <class T, int N>
+struct is_integral<ScaledBasis<T,N>> : true_type {};
+
+// Get the scalar T out of a ScaledBasis
+template <class SB>
+CUTE_HOST_DEVICE constexpr auto
+basis_value(SB const& e)
+{
+  if constexpr (is_scaled_basis<SB>::value) {
+    return basis_value(e.value());
+  } else {
+    return e;
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Apply the N... pack to another Tuple
+template <class SB, class Tuple>
+CUTE_HOST_DEVICE decltype(auto)
+basis_get(SB const& e, Tuple&& t)
+{
+  if constexpr (is_scaled_basis<SB>::value) {
+    return basis_get(e.value(), get<SB::mode()>(static_cast<Tuple&&>(t)));
+  } else {
+    return static_cast<Tuple&&>(t);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class T, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+to_atuple_i(T const& t, seq<I...>) {
+  return make_arithmetic_tuple((void(I),Int<0>{})..., t);
+}
+
+} // end namespace detail
+
+// Turn a ScaledBases<T,N> into a rank-N+1 ArithmeticTuple
+//    with N prefix 0s:  (_0,_0,...N...,_0,T)
+template <class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+as_arithmetic_tuple(ScaledBasis<T,N> const& t) {
+  return detail::to_atuple_i(as_arithmetic_tuple(t.value()), make_seq<N>{});
+}
+
+namespace detail {
+
+template <int... Ns>
+struct Basis;
+
+template <>
+struct Basis<> {
+  using type = Int<1>;
+};
+
+template <int N, int... Ns>
+struct Basis<N,Ns...> {
+  using type = ScaledBasis<typename Basis<Ns...>::type, N>;
+};
+
+} // end namespace detail
+
+// Shortcut for writing ScaledBasis<ScaledBasis<ScaledBasis<Int<1>, N0>, N1>, ...>
+// E<>    := _1
+// E<0>   := (_1,_0,_0,...)
+// E<1>   := (_0,_1,_0,...)
+// E<0,0> := ((_1,_0,_0,...),_0,_0,...)
+// E<0,1> := ((_0,_1,_0,...),_0,_0,...)
+// E<1,0> := (_0,(_1,_0,_0,...),_0,...)
+// E<1,1> := (_0,(_0,_1,_0,...),_0,...)
+template <int... N>
+using E = typename detail::Basis<N...>::type;
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_basis_like(Shape const& shape)
+{
+  if constexpr (is_integral<Shape>::value) {
+    return Int<1>{};
+  } else {
+    // Generate bases for each rank of shape
+    return transform(tuple_seq<Shape>{}, shape, [](auto I, auto si) {
+      // Generate bases for each rank of si and add an i on front
+      using I_type = decltype(I);
+      return transform_leaf(make_basis_like(si), [](auto e) {
+        // MSVC has trouble capturing variables as constexpr,
+        // so that they can be used as template arguments.
+        // This is exactly what the code needs to do with i, unfortunately.
+        // The work-around is to define i inside the inner lambda,
+        // by using just the type from the enclosing scope.
+        constexpr int i = I_type::value;
+        return ScaledBasis<decltype(e), i>{};
+      });
+    });
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Arithmetic
+//
+
+template <class T, int M, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(ScaledBasis<T,M> const& b, U const& u)
+{
+  auto t = safe_div(b.value(), u);
+  return ScaledBasis<decltype(t),M>{t};
+}
+
+template <class T, int M, class U>
+CUTE_HOST_DEVICE constexpr
+auto
+shape_div(ScaledBasis<T,M> const& b, U const& u)
+{
+  auto t = shape_div(b.value(), u);
+  return ScaledBasis<decltype(t),M>{t};
+}
+
+// Equality
+template <class T, int N, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
+  return bool_constant<M == N>{} && t.value() == u.value();
+}
+
+// Not equal to anything else
+template <class T, int N, class U>
+CUTE_HOST_DEVICE constexpr
+false_type
+operator==(ScaledBasis<T,N> const&, U const&) {
+  return {};
+}
+
+template <class T, class U, int M>
+CUTE_HOST_DEVICE constexpr
+false_type
+operator==(T const&, ScaledBasis<U,M> const&) {
+  return {};
+}
+
+// Abs
+template <class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+abs(ScaledBasis<T,N> const& e) {
+  return ScaledBasis<decltype(abs(e.value())),N>{abs(e.value())};
+}
+
+// Multiplication
+template <class A, class T, int N>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(A const& a, ScaledBasis<T,N> const& e) {
+  auto r = a * e.value();
+  return ScaledBasis<decltype(r),N>{r};
+}
+
+template <class T, int N, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(ScaledBasis<T,N> const& e, B const& b) {
+  auto r = e.value() * b;
+  return ScaledBasis<decltype(r),N>{r};
+}
+
+// Addition
+template <class T, int N, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
+  return as_arithmetic_tuple(t) + as_arithmetic_tuple(u);
+}
+
+template <class T, int N, class... U>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, ArithmeticTuple<U...> const& u) {
+  return as_arithmetic_tuple(t) + u;
+}
+
+template <class... T, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ArithmeticTuple<T...> const& t, ScaledBasis<U,M> const& u) {
+  return t + as_arithmetic_tuple(u);
+}
+
+template <auto t, class U, int M>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(C<t>, ScaledBasis<U,M> const& u) {
+  static_assert(t == 0, "ScaledBasis op+ error!");
+  return u;
+}
+
+template <class T, int N, auto u>
+CUTE_HOST_DEVICE constexpr
+auto
+operator+(ScaledBasis<T,N> const& t, C<u>) {
+  static_assert(u == 0, "ScaledBasis op+ error!");
+  return t;
+}
+
+//
+// Display utilities
+//
+
+template <class ArithTuple>
+CUTE_HOST_DEVICE void print(ArithmeticTupleIterator<ArithTuple> const& iter)
+{
+  printf("ArithTuple"); print(iter.coord_);
+}
+
+template <class T, int N>
+CUTE_HOST_DEVICE void print(ScaledBasis<T,N> const& e)
+{
+  print(e.value()); printf("@%d", N);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class ArithTuple>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ArithmeticTupleIterator<ArithTuple> const& iter)
+{
+  return os << "ArithTuple" << iter.coord_;
+}
+
+template <class T, int N>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, ScaledBasis<T,N> const& e)
+{
+  return os << e.value() << "@" << N;
+}
+#endif
+
+} // end namespace cute
+
+
+namespace CUTE_STL_NAMESPACE
+{
+
+template <class... T>
+struct tuple_size<cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace CUTE_STL_NAMESPACE
+
+#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
+namespace std
+{
+
+#if defined(__CUDACC_RTC__)
+template <class... _Tp>
+struct tuple_size;
+
+template <size_t _Ip, class... _Tp>
+struct tuple_element;
+#endif
+
+template <class... T>
+struct tuple_size<cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+template <class... T>
+struct tuple_size<const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
+{};
+
+template <size_t I, class... T>
+struct tuple_element<I, const cute::ArithmeticTuple<T...>>
+  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
+{};
+
+} // end namespace std
+#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp b/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
new file mode 100755
index 000000000..7dd9ea5bf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>    // CUTE_HOST_DEVICE
+
+#include <cutlass/complex.h>  // cutlass::complexm, cutlass::real, cutlass::imag, cutlass::is_complex
+
+namespace cute
+{
+
+using cutlass::complex;
+using cutlass::is_complex;
+using cutlass::RealType;
+using cutlass::real;
+using cutlass::imag;
+using cutlass::conj;
+
+template <class T>
+static constexpr auto is_complex_v = is_complex<T>::value;
+
+/// Fused multiply-add for complex numbers
+template <class D, class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(complex<D>      & d,
+    complex<A> const& a,
+    complex<B> const& b,
+    complex<C> const& c)
+{
+  fma(d.real(),  a.real(), b.real(), c.real());
+  fma(d.imag(),  a.real(), b.imag(), c.imag());
+  fma(d.real(), -a.imag(), b.imag(), d.real());
+  fma(d.imag(),  a.imag(), b.real(), d.imag());
+}
+
+/// Fused multiply-add for triplets
+template <class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(complex<A> const& a,
+    complex<B> const& b,
+    complex<C>      & c)
+{
+  return fma(c, a, b, c);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/int.hpp b/lightllm-kernel/cutlass/include/cute/numeric/int.hpp
new file mode 100755
index 000000000..571b3e3ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/int.hpp
@@ -0,0 +1,106 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#include <cute/config.hpp>          // CUTE_STL_NAMESPACE
+
+#include <cutlass/numeric_types.h>  // cutlass::int2b_t, cutlass::int4b_t
+
+namespace cute
+{
+
+//
+// Signed integers
+//
+
+using int2_t  = cutlass::int2b_t;
+using int4_t  = cutlass::int4b_t;
+using CUTE_STL_NAMESPACE::int8_t;
+using CUTE_STL_NAMESPACE::int16_t;
+using CUTE_STL_NAMESPACE::int32_t;
+using CUTE_STL_NAMESPACE::int64_t;
+
+template <int N> struct int_bit;
+template <> struct int_bit<  2>  { using type = int2_t; };
+template <> struct int_bit<  4>  { using type = int4_t; };
+template <> struct int_bit<  8>  { using type = int8_t;  };
+template <> struct int_bit< 16>  { using type = int16_t; };
+template <> struct int_bit< 32>  { using type = int32_t; };
+template <> struct int_bit< 64>  { using type = int64_t; };
+
+template <int N>
+using int_bit_t = typename int_bit<N>::type;
+
+template <int N>
+using int_byte = int_bit<8*N>;
+
+template <int N>
+using int_byte_t = typename int_byte<N>::type;
+
+//
+// Unsigned integers
+//
+
+using uint1_t   = cutlass::uint1b_t;
+using uint2_t   = cutlass::uint2b_t;
+using uint4_t   = cutlass::uint4b_t;
+using CUTE_STL_NAMESPACE::uint8_t;
+using CUTE_STL_NAMESPACE::uint16_t;
+using CUTE_STL_NAMESPACE::uint32_t;
+using CUTE_STL_NAMESPACE::uint64_t;
+using cutlass::uint128_t;
+
+template <int N> struct uint_bit;
+template <> struct uint_bit<  1> { using type = uint1_t; };
+template <> struct uint_bit<  2> { using type = uint2_t; };
+template <> struct uint_bit<  4> { using type = uint4_t; };
+template <> struct uint_bit<  8> { using type = uint8_t;  };
+template <> struct uint_bit< 16> { using type = uint16_t; };
+template <> struct uint_bit< 32> { using type = uint32_t; };
+template <> struct uint_bit< 64> { using type = uint64_t; };
+template <> struct uint_bit<128> { using type = cutlass::uint128_t; };
+
+template <int N>
+using uint_bit_t = typename uint_bit<N>::type;
+
+template <int N>
+using uint_byte = uint_bit<8*N>;
+
+template <int N>
+using uint_byte_t = typename uint_byte<N>::type;
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
new file mode 100755
index 000000000..608017958
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
@@ -0,0 +1,151 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+#include <cute/util/type_traits.hpp>
+#include <cute/numeric/integral_constant.hpp>
+
+namespace cute
+{
+
+using CUTE_STL_NAMESPACE::integer_sequence;
+using CUTE_STL_NAMESPACE::make_integer_sequence;
+
+namespace detail {
+
+template <class T, class S, T Begin>
+struct range_impl;
+
+template <class T, T... N, T Begin>
+struct range_impl<T, integer_sequence<T, N...>, Begin> {
+  using type = integer_sequence<T, N+Begin...>;
+};
+
+template <class S>
+struct reverse_impl;
+
+template <class T, T... N>
+struct reverse_impl<integer_sequence<T, N...>> {
+  using type = integer_sequence<T, sizeof...(N)-1-N...>;
+};
+
+} // end namespace detail
+
+template <class T, T Begin, T End>
+using make_integer_range = typename detail::range_impl<
+    T,
+    make_integer_sequence<T, (End-Begin > 0) ? (End-Begin) : 0>,
+    Begin>::type;
+
+template <class T, T N>
+using make_integer_sequence_reverse = typename detail::reverse_impl<
+    make_integer_sequence<T, N>>::type;
+
+//
+// Common aliases
+//
+
+// int_sequence
+
+template <int... Ints>
+using int_sequence = integer_sequence<int, Ints...>;
+
+template <int N>
+using make_int_sequence = make_integer_sequence<int, N>;
+
+template <int N>
+using make_int_rsequence = make_integer_sequence_reverse<int, N>;
+
+template <int Begin, int End>
+using make_int_range = make_integer_range<int, Begin, End>;
+
+// index_sequence
+
+template <size_t... Ints>
+using index_sequence = integer_sequence<size_t, Ints...>;
+
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+
+template <size_t N>
+using make_index_rsequence = make_integer_sequence_reverse<size_t, N>;
+
+template <size_t Begin, size_t End>
+using make_index_range = make_integer_range<size_t, Begin, End>;
+
+//
+// Shortcuts
+//
+
+template <int... Ints>
+using seq = int_sequence<Ints...>;
+
+template <int N>
+using make_seq = make_int_sequence<N>;
+
+template <int N>
+using make_rseq = make_int_rsequence<N>;
+
+template <int Min, int Max>
+using make_range = make_int_range<Min, Max>;
+
+template <class Tuple>
+using tuple_seq = make_seq<tuple_size<remove_cvref_t<Tuple>>::value>;
+
+template <class Tuple>
+using tuple_rseq = make_rseq<tuple_size<remove_cvref_t<Tuple>>::value>;
+
+//
+// Specialize cute::tuple-traits for std::integer_sequence
+//
+
+template <class T, T... Ints>
+struct tuple_size<integer_sequence<T, Ints...>>
+    : cute::integral_constant<size_t, sizeof...(Ints)>
+{};
+
+template <size_t I, class T, T... Is>
+struct tuple_element<I, integer_sequence<T, Is...>>
+{
+  constexpr static T idx[sizeof...(Is)] = {Is...};
+  using type = cute::integral_constant<T, idx[I]>;
+};
+
+template <size_t I, class T, T... Ints>
+CUTE_HOST_DEVICE constexpr
+tuple_element_t<I, integer_sequence<T, Ints...>>
+get(integer_sequence<T, Ints...>) {
+  static_assert(I < sizeof...(Ints), "Index out of range");
+  return {};
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
new file mode 100755
index 000000000..3a8d036ee
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
@@ -0,0 +1,517 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/math.hpp>      // cute::max, etc
+#include <cute/util/print.hpp>        // cute::print
+#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES, cute::is_std_integral
+
+namespace cute
+{
+
+// A constant value: short name and type-deduction for fast compilation
+template <auto v>
+struct C {
+  using type = C<v>;
+  static constexpr auto value = v;
+  using value_type = decltype(v);
+  CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
+  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
+};
+
+// Deprecate
+template <class T, T v>
+using constant = C<v>;
+
+template <bool b>
+using bool_constant = C<b>;
+
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
+
+// A more std:: conforming integral_constant that enforces type but interops with C<v>
+template <class T, T v>
+struct integral_constant : C<v> {
+  using type = integral_constant<T,v>;
+  static constexpr T value = v;
+  using value_type = T;
+  // Disambiguate C<v>::operator value_type()
+  //CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
+  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
+};
+
+//
+// Traits
+//
+
+// Use cute::is_std_integral<T> to match built-in integral types (int, int64_t, unsigned, etc)
+// Use cute::is_integral<T> to match both built-in integral types AND static integral types.
+
+template <class T>
+struct is_integral : bool_constant<is_std_integral<T>::value> {};
+template <auto v>
+struct is_integral<C<v>                  > : true_type {};
+template <class T, T v>
+struct is_integral<integral_constant<T,v>> : true_type {};
+
+// Register FastDivmod as the integral type
+template<>
+struct is_integral<cutlass::FastDivmod> : true_type {};
+
+// is_static detects if an (abstract) value is defined completely by its type (no members)
+template <class T>
+struct is_static : bool_constant<is_empty<remove_cvref_t<T>>::value> {};
+
+template <class T>
+constexpr bool is_static_v = is_static<T>::value;
+
+// is_constant detects if a type is a static integral type and if v is equal to a value
+
+template <auto n, class T>
+struct is_constant : false_type {};
+template <auto n, class T>
+struct is_constant<n, T const > : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T const&> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T      &> : is_constant<n,T> {};
+template <auto n, class T>
+struct is_constant<n, T     &&> : is_constant<n,T> {};
+template <auto n, auto v>
+struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
+template <auto n, class T, T v>
+struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
+
+//
+// Specializations
+//
+
+template <int v>
+using Int = C<v>;
+
+using _m32    = Int<-32>;
+using _m24    = Int<-24>;
+using _m16    = Int<-16>;
+using _m12    = Int<-12>;
+using _m10    = Int<-10>;
+using _m9     = Int<-9>;
+using _m8     = Int<-8>;
+using _m7     = Int<-7>;
+using _m6     = Int<-6>;
+using _m5     = Int<-5>;
+using _m4     = Int<-4>;
+using _m3     = Int<-3>;
+using _m2     = Int<-2>;
+using _m1     = Int<-1>;
+using _0      = Int<0>;
+using _1      = Int<1>;
+using _2      = Int<2>;
+using _3      = Int<3>;
+using _4      = Int<4>;
+using _5      = Int<5>;
+using _6      = Int<6>;
+using _7      = Int<7>;
+using _8      = Int<8>;
+using _9      = Int<9>;
+using _10     = Int<10>;
+using _12     = Int<12>;
+using _16     = Int<16>;
+using _24     = Int<24>;
+using _32     = Int<32>;
+using _40     = Int<40>;
+using _48     = Int<48>;
+using _56     = Int<56>;
+using _64     = Int<64>;
+using _72     = Int<72>;
+using _80     = Int<80>;
+using _88     = Int<88>;
+using _96     = Int<96>;
+using _104    = Int<104>;
+using _112    = Int<112>;
+using _120    = Int<120>;
+using _128    = Int<128>;
+using _136    = Int<136>;
+using _144    = Int<144>;
+using _152    = Int<152>;
+using _160    = Int<160>;
+using _168    = Int<168>;
+using _176    = Int<176>;
+using _184    = Int<184>;
+using _192    = Int<192>;
+using _200    = Int<200>;
+using _208    = Int<208>;
+using _216    = Int<216>;
+using _224    = Int<224>;
+using _232    = Int<232>;
+using _240    = Int<240>;
+using _248    = Int<248>;
+using _256    = Int<256>;
+using _384    = Int<384>;
+using _512    = Int<512>;
+using _768    = Int<768>;
+using _1024   = Int<1024>;
+using _2048   = Int<2048>;
+using _4096   = Int<4096>;
+using _8192   = Int<8192>;
+using _16384  = Int<16384>;
+using _32768  = Int<32768>;
+using _65536  = Int<65536>;
+using _131072 = Int<131072>;
+using _262144 = Int<262144>;
+using _524288 = Int<524288>;
+
+/***************/
+/** Operators **/
+/***************/
+
+#define CUTE_LEFT_UNARY_OP(OP)                                       \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(OP t)> operator OP (C<t>) {                                     \
+    return {};                                                       \
+  }
+#define CUTE_RIGHT_UNARY_OP(OP)                                      \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(t OP)> operator OP (C<t>) {                                     \
+    return {};                                                       \
+  }
+#define CUTE_BINARY_OP(OP)                                           \
+  template <auto t, auto u>                                          \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<(t OP u)> operator OP (C<t>, C<u>) {                             \
+    return {};                                                       \
+  }
+
+CUTE_LEFT_UNARY_OP(+);
+CUTE_LEFT_UNARY_OP(-);
+CUTE_LEFT_UNARY_OP(~);
+CUTE_LEFT_UNARY_OP(!);
+CUTE_LEFT_UNARY_OP(*);
+
+CUTE_BINARY_OP( +);
+CUTE_BINARY_OP( -);
+CUTE_BINARY_OP( *);
+CUTE_BINARY_OP( /);
+CUTE_BINARY_OP( %);
+CUTE_BINARY_OP( &);
+CUTE_BINARY_OP( |);
+CUTE_BINARY_OP( ^);
+CUTE_BINARY_OP(<<);
+CUTE_BINARY_OP(>>);
+
+CUTE_BINARY_OP(&&);
+CUTE_BINARY_OP(||);
+
+CUTE_BINARY_OP(==);
+CUTE_BINARY_OP(!=);
+CUTE_BINARY_OP( >);
+CUTE_BINARY_OP( <);
+CUTE_BINARY_OP(>=);
+CUTE_BINARY_OP(<=);
+
+#undef CUTE_BINARY_OP
+#undef CUTE_LEFT_UNARY_OP
+#undef CUTE_RIGHT_UNARY_OP
+
+//
+// Mixed static-dynamic special cases
+//
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator*(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator*(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator/(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && (t == 1 || t == -1))>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator%(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator%(C<t>, U) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator&(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
+CUTE_HOST_DEVICE constexpr
+C<0>
+operator&(U, C<t>) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<false>
+operator&&(C<t>, U) {
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<false>
+operator&&(U, C<t>) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<true>
+operator||(C<t>, U) {
+  return {};
+}
+
+template <class U, auto t,
+          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
+CUTE_HOST_DEVICE constexpr
+C<true>
+operator||(U, C<t>) {
+  return {};
+}
+
+//
+// Named functions from math.hpp
+//
+
+#define CUTE_NAMED_UNARY_FN(OP)                                      \
+  template <auto t>                                                  \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<OP(t)> OP (C<t>) {                                               \
+    return {};                                                       \
+  }
+#define CUTE_NAMED_BINARY_FN(OP)                                     \
+  template <auto t, auto u>                                          \
+  CUTE_HOST_DEVICE constexpr                                         \
+  C<OP(t,u)> OP (C<t>, C<u>) {                                       \
+    return {};                                                       \
+  }                                                                  \
+  template <auto t, class U,                                         \
+            __CUTE_REQUIRES(is_std_integral<U>::value)>              \
+  CUTE_HOST_DEVICE constexpr                                         \
+  auto OP (C<t>, U u) {                                              \
+    return OP(t,u);                                                  \
+  }                                                                  \
+  template <class T, auto u,                                         \
+            __CUTE_REQUIRES(is_std_integral<T>::value)>              \
+  CUTE_HOST_DEVICE constexpr                                         \
+  auto OP (T t, C<u>) {                                              \
+    return OP(t,u);                                                  \
+  }
+
+CUTE_NAMED_UNARY_FN(abs);
+CUTE_NAMED_UNARY_FN(signum);
+CUTE_NAMED_UNARY_FN(has_single_bit);
+
+CUTE_NAMED_BINARY_FN(max);
+CUTE_NAMED_BINARY_FN(min);
+CUTE_NAMED_BINARY_FN(shiftl);
+CUTE_NAMED_BINARY_FN(shiftr);
+CUTE_NAMED_BINARY_FN(gcd);
+CUTE_NAMED_BINARY_FN(lcm);
+
+#undef CUTE_NAMED_UNARY_FN
+#undef CUTE_NAMED_BINARY_FN
+
+//
+// Other functions
+//
+
+template <auto t, auto u>
+CUTE_HOST_DEVICE constexpr
+C<t / u>
+safe_div(C<t>, C<u>) {
+  static_assert(t % u == 0, "Static safe_div requires t % u == 0");
+  return {};
+}
+
+template <auto t, class U,
+          __CUTE_REQUIRES(is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(C<t>, U u) {
+  return t / u;
+}
+
+template <class T, auto u,
+          __CUTE_REQUIRES(is_std_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(T t, C<u>) {
+  return t / u;
+}
+
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+conditional_return(true_type, TrueType&& t, FalseType&&) {
+  return static_cast<TrueType&&>(t);
+}
+
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+decltype(auto)
+conditional_return(false_type, TrueType&&, FalseType&& f) {
+  return static_cast<FalseType&&>(f);
+}
+
+template <auto v>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, C<v> const&, C<v> const&) {
+  return C<v>{};
+}
+
+template <auto v, auto u>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, C<v> const&, C<u> const&) {
+  return b ? v : u;
+}
+
+// TrueType and FalseType must have a common type
+template <class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(bool b, TrueType const& t, FalseType const& f) {
+  return b ? t : f;
+}
+
+// TrueType and FalseType don't require a common type
+template <bool b, class TrueType, class FalseType>
+CUTE_HOST_DEVICE constexpr
+auto
+conditional_return(TrueType const& t, FalseType const& f) {
+  if constexpr (b) {
+    return t;
+  } else {
+    return f;
+  }
+}
+
+template <class Trait>
+CUTE_HOST_DEVICE constexpr
+auto
+static_value()
+{
+  if constexpr (is_std_integral<decltype(Trait::value)>::value) {
+    return Int<Trait::value>{};
+  } else {
+    return Trait::value;
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Display utilities
+//
+
+template <auto Value>
+CUTE_HOST_DEVICE void print(C<Value>) {
+  printf("_");
+  ::cute::print(Value);
+}
+
+#if !defined(__CUDACC_RTC__)
+template <auto t>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, C<t> const&) {
+  return os << "_" << t;
+}
+#endif
+
+
+namespace detail {
+
+// parse_int_digits takes a variadic number of digits and converts them into an int
+template <class... Ts>
+constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)
+{
+  if constexpr (sizeof...(Ts) == 0) {
+    return 10 * result + digit;
+  } else {
+    return parse_int_digits(10 * result + digit, digits...);
+  }
+}
+
+} // end namespace detail
+
+
+// This user-defined literal operator allows cute::constant written as literals. For example,
+//
+//    auto var = 32_c;
+//
+//  var has type cute::constant<int,32>.
+//
+template <char... digits>
+constexpr cute::constant<int,detail::parse_int_digits(0, (digits - '0')...)> operator "" _c()
+{
+  static_assert((('0' <= digits && digits <= '9') && ...),
+                "Expected 0 <= digit <= 9 for each digit of the integer.");
+  return {};
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
new file mode 100755
index 000000000..1b1432533
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+#include <cute/numeric/math.hpp>               // cute::signum
+#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
+
+namespace cute
+{
+
+/** Compile-time rational arithmetic type.
+ * Like cute::C for std::integral_constant, cute::R for std::ratio has a short name
+ *   for error messages and compile times.
+ * The static data members @a num and @a den represent the reduced numerator and denominator
+ *   of the rational value. Thus, two cute::R types with different @a n or @a d are distinct types
+ *   even if they represent the same rational value.
+ * A cute::R exposes the reduced canonical type via its ::type member.
+ *   That is, cute::R<3,6>::type is cute::R<1,2> and cute::R<6,3>::type is cute::C<2>.
+ * A cute::R<n,d>::value can be used much like any other trait::value. It can be involved in
+ *   arithmetic expressions (according to the operator-overloads for cute::C and cute::R,
+ *   though these may be incomplete) but with a potential rational value rather than an integral value.
+ */
+template <auto n, auto d>
+class R {
+  static_assert(d != 0);
+  static constexpr auto an  = abs(n);
+  static constexpr auto ad  = abs(d);
+  static constexpr auto g   = gcd(an, ad);
+
+ public:
+  static constexpr auto num = signum(n) * signum(d) * an / g;
+  static constexpr auto den =                         ad / g;
+  // RI: den >= 1 && gcd(abs(num),den) == 1
+  using type = typename conditional<num == 0 || den == 1, C<num>, R<num,den>>::type;
+};
+
+template <class T>
+struct is_ratio : false_type {};
+template <auto n, auto d>
+struct is_ratio<R<n,d>> : true_type {};
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a,b>::type
+ratio(C<a>, C<b>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+ratio(C<a>, R<b,c>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<b,a*c>::type
+ratio(R<b,c>, C<a>) {
+  return {};
+}
+
+template <auto a, auto b, auto c, auto d>
+CUTE_HOST_DEVICE constexpr
+typename R<a*d,b*c>::type
+ratio(R<a,b>, R<c,d>) {
+  return {};
+}
+
+//
+// Non-reduced ratio implementations
+//
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+R<a,b>
+nratio(C<a>, C<b>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+R<a*c,b>
+nratio(C<a>, R<b,c>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+R<b,a*c>
+nratio(R<b,c>, C<a>) {
+  return {};
+}
+
+template <auto a, auto b, auto c, auto d>
+CUTE_HOST_DEVICE constexpr
+R<a*d,b*c>
+nratio(R<a,b>, R<c,d>) {
+  return {};
+}
+
+//
+// Operators
+//
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*x,b*y>::type
+operator*(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+operator*(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a*c,b>::type
+operator*(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<c*b,a>::type
+operator/(C<c>, R<a,b>) {
+  return {};
+}
+
+// Product with dynamic type needs to produce an integer...
+template <class C, auto a, auto b,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(C const& c, R<a,b>) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+// Product with dynamic type needs to produce an integer...
+template <auto a, auto b, class C,
+          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+operator*(R<a,b>, C const& c) {
+  return c * R<a,b>::num / R<a,b>::den;
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+typename R<a*y+b*x, b*y>::type
+operator+(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type
+operator+(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<a+c*b,b>::type
+operator+(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b, auto x, auto y>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == R<x,y>::num && R<a,b>::den == R<x,y>::den>
+operator==(R<a,b>, R<x,y>) {
+  return {};
+}
+
+template <auto a, auto b, auto c>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
+operator==(R<a,b>, C<c>) {
+  return {};
+}
+
+template <auto c, auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
+operator==(C<c>, R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+typename R<abs(a),abs(b)>::type
+abs(R<a,b>) {
+  return {};
+}
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE constexpr
+int32_t
+log_2(R<a,b>) {
+  static_assert(R<a,b>::num > 0);
+  static_assert(R<a,b>::den > 0);
+  return log_2(static_cast<uint32_t>(R<a,b>::num)) - log_2(static_cast<uint32_t>(R<a,b>::den));
+}
+
+// @return A non-reduced ratio cute::R of the Trait0::value / Trait1::value
+template <class Trait0, class Trait1>
+CUTE_HOST_DEVICE constexpr
+auto
+trait_ratio(Trait0, Trait1) {
+  return nratio(static_value<Trait0>(), static_value<Trait1>());
+}
+
+//
+// Display utilities
+//
+
+template <auto a, auto b>
+CUTE_HOST_DEVICE void print(R<a,b>) {
+  print(C<a>{}); print("/"); print(C<b>{});
+}
+
+#if !defined(__CUDACC_RTC__)
+template <auto a, auto b>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, R<a,b>) {
+  return os << "_" << C<a>{} << "/" << C<b>{};
+}
+#endif
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/math.hpp b/lightllm-kernel/cutlass/include/cute/numeric/math.hpp
new file mode 100755
index 000000000..e493a3a95
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/math.hpp
@@ -0,0 +1,356 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>            // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES
+
+#include <cutlass/fast_math.h>
+
+namespace cute
+{
+
+//
+// Common Operations
+//
+
+template <class T, class U,
+          __CUTE_REQUIRES(is_arithmetic<T>::value &&
+                          is_arithmetic<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+max(T const& t, U const& u) {
+  return t < u ? u : t;
+}
+
+template <class T, class U,
+          __CUTE_REQUIRES(is_arithmetic<T>::value &&
+                          is_arithmetic<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+min(T const& t, U const& u) {
+  return t < u ? t : u;
+}
+
+template <class T,
+          __CUTE_REQUIRES(is_arithmetic<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+abs(T const& t) {
+  if constexpr (is_signed<T>::value) {
+    return t < T(0) ? -t : t;
+  } else {
+    return t;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero.
+template <class T,
+          __CUTE_REQUIRES(is_arithmetic<T>::value)>
+CUTE_HOST_DEVICE constexpr
+int
+signum(T const& x) {
+  if constexpr (is_signed<T>::value) {
+    return (T(0) < x) - (x < T(0));
+  } else {
+    return T(0) < x;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// C++17 <numeric> operations
+//
+
+// Greatest common divisor of two positive integers
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+cute::common_type_t<T, U>
+gcd(T t, U u) {
+  while (true) {
+    if (t == 0) { return u; }
+    u %= t;
+    if (u == 0) { return t; }
+    t %= u;
+  }
+}
+
+// Least common multiple of two positive integers
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+cute::common_type_t<T, U>
+lcm(T const& t, U const& u) {
+  return (t / gcd(t,u)) * u;
+}
+
+//
+// C++20 <bit> operations
+//
+
+// Checks if a number is an integral power of two
+template <class T>
+CUTE_HOST_DEVICE constexpr
+bool
+has_single_bit(T x) {
+  return x != 0 && (x & (x - 1)) == 0;
+}
+
+// Smallest number of bits needed to represent the given value
+//   For x == 0, this is 0
+//   For x != 0, this is 1 + floor(log2(x))
+// bit_width( 0b0000 ) = 0
+// bit_width( 0b0001 ) = 1
+// bit_width( 0b0010 ) = 2
+// bit_width( 0b0011 ) = 2
+// bit_width( 0b0100 ) = 3
+// bit_width( 0b0101 ) = 3
+// bit_width( 0b0110 ) = 3
+// bit_width( 0b0111 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+bit_width(T x) {
+  static_assert(is_unsigned<T>::value, "Only to be used for unsigned types.");
+  constexpr int N = (numeric_limits<T>::digits == 64 ? 6 :
+                    (numeric_limits<T>::digits == 32 ? 5 :
+                    (numeric_limits<T>::digits == 16 ? 4 :
+                    (numeric_limits<T>::digits ==  8 ? 3 : (assert(false),0)))));
+  T r = 0;
+  for (int i = N - 1; i >= 0; --i) {
+    T shift = (x > ((T(1) << (T(1) << i))-1)) << i;
+    x >>= shift;
+    r  |= shift;
+  }
+  return r + (x != 0);
+}
+
+// Smallest integral power of two not less than the given value
+// bit_ceil( 0b00000000 ) = 0b00000001
+// bit_ceil( 0b00000001 ) = 0b00000001
+// bit_ceil( 0b00000010 ) = 0b00000010
+// bit_ceil( 0b00000011 ) = 0b00000100
+// bit_ceil( 0b00000100 ) = 0b00000100
+// bit_ceil( 0b00000101 ) = 0b00001000
+// bit_ceil( 0b00000110 ) = 0b00001000
+// bit_ceil( 0b00000111 ) = 0b00001000
+// bit_ceil( 0b00001000 ) = 0b00001000
+// bit_ceil( 0b00001001 ) = 0b00010000
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+bit_ceil(T x) {
+  return x == 0 ? T(1) : (T(1) << bit_width(x - 1));
+}
+
+// Largest integral power of two not greater than the given value
+// bit_floor( 0b00000000 ) = 0b00000000
+// bit_floor( 0b00000001 ) = 0b00000001
+// bit_floor( 0b00000010 ) = 0b00000010
+// bit_floor( 0b00000011 ) = 0b00000010
+// bit_floor( 0b00000100 ) = 0b00000100
+// bit_floor( 0b00000101 ) = 0b00000100
+// bit_floor( 0b00000110 ) = 0b00000100
+// bit_floor( 0b00000111 ) = 0b00000100
+// bit_floor( 0b00001000 ) = 0b00001000
+// bit_floor( 0b00001001 ) = 0b00001000
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+bit_floor(T x) {
+  return x == 0 ? 0 : (T(1) << (bit_width(x) - 1));
+}
+
+template <class T>
+CUTE_HOST_DEVICE constexpr T rotl(T x, int s);
+template <class T>
+CUTE_HOST_DEVICE constexpr T rotr(T x, int s);
+
+// Computes the result of circular bitwise left-rotation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+rotl(T x, int s) {
+  constexpr int N = numeric_limits<T>::digits;
+  return static_cast<T>(s == 0 ? x : s > 0 ? (x << s) | (x >> (N - s)) : rotr(x, -s));
+}
+
+// Computes the result of circular bitwise right-rotation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+rotr(T x, int s) {
+  constexpr int N = numeric_limits<T>::digits;
+  return static_cast<T>(s == 0 ? x : s > 0 ? (x >> s) | (x << (N - s)) : rotl(x, -s));
+}
+
+// Counts the number of consecutive 0 bits, starting from the most significant bit
+// countl_zero( 0b00000000 ) = 8
+// countl_zero( 0b11111111 ) = 0
+// countl_zero( 0b00011100 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countl_zero(T x) {
+  return numeric_limits<T>::digits - bit_width(x);
+}
+
+// Counts the number of consecutive 1 bits, starting from the most significant bit
+// countl_one( 0b00000000 ) = 0
+// countl_one( 0b11111111 ) = 8
+// countl_one( 0b11100011 ) = 3
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countl_one(T x) {
+  return countl_zero(~x);
+}
+
+// Counts the number of consecutive 0 bits, starting from the least significant bit
+// countr_zero( 0b00000000 ) = 8
+// countr_zero( 0b11111111 ) = 0
+// countr_zero( 0b00011100 ) = 2
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countr_zero(T x) {
+  return x == 0 ? numeric_limits<T>::digits : bit_width(T(x & T(-x))) - 1;  // bit_width of the LSB
+}
+
+// Counts the number of consecutive 1 bits, starting from the least significant bit
+// countr_one( 0b00000000 ) = 0
+// countr_one( 0b11111111 ) = 8
+// countr_one( 0b11100011 ) = 2
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+countr_one(T x) {
+  return countr_zero(~x);
+}
+
+// Counts the number of 1 bits in an unsigned integer
+// popcount( 0b00000000 ) = 0
+// popcount( 0b11111111 ) = 8
+// popcount( 0b00011101 ) = 4
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int
+popcount(T x) {
+  int c = 0;
+  while (x) {
+    ++c;
+    x &= x - 1; // clear the least significant bit set
+  }
+  return c;
+}
+
+//
+// Custom operations
+//
+
+// Computes the result of bitwise left-shift
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftl(T x, int s) {
+  return s >= 0 ? (x << s) : (x >> -s);
+}
+
+// Computes the result of bitwise right-shift
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftr(T x, int s) {
+  return s >= 0 ? (x >> s) : (x << -s);
+}
+
+// Safe divide
+// @pre t % u == 0
+// @result t / u
+template <class T, class U,
+          __CUTE_REQUIRES(is_std_integral<T>::value &&
+                          is_std_integral<U>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(T const& t, U const& u) {
+  //assert(t % u == 0);
+  return t / u;
+}
+
+/**
+ * log2 computation
+ */
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+int32_t
+log_2(T x) {
+  assert(x > 0);
+  static_assert(is_unsigned<T>::value, "Only to be used for unsigned integral types.");
+  return static_cast<int32_t>(bit_width(x)) - 1;
+}
+
+template <class IntDiv, class IntMod>
+struct DivModReturnType {
+  IntDiv div_;
+  IntMod mod_;
+  CUTE_HOST_DEVICE constexpr
+  DivModReturnType(IntDiv const& div, IntMod const& mod) : div_(div), mod_(mod) {}
+};
+
+// General divmod
+template <class CInt0, class CInt1>
+CUTE_HOST_DEVICE constexpr
+auto
+divmod(CInt0 const& a, CInt1 const& b) {
+  return DivModReturnType{a / b, a % b};
+}
+
+// Specialized function with fastDivmod input
+template <class CInt>
+CUTE_HOST_DEVICE constexpr
+auto
+divmod(CInt const& a, cutlass::FastDivmod const& b) {
+  using val_div_type = typename cutlass::FastDivmod::value_div_type;
+  using val_mod_type = typename cutlass::FastDivmod::value_mod_type;
+  val_div_type div = 0;
+  val_mod_type mod = 0;
+  b(div, mod, a);
+  return DivModReturnType{div, mod};
+}
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp b/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
new file mode 100755
index 000000000..07444331f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>          // CUTE_HOST_DEVICE
+#include <cute/numeric/int.hpp>     // cute::int2_t, cute::int4_t, etc
+
+#include <cutlass/numeric_size.h>   // cutlass::sizeof_bits
+#include <cutlass/numeric_types.h>  // cutlass::float_e4m3_t, cutlass::float_e5m2_t, etc
+
+namespace cute {
+
+template <typename T>
+struct sizeof_bits : public cutlass::sizeof_bits<T> {};
+
+// DO NOT change auto to int, sizeof_bits<sparse_elem> use integral_ratio instead of int 
+template <class T>
+static constexpr auto sizeof_bits_v = sizeof_bits<T>::value;
+
+using cutlass::bits_to_bytes;
+
+using cutlass::is_subbyte;
+
+template <class T>
+static constexpr auto is_subbyte_v = is_subbyte<T>::value;
+
+using cutlass::half_t;
+using cutlass::bfloat16_t;
+
+using cutlass::tfloat32_t;
+
+// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
+// This umbrella datatype can be enabled when a user provides a specific
+// datatype in runtime argument list.
+using cutlass::type_erased_dynamic_float8_t;
+using cutlass::float_e4m3_t;
+using cutlass::float_e5m2_t;
+
+using cutlass::uint1b_t;
+using cutlass::int2b_t;
+using cutlass::uint2b_t;
+using cutlass::int4b_t;
+using cutlass::uint4b_t;
+using cutlass::bin1_t;
+
+
+//
+// Print utility
+//
+
+CUTE_HOST_DEVICE
+void
+print(half_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(bfloat16_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+
+CUTE_HOST_DEVICE
+void
+print(tfloat32_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(float_e4m3_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(float_e5m2_t a) {
+  printf("%f", static_cast<float>(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(bfloat16_t v) {
+  printf("%*.2f", 8, float(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(half_t v) {
+  printf("%*.2f", 8, float(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(tfloat32_t v) {
+  printf("%*.2e", 10, static_cast<float>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float_e4m3_t t) {
+  printf("%*.2f", 8, static_cast<float>(t));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float_e5m2_t t) {
+  printf("%*.2f", 8, static_cast<float>(t));
+}
+
+} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/real.hpp b/lightllm-kernel/cutlass/include/cute/numeric/real.hpp
new file mode 100755
index 000000000..4ce58dfa1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/numeric/real.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>
+
+namespace cute
+{
+
+/// Generic add
+template <class C, class A, class B>
+CUTE_HOST_DEVICE constexpr
+void
+add(C& c, A const& a, B const& b)
+{
+  c = a + b;
+}
+
+/// Generic multiply
+template <class C, class A, class B>
+CUTE_HOST_DEVICE constexpr
+void
+mul(C& c, A const& a, B const& b)
+{
+  c = a * b;
+}
+
+/// Generic fused multiply-add
+template <class D, class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(D& d, A const& a, B const& b, C const& c)
+{
+  d = a * b + c;
+}
+
+/// Fused multiply-add for triplets
+template <class A, class B, class C>
+CUTE_HOST_DEVICE constexpr
+void
+fma(A const& a, B const& b, C& c)
+{
+  return fma(c, a, b, c);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer.hpp b/lightllm-kernel/cutlass/include/cute/pointer.hpp
new file mode 100755
index 000000000..4cfa129cc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/pointer.hpp
@@ -0,0 +1,322 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>               // cute::iter_adaptor
+#include <cute/pointer_sparse.hpp>
+#include <cute/container/array_subbyte.hpp>    // cute::subbyte_iterator
+#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
+#include <cute/numeric/numeric_types.hpp>      // sizeof_bits
+
+namespace cute
+{
+
+//
+// recast_ptr<T> -- Create an iterator over values of type T.
+// For most types this will simply be T*, but certain types require more care.
+// Subbyte Types: uint2_t, uint4_t, etc
+//   Requires construction of a subbyte_iterator<T> in order to properly
+//   resolve each element in byte-addressed memory.
+// Sparse Types: sparse_elem<int S, class T>
+//   A type that holds one physical element meant to represent S number of logical elements.
+//   Requires construction of a sparse_ptr that emulates access to the S logical elements.
+//
+
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(void* ptr)
+{
+  if constexpr (is_sparse<NewT>::value) {
+    constexpr int sparsity = NewT::sparsity;
+    NewT* p = reinterpret_cast<NewT*>(ptr);
+    return make_sparse_ptr<sparsity>(p);
+  } else
+  if constexpr (cute::is_subbyte_v<NewT>) {
+    return subbyte_iterator<NewT>(ptr);
+  } else {
+    return reinterpret_cast<NewT*>(ptr);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(void const* ptr)
+{
+  if constexpr (is_sparse<NewT>::value) {
+    constexpr int sparsity = NewT::sparsity;
+    NewT const* p = reinterpret_cast<NewT const*>(ptr);
+    return make_sparse_ptr<sparsity>(p);
+  } else
+  if constexpr (cute::is_subbyte_v<NewT>) {
+    return subbyte_iterator<NewT const>(ptr);
+  } else {
+    return reinterpret_cast<NewT const*>(ptr);
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Disambiguate nullptr
+template <class NewT>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(decltype(nullptr)) {   // nullptr_t
+  return recast_ptr<NewT>(static_cast<NewT*>(nullptr));
+}
+
+//
+// gmem_ptr
+//
+
+template <class P>
+struct gmem_ptr : iter_adaptor<P, gmem_ptr<P>> {
+  using iter_adaptor<P, gmem_ptr<P>>::iter_adaptor;
+};
+
+template <class T, class = void>
+struct is_gmem : false_type {};
+template <class P>                     // Found the gmem
+struct is_gmem<gmem_ptr<P>> : true_type {};
+template <class P>                     // Recurse on ::iterator, if possible
+struct is_gmem<P, void_t<typename P::iterator>> : is_gmem<typename P::iterator> {};
+template <class P>
+constexpr bool is_gmem_v = is_gmem<P>::value;
+
+// Idempotent gmem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(Iterator iter) {
+  if constexpr (is_gmem<Iterator>::value) {
+    return iter;
+  } else {
+    return gmem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(void* ptr) {
+  return make_gmem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(void const* ptr) {
+  return make_gmem_ptr(recast_ptr<T const>(ptr));
+}
+
+// nullptr_t overload for make_gmem_ptr<float>(nullptr) disambiguation
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_gmem_ptr(decltype(nullptr)) { // nullptr_t
+  return make_gmem_ptr(recast_ptr<T>(nullptr));
+}
+
+// The gmem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(gmem_ptr<P> const& ptr) {
+  return make_gmem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// smem_ptr
+//
+
+template <class P>
+struct smem_ptr : iter_adaptor<P, smem_ptr<P>> {
+  using iter_adaptor<P, smem_ptr<P>>::iter_adaptor;
+};
+
+template <class T, class = void>
+struct is_smem : false_type {};
+template <class P>                     // Found the smem
+struct is_smem<smem_ptr<P>> : true_type {};
+template <class P>                     // Recurse on ::iterator, if possible
+struct is_smem<P, void_t<typename P::iterator>> : is_smem<typename P::iterator> {};
+template <class P>
+constexpr bool is_smem_v = is_smem<P>::value;
+
+// Idempotent smem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(Iterator iter) {
+  if constexpr (is_smem<Iterator>::value) {
+    return iter;
+  } else {
+    return smem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Make a smem swizzle pointer, common operation
+template <class Iterator, class Swizzle>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(Iterator ptr, Swizzle sw)
+{
+  return make_swizzle_ptr(make_smem_ptr(ptr), sw);
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(void* ptr) {
+  return make_smem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_smem_ptr(void const* ptr) {
+  return make_smem_ptr(recast_ptr<T const>(ptr));
+}
+
+// The smem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(smem_ptr<P> const& ptr) {
+  return make_smem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// rmem_ptr
+//
+
+template <class P>
+struct rmem_ptr : iter_adaptor<P, rmem_ptr<P>> {
+  using iter_adaptor<P, rmem_ptr<P>>::iter_adaptor;
+};
+
+// Anything that is not gmem or smem is rmem
+template <class T, class = void>
+struct is_rmem : bool_constant<not (is_gmem<T>::value || is_smem<T>::value)> {};
+template <class P>
+struct is_rmem<rmem_ptr<P>> : true_type {};
+template <class P>
+constexpr bool is_rmem_v = is_rmem<P>::value;
+
+// Idempotent rmem tag on an iterator
+template <class Iterator>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(Iterator iter) {
+  if constexpr (is_rmem<Iterator>::value) {
+    return iter;
+  } else {
+    return rmem_ptr<Iterator>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(void* ptr) {
+  return make_rmem_ptr(recast_ptr<T>(ptr));
+}
+
+// Explicitly typed construction from a raw pointer
+template <class T>
+CUTE_HOST_DEVICE constexpr
+auto
+make_rmem_ptr(void const* ptr) {
+  return make_rmem_ptr(recast_ptr<T const>(ptr));
+}
+
+// The rmem tag is invariant over type-recast
+template <class NewT, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(rmem_ptr<P> const& ptr) {
+  return make_rmem_ptr(recast_ptr<NewT>(ptr.get()));
+}
+
+//
+// Display utilities
+//
+
+template <class T>
+CUTE_HOST_DEVICE void print(gmem_ptr<T> ptr)
+{
+  printf("gmem_"); print(ptr.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(smem_ptr<T> ptr)
+{
+  printf("smem_"); print(ptr.get());
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(rmem_ptr<T> ptr)
+{
+  printf("rmem_"); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, gmem_ptr<T> ptr)
+{
+  return os << "gmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr<T> ptr)
+{
+  return os << "smem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, rmem_ptr<T> ptr)
+{
+  return os << "rmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
+}
+
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_base.hpp b/lightllm-kernel/cutlass/include/cute/pointer_base.hpp
new file mode 100755
index 000000000..90ca0ceb6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/pointer_base.hpp
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
+#include <cute/numeric/numeric_types.hpp>  // cute::sizeof_bits
+#include <cute/util/type_traits.hpp>       // cute::declval, cute::void_t, etc
+
+namespace cute
+{
+
+//
+// C++20 <iterator> iterator_traits
+//
+
+namespace detail {
+// Default reference type of an iterator
+template <class T, class = void>
+struct iter_ref { using type = decltype(*declval<T&>()); };
+// Prefer to propagate ::reference
+template <class T>
+struct iter_ref<T,void_t<typename T::reference>> { using type = typename T::reference; };
+} // end namespace detail
+
+template <class T>
+using iter_reference = detail::iter_ref<T>;
+template <class T>
+using iter_reference_t = typename iter_reference<T>::type;
+
+namespace detail {
+// Default element_type of an iterator
+template <class T, class = void>
+struct iter_e { using type = remove_reference_t<typename iter_ref<T>::type>; };
+// Prefer to propagate ::element_type
+template <class T>
+struct iter_e<T,void_t<typename T::element_type>> { using type = typename T::element_type; };
+} // end namespace detail
+
+template <class T>
+using iter_element = detail::iter_e<T>;
+template <class T>
+using iter_element_t = typename iter_element<T>::type;
+
+namespace detail {
+// Default value_type of an iterator
+template <class T, class = void>
+struct iter_v { using type = remove_cv_t<typename iter_e<T>::type>; };
+// Prefer to propagate ::value_type
+template <class T>
+struct iter_v<T,void_t<typename T::value_type>> { using type = typename T::value_type; };
+} // end namespace detail
+
+template <class T>
+using iter_value = detail::iter_v<T>;
+template <class T>
+using iter_value_t = typename iter_value<T>::type;
+
+template <class Iterator>
+struct iterator_traits {
+  using reference    = iter_reference_t<Iterator>;
+  using element_type = iter_element_t<Iterator>;
+  using value_type   = iter_value_t<Iterator>;
+};
+
+//
+// has_dereference to determine if a type is an iterator concept
+//
+
+namespace detail {
+template <class T, class = void>
+struct has_dereference : CUTE_STL_NAMESPACE::false_type {};
+template <class T>
+struct has_dereference<T, void_t<decltype(*declval<T&>())>> : CUTE_STL_NAMESPACE::true_type {};
+} // end namespace detail
+
+template <class T>
+using has_dereference = detail::has_dereference<T>;
+
+//
+// raw_pointer_cast
+//
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T*
+raw_pointer_cast(T* ptr) {
+  return ptr;
+}
+
+//
+// A very simplified iterator adaptor.
+// Derived classed may override methods, but be careful to reproduce interfaces exactly.
+// Clients should never have an instance of this class. Do not write methods that take this as a param.
+//
+
+template <class Iterator, class DerivedType>
+struct iter_adaptor
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+
+  iterator ptr_;
+
+  CUTE_HOST_DEVICE constexpr
+  iter_adaptor(iterator ptr = {}) : ptr_(ptr) {}
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const { return *ptr_; }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const { return ptr_[i]; }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  DerivedType operator+(Index const& i) const { return {ptr_ + i}; }
+
+  CUTE_HOST_DEVICE constexpr
+  iterator get() const { return ptr_; }
+
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator==(DerivedType const& x, DerivedType const& y) { return x.ptr_ == y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator!=(DerivedType const& x, DerivedType const& y) { return x.ptr_ != y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator< (DerivedType const& x, DerivedType const& y) { return x.ptr_ <  y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator<=(DerivedType const& x, DerivedType const& y) { return x.ptr_ <= y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator> (DerivedType const& x, DerivedType const& y) { return x.ptr_ >  y.ptr_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator>=(DerivedType const& x, DerivedType const& y) { return x.ptr_ >= y.ptr_; }
+};
+
+template <class I, class D>
+CUTE_HOST_DEVICE constexpr
+auto
+raw_pointer_cast(iter_adaptor<I,D> const& x) {
+  return raw_pointer_cast(x.ptr_);
+}
+
+//
+// counting iterator -- quick and dirty
+//
+
+template <class T = int>
+struct counting_iterator
+{
+  using index_type = T;
+  using value_type = T;
+  using reference  = T;
+
+  index_type n_;
+
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator(index_type n = 0) : n_(n) {}
+
+  CUTE_HOST_DEVICE constexpr
+  index_type operator*() const { return n_; }
+
+  CUTE_HOST_DEVICE constexpr
+  index_type operator[](index_type i) const { return n_ + i; }
+
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator operator+(index_type i) const { return {n_ + i}; }
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator& operator++() { ++n_; return *this; }
+  CUTE_HOST_DEVICE constexpr
+  counting_iterator operator++(int) { counting_iterator ret = *this; ++n_; return ret; }
+
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator==(counting_iterator const& x, counting_iterator const& y) { return x.n_ == y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator!=(counting_iterator const& x, counting_iterator const& y) { return x.n_ != y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator< (counting_iterator const& x, counting_iterator const& y) { return x.n_ <  y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator<=(counting_iterator const& x, counting_iterator const& y) { return x.n_ <= y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator> (counting_iterator const& x, counting_iterator const& y) { return x.n_ >  y.n_; }
+  CUTE_HOST_DEVICE constexpr
+  friend bool operator>=(counting_iterator const& x, counting_iterator const& y) { return x.n_ >= y.n_; }
+};
+
+template <class T>
+CUTE_HOST_DEVICE constexpr
+T
+raw_pointer_cast(counting_iterator<T> const& x) {
+  return x.n_;
+}
+
+//
+// Display utilities
+//
+
+template <class T>
+CUTE_HOST_DEVICE void print(T const* const ptr)
+{
+  printf("ptr["); print(sizeof_bits<T>::value); printf("b](%p)", ptr);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void print(counting_iterator<T> ptr)
+{
+  printf("counting_iter("); print(ptr.n_); printf(")");
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class T>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, counting_iterator<T> ptr)
+{
+  return os << "counting_iter(" << ptr.n_ << ")";
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp b/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
new file mode 100755
index 000000000..eb8d7e452
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/layout_composed.hpp>            // cute::ComposedLayout
+#include <cute/pointer.hpp>                    // cute::make_smem_ptr
+#include <cute/pointer_sparse.hpp>             // cute::is_sparse
+#include <cute/pointer_swizzle.hpp>            // cute::make_swizzle_ptr
+#include <cute/arch/util.hpp>                  // cute::cast_smem_ptr_to_uint
+#include <cute/numeric/integral_constant.hpp>  // cute::Int
+
+namespace cute
+{
+
+//
+// Stand-in Swizzle Layout
+//   A model of a nullptr smem_ptr<T> with B == sizeof_bits<T>::value
+//   That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
+//
+
+template <int Bits>
+struct smem_ptr_flag_bits : Int<0> {};
+
+using smem_ptr_flag = smem_ptr_flag_bits<1>;
+
+// A flagged construction method to transform ComposedLayout
+// Make a swizzle pointer tensor and check that the intended type size matches
+template <class Iterator, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& ptr,
+            ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  static_assert(is_smem<Iterator>::value, "Expected smem.");
+  static_assert(B == sizeof_bits<iter_value_t<Iterator>>::value, "Expected a B-bit pointer type.");
+  return make_tensor(make_smem_ptr(ptr.get(), layout.layout_a()),
+                     layout.layout_b());
+}
+
+// NOTE: To preserve smem_ptr_flag_bits under recast ops
+template <int N, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(layout.layout_a(), smem_ptr_flag_bits<B*N>{}, upcast<N>(layout.layout_b()));
+}
+
+template <int N, class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(layout.layout_a(), smem_ptr_flag_bits<B/N>{}, downcast<N>(layout.layout_b()));
+}
+
+//
+// Conversion with swizzle_layout
+//
+
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+auto
+as_position_independent_swizzle_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  return composition(recast_layout<uint8_t,uint_bit_t<B>>(layout.layout_a()), Int<0>{}, layout.layout_b());
+}
+
+template <class Tensor>
+CUTE_HOST_DEVICE
+auto
+as_position_independent_swizzle_tensor(Tensor&& tensor)
+{
+  static_assert(is_smem<remove_cvref_t<Tensor>>::value, "Expected smem tensor.");
+  using SwizzleFn = get_swizzle_t<remove_cvref_t<Tensor>>;
+  if constexpr (SwizzleFn::num_bits == 0) {
+    return tensor;
+  } else {
+#if !defined(NDEBUG)
+    {
+    uint32_t address = cast_smem_ptr_to_uint(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
+    uint32_t mask    = ((uint32_t(1) << SwizzleFn::num_base) - 1) | SwizzleFn::swizzle_code;
+    assert((address & mask) == 0);  // Alignment to the Base, Z, and Y of Swizzle
+    }
+#endif
+    using T = typename remove_cvref_t<Tensor>::value_type;
+    // Recast swizzle from acting on byte-addressed pointers to elements of type-T
+    auto new_swizzle = recast_layout<uint8_t, T>(SwizzleFn{});
+    // Strip off everything and create a new smem_ptr for type-T
+    auto new_ptr = make_smem_ptr<T>(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
+    return make_tensor(new_ptr, composition(new_swizzle, Int<0>{}, tensor.layout()));
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+// A model of a nullptr sparse_ptr<S, smem_ptr<T>> with B == sizeof_bits<T>::value
+// That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
+template <int Sparsity, int Bits>
+struct smem_sparse_ptr_flag_bits : Int<0> {};
+
+template <int Sparsity>
+using smem_sparse_ptr_flag = smem_sparse_ptr_flag_bits<Sparsity, 1>;
+
+// A flagged construction method to transform ComposedLayout
+// Make a swizzle pointer tensor and check that the intended type size matches
+template <class Iterator, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& ptr,
+            ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(is_smem<Iterator>::value, "Expected smem.");
+  static_assert(is_sparse_ptr<Iterator>::value, "Expected sparse iter");
+  static_assert(is_sparse<iter_value_t<Iterator>>::value, "Expected sparse elem");
+  static_assert(S == iter_value_t<Iterator>::sparsity, "Expected sparsity S");
+  static_assert(B == sizeof_bits<typename iter_value_t<Iterator>::raw_type>::value, "Expected B-bit pointer type");
+  return make_tensor(make_swizzle_ptr(ptr, layout.layout_a()), layout.layout_b());
+}
+
+// NOTE: To preserve smem_ptr_flag_bits under recast ops
+template <int N, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
+}
+
+template <int N, class SwizzleFn, int S, int B, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
+{
+  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
+}
+
+//
+// Display utilities
+//
+
+// Capture and cast smem_ptr_flag Layouts to offset-0 layouts
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+void
+print_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  print_layout(as_position_independent_swizzle_layout(layout));
+}
+
+template <class SwizzleFn, int B, class Layout>
+CUTE_HOST_DEVICE
+void
+print_latex(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
+{
+  print_latex(as_position_independent_swizzle_layout(layout));
+}
+
+template <int B>
+CUTE_HOST_DEVICE void print(smem_ptr_flag_bits<B> ptr)
+{
+  printf("smem_ptr[%db](unset)", B);
+}
+
+template <int S, int B>
+CUTE_HOST_DEVICE void print(smem_sparse_ptr_flag_bits<S,B>)
+{
+  printf("smem_sparse<%d>_ptr[%db](unset)", S, B);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp b/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
new file mode 100755
index 000000000..ccae45865
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>               // cute::iter_adaptor
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+#include <cute/numeric/integral_ratio.hpp>     // cute::ratio
+
+namespace cute
+{
+
+// A data type that holds one physical element meant to represent Sparsity number of logical elements
+// This class is purposely not compatible with anything -- know what you're doing if you attempt to use it
+template <int Sparsity, class T>
+struct sparse_elem
+{
+  static constexpr int sparsity = Sparsity;
+  using raw_type = T;
+  T elem_;
+
+  CUTE_HOST_DEVICE constexpr
+  explicit sparse_elem(T const& elem = {}) : elem_(elem) {}
+
+  CUTE_HOST_DEVICE constexpr friend bool operator==(sparse_elem const& a, sparse_elem const& b) { return a.elem_ == b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator!=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ != b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator< (sparse_elem const& a, sparse_elem const& b) { return a.elem_ <  b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator<=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ <= b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator> (sparse_elem const& a, sparse_elem const& b) { return a.elem_ >  b.elem_; }
+  CUTE_HOST_DEVICE constexpr friend bool operator>=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ >= b.elem_; }
+};
+
+template <class T>
+struct is_sparse : false_type {};
+template <class T>
+struct is_sparse<T const> : is_sparse<T> {};
+template <int S, class T>
+struct is_sparse<sparse_elem<S,T>> : true_type {};
+template<class T>
+static constexpr auto is_sparse_v = is_sparse<T>::value;
+
+// Overload sizeof_bits for sparse_elem.
+//   Much like subbyte element types, this is the effective number of bits in a sparse_elem
+//   rather than actual physical bits that may be used in storing one. Also like subbyte element
+//   types, modified iterators are required to properly index and access sparse_elems.
+//
+//   Defining sizeof_bits like this makes reasonable expressions like N * sizeof_bits_v<E> meaningful
+//   even when E is subbyte or sparse. However, this also means that sparse_elem can rather easily be
+//   confused with subbyte elements and special care should be taken with each.
+template <int S, class T>
+struct sizeof_bits<sparse_elem<S,T>> {
+  // Simple implementation that conforms to sizeof_bits
+  //static constexpr auto value = sizeof_bits<T>::value / S;
+  //static_assert(value != 0, "sizeof_bits=0 detected. Sparsity is larger than width.");
+  //static_assert((sizeof_bits<T>::value % S) == 0, "Width needs to be a multiple of sparsity.")
+
+  // Interesting experiment that allows any sparsity level to be used by potentially presenting
+  // an integral_ratio rather than size_t. This is valid in most integer expressions as well.
+  static constexpr auto value = cute::ratio(cute::Int<cute::sizeof_bits_v<T>>{}, cute::Int<S>{});
+};
+
+//
+// sparse_ptr
+//
+
+template <class T, class = void>
+struct is_sparse_ptr : false_type {};
+template <class T>
+struct is_sparse_ptr<T, void_t<typename T::iterator>> : is_sparse_ptr<typename T::iterator> {};
+
+template <int Sparsity, class Iterator>
+struct sparse_ptr : iter_adaptor<Iterator, sparse_ptr<Sparsity, Iterator>>
+{
+  using reference    = typename iterator_traits<Iterator>::reference;
+  using element_type = typename iterator_traits<Iterator>::element_type;
+  using value_type   = typename iterator_traits<Iterator>::value_type;
+
+  // Sanity, for now
+  static_assert(is_sparse<value_type>::value, "Enforce sparse value-type");
+  static_assert(Sparsity == iter_value_t<Iterator>::sparsity, "Enforce sparsity S");
+  static_assert(not is_sparse_ptr<Iterator>::value, "Enforce sparse singleton");
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  sparse_ptr operator+(Index const& i) const {
+    // Only allow offset by multiples of the sparsity factor,
+    // else the misalignments become a bug. E.g. (sparse_ptr<8,I>{} + 7) + 7
+    // Motivation for subsparse_iterator or generalization of subbyte_iterator?
+    assert(i % Sparsity == 0);
+    return {this->get() + i / Sparsity};
+  }
+
+  template <class Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Index const& i) const {
+    // Allow offset by any value and dereference.
+    // Not implemented in terms of sparse_ptr::op+()
+    return *(this->get() + i / Sparsity);
+  }
+};
+
+template <int S, class I>
+struct is_sparse_ptr<sparse_ptr<S,I>> : true_type {};
+
+template <int Sparsity, class Iter>
+CUTE_HOST_DEVICE constexpr
+auto
+make_sparse_ptr(Iter const& iter) {
+  if constexpr (Sparsity == 1) {
+    return iter;
+  } else {
+    return sparse_ptr<Sparsity, Iter>{iter};
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class NewT, int S, class Iter>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(sparse_ptr<S,Iter> const& ptr) {
+  static_assert(not is_sparse<NewT>::value);
+  return recast_ptr<NewT>(ptr.get());
+}
+
+//
+// Display utilities
+//
+
+template <int S, class Iter>
+CUTE_HOST_DEVICE void print(sparse_ptr<S,Iter> ptr)
+{
+  printf("sparse<%d>_", S); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <int S, class Iter>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, sparse_ptr<S,Iter> ptr)
+{
+  return os << "sparse<" << S << ">_" << ptr.get();
+}
+#endif
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp b/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
new file mode 100755
index 000000000..720b9b124
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                   // CUTE_HOST_DEVICE
+#include <cute/pointer_base.hpp>             // cute::iter_adaptor
+#include <cute/swizzle.hpp>                  // cute::Swizzle, cute::get_swizzle primary template
+#include <cute/util/type_traits.hpp>         // cute::iterator_traits
+#include <cute/container/array_subbyte.hpp>  // cute::subbyte_iterator
+
+/* This implements a swizzle pointer of the form
+ *   InvolutionFn o PtrAdd
+ * where the InvolutionFn need not be linear.
+ *
+ * This differs subtly from swizzle_layout because the smem pointer is used
+ * as the offset. That means that swizzle_layout will implement position-independent
+ * swizzle layouts, while swizzle_ptr implements position-dependent swizzle tensors.
+ * Arch chose to design hardware with position-dependent swizzles.
+ *
+ * For clarity:
+ *   NormalLayout  : DeRef <- PtrAdd <- [Layout]
+ *   ComposedLayout: DeRef <- PtrAdd <- [Swizzle <- OffsetAdd <- Layout]
+ *   SwizzlePtr    : [DeRef <- Swizzle <- PtrAdd] <- Layout
+ *
+ * Furthermore, for known swizzles, this pointer attempts to decay itself
+ *    to a normal-pointer with a new layout containing dynamic or static strides.
+ * This is possible by determining the subdomain of the InvolutionFn
+ *    that is identity and testing if the Layout's codomain is contained
+ *    within it.
+ */
+
+namespace cute
+{
+
+// concept SwizzleFn {
+//   CUTE_HOST_DEVICE constexpr static uint apply(uint);
+// }
+// See Swizzle<B,M,S> in swizzle.hpp for common swizzle-functions.
+
+template <class SwizzleFn, class Iterator>
+struct swizzle_ptr : iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+
+  using iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>::iter_adaptor;
+
+  template <class Iter>
+  CUTE_HOST_DEVICE constexpr static
+  Iter apply_swizzle(Iter ptr) {
+    return {apply_swizzle(ptr.get())};
+  }
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr static
+  T* apply_swizzle(T* ptr) {
+    return reinterpret_cast<T*>(SwizzleFn::apply(reinterpret_cast<uintptr_t>(ptr)));
+  }
+
+  template <class T>
+  CUTE_HOST_DEVICE constexpr static
+  subbyte_iterator<T> apply_swizzle(subbyte_iterator<T> ptr) {
+    return {apply_swizzle(ptr.ptr_), ptr.idx_};
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return *apply_swizzle(this->get());
+  }
+
+  template <class Int>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](Int const& i) const {
+    return *apply_swizzle(this->get() + i);
+  }
+};
+
+//
+// Helper Function
+//
+template <class SwizzleFn, class P>                   // Found the SwizzleFn
+struct get_swizzle<swizzle_ptr<SwizzleFn,P>> { using type = SwizzleFn; };
+template <class T>                                    // Recurse into anything with a ::iterator
+struct get_swizzle<T, void_t<typename T::iterator>> : get_swizzle<typename T::iterator> {};
+
+template <class Iterator, class SwizzleFn>
+CUTE_HOST_DEVICE constexpr
+swizzle_ptr<SwizzleFn,Iterator>
+make_swizzle_ptr(Iterator ptr, SwizzleFn) {
+  return {ptr};
+}
+
+// Swizzle-0 specialization for immediate decay
+template <class Iterator, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Iterator
+make_swizzle_ptr(Iterator ptr, Swizzle<0,M,S>) {
+  return ptr;
+}
+
+//
+// Recast
+//
+
+template <class SwizzleFn, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+raw_pointer_cast(swizzle_ptr<SwizzleFn,P> const& ptr) {
+  return raw_pointer_cast(ptr.get());
+}
+
+// SwizzleFn operates on the pointer address, so it doesn't care about the type
+template <class NewT, class SwizzleFn, class P>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_ptr(swizzle_ptr<SwizzleFn,P> const& ptr) {
+  return make_swizzle_ptr(recast_ptr<NewT>(ptr.get()), SwizzleFn{});
+}
+
+//
+// Display utilities
+//
+
+template <class SwizzleFn, class P>
+CUTE_HOST_DEVICE void print(swizzle_ptr<SwizzleFn,P> ptr)
+{
+  print(SwizzleFn{}); printf("_"); print(ptr.get());
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class SwizzleFn, class P>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, swizzle_ptr<SwizzleFn,P> ptr)
+{
+  return os << SwizzleFn{} << "_" << ptr.get();
+}
+#endif
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/stride.hpp b/lightllm-kernel/cutlass/include/cute/stride.hpp
new file mode 100755
index 000000000..f2d31f4e3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/stride.hpp
@@ -0,0 +1,598 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
+#include <cute/numeric/integer_sequence.hpp>   // cute::seq
+#include <cute/numeric/math.hpp>               // cute::divmod
+#include <cute/numeric/arithmetic_tuple.hpp>   // cute::basis_get
+#include <cute/algorithm/functional.hpp>       // cute::identity
+#include <cute/algorithm/tuple_algorithms.hpp> // cute::fold
+#include <cute/int_tuple.hpp>                  // cute::is_congruent
+
+namespace cute
+{
+
+/** crd2idx(c,s,d) maps a coordinate within <Shape,Stride> to an index
+ *
+ * This is computed as follows:
+ *  [coord, shape, and stride are all integers => step forward by stride]
+ * op(c, s, d)             => c * d
+ *  [coord is integer, shape and stride are tuple => divmod coord for each mode]
+ * op(c, (s,S), (d,D))     => op(c % prod(s), s, d) + op(c / prod(s), (S), (D))
+ *  [coord, shape, and stride are all tuples => consider each mode independently]
+ * op((c,C), (s,S), (d,D)) => op(c, s, d) + op((C), (S), (D))
+ */
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord  const& coord,
+        Shape  const& shape,
+        Stride const& stride);
+
+namespace detail {
+
+template <class Coord, class Shape, class Stride, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_ttt(Coord  const& coord,
+            Shape  const& shape,
+            Stride const& stride, seq<Is...>)
+{
+  return (... + crd2idx(get<Is>(coord), get<Is>(shape), get<Is>(stride)));
+}
+
+template <class CInt, class STuple, class DTuple, int I0, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_itt(CInt   const& coord,
+            STuple const& shape,
+            DTuple const& stride, seq<I0,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {  // Avoid recursion and mod on single/last iter
+    return crd2idx(coord, get<I0>(shape), get<I0>(stride));
+  } else if constexpr (is_constant<0, CInt>::value) {
+    return crd2idx(_0{}, get<I0>(shape), get<I0>(stride))
+         + (_0{} + ... + crd2idx(_0{}, get<Is>(shape), get<Is>(stride)));
+  } else {                             // General case
+    auto [div, mod] = divmod(coord, product(get<I0>(shape)));
+    return crd2idx(mod, get<I0>(shape), get<I0>(stride))
+         + crd2idx_itt(div, shape, stride, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Coord, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord  const& coord,
+        Shape  const& shape,
+        Stride const& stride)
+{
+  if constexpr (is_tuple<Coord>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
+      static_assert(tuple_size<Coord>::value == tuple_size< Shape>::value, "Mismatched Ranks");
+      static_assert(tuple_size<Coord>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return detail::crd2idx_ttt(coord, shape, stride, tuple_seq<Coord>{});
+    } else {                                     // tuple "int" "int"
+      static_assert(sizeof(Coord) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {      // "int" tuple tuple
+      static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return detail::crd2idx_itt(coord, shape, stride, tuple_seq<Shape>{});
+    } else {                                     // "int" "int" "int"
+      return coord * stride;
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+namespace detail {
+
+template <class CTuple, class STuple, int I0, int... Is>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx_horner(CTuple const& coord,
+               STuple const& shape, seq<I0,Is...>)
+{
+  if constexpr (sizeof...(Is) == 0) {  // No recursion on single/last iter
+    return get<I0>(coord);
+  } else {                             // General case
+    return get<I0>(coord) + get<I0>(shape) * crd2idx_horner(coord, shape, seq<Is...>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+/** crd2idx(c,s) maps a coordinate within Shape to an index
+ * via a colexicographical enumeration of coordinates in Shape.
+ * i = c0 + s0 * (c1 + s1 * (c2 + s2 * ...))
+ */
+template <class Coord, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2idx(Coord const& coord,
+        Shape const& shape)
+{
+  if constexpr (is_integral<Coord>::value) {  // Coord is already an index
+    return coord;
+  } else if constexpr (is_integral<Shape>::value) {
+    static_assert(dependent_false<Shape>, "Invalid parameters");
+  } else {                                    // Make congruent, flatten, and apply Horner's method
+    static_assert(tuple_size<Coord>::value == tuple_size<Shape>::value, "Mismatched Ranks");
+    auto flat_coord = flatten(coord);
+    auto flat_shape = flatten(product_like(shape, coord));
+    return detail::crd2idx_horner(flat_coord, flat_shape, tuple_seq<decltype(flat_shape)>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** idx2crd(i,s,d) splits an index into a coordinate within <Shape,Stride>.
+ *
+ * This is computed as follows:
+ *  [index, shape, and stride are all integers => determine 1D coord]
+ * op(i, s, d)             => (i / d) % s
+ *  [index is integer, shape and stride are tuple => determine component for each mode]
+ * op(i, (s,S), (d,D))     => (op(i, s, d), op(i, S, D)...)
+ *  [index, shape, and stride are all tuples => consider each mode independently]
+ * op((i,I), (s,S), (d,D)) => (op(i, s, d), op((I), (S), (D)))
+ *
+ * NOTE: This only works for compact shape+stride layouts. A more general version would
+ *       apply to all surjective layouts
+ */
+template <class Index, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+idx2crd(Index  const& idx,
+        Shape  const& shape,
+        Stride const& stride)
+{
+  if constexpr (is_tuple<Index>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
+      static_assert(tuple_size<Index>::value == tuple_size< Shape>::value, "Mismatched Ranks");
+      static_assert(tuple_size<Index>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+      return transform(idx, shape, stride, [](auto const& i, auto const& s, auto const& d){ return idx2crd(i,s,d); });
+    } else {                                     // tuple "int" "int"
+      static_assert(sizeof(Index) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {
+      if constexpr (is_tuple<Stride>::value) {   // "int" tuple tuple
+        static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
+        return transform(shape, stride, [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
+      } else {                                   // "int" tuple "int"
+        return transform(shape, compact_col_major(shape, stride), [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
+      }
+    } else {                                     // "int" "int" "int"
+      if constexpr (is_constant<1, Shape>::value) {
+        // Skip potential stride-0 division
+        return Int<0>{};
+      } else {
+        return (idx / stride) % shape;
+      }
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/** idx2crd(i,s) splits an index into a coordinate within Shape
+ * via a colexicographical enumeration of coordinates in Shape.
+ * c0 = (idx / 1) % s0
+ * c1 = (idx / s0) % s1
+ * c2 = (idx / (s0 * s1)) % s2
+ * ...
+ */
+template <class Index, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+idx2crd(Index const& idx,
+        Shape const& shape)
+{
+  if constexpr (is_tuple<Index>::value) {
+    if constexpr (is_tuple<Shape>::value) {      // tuple tuple
+      static_assert(tuple_size<Index>::value == tuple_size<Shape>::value, "Mismatched Ranks");
+      return transform(idx, shape, [](auto const& i, auto const& s) { return idx2crd(i,s); });
+    } else {                                     // tuple "int"
+      static_assert(sizeof(Index) == 0, "Invalid parameters");
+    }
+  } else {
+    if constexpr (is_tuple<Shape>::value) {      // "int" tuple
+      return transform_leaf(as_arithmetic_tuple(crd2idx(idx, shape, make_basis_like(shape))), identity{});
+    } else {                                     // "int" "int"
+      return idx;
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// crd2crd
+//
+
+template <class Coord, class SShape, class DShape>
+CUTE_HOST_DEVICE constexpr
+auto
+crd2crd(Coord  const& coord,
+        SShape const& src_shape,
+        DShape const& dst_shape)
+{
+  if constexpr (is_tuple<Coord>::value && is_tuple<SShape>::value && is_tuple<DShape>::value) {
+    static_assert(tuple_size<Coord>::value == tuple_size<SShape>::value, "Mismatched Ranks");
+    static_assert(tuple_size<Coord>::value == tuple_size<DShape>::value, "Mismatched Ranks");
+    return transform(coord, src_shape, dst_shape, [](auto const& c, auto const& s, auto const& d) { return crd2crd(c,s,d); });
+  } else {
+    // assert(size(src_shape) == size(dst_shape))
+    return idx2crd(crd2idx(coord, src_shape), dst_shape);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Compact Major
+//
+
+// Tags for common layouts and dispatching
+struct LayoutLeft;               // Col-major layout mapping; leftmost extent has stride 1
+using GenColMajor = LayoutLeft;  // Alias
+
+struct LayoutRight;              // Row-major layout mapping; rightmost extent has stride 1
+using GenRowMajor = LayoutRight; // Alias
+
+namespace detail {
+
+// For GCC8.5 -- Use of lambdas in unevaluated contexts. Instead use function objects.
+template <class Major>
+struct CompactLambda;
+
+// @pre is_integral<Current>
+// Return (result, current * product(shape)) to enable recurrence
+template <class Major, class Shape, class Current>
+CUTE_HOST_DEVICE constexpr
+auto
+compact(Shape   const& shape,
+        Current const& current)
+{
+  if constexpr (is_tuple<Shape>::value) { // Shape::tuple Current::int
+    using Lambda = CompactLambda<Major>;                  // Append or Prepend
+    using Seq    = typename Lambda::template seq<Shape>;  // Seq or RSeq
+    return cute::detail::fold(shape, cute::make_tuple(cute::make_tuple(), current), Lambda{}, Seq{});
+  } else {                                // Shape::int Current::int
+    if constexpr (is_constant<1, Shape>::value) {
+      return cute::make_tuple(Int<0>{}, current); // If current is dynamic, this could save a reg
+    } else {
+      return cute::make_tuple(current, current * shape);
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// For GCC8.5 -- Specialization LayoutLeft
+template <>
+struct CompactLambda<LayoutLeft>
+{
+  template <class Init, class Shape>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Init const& init, Shape const& si) {
+    auto result = detail::compact<LayoutLeft>(si, get<1>(init));
+    return cute::make_tuple(append(get<0>(init), get<0>(result)), get<1>(result));  // Append
+  }
+
+  template <class Shape>
+  using seq = tuple_seq<Shape>;                                                     // Seq
+};
+
+// For GCC8.5 -- Specialization LayoutRight
+template <>
+struct CompactLambda<LayoutRight>
+{
+  template <class Init, class Shape>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Init const& init, Shape const& si) {
+    auto result = detail::compact<LayoutRight>(si, get<1>(init));
+    return cute::make_tuple(prepend(get<0>(init), get<0>(result)), get<1>(result));  // Prepend
+  }
+
+  template <class Shape>
+  using seq = tuple_rseq<Shape>;                                                     // RSeq
+};
+
+} // end namespace detail
+
+template <class Major, class Shape, class Current = Int<1>,
+          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_major(Shape   const& shape,
+              Current const& current = {})
+{
+  if constexpr (is_tuple<Current>::value) {    // Shape::tuple Current::tuple
+    static_assert(is_tuple<Shape>::value, "Invalid parameters");
+    static_assert(tuple_size<Shape>::value == tuple_size<Current>::value, "Mismatched Ranks");
+    // Recurse to apply to the terminals of current
+    return transform(shape, current, [&](auto const& s, auto const& c){ return compact_major<Major>(s,c); });
+  } else {
+    return get<0>(detail::compact<Major>(shape, current));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Compact Col Major
+//
+
+struct LayoutLeft {
+  template <class Shape>
+  using Apply = decltype(compact_major<LayoutLeft>(declval<Shape>()));
+};
+
+template <class Shape, class Current = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_col_major(Shape   const& shape,
+                  Current const& current = {})
+{
+  return compact_major<LayoutLeft>(shape, current);
+}
+
+//
+// Compact Row Major
+//
+
+struct LayoutRight {
+  template <class Shape>
+  using Apply = decltype(compact_major<LayoutRight>(declval<Shape>()));
+};
+
+template <class Shape, class Current = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_row_major(Shape   const& shape,
+                  Current const& current = {})
+{
+  return compact_major<LayoutRight>(shape, current);
+}
+
+//
+// Compact Order -- compute a compact stride based on an ordering of the modes
+//
+
+namespace detail {
+
+// @pre weakly_congruent(order, shape)
+// @pre is_congruent<RefShape, RefOrder>
+// @pre is_static<Order>
+// @pre is_static<RefOrder>
+template <class Shape, class Order, class RefShape, class RefOrder>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, Order const& order,
+              RefShape const& ref_shape, RefOrder const& ref_order)
+{
+  if constexpr (is_tuple<Order>::value) {
+    static_assert(tuple_size<Shape>::value == tuple_size<Order>::value, "Need equal rank of shape and order");
+    return transform(shape, order, [&](auto const& s, auto const& o) { return compact_order(s, o, ref_shape, ref_order); });
+  } else {
+    // Compute the starting stride for this shape by accumulating all shapes corresponding to lesser orders
+    auto stride_start = product(transform(ref_shape, ref_order,
+                                          [&](auto const& s, auto const& o) {
+                                            return conditional_return(o < order, s, Int<1>{});
+                                          }));
+    return compact_col_major(shape, stride_start);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+template <class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, Order const& order)
+{
+  auto ref_shape = flatten_to_tuple(product_like(shape, order));
+
+  auto flat_order = flatten_to_tuple(order);
+  // Find the largest static element of order
+  auto max_order = cute::fold(flat_order, Int<0>{}, [](auto v, auto order) {
+    if constexpr (is_constant<true, decltype(v < order)>::value) {
+      return order;
+    } else {
+      return v;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+  // Replace any dynamic elements within order with large-static elements
+  auto max_seq = make_range<max_order+1, max_order+1+rank(flat_order)>{};
+  auto ref_order = cute::transform(max_seq, flat_order, [](auto seq_v, auto order) {
+    if constexpr (is_static<decltype(order)>::value) {
+      return order;
+    } else {
+      return seq_v;
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  });
+
+  auto new_order = unflatten(ref_order, order);
+
+  return detail::compact_order(shape, new_order, ref_shape, ref_order);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, GenColMajor const& major)
+{
+  return compact_major<LayoutLeft>(shape);
+}
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+compact_order(Shape const& shape, GenRowMajor const& major)
+{
+  return compact_major<LayoutRight>(shape);
+}
+
+//
+// Coordinate iterator
+//
+
+namespace detail {
+
+template <class Coord, class Shape, class Order>
+CUTE_HOST_DEVICE constexpr
+void
+increment(Coord& coord, Shape const& shape, Order const& order)
+{
+  ++basis_get(get<0>(order), coord);
+  cute::for_each(make_range<1, tuple_size<Order>::value>{}, [&](auto i){
+    if (basis_get(get<i-1>(order), coord) == basis_get(get<i-1>(order), shape)) {
+      basis_get(get<i-1>(order), coord) = 0;
+      ++basis_get(get<i>(order), coord);
+    }
+  });
+}
+
+/** Increment a (dynamic) coord colexicographically within a shape
+ * @pre is_congruent<Coord,Shape>::value
+ * \code
+ *   auto shape = make_shape(1,2,make_shape(2,3),3);
+ *   auto coord = repeat_like(shape, 0);
+ *
+ *   for (int i = 0; i < size(shape); ++i) {
+ *     std::cout << i << ": " << coord << std::endl;
+ *     increment(coord, shape);
+ *   }
+ * \endcode
+ */
+template <class Coord, class Shape>
+CUTE_HOST_DEVICE constexpr
+void
+increment(Coord& coord, Shape const& shape)
+{
+  increment(coord, shape, flatten_to_tuple(make_basis_like(shape)));
+}
+
+} // end namespace detail
+
+struct ForwardCoordIteratorSentinel
+{};
+
+// A forward iterator for a starting coordinate in a shape's domain, and a shape.
+// The starting coordinate may be zero but need not necessarily be.
+template <class Coord, class Shape, class Order>
+struct ForwardCoordIterator
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+
+  CUTE_HOST_DEVICE constexpr
+  Coord const& operator*() const { return coord; }
+  CUTE_HOST_DEVICE constexpr
+  ForwardCoordIterator& operator++() { detail::increment(coord, shape, Order{}); return *this; }
+  // Sentinel for the end of the implied range
+  CUTE_HOST_DEVICE constexpr
+  bool operator==(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) == basis_get(back(Order{}), shape); }
+  CUTE_HOST_DEVICE constexpr
+  bool operator!=(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) != basis_get(back(Order{}), shape); }
+  // NOTE: These are expensive, avoid use
+  CUTE_HOST_DEVICE constexpr
+  bool operator==(ForwardCoordIterator const& other) const { return coord == other.coord; }
+  CUTE_HOST_DEVICE constexpr
+  bool operator!=(ForwardCoordIterator const& other) const { return coord != other.coord; }
+
+  Coord coord;
+  Shape const& shape;
+};
+
+// A forward iterator for a coordinate that starts from a provided coordinate and increments in a prescribed order
+template <class Order, class Shape, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Coord const& coord, Shape const& shape)
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+  static_assert(is_congruent<Order, Coord>::value);
+  static_assert(is_congruent<Order, Shape>::value);
+  auto flat_order  = flatten_to_tuple(Order{});
+  auto inv_order   = transform(make_seq<rank(flat_order)>{}, [&](auto i){ return find(flat_order, i); });
+  auto basis_order = transform_leaf(inv_order, [&](auto i) { return get<i>(flatten_to_tuple(make_basis_like(shape))); });
+  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
+}
+
+// A forward iterator for a coordinate that starts from a provided coordinate and increments colex
+template <class Shape, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Coord const& coord, Shape const& shape)
+{
+  static_assert(is_congruent<Coord, Shape>::value);
+  auto basis_order = flatten_to_tuple(make_basis_like(shape));
+  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
+}
+
+// A forward iterator for a coordinate that starts from zero and increments in a prescribed order
+template <class Order, class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Shape const& shape)
+{
+  return make_coord_iterator<Order>(repeat_like(shape, int(0)), shape);
+}
+
+// A forward iterator for a coordinate that starts from zero and increments colex
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_coord_iterator(Shape const& shape)
+{
+  return make_coord_iterator(repeat_like(shape, int(0)), shape);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/swizzle.hpp b/lightllm-kernel/cutlass/include/cute/swizzle.hpp
new file mode 100755
index 000000000..52abf856d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/swizzle.hpp
@@ -0,0 +1,498 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
+#include <cute/container/tuple.hpp>             // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>   // cute::constant
+#include <cute/numeric/math.hpp>                // cute::max, cute::min
+#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform_apply
+
+namespace cute
+{
+
+// A generic Swizzle functor
+/* 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ *                               ^--^ MBase is the number of least-sig bits to keep constant
+ *                  ^-^       ^-^     BBits is the number of bits in the mask
+ *                    ^---------^     SShift is the distance to shift the YYY mask
+ *                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ *
+ * e.g. Given
+ * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ * the result is
+ * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ */
+template <int BBits, int MBase, int SShift = BBits>
+struct Swizzle
+{
+  static constexpr int num_bits = BBits;
+  static constexpr int num_base = MBase;
+  static constexpr int num_shft = SShift;
+
+  static_assert(num_base >= 0,             "MBase must be positive.");
+  static_assert(num_bits >= 0,             "BBits must be positive.");
+  static_assert(abs(num_shft) >= num_bits, "abs(SShift) must be more than BBits.");
+
+  // using 'int' type here to avoid unintentially casting to unsigned... unsure.
+  using bit_msk = cute::constant<int, (1 << num_bits) - 1>;
+  using yyy_msk = cute::constant<int, bit_msk{} << (num_base + max(0,num_shft))>;
+  using zzz_msk = cute::constant<int, bit_msk{} << (num_base - min(0,num_shft))>;
+  using msk_sft = cute::constant<int, num_shft>;
+
+  static constexpr uint32_t swizzle_code = uint32_t(yyy_msk{} | zzz_msk{});
+
+  template <class Offset>
+  CUTE_HOST_DEVICE constexpr static
+  auto
+  apply(Offset const& offset)
+  {
+    return offset ^ shiftr(offset & yyy_msk{}, msk_sft{});   // ZZZ ^= YYY
+  }
+
+  template <class Offset>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Offset const& offset) const
+  {
+    return apply(offset);
+  }
+
+  template <int B, int M, int S>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator==(Swizzle<B,M,S> const&) const
+  {
+    return B == BBits && M == MBase && S == SShift;
+  }
+};
+
+//
+// make_swizzle<0b1000, 0b0100>()         ->  Swizzle<1,2,1>
+// make_swizzle<0b11000000, 0b00000110>() ->  Swizzle<2,1,5>
+//
+
+template <uint32_t Y, uint32_t Z>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle()
+{
+  constexpr uint32_t BZ = popcount(Y);                    // Number of swizzle bits
+  constexpr uint32_t BY = popcount(Z);                    // Number of swizzle bits
+  static_assert(BZ == BY, "Number of bits in Y and Z don't match");
+  constexpr uint32_t TZ_Y = countr_zero(Y);               // Number of trailing zeros in Y
+  constexpr uint32_t TZ_Z = countr_zero(Z);               // Number of trailing zeros in Z
+  constexpr uint32_t M = cute::min(TZ_Y, TZ_Z) % 32;
+  constexpr  int32_t S = int32_t(TZ_Y) - int32_t(TZ_Z);   // Difference in trailing zeros
+  static_assert((Y | Z) == Swizzle<BZ,M,S>::swizzle_code, "Something went wrong.");
+  return Swizzle<BZ,M,S>{};
+}
+
+template <int B0, int M0, int S0,
+          int B1, int M1, int S1>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>)
+{
+  static_assert(S0 == S1, "Can only merge swizzles of the same shift.");
+  constexpr uint32_t Y = Swizzle<B0,M0,S0>::yyy_msk::value ^ Swizzle<B1,M1,S1>::yyy_msk::value;
+  constexpr uint32_t Z = Swizzle<B0,M0,S0>::zzz_msk::value ^ Swizzle<B1,M1,S1>::zzz_msk::value;
+  return make_swizzle<Y,Z>();
+
+  //return ComposedFn<Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>>{};
+}
+
+//
+// Utility for slicing and swizzle "offsets"
+//
+
+// For swizzle functions, it is often needed to keep track of which bits are
+//   consumed and which bits are free. Furthermore, it is useful to know whether
+// each of these bits is known statically or dynamically.
+
+// MixedBits is an 32-bit unsigned integer class where some bits are known statically
+//   and some bits are known dynamically. These sets of bits are disjoint and it is
+//   known statically which bits are known dynamically.
+
+// MixedBits can only be manipulated through bitwise operations
+
+// Abstract value:  StaticInt | (dynamic_int_ & StaticFlags)
+template <uint32_t StaticInt,
+          uint32_t StaticFlags>    // 0: static, 1: dynamic
+struct MixedBits
+{
+  // Representation invariants
+  static_assert(StaticFlags != 0, "Should be at least one dynamic bit in MixedBits.");
+  static_assert((StaticInt & StaticFlags) == 0, "No static/dynamic overlap allowed in MixedBits.");
+
+  uint32_t dynamic_int_;
+  // assert((dynamic_int_ & ~StaticFlags) == 0);
+
+  CUTE_HOST_DEVICE constexpr operator uint32_t() const noexcept { return StaticInt | dynamic_int_; }
+};
+
+// Return a value representing (C<s>{} | (d & C<f>)) potentially using MixedBits to track s and f.
+// This maker does allow ((s & f) != 0) and enforces the MixedBits invariant before creation.
+template <auto s, class DynamicType, auto f>
+CUTE_HOST_DEVICE constexpr
+auto
+make_mixed_bits(C<s>, DynamicType const& d, C<f>)
+{
+  static_assert(is_integral<DynamicType>::value);
+  constexpr uint32_t new_f = uint32_t(f) & ~uint32_t(s);        // StaticBits take precedence, M<0,f>{d} | C<s>{}
+  if constexpr (new_f == 0 || is_static<DynamicType>::value) {
+    return C<s>{} | (d & C<new_f>{});                           // Just return a static int
+  } else {
+    return MixedBits<s, new_f>{uint32_t(d) & new_f};            // MixedBits
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Operators
+//
+
+// Equality
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return (S0 == (uint32_t(S1) & ~F0)) && (m.dynamic_int_ == (uint32_t(S1) & F0));
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator==(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m == s;
+}
+
+// Bitwise AND
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) & (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0  | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0      | 0X0 | 0X0 | 0X0 | 0X0 |
+  //  001      | 0X0 | 001 | 001 | 001 |
+  //  011      | 0X0 | 001 | 011 | 011 |
+  //  1X0      | 0X0 | 001 | 011 | 1X0 |
+
+  return make_mixed_bits(C<S0 & S1>{},
+                         //(S0 | m0.dynamic_int_) & (S1 | m1.dynamic_int_),
+                         ((S1 & F0) & m0.dynamic_int_) | ((S0 & F1) & m1.dynamic_int_) | (m0.dynamic_int_ & m1.dynamic_int_),
+                         C<(S1 & F0) | (S0 & F1) | (F0 & F1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<S0 & uint32_t(S1)>{},
+                         m.dynamic_int_,
+                         C<F0 & uint32_t(S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator&(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m & s;
+}
+
+// Bitwise OR
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) | (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0     | 0X0 | 001 | 011 | 1X0 |
+  //  001     | 001 | 001 | 011 | 1X0 |
+  //  011     | 011 | 011 | 011 | 1X0 |
+  //  1X0     | 1X0 | 1X0 | 1X0 | 1X0 |
+
+  return make_mixed_bits(C<S0 | S1>{},
+                         ((~S1 & F0) & m0.dynamic_int_) | ((~S0 & F1) & m1.dynamic_int_),
+                         C<(~S0 & F1) | (~S1 & F0)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<S0 |  uint32_t(S1)>{},
+                         m.dynamic_int_,
+                         C<F0 & ~uint32_t(S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator|(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m | s;
+}
+
+// Bitwise XOR
+template <uint32_t S0, uint32_t F0,
+          uint32_t S1, uint32_t F1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
+{
+  // Truth table for (S0,D0,F0) ^ (S1,D1,F1) -> (S,D,F)
+  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
+  // S1D1F1
+  //  0X0     | 0X0 | 001 | 011 | 1X0 |
+  //  001     | 001 | 001 | 011 | 011 |
+  //  011     | 011 | 011 | 001 | 001 |
+  //  1X0     | 1X0 | 011 | 001 | 0X0 |
+
+  return make_mixed_bits(C<(~S0 & S1 & ~F0) | (S0 & ~S1 & ~F1)>{},
+                         (S0 | m0.dynamic_int_) ^ (S1 | m1.dynamic_int_),
+                         C<F0 | F1>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(~S0 & uint32_t(S1) & ~F0) | (S0 & ~uint32_t(S1))>{},
+                         (S0 | m.dynamic_int_) ^ uint32_t(S1),
+                         C<F0>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator^(C<S1> s, MixedBits<S0,F0> const& m)
+{
+  return m ^ s;
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator<<(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(S0 << S1)>{},
+                         m.dynamic_int_ << S1,
+                         C<(F0 << S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+operator>>(MixedBits<S0,F0> const& m, C<S1>)
+{
+  return make_mixed_bits(C<(S0 >> S1)>{},
+                         m.dynamic_int_ >> S1,
+                         C<(F0 >> S1)>{});
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftl(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  if constexpr (S1 >= 0) {
+    return m << s;
+  } else {
+    return m >> -s;
+  }
+}
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+shiftr(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  if constexpr (S1 >= 0) {
+    return m >> s;
+  } else {
+    return m << -s;
+  }
+}
+
+//
+// Upcast and Downcast
+//
+
+template <uint32_t S0, uint32_t F0, auto S1>
+CUTE_HOST_DEVICE constexpr
+auto
+safe_div(MixedBits<S0,F0> const& m, C<S1> s)
+{
+  static_assert(has_single_bit(uint32_t(S1)), "Only divide MixedBits by powers of two.");
+  return make_mixed_bits(safe_div(C<S0>{}, s),
+                         safe_div(m.dynamic_int_, s),
+                         safe_div(C<F0>{}, s));
+}
+
+template <uint32_t N, uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(MixedBits<S0,F0> const& m)
+{
+  static_assert(has_single_bit(N), "Only divide MixedBits by powers of two.");
+  return safe_div(m, C<N>{});
+}
+
+template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(T const& m)
+{
+  return safe_div(m, C<N>{});
+}
+
+template <uint32_t N, uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(MixedBits<S0,F0> const& m)
+{
+  static_assert(has_single_bit(N), "Only scale MixedBits by powers of two.");
+  return make_mixed_bits(C<S0 * N>{},
+                         m.dynamic_int_ * N,
+                         C<F0 * N>{});
+}
+
+template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(T const& m)
+{
+  return m * C<N>{};
+}
+
+template <uint32_t S0, uint32_t F0>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(MixedBits<S0,F0> const&)
+{
+  return C<uint32_t(1) << countr_zero(S0 | F0)>{};
+}
+
+template <auto v>
+CUTE_HOST_DEVICE constexpr
+C<v>
+max_alignment(C<v> const& c)
+{
+  return c;
+}
+
+//
+// Convert a Pow2Layout+Coord to a MixedBits
+//
+
+template <class Shape, class Stride, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+to_mixed_bits(Shape const& shape, Stride const& stride, Coord const& coord)
+{
+  if constexpr (is_tuple<Shape>::value && is_tuple<Stride>::value && is_tuple<Coord>::value) {
+    static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched ranks");
+    static_assert(tuple_size<Shape>::value == tuple_size<Coord >::value, "Mismatched ranks");
+    return transform_apply(shape, stride, coord, [](auto const& s, auto const& d, auto const& c) { return to_mixed_bits(s,d,c); },
+                                                 [](auto const&... a) { return (a ^ ...); });
+  } else if constexpr (is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value) {
+    static_assert(decltype(shape*stride)::value == 0 || has_single_bit(decltype(shape*stride)::value), "Requires pow2 shape*stride.");
+    return make_mixed_bits(Int<0>{}, coord * stride, (shape - Int<1>{}) * stride);
+  } else {
+    static_assert(is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value, "Either Shape, Stride, and Coord must be all tuples, or they must be all integral (in the sense of cute::is_integral).");
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <class Layout, class Coord>
+CUTE_HOST_DEVICE constexpr
+auto
+to_mixed_bits(Layout const& layout, Coord const& coord)
+{
+  return to_mixed_bits(layout.shape(), layout.stride(), idx2crd(coord, layout.shape()));
+}
+
+//
+// Display utilities
+//
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE void print(Swizzle<B,M,S> const&)
+{
+  printf("Sw<%d,%d,%d>", B, M, S);
+}
+
+template <uint32_t S, uint32_t F>
+CUTE_HOST_DEVICE void print(MixedBits<S,F> const& m)
+{
+  printf("M_%u|(%u&%u)=%u", S, m.dynamic_int_, F, uint32_t(m));
+}
+
+#if !defined(__CUDACC_RTC__)
+template <int B, int M, int S>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Swizzle<B,M,S> const&)
+{
+  return os << "Sw<" << B << "," << M << "," << S << ">";
+}
+
+template <uint32_t S, class D, uint32_t F>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits<S,F> const& m)
+{
+  return os << "M_" << S << "|(" << m.dynamic_int_ << "&" << F << ")=" << uint32_t(m);
+}
+#endif // !defined(__CUDACC_RTC__)
+
+//
+// Helper Function
+//
+template <class T, class = void>                      // Default No-Swizzle
+struct get_swizzle { using type = Swizzle<0,4,3>; };
+
+template <class T>
+using get_swizzle_t = typename get_swizzle<T>::type;
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp b/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
new file mode 100755
index 000000000..1324360eb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/layout.hpp>           // cute::Layout
+#include <cute/layout_composed.hpp>  // cute::ComposedLayout
+#include <cute/swizzle.hpp>          // cute::Swizzle, cute::get_swizzle primary template
+
+/* Specialized functionality for a ComposedLayout of the form
+ *   InvolutionFn o Offset o LayoutB
+ * where the InvolutionFn is a Swizzle<B,M,S> and is not linear (hence the need for the Offset).
+ *
+ * Because these are specializations for core functions of ComposedLayout, these Swizzle Layouts
+ * provide similar functionality to Layout including tiling, partitioning,
+ * coordinate-to-index mapping and layout manipulations, but are not considered "normal" layouts.
+ * For example, these provide shape() and size() functions, but do not provide stride() functions.
+ *
+ * Furthermore, each of these specializations uses Swizzle<>-specific knowledge in its implementation and
+ * attempts to decay itself to a normal-layout with dynamic or static strides when certain slicing conditions
+ * are met. This is possible by determining the subdomain of the Swizzle<> function that is identity and
+ * testing if LayoutB's codomain is contained within it. In general, MizedBits is used as the Offset to track
+ * statically-vs-dynamically known bits in the Offset to improve the decay to static or dynamic normal layouts.
+ */
+
+namespace cute
+{
+
+//
+// Helper Function
+//
+template <int B, int M, int S, class Offset, class LayoutB>
+struct get_swizzle<ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>> { using type = Swizzle<B,M,S>; };
+
+//
+// Constructors
+//
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+make_layout(Swizzle<B,M,S> const& sxor)
+{
+  return composition(sxor, Layout<Int<M+B+abs(S)>,Int<1>>{});
+}
+
+namespace detail {
+
+template <int B, int M, int S, class OldShape, class OldStride, class NewShape, class NewStride>
+CUTE_HOST_DEVICE constexpr
+auto
+transfer_swizzle(Layout<OldShape,OldStride> const& old_layout,
+                 Layout<NewShape,NewStride> const& new_layout)
+{
+  // Our goal is to determine a new swizzle for the strides in new_layout for consistent vectorizations
+
+  // This is accomplished by identifying
+  //  S o L  :=:  S? o L*
+  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
+  // Then that active identifier is transformed through the layouts:
+  //  L*(L[(P o L)(c*)])
+  // which is a new swizzle identifier for S?, the new swizzle
+
+  // Projections of the swizzle layout for composition, P
+  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
+                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
+
+  // Compose with the tile to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+  auto layout_only_zy       = composition(swizzle_only_zy, old_layout);
+  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
+
+  // Get the Z bit and the Y bits -- keep only those that are active in Z *and* Y
+  auto zzz_msk = typename Swizzle<B,M,S>::zzz_msk{};
+  auto yyy_msk = typename Swizzle<B,M,S>::yyy_msk{};
+  auto msk_sft = typename Swizzle<B,M,S>::msk_sft{};
+  auto active_Z = swizzle_active_bits & shiftr(swizzle_active_bits,  msk_sft) & zzz_msk;
+  auto active_Y = swizzle_active_bits & shiftr(swizzle_active_bits, -msk_sft) & yyy_msk;
+
+  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
+  auto new_active_Z = new_layout(old_layout.get_1d_coord(active_Z));
+  auto new_active_Y = new_layout(old_layout.get_1d_coord(active_Y));
+
+  // Use this new swizzle identifier to construct the new swizzle for new_layout
+  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
+  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
+}
+
+} // end namespace detail
+
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  return make_fragment_like(layout.layout_b());
+}
+
+//
+// Utilities
+//
+
+namespace detail {
+
+// Get just the Swizzle part of a composed layout.
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+get_swizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>)
+{
+  return Swizzle<B,M,S>{};
+}
+
+// A non-swizzled layout's "Swizzle part" is the identity swizzle.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get_swizzle_portion(Layout<Shape,Stride>)
+{
+  return Swizzle<0,4,3>{};
+}
+
+// Get the "non-swizzle" part of a composed layout,
+// which is the underlying (non-composed) Layout.
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+get_nonswizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& slayout)
+{
+  return slayout.layout_b();
+}
+
+// The non-swizzle part of a non-swizzled layout is just the Layout.
+template <class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+get_nonswizzle_portion(Layout<Shape,Stride> const& slayout)
+{
+  return slayout;
+}
+
+} // namespace detail
+
+//
+// Slice a Swizzled ComposedLayout
+//
+
+namespace detail {
+
+template <class IntZ, class IntY, class Offset, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle_strides(true_type,
+                     IntZ   const& Z,
+                     IntY   const& Y,
+                     Offset const& offset,
+                     int_sequence<I...>)
+{
+  // Below is an optimized/compressed version of:
+  //return cute::make_tuple((swizzle(offset + Z*Int<(1 << I)>{}) - swizzle(offset))...);
+  // with knowledge of Swizzle, I... ranges for each B bits,
+  //    and the layout won't slice along z-bits that are already set
+
+  // y\z  0   1
+  //   0  Z  DC
+  //   1 -Z  DC
+
+  return cute::make_tuple(conditional_return((offset & (Y << Int<I>{})) == Int<0>{}, Z * Int<(1 << I)>{}, -Z * Int<(1 << I)>{})...);
+}
+
+template <class IntZ, class IntY, class Offset, int... I>
+CUTE_HOST_DEVICE constexpr
+auto
+make_swizzle_strides(false_type,
+                     IntZ   const& Z,
+                     IntY   const& Y,
+                     Offset const& offset,
+                     int_sequence<I...>)
+{
+  // Below is an optimized/compressed version of:
+  //return cute::make_tuple((swizzle(offset + Y*Int<(1 << I)>{}) - swizzle(offset))...);
+  // with knowledge of Swizzle, I... ranges for each B bits,
+  //    and the layout won't slice along y-bits that are already set
+
+  // y\z  0   1
+  //   0 Y+Z Y-Z
+  //   1 DC  DC
+
+  return cute::make_tuple(conditional_return((offset & (Z << Int<I>{})) == Int<0>{}, (Y+Z) * Int<(1 << I)>{}, (Y-Z) * Int<(1 << I)>{})...);
+}
+
+} // end namespace detail
+
+template <class Coord, int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& coord, ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (all_underscore<Coord>::value) {
+    // Skip the expensive/complicated attempt to decay to a normal layout and just reshape
+    return cute::make_tuple(composition(layout.layout_a(), layout.offset(), slice(coord, layout.layout_b())), Int<0>{});
+  } else {
+
+    // Projections of the swizzle layout for composition
+    auto sw = make_layout(make_shape(Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B)>{}, Int<1>{}));
+
+    auto swizzle_anti_zy = make_layout(shape(sw),
+                                       make_stride(stride<0>(sw),      Int<0>{}, stride<2>(sw),      Int<0>{}, size(sw)));
+    auto swizzle_only_zy = make_layout(shape(sw),
+                                       make_stride(     Int<0>{}, stride<1>(sw),      Int<0>{}, stride<3>(sw), Int<0>{}));
+
+    // The portion of the layout that is not yet consumed
+    auto sliced_layout = slice(coord, layout.layout_b());
+
+    // The portion of the layout that we are consuming now
+    auto diced_layout = dice(coord, layout.layout_b());
+    auto diced_coord  = dice(coord, coord);
+
+    auto diced_layout_anti_zy = composition(swizzle_anti_zy, diced_layout);
+    auto diced_layout_only_zy = composition(swizzle_only_zy, diced_layout);
+
+    // New swizzle and offset
+    auto swizzle = layout.layout_a();
+    // offset_only_zy interacts with swizzle and gets accumulated with layout.offset()
+    //   being careful about the static/dynamic contributions from diced_layout and diced_coord
+    auto offset_only_zy = layout.offset() ^ to_mixed_bits(diced_layout_only_zy, diced_coord);
+    // offset_anti_zy always gets passed through, no interaction with swizzle
+    auto offset_anti_zy = diced_layout_anti_zy(diced_coord);
+
+    // If Layout's codomain hits on         Y AND Z, then it's not reducible
+    // If Layout's codomain hits on         Y XOR Z, then it's dynamic-normal
+    // If Layout's codomain hits on neither Y NOR Z, then it's static-normal
+
+    // If the sliced_layout hits two bits that are swizzled together, then don't attempt to decay
+
+    // Compose with the layout to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+    //   (this also tests that shape/stride of layout compose with swizzle)
+    auto sliced_layout_only_zy = composition(swizzle_only_zy, sliced_layout);
+    // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+    [[maybe_unused]] auto swizzle_active_bits = sliced_layout_only_zy(size(sliced_layout_only_zy)-Int<1>{});
+
+    // Determine if any active bits collide under the swizzle for potential decay
+    if constexpr (is_constant<0, decltype(not (swizzle_active_bits & ~swizzle(swizzle_active_bits)))>::value)
+    { // Hits on Y AND Z, so it's not reducible
+      return cute::make_tuple(composition(swizzle, offset_only_zy, sliced_layout), offset_anti_zy);
+    } else
+    { // Misses on Y or Z, so it's static-normal or dynamic-normal
+
+      // Lowest bit of the Z and Y masks
+      auto Z = typename Swizzle<B,M,S>::zzz_msk{} & -typename Swizzle<B,M,S>::zzz_msk{};
+      auto Y = typename Swizzle<B,M,S>::yyy_msk{} & -typename Swizzle<B,M,S>::yyy_msk{};
+      auto stride_lo = detail::make_swizzle_strides(Z < Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
+      auto stride_hi = detail::make_swizzle_strides(Z > Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
+
+      // Construct a (dynamic) layout that we can perform the composition with
+      auto swizzle_layout = make_layout(make_shape (Int<(1 << M)>{}, repeat<B>(Int<2>{}), Int<(1 << (abs(S)-B))>{}, repeat<B>(Int<2>{}), Int<                  1>{}),
+                                        make_stride(Int<       1>{},           stride_lo, Int<(1 <<      (M+B))>{},          stride_hi , Int<(1 << (M+B+abs(S)))>{}));
+
+      // Decay to a normal layout with offset
+      return cute::make_tuple(composition(swizzle_layout, sliced_layout),
+                              swizzle(offset_only_zy) + offset_anti_zy);
+    }
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// composition
+//
+
+// Ignore identity case
+template <int M, int S,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<0,M,S> const&,
+            Int<0> const&,
+            Layout<Shape,Stride> const& layout)
+{
+  return layout;
+}
+
+template <int B, int M, int S,
+          class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Swizzle<B,M,S> const& sxor,
+            Layout<Shape,Stride> const& layout)
+{
+  return composition(sxor, Int<0>{}, layout);
+}
+
+template <class ShapeA, class StrideA,
+          int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Layout<ShapeA,StrideA> const& a,
+            Swizzle<B,M,S>         const& b)
+{
+  // Get the Z bits and the Y bits
+  auto active_Y = a(typename Swizzle<B,M,S>::yyy_msk{});
+  auto active_Z = a(typename Swizzle<B,M,S>::zzz_msk{});
+
+  // Works in simple cases... but could be greatly generalized
+
+  return composition(make_swizzle<active_Y,active_Z>(), a);
+}
+
+//
+// inverse
+//
+
+// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (is_constant<0, Offset>::value) {
+    return composition(right_inverse(layout.layout_b()), layout.layout_a());
+  } else {
+    return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
+  }
+}
+
+// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
+template <int B, int M, int S, class Offset, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
+{
+  if constexpr (is_constant<0, Offset>::value) {
+    return composition(left_inverse(layout.layout_b()), layout.layout_a());
+  } else {
+    return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
+  }
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Swizzle<B,M,S>
+right_inverse(Swizzle<B,M,S> const& sw)
+{
+  return sw;
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+Swizzle<B,M,S>
+left_inverse(Swizzle<B,M,S> const& sw)
+{
+  return sw;
+}
+
+// Kludge -- Probably want an OffsetFn<T> here instead
+template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+right_inverse(T const& t)
+{
+  return -t;
+}
+
+// Kludge -- Probably want an OffsetFn<T> here instead
+template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+left_inverse(T const& t)
+{
+  return -t;
+}
+
+//
+// Upcast and Downcast
+//
+
+template <int N, int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Swizzle<B,M,S> const& swizzle)
+{
+  static_assert(has_single_bit(N), "N must be a power of two");
+  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
+  constexpr int NewM   = M - log2_n;
+  if constexpr (NewM >= 0) {
+    return Swizzle<B,NewM,S>{};
+  } else {
+    return Swizzle<cute::max(B+NewM,0), 0, S>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+downcast(Swizzle<B,M,S> const& swizzle)
+{
+  static_assert(has_single_bit(N), "N must be a power of two");
+  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
+  return Swizzle<B,(M + log2_n),S>{};
+}
+
+template <class OldType, class NewType,
+          int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+recast_layout(Swizzle<B,M,S> const& swizzle)
+{
+  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
+  if constexpr (scale::num == 1 && scale::den == 1) {
+    return swizzle;
+  }
+  else if constexpr (scale::num == 1) {
+    return downcast<scale::den>(swizzle);
+  }
+  else if constexpr (scale::den == 1) {
+    return upcast<scale::num>(swizzle);
+  }
+  else {
+    static_assert(dependent_false<scale>, "Recast not supported.");
+  }
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int B, int M, int S>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(Swizzle<B,M,S> const&)
+{
+  return Int<1 << M>{};
+}
+
+template <int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_alignment(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& layout)
+{
+  return gcd(max_alignment(layout.layout_a()),
+             max_alignment(layout.offset()),
+             max_alignment(layout.layout_b()));
+}
+
+//
+// Other operations
+//
+
+template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
+                  Layout<Shape,Stride>                          const& b)
+{
+  auto common = max_common_layout(a.layout_b(), b);
+  auto base = Int<(1 << M)>{};
+  if constexpr (base < size(common)) {
+    return common.compose(base);       // Truncate common to size base
+  } else {
+    return common;
+  }
+}
+
+template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Layout<Shape,Stride>                          const& a,
+                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
+{
+  return max_common_layout(b, a);
+}
+
+template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
+                  Layout<Shape,Stride>                          const& b)
+{
+  // This assumes that Offset is in the YZ domain of the Swizzle...
+  return cute::min(max_common_vector(a.layout_b(), b), Int<(1 << M)>{});
+}
+
+template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Layout<Shape,Stride>                          const& a,
+                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
+{
+  return max_common_vector(b, a);
+}
+
+template <int B0, int M0, int S0, class Offset0, class LayoutB0,
+          int B1, int M1, int S1, class Offset1, class LayoutB1>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(ComposedLayout<Swizzle<B0,M0,S0>,Offset0,LayoutB0> const& a,
+                  ComposedLayout<Swizzle<B1,M1,S1>,Offset1,LayoutB1> const& b)
+{
+  // Typical impl is composition(a, right_inverse(b))
+  // so this is  Sw0 o B0 o rinv(Sw1 o B1) = Sw0 o B0 o rinv(B1) o Sw1
+  auto vec = max_common_vector(a.layout_b(), b.layout_b());
+
+  // This assumes that Offset is in the YZ domain of the Swizzle...
+  if constexpr (Swizzle<B0,M0,S0>{} == Swizzle<B1,M1,S1>{}) {
+    return vec;
+  } else {
+    return cute::min(vec, Int<(1 << M0)>{}, Int<(1 << M1)>{});
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// ComposedLayout as second argument is often more difficult...
+
+template <class Shape, class Stride,
+          int B, int M, int S, class Offset, class LayoutT>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_product(Layout<Shape,Stride>                          const& layout,
+                ComposedLayout<Swizzle<B,M,S>,Offset,LayoutT> const& tiler)
+{
+  CUTE_STATIC_ASSERT_V(tiler.offset() == Int<0>{}, "Require Swizzle offset == 0.");
+  // The new layout -- if swizzle wasn't an issue, this is the result
+  //   our goal is to determine a new swizzle for these strides
+  auto new_layout = logical_product(layout, tiler.layout_b());
+
+  // This is accomplished by identifying
+  //  S o L  :=:  S? o L*
+  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
+  // Then that active identifier is transformed through the layouts:
+  //  L*(L[(P o L)(c*)])
+  // which is a new swizzle identifier for S?, the new swizzle
+
+  // Projections of the swizzle layout for composition, P
+  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
+                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
+
+  // Compose with the tiler to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
+  auto layout_only_zy       = composition(swizzle_only_zy, tiler.layout_b());
+  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
+  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
+  // Get the Z bit and the Y bits
+  auto active_Z = swizzle_active_bits & typename Swizzle<B,M,S>::zzz_msk{};
+  auto active_Y = swizzle_active_bits & typename Swizzle<B,M,S>::yyy_msk{};
+
+  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
+  auto new_active_Z = new_layout(Int<0>{}, tiler.layout_b()[active_Z]);
+  auto new_active_Y = new_layout(Int<0>{}, tiler.layout_b()[active_Y]);
+
+  // Use this new swizzle identifier to construxt the new swizzle for new_layout
+  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
+  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/tensor.hpp b/lightllm-kernel/cutlass/include/cute/tensor.hpp
new file mode 100755
index 000000000..3f3335b63
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/tensor.hpp
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/tensor_impl.hpp>
+
+//
+// Extended Engines
+//
+
+#include <cute/pointer_swizzle.hpp>
+#include <cute/pointer_sparse.hpp>
+#include <cute/pointer_flagged.hpp>
+#include <cute/tensor_zip.hpp>
+
+//
+// Tensor Algorithms
+//
+
+#include <cute/algorithm/tensor_algorithms.hpp>
+#include <cute/algorithm/fill.hpp>
+#include <cute/algorithm/clear.hpp>
+#include <cute/algorithm/copy.hpp>
+#include <cute/algorithm/prefetch.hpp>
+#include <cute/algorithm/axpby.hpp>
+#include <cute/algorithm/gemm.hpp>
+
+#include <cute/algorithm/cooperative_copy.hpp>
+#include <cute/algorithm/cooperative_gemm.hpp>
+
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp b/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
new file mode 100755
index 000000000..61eefc506
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
@@ -0,0 +1,1193 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains the definition of Tensor as well as classes/functions most closely associated with it.
+
+    For backwards-compatibility, "tensor.hpp" is the "entrypoint" header for a collection of classes and utilities
+    that are adjacent to Tensor, e.g. fill(). Whereas this file contains the actual definition of Tensor and
+    a small set of functions central to its usage.
+
+    Within the CUTLASS codebase, favor not including "tensor.hpp" wherever possible; instead include "tensor_impl.hpp"
+    along with other specific headers that you need. This helps to avoid circular includes and to reduce build time.
+*/
+
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
+#include <cute/layout.hpp>                     // cute::Shape
+#include <cute/layout_composed.hpp>            // cute::is_composed_layout
+#include <cute/pointer.hpp>                    // cute::recast_ptr
+#include <cute/pointer_base.hpp>               // cute::iterator_traits
+#include <cute/container/array_aligned.hpp>    // cute::array_aligned
+#include <cute/container/array_subbyte.hpp>    // cute::array_subbyte
+#include <cute/container/tuple.hpp>            // cute::tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
+#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
+
+namespace cute
+{
+
+//
+// Engine -- owning or non-owning data store
+//
+
+// concept Engine {
+//   using iterator     = ;
+//   using value_type   = ;
+//   using element_type = ;
+//   using reference    = ;
+//   iterator begin();
+// };
+
+template <class T, size_t N>
+struct ArrayEngine
+{
+  using Storage = typename conditional<(sizeof_bits<T>::value % 8 == 0),
+                                       array_aligned<T,N>,
+                                       array_subbyte<T,N>>::type;
+  using iterator     = typename Storage::iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  Storage storage_;
+
+  CUTE_HOST_DEVICE constexpr auto begin() const { return storage_.begin(); }
+  CUTE_HOST_DEVICE constexpr auto begin()       { return storage_.begin(); }
+};
+
+// Specialization for sparse_elem<S,T> tensor allocation/iteration
+template <int S, class T, size_t N>
+struct ArrayEngine<sparse_elem<S,T>, N>
+{
+  static_assert(N % S == 0, "Expected a multiple of the sparsity.");
+  using value_type   = sparse_elem<S,T>;
+  using Storage      = typename conditional<(sizeof_bits<T>::value % 8 == 0),
+                                            array_aligned<T,N/S>,
+                                            array_subbyte<T,N/S>>::type;
+  using iterator     = sparse_ptr<S,sparse_elem<S,T>*>;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  Storage storage_;
+
+  CUTE_HOST_DEVICE constexpr auto begin() const { return recast_ptr<value_type>(storage_.begin()); }
+  CUTE_HOST_DEVICE constexpr auto begin()       { return recast_ptr<value_type>(storage_.begin()); }
+};
+
+template <class Iterator>
+struct ViewEngine
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  iterator storage_;
+
+  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
+  CUTE_HOST_DEVICE constexpr iterator      & begin()       { return storage_; }
+};
+
+template <class Iterator>
+struct ConstViewEngine
+{
+  using iterator     = Iterator;
+  using reference    = typename iterator_traits<iterator>::reference;
+  using element_type = typename iterator_traits<iterator>::element_type;
+  using value_type   = typename iterator_traits<iterator>::value_type;
+  iterator storage_;
+
+  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
+};
+
+//
+// Tensor
+//
+
+template <class Engine, class Layout>
+struct Tensor
+{
+  using iterator     = typename Engine::iterator;
+  using value_type   = typename Engine::value_type;
+  using element_type = typename Engine::element_type;
+  using reference    = typename Engine::reference;
+
+  using engine_type  = Engine;
+  using layout_type  = Layout;
+
+  CUTE_HOST_DEVICE constexpr
+  Tensor() {}
+
+  CUTE_HOST_DEVICE constexpr
+  Tensor(Engine const& engine, Layout const& layout)
+      : rep_(layout, engine) {
+  }
+
+  //
+  // Accessors
+  //
+
+  static constexpr int rank  = Layout::rank;
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  tensor() const {
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  engine() const {
+    return get<1>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  engine() {
+    return get<1>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  data() const {
+    return engine().begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  data() {
+    return engine().begin();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  layout() const {
+    return get<0>(rep_);
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  shape() const {
+    return layout().shape();
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  auto
+  size() const {
+    return cute::size(shape());
+  }
+
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  stride() const {
+    return layout().stride();
+  }
+
+  //
+  // Indexing op() and op[]
+  //
+
+  // Index into this tensor like an array by computing the offset via layout()
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator[](Coord const& coord) {
+    return data()[layout()(coord)];
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator[](Coord const& coord) const {
+    return data()[layout()(coord)];
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord const& coord) {
+    if constexpr (has_underscore<Coord>::value) {
+      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      return make_tensor(data() + offset, sliced_layout);
+    } else {
+      return data()[layout()(coord)];
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
+      return make_tensor(data() + offset, sliced_layout);
+    } else {
+      return data()[layout()(coord)];
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // op() convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  //
+  // Compose
+  //
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) {
+    return make_tensor(data(), layout().compose(layouts...));
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  compose(Layouts const&... layouts) const {
+    return make_tensor(data(), layout().compose(layouts...));
+  }
+
+  //
+  // Tile
+  //
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) {
+    return make_tensor(data(), layout().tile(layouts...));
+  }
+
+  template <class... Layouts>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  tile(Layouts const&... layouts) const {
+    return make_tensor(data(), layout().tile(layouts...));
+  }
+
+  //
+  // Utility
+  //
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_1d_coord(Int const& linear_idx) const {
+    return layout().get_1d_coord(linear_idx);
+  }
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_hier_coord(Int const& linear_idx) const {
+    return layout().get_hier_coord(linear_idx);
+  }
+
+  template <class Int,
+            __CUTE_REQUIRES(is_integral<Int>::value)>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_flat_coord(Int const& linear_idx) const {
+    return layout().get_flat_coord(linear_idx);
+  }
+
+  cute::tuple<layout_type, engine_type> rep_;
+};
+
+template <class T>
+struct is_tensor : false_type {};
+template <class Engine, class Layout>
+struct is_tensor<Tensor<Engine,Layout>> : true_type {};
+template <class T>
+constexpr bool is_tensor_v = is_tensor<T>::value;
+
+// Customization point for creation of owning and non-owning Tensors
+template <class T>
+struct MakeTensor
+{
+  template <class Arg0, class... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Arg0 const& arg0, Args const&... args) const
+  {
+    if constexpr (has_dereference<Arg0>::value) {
+      // Construct a non-owning Tensor
+      using Engine = ViewEngine<Arg0>;
+      if constexpr (sizeof...(Args) == 1 && (is_layout<Args>::value && ...)) {
+        // Forward a Layout
+        return Tensor{Engine{arg0}, args...};
+      } else {
+        // Construct a Layout from Args
+        return Tensor{Engine{arg0}, make_layout(args...)};
+      }
+    } else {
+      // Construct an owning Tensor
+      static_assert((is_static<Arg0>::value && ... && is_static<Args>::value),
+                    "Dynamic owning tensors not supported");
+      if constexpr (sizeof...(Args) == 0 && is_layout<Arg0>::value) {
+        // Forward a Layout
+        using Layout = Arg0;
+        using Engine = ArrayEngine<T, cosize_v<Layout>>;
+        return Tensor<Engine,Layout>();
+      } else {
+        // Construct a Layout from Args
+        using Layout = decltype(make_layout(arg0, args...));
+        using Engine = ArrayEngine<T, cosize_v<Layout>>;
+        return Tensor<Engine,Layout>();
+      }
+    }
+  }
+};
+
+//
+// make_tensor
+//
+
+// Make an owning Tensor that will allocate a static array
+// e.g. make_tensor<float>(Int<12>{})
+template <class T, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Args const&... args)
+{
+  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor<T>(args...)");
+  return MakeTensor<T>{}(args...);
+}
+
+// Make a non-owning Tensor that will use a pointer (view)
+// e.g. make_tensor(vec.data(), 12)
+template <class Iterator, class... Args>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor(Iterator const& iter, Args const&... args)
+{
+  static_assert(has_dereference<Iterator>::value, "Expected iterator iter in make_tensor(iter, args...)");
+  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor(iter, args...)");
+  return MakeTensor<Iterator>{}(iter, args...);
+}
+
+//
+// make_tensor_like
+//   Make a register tensor the same type and shape and (if possible) order as another tensor
+//
+
+template <class NewT, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Layout const& layout)
+{
+  return make_tensor<NewT>(make_layout_like(layout));
+}
+
+template <class NewT, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_tensor_like<NewT>(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_tensor_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_tensor_like<typename Engine::value_type>(tensor.layout());
+}
+
+//
+// make_fragment_like
+//   Make a tensor the same shape and (if possible) order as another tensor, with special
+//   consideration of the 0th mode. The 0th mode is commonly used for MMA_Atoms or Copy_Atoms
+//   so this allocates the 0th mode with LayoutLeft regardless of the reference layout.
+//
+
+template <class NewT, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Layout const& layout)
+{
+  return make_tensor<NewT>(make_fragment_like(layout));
+}
+
+template <class NewT, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_fragment_like<NewT>(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+make_fragment_like(Tensor<Engine,Layout> const& tensor)
+{
+  return make_fragment_like<typename Engine::value_type>(tensor.layout());
+}
+
+//
+// make_counting_tensor
+//   Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain.
+//
+
+template <class Layout, __CUTE_REQUIRES(is_layout<Layout>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+make_counting_tensor(Layout const& layout)
+{
+  return make_tensor(make_inttuple_iter(repeat_like(coshape(layout), Int<0>{})), layout);
+}
+
+//
+// make_identity_tensor
+//   Make a tensor that maps coordinates within a shape to themselves.
+//
+
+template <class Shape>
+CUTE_HOST_DEVICE constexpr
+auto
+make_identity_tensor(Shape const& shape)
+{
+  return make_counting_tensor(make_identity_layout(shape));
+}
+
+//
+// Utilities
+//
+
+// Return the subtensor of a mode
+template <int... Is, class Tensor>
+CUTE_HOST_DEVICE constexpr
+auto
+tensor(Tensor&& tensor)
+{
+  if constexpr (sizeof...(Is) == 0) {
+    return tensor;
+  } else {
+    return make_tensor(tensor.data(), get<Is...>(tensor.layout()));
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Return the layout of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+layout(Tensor<Engine,Layout> const& tensor)
+{
+  return layout<Is...>(tensor.layout());
+}
+
+// Return the shape of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+shape(Tensor<Engine,Layout> const& tensor)
+{
+  return shape<Is...>(tensor.layout());
+}
+
+// Return the stride of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+stride(Tensor<Engine,Layout> const& tensor)
+{
+  return stride<Is...>(tensor.layout());
+}
+
+// Return the number of elements in a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+size(Tensor<Engine,Layout> const& tensor)
+{
+  return size<Is...>(tensor.layout());
+}
+
+// Return the rank of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(Tensor<Engine,Layout> const& tensor)
+{
+  return rank<Is...>(tensor.layout());
+}
+
+// Return the depth of a mode
+template <int... Is, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+depth(Tensor<Engine, Layout> const& tensor)
+{
+  return depth<Is...>(tensor.layout());
+}
+
+//
+// Operations to manipulate Tensors like a Layout or IntTuple
+//   These are implemented with explicit modifier overloads because these
+//   methods likely also have a general IntTuple overload that can shadow.
+//
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+flatten(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), flatten(tensor.layout()));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout> const& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout>& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile = Int<1>>
+CUTE_HOST_DEVICE constexpr
+auto
+coalesce(Tensor<Engine,Layout>&& tensor, Profile const& profile = {}) {
+  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
+}
+
+// Replace the modes in layout that have a 0-stride with a 1-size
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout> const& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+template <class Engine, class Layout, class Profile>
+CUTE_HOST_DEVICE constexpr
+auto
+filter_zeros(Tensor<Engine,Layout>&& tensor, Profile const& profile)
+{
+  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
+}
+
+// Remove all of the 0-strides and 1-sizes
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+filter(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), filter(tensor.layout()));
+}
+
+// Group the modes [B,E) into a single mode
+// e.g. group<2,4>(make_tensor<int>(Layout<Shape<_1,_2,_3,_4,_5,_6>>{}))
+//      => make_tensor<int>(Layout<Shape<_1,_2,Shape<_3,_4>,_5,_6>>{})
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+group_modes(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
+}
+
+// Return the subtensor of a range of modes
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout> const& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout>& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+template <int B, int E, class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+take(Tensor<Engine,Layout>&& tensor) {
+  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
+}
+
+// Return a tensor with the same shape as input but offset by a given coordinate
+template <class Coord, class Tensor,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+domain_offset(Coord const& coord, Tensor&& tensor)
+{
+  auto [layout, ptr_offset] = domain_offset(coord, tensor.layout());
+  return make_tensor(static_cast<Tensor&&>(tensor).data() + ptr_offset, layout);
+}
+
+//
+// Recast
+//
+
+// NOTE: This is very dangerous to do
+//   -- doesn't check dynamic integer divisibility
+//   -- doesn't check alignment
+
+template <class NewType, class Tensor>
+CUTE_HOST_DEVICE constexpr
+auto
+recast(Tensor&& tensor)
+{
+  using OldType = typename remove_cvref_t<Tensor>::value_type;
+  auto old_layout = tensor.layout();
+  auto new_layout = recast_layout<OldType,NewType>(old_layout);
+
+  // If this is an upcast of a normal Layout with static negative strides, then offset as well
+  if constexpr (sizeof(OldType) < sizeof(NewType) && not is_composed_layout<decltype(old_layout)>::value) {
+    auto shape_diff = transform(flatten(old_layout.shape()), flatten(new_layout.shape()), minus{});
+    auto extent_diff = transform(shape_diff, flatten(old_layout.stride()), multiplies{});
+    auto offset = fold(extent_diff, Int<0>{}, [](auto const& i, auto const& a) { return i + cute::min(a,Int<0>{}); });
+
+    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data() + offset), new_layout);
+  } else {
+    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data()         ), new_layout);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// max_common_vector
+//
+
+/* Return Int<N> such that N is the maximum number of contiguous elements
+ * that logically correspond in the tensors of @a a and @a b. This is,
+ * the number of elements that could reasonably be vectorized into a single load/store.
+ *
+ * @returns Int<N> with N >= 0
+ *
+ * A return value of Int<0> indicates that no such conclusion can be made and no
+ * vectorization should be attempted.
+ *
+ * Note that the return value does NOT include alignment concerns such as the pointer value and
+ * the divisbility of dynamic strides.
+ */
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_vector(Tensor<SrcEngine,SrcLayout> const& a,
+                  Tensor<DstEngine,DstLayout> const& b)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;
+
+  // Determine if vectorization candidates at all
+  if constexpr (// Should be the same value_types, else the copy is also performing a cast
+                cute::is_same<SrcType, DstType>::value &&
+                // The types should be trivially copyable so that vectorization is valid
+                is_trivially_copyable<SrcType>::value &&
+                is_trivially_copyable<DstType>::value &&
+                // Should be load/storing real data, rather than implicit iterators or such
+                is_reference<SrcRef>::value &&
+                is_reference<DstRef>::value)
+  {
+    return max_common_vector(a.layout(), b.layout());
+  } else {
+    return Int<0>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+/* Return a layout that points to the maximum number of contiguous elements
+ * that logically correspond in the tensors of @a a and @a b. This is,
+ * the elements that could reasonably be "vectorized" into a single load/store.
+ *
+ * @returns Layout R such that composition(a.layout(), R) and composition(b.layout(), R)
+ *          are both identity Layouts.
+ *
+ * Note that the returned layout does NOT include alignment concerns such as the pointer value and
+ * the divisbility of dynamic strides.
+ */
+template <class SrcEngine, class SrcLayout,
+          class DstEngine, class DstLayout>
+CUTE_HOST_DEVICE constexpr
+auto
+max_common_layout(Tensor<SrcEngine,SrcLayout> const& a,
+                  Tensor<DstEngine,DstLayout> const& b)
+{
+  using SrcType = typename SrcEngine::value_type;
+  using SrcRef  = typename SrcEngine::reference;
+  using DstType = typename DstEngine::value_type;
+  using DstRef  = typename DstEngine::reference;
+
+  // Determine if vectorization candidates at all
+  if constexpr (// Should be the same value_types, else the copy is also performing a cast
+                cute::is_same<SrcType, DstType>::value &&
+                // The types should be trivially copyable so that vectorization is valid
+                is_trivially_copyable<SrcType>::value &&
+                is_trivially_copyable<DstType>::value &&
+                // Should be load/storing real data, rather than implicit iterators or such
+                is_reference<SrcRef>::value &&
+                is_reference<DstRef>::value)
+  {
+    return max_common_layout(a.layout(), b.layout());
+  } else {
+    return Layout<_1,_0>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Key algebraic operations -- Composition, Divide, and Product
+//
+
+// Apply a Tiler to the Tensor via composition.
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+composition(Tensor    && tensor,
+            Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     composition(tensor.layout(), tiler));
+}
+
+// Apply a Tiler to the Tensor.
+//
+// Consider a Tensor with shape (A,B,x,y)
+// And a Tiler that is:
+//
+// * A Layout with shape (BLK_A,BLK_B)
+// ** Result Tensor shape ((BLK_A,BLK_B),Rest).
+// ** That is, the Tensor and Tile are treated as 1D for the tiling.
+// ** See logical_divide(Layout,Layout)
+//
+// * A Tile<Layout...> with shape <BLK_A,BLK_B>
+// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
+// ** Each mode of the Tile<Layout...> is applied to the corresponding mode of the Tensor.
+// ** See logical_divide(Layout,Tuple)
+//
+// * A Shape (BLK_A,BLK_B)
+// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
+// ** Equivalent to applying Tile<BLK_A:_1,BLK_B:_1>.
+// ** See logical_divide(Layout,Tuple) and logical_divide(Layout,Int)
+//
+// Note that the Tile<Layout...>/Shape Tilers must be weakly_congruent to the Tensor
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(Tensor    && tensor,
+               Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     logical_divide(tensor.layout(), tiler));
+}
+
+// zipped_divide is logical_divide with Tiler modes and Rest modes gathered together: (Tiler,Rest)
+// When Tiler is Layout, this has no effect as logical_divide results in the same.
+// When Tiler is Tile<Layout...> or Shape, this zips modes into standard form ((BLK_A,BLK_B),(a,b,x,y))
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(Tensor    && tensor,
+              Tiler const& tiler)    // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     zipped_divide(tensor.layout(), tiler));
+}
+
+// tiled_divide is zipped_divide with the second output mode flattened ((BLK_A,BLK_B),a,b,x,y)
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+tiled_divide(Tensor    && tensor,
+             Tiler const& tiler)     // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     tiled_divide(tensor.layout(), tiler));
+}
+
+// flat_divide is zipped_divide with the both modes flattened (BLK_A,BLK_B,a,b,x,y)
+template <class Tensor, class Tiler,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+flat_divide(Tensor    && tensor,
+            Tiler const& tiler)      // Layout or Tile<Layout...> or Shape
+{
+  return make_tensor(static_cast<Tensor&&>(tensor).data(),
+                     flat_divide(tensor.layout(), tiler));
+}
+
+// logical_product on a Tensor doesn't make sense since it often increases cosize
+//   though this might make sense for creating Tensors with broadcasted (stride-0) modes
+
+//
+// Tensor partitioning utilities
+//
+
+// Apply a Tiler to the Tensor, then slice out one of those tiles by slicing into the "Rest" modes.
+// With an inner_partition, you get everything that's inside the Tiler. Everything that the Tiler is pointing to.
+// Split the modes of tensor according to the Tiler
+//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+// Then slice into the second mode (the "Rest" mode) with Coord
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+inner_partition(Tensor    && tensor,
+                Tiler const& tiler,
+                Coord const& coord)
+{
+  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
+  constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
+
+  // The coord slices into the second mode (the "rest" mode), flatten the first
+  if constexpr (is_tuple<Coord>::value) {
+    // Append trailing modes if coord is tuple
+    constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
+    return tensor_tiled(repeat<R0>(_), append<R1>(coord,_));
+  } else {
+    // Flat indexing if coord is not tuple
+    return tensor_tiled(repeat<R0>(_), coord);
+  }
+}
+
+// Apply a Tiler to the Tensor, then slice out the remainder by slicing into the "Tile" modes.
+// With an outer_partition, you get everything that's outside the Tiler. The layout of the Tile in the Tensor.
+// Split the modes of tensor according to the Tiler
+//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
+// Then slice into the first mode (the "Tile" mode) with Coord
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+outer_partition(Tensor    && tensor,
+                Tiler const& tiler,
+                Coord const& coord)
+{
+  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
+  constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
+
+  // The coord slices into the first mode (the "tile" mode), flatten the second
+  if constexpr (is_tuple<Coord>::value) {
+    // Append trailing modes if coord is tuple
+    constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
+    return tensor_tiled(append<R0>(coord,_), repeat<R1>(_));
+  } else {
+    // Flat indexing if coord is not tuple
+    return tensor_tiled(coord, repeat<R1>(_));
+  }
+}
+
+// Tile a tensor according to @a tiler and use @a coord to index into the remainder, keeping the tile.
+// This is typical at the CTA level where tiles of data are extracted:
+//   Tensor data = ...                                                                         // (  M,  N)
+//   Tensor cta_data = local_tile(data, Shape<_32,_64>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
+template <class Tensor, class Tiler, class Coord,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE constexpr
+auto
+local_tile(Tensor    && tensor,
+           Tiler const& tiler,   // tiler to apply
+           Coord const& coord)   // coord to slice into "remainder"
+{
+  return inner_partition(static_cast<Tensor&&>(tensor),
+                         tiler,
+                         coord);
+}
+
+// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
+//   when using projections of the same tiler.
+// This is typical at the CTA level where tiles of data are extracted as projections:
+//   Tensor dataA = ...                                                        // (M,K)
+//   Tensor dataB = ...                                                        // (N,K)
+//   Tensor dataC = ...                                                        // (M,N)
+//   auto cta_tiler = Shape<_32, _64, _4>{};
+//   auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);
+//   Tensor ctaA = local_tile(dataA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (_32,_4,k)
+//   Tensor ctaB = local_tile(dataA, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (_64,_4,k)
+//   Tensor ctaC = local_tile(dataA, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (_32,_64)
+template <class Tensor, class Tiler, class Coord, class Proj,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_tile(Tensor    && tensor,
+           Tiler const& tiler,   // tiler to apply
+           Coord const& coord,   // coord to slice into "remainder"
+           Proj  const& proj)    // projection to apply to tiler and coord
+{
+  return local_tile(static_cast<Tensor&&>(tensor),
+                    dice(proj, tiler),
+                    dice(proj, coord));
+}
+
+// Tile a tensor according to the flat shape of a layout that provides the coordinate of the target index.
+// This is typical at the Thread level where data is partitioned across repeated patterns of threads:
+//   Tensor data = ...                                                            // (_16,_64)
+//   Tensor thr_data = local_partition(data, Layout<Shape<_2,_16>>{}, thr_idx);   // ( _8, _4)
+template <class Tensor, class LShape, class LStride, class Index,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_partition(Tensor                     && tensor,
+                Layout<LShape,LStride> const& tile,    // coord -> index
+                Index                  const& index)   // index to slice for
+{
+  static_assert(is_integral<Index>::value);
+  return outer_partition(static_cast<Tensor&&>(tensor),
+                         product_each(shape(tile)),
+                         tile.get_flat_coord(index));
+}
+
+// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
+//   when using projections of the same tiler.
+// This is typical at the Thread level where data is partitioned across projected layouts of threads:
+//   Tensor dataA = ...                                                            // (M,K)
+//   Tensor dataB = ...                                                            // (N,K)
+//   Tensor dataC = ...                                                            // (M,N)
+//   auto thr_layout = Layout<Shape<_2,_16,_1>, Stride<_16,_1,_0>>{};
+//   Tensor thrA = local_partition(dataA, thr_layout, thr_idx, Step<_1, X,_1>{});  // (M/2,K/1)
+//   Tensor thrB = local_partition(dataB, thr_layout, thr_idx, Step< X,_1,_1>{});  // (N/16,K/1)
+//   Tensor thrC = local_partition(dataC, thr_layout, thr_idx, Step<_1,_1, X>{});  // (M/2,N/16)
+template <class Tensor, class LShape, class LStride, class Index, class Projection,
+          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
+CUTE_HOST_DEVICE
+auto
+local_partition(Tensor                     && tensor,
+                Layout<LShape,LStride> const& tile,   // coord -> index
+                Index                  const& index,  // index to slice for
+                Projection             const& proj)
+{
+  return local_partition(static_cast<Tensor&&>(tensor),
+                         dice(proj, tile),
+                         index);
+}
+
+//
+// Display utilities
+//
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE void print(Tensor<Engine,Layout> const& tensor)
+{
+  print(tensor.data()); print(" o "); print(tensor.layout());
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE void print_tensor(Tensor<Engine,Layout> const& tensor, bool print_type = true)
+{
+  if (print_type) {
+    print(tensor); print(":\n");
+  }
+
+  if constexpr (Layout::rank == 1)
+  {
+    for (int m = 0; m < size(tensor); ++m) {
+      pretty_print(tensor(m));
+      printf("\n");
+    }
+  } else
+  if constexpr (Layout::rank == 2)
+  {
+    for (int m = 0; m < size<0>(tensor); ++m) {
+      for (int n = 0; n < size<1>(tensor); ++n) {
+        pretty_print(tensor(m,n));
+      }
+      printf("\n");
+    }
+  } else
+  if constexpr (Layout::rank == 3)
+  {
+    print_tensor(tensor(_,_,0), false);
+    for (int k = 1; k < size<2>(tensor); ++k) {
+      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("-"); } print("\n");
+      print_tensor(tensor(_,_,k), false);
+    }
+  } else
+  if constexpr (Layout::rank == 4)
+  {
+    print_tensor(tensor(_,_,_,0), false);
+    for (int p = 1; p < size<3>(tensor); ++p) {
+      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("="); } print("\n");
+      print_tensor(tensor(_,_,_,p), false);
+    }
+  }
+}
+
+#if !defined(__CUDACC_RTC__)
+template <class Engine, class Layout>
+CUTE_HOST std::ostream& print_tensor_os(std::ostream& os, Tensor<Engine,Layout> const& tensor)
+{
+  int digits = 9;
+
+  if constexpr (Layout::rank == 1)
+  {
+    for (int m = 0; m < size(tensor); ++m) {
+      os << std::setw(digits) << tensor(m) << std::endl;
+    }
+  } else
+  if constexpr (Layout::rank == 2)
+  {
+    for (int m = 0; m < size<0>(tensor); ++m) {
+      for (int n = 0; n < size<1>(tensor); ++n) {
+        os << std::setw(digits) << tensor(m,n);
+      }
+      os << std::endl;
+    }
+  } else
+  if constexpr (Layout::rank == 3)
+  {
+    print_tensor_os(os, tensor(_,_,0));
+    for (int k = 1; k < size<2>(tensor); ++k) {
+      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "-"; } os << std::endl;
+      print_tensor_os(os, tensor(_,_,k));
+    }
+  } else
+  if constexpr (Layout::rank == 4)
+  {
+    print_tensor_os(os, tensor(_,_,_,0));
+    for (int p = 1; p < size<3>(tensor); ++p) {
+      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "="; } os << std::endl;
+      print_tensor_os(os, tensor(_,_,_,p));
+    }
+  }
+
+  return os;
+}
+
+template <class Engine, class Layout>
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Tensor<Engine,Layout> const& tensor)
+{
+  os << tensor.layout() << std::endl;
+  return print_tensor_os(os, tensor);
+}
+#endif // !defined(__CUDACC_RTC__)
+
+} // end namespace cute
+
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp b/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
new file mode 100755
index 000000000..9c8a2ba61
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                    // CUTE_HOST_DEVICE
+#include <cute/numeric/integral_constant.hpp> // cute::true_type
+
+namespace cute
+{
+
+template <class T>
+struct ConstantTensor
+{
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  T const&
+  operator()(Coords const&...) const {
+    return val_;
+  }
+
+  T val_;
+};
+
+struct TrivialPredTensor
+{
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  true_type
+  operator()(Coords const&...) const {
+    return {};
+  }
+};
+
+template <class Fn>
+struct FunctionPredTensor
+{
+  CUTE_HOST_DEVICE constexpr
+  FunctionPredTensor(Fn const& fn) : fn_(fn) {}
+
+  template <class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coords const&... coords) const {
+    return fn_(coords...);
+  }
+
+  Fn const& fn_;
+};
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp b/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
new file mode 100755
index 000000000..6d70ffc84
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
@@ -0,0 +1,243 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/tensor_impl.hpp>      // cute::Tensor
+#include <cute/container/tuple.hpp>  // cute::tuple
+
+namespace cute
+{
+
+// A tuple of Iterators that can be offset asymmetrically
+// Note that this only accepts op+(tuple<Index...>) and op[tuple<Index...>]
+//   where each iterator will be offset by its respective index only.
+// READ-ONLY for now until cute::tuple can be constructed with references.
+template <class... Iters>
+struct ZipIterator
+{
+  using value_type   = cute::tuple<iter_value_t<Iters>...>;
+  using element_type = cute::tuple<iter_element_t<Iters>...>;
+  // NOTE: cute::tuple does not support constructions with references at the moment.
+  //       Consider fixes and/or an implementation of std::forward_as_tuple.
+  //       For now, use a cute::tuple of value_types instead, which makes this Iterator READ-ONLY.
+  //using reference    = cute::tuple<iter_reference_t<Iters>...>;
+  using reference  = value_type;
+
+  ZipIterator() = delete;
+
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator(Iters... iters)
+    : iters_(iters...)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator(cute::tuple<Iters...> const& iters)
+    : iters_(iters)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  reference operator*() const {
+    return cute::apply(iters_, [](auto&&... args) { return reference(*args...); });
+  }
+
+  template <class... Index>
+  CUTE_HOST_DEVICE constexpr
+  ZipIterator operator+(cute::tuple<Index...> const& idxs) const {
+    static_assert(sizeof...(Index) == sizeof...(Iters), "Expect same number of offsets as iterators.");
+    return cute::transform(iters_, idxs, [](auto&& iter, auto&& idx) { return iter + idx; });
+  }
+
+  template <class... Index>
+  CUTE_HOST_DEVICE constexpr
+  reference operator[](cute::tuple<Index...> const& idxs) const {
+    return *(*this + idxs);
+  }
+
+  cute::tuple<Iters...> iters_;
+};
+
+//------------------------------------------------------------------------------
+// type traits
+
+template <class... Iters>
+struct is_rmem<ZipIterator<Iters...>> : conjunction<is_rmem<Iters>...> {};
+template <class... Iters>
+struct is_smem<ZipIterator<Iters...>> : conjunction<is_smem<Iters>...> {};
+template <class... Iters>
+struct is_gmem<ZipIterator<Iters...>> : conjunction<is_gmem<Iters>...> {};
+// A tuple of Layouts that operates on each Layout symmetrically
+// The Layouts need to have compatible shapes and ranks.
+// The ZipLayout presents the intersection of the domain of its component Layouts.
+//   E.g. all Layouts accept 1D coords and ZipLayout does as well.
+// The ZipLayout returns the union of the codomain of its component Layouts.
+//   E.g. all Layouts return an integer so ZipLayout returns a tuple of integers.
+template <class... Layouts>
+struct ZipLayout
+{
+  static constexpr int rank = (int(0) | ... | Layouts::rank);
+
+  static_assert((is_layout<Layouts>::value && ...), "All template parameters must be layouts");
+  static_assert(((Layouts::rank == rank) && ...),   "All layouts must have the same rank");
+
+  CUTE_HOST_DEVICE constexpr
+  ZipLayout(Layouts const&... layouts)
+    : layouts_(layouts...)
+  {}
+
+  CUTE_HOST_DEVICE constexpr
+  ZipLayout(cute::tuple<Layouts...> const& layouts)
+    : layouts_(layouts)
+  {}
+
+  template <class Coord>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(Coord const& coord) const {
+    if constexpr (has_underscore<Coord>::value) {
+      return ZipLayout(cute::transform(layouts_, [&] (auto layout) { return layout(coord); }));
+    } else {
+      return cute::transform(layouts_, [&] (auto layout) { return layout(coord); });
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+
+  // op() convenience function for multi-dimensional coordinates
+  template <class Coord0, class Coord1, class... Coords>
+  CUTE_HOST_DEVICE constexpr
+  decltype(auto)
+  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
+    return operator()(make_coord(c0,c1,cs...));
+  }
+
+  cute::tuple<Layouts...> layouts_;
+};
+
+template <class... Layouts>
+struct is_layout<ZipLayout<Layouts...>> : true_type {};
+
+//
+// make_zip_tensor and unzip_tensor
+//
+
+template <class... Engines, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+make_zip_tensor(Tensor<Engines,Layouts> const&... tensors)
+{
+  return make_tensor(ZipIterator(tensors.data()...),
+                     ZipLayout(tensors.layout()...));
+}
+
+template <class Engine, class Layout>
+CUTE_HOST_DEVICE constexpr
+auto
+unzip_tensor(Tensor<Engine,Layout> const& tensor)
+{
+  return cute::transform(tensor.data().iters_, tensor.layout().layouts_,
+                         [](auto iter, auto layout) { return make_tensor(iter, layout); });
+}
+
+//
+// Utilities
+//
+
+template <int... Is, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+rank(ZipLayout<Layouts...> const& layouts)
+{
+  return rank<Is...>(get<0>(layouts.layouts_));
+}
+
+template <int... Is, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+size(ZipLayout<Layouts...> const& layouts)
+{
+  return size<Is...>(get<0>(layouts.layouts_));
+}
+
+//
+// Manipulation
+//
+
+// Extend each component layout to rank-N by appending Layout @a x.
+template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+append(ZipLayout<Layouts...>  const& layouts,
+       Layout<ShapeX,StrideX> const& x = {})
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return append<N>(t, x); }));
+}
+
+// Extend each component layout to rank-N by prepending Layout @a x.
+template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
+CUTE_HOST_DEVICE constexpr
+auto
+prepend(ZipLayout<Layouts...>  const& layouts,
+        Layout<ShapeX,StrideX> const& x = {})
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return prepend<N>(t, x); }));
+}
+
+template <class... Layouts, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+logical_divide(ZipLayout<Layouts...> const& layouts,
+               Tiler                 const& tiler)
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return logical_divide(t, tiler); }));
+}
+
+template <class... Layouts, class Tiler>
+CUTE_HOST_DEVICE constexpr
+auto
+zipped_divide(ZipLayout<Layouts...> const& layouts,
+              Tiler                 const& tiler)
+{
+  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return zipped_divide(t, tiler); }));
+}
+
+// Return <SlicedZipLayout, ZipOffsets> by calling slice_and_offset and all component layouts.
+template <class Coord, class... Layouts>
+CUTE_HOST_DEVICE constexpr
+auto
+slice_and_offset(Coord const& c, ZipLayout<Layouts...> const& layouts)
+{
+  auto result = cute::zip(cute::transform(layouts.layouts_, [&c](auto const& layout) { return slice_and_offset(c, layout); }));
+  return cute::make_tuple(ZipLayout(get<0>(result)), get<1>(result));
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/underscore.hpp b/lightllm-kernel/cutlass/include/cute/underscore.hpp
new file mode 100755
index 000000000..e9d80fe5b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/underscore.hpp
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>                     // CUTE_INLINE_CONSTANT, CUTE_HOST_DEVICE
+#include <cute/container/tuple.hpp>            // cute::is_tuple
+#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
+
+namespace cute
+{
+
+// For slicing
+struct Underscore : Int<0> {};
+
+CUTE_INLINE_CONSTANT Underscore _;
+
+// Convenient alias
+using X = Underscore;
+
+// Treat Underscore as an integral like integral_constant
+template <>
+struct is_integral<Underscore> : true_type {};
+
+template <class T>
+struct is_underscore : false_type {};
+template <>
+struct is_underscore<Underscore> : true_type {};
+
+// Tuple trait for detecting static member element
+template <class Tuple, class Elem, class Enable = void>
+struct has_elem : false_type {};
+template <class Elem>
+struct has_elem<Elem, Elem> : true_type {};
+template <class Tuple, class Elem>
+struct has_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
+    : has_elem<Tuple, Elem, tuple_seq<Tuple> > {};
+template <class Tuple, class Elem, int... Is>
+struct has_elem<Tuple, Elem, seq<Is...>>
+    : disjunction<has_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
+
+// Tuple trait for detecting static member element
+template <class Tuple, class Elem, class Enable = void>
+struct all_elem : false_type {};
+template <class Elem>
+struct all_elem<Elem, Elem> : true_type {};
+template <class Tuple, class Elem>
+struct all_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
+    : all_elem<Tuple, Elem, tuple_seq<Tuple> > {};
+template <class Tuple, class Elem, int... Is>
+struct all_elem<Tuple, Elem, seq<Is...>>
+    : conjunction<all_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
+
+// Tuple trait for detecting Underscore member
+template <class Tuple>
+using has_underscore = has_elem<Tuple, Underscore>;
+
+template <class Tuple>
+using all_underscore = all_elem<Tuple, Underscore>;
+
+template <class Tuple>
+using has_int1 = has_elem<Tuple, Int<1>>;
+
+template <class Tuple>
+using has_int0 = has_elem<Tuple, Int<0>>;
+
+//
+// Slice keeps only the elements of Tuple B that are paired with an Underscore
+//
+
+namespace detail {
+
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+lift_slice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_slice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<B>{b};
+  } else {
+    return cute::tuple<>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Entry point overrides the lifting so that slice(_,b) == b
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+slice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_slice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return b;
+  } else {
+    return cute::tuple<>{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Dice keeps only the elements of Tuple B that are paired with an Int
+//
+
+namespace detail {
+
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+lift_dice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_dice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<>{};
+  } else {
+    return cute::tuple<B>{b};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+} // end namespace detail
+
+// Entry point overrides the lifting so that dice(1,b) == b
+template <class A, class B>
+CUTE_HOST_DEVICE constexpr
+auto
+dice(A const& a, B const& b)
+{
+  if constexpr (is_tuple<A>::value) {
+    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
+    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_dice(x,y); });
+  } else if constexpr (is_underscore<A>::value) {
+    return cute::tuple<>{};
+  } else {
+    return b;
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+//
+// Display utilities
+//
+
+CUTE_HOST_DEVICE void print(Underscore const&) {
+  printf("_");
+}
+
+#if !defined(__CUDACC_RTC__)
+CUTE_HOST std::ostream& operator<<(std::ostream& os, Underscore const&) {
+  return os << "_";
+}
+#endif
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/debug.hpp b/lightllm-kernel/cutlass/include/cute/util/debug.hpp
new file mode 100755
index 000000000..86da7cae9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/util/debug.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/**
+ * \file
+ * \brief Debugging and logging functionality
+ */
+
+#include <cuda_runtime_api.h>
+
+#include <cute/config.hpp>
+
+namespace cute
+{
+
+/******************************************************************************
+ * Debug and logging macros
+ ******************************************************************************/
+
+/**
+ * Formats and prints the given message to stdout
+ */
+#if !defined(CUTE_LOG)
+#  if !defined(__CUDA_ARCH__)
+#    define CUTE_LOG(format, ...) printf(format, __VA_ARGS__)
+#  else
+#    define CUTE_LOG(format, ...)                                \
+        printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+               blockIdx.x,  blockIdx.y,  blockIdx.z,             \
+               threadIdx.x, threadIdx.y, threadIdx.z,            \
+               __VA_ARGS__);
+#  endif
+#endif
+
+/**
+ * Formats and prints the given message to stdout only if DEBUG is defined
+ */
+#if !defined(CUTE_LOG_DEBUG)
+#  ifdef DEBUG
+#    define CUTE_LOG_DEBUG(format, ...) CUTE_LOG(format, __VA_ARGS__)
+#  else
+#    define CUTE_LOG_DEBUG(format, ...)
+#  endif
+#endif
+
+/**
+ * \brief Perror macro with exit
+ */
+#if !defined(CUTE_ERROR_EXIT)
+#  define CUTE_ERROR_EXIT(e)                                         \
+      do {                                                           \
+        cudaError_t code = (e);                                      \
+        if (code != cudaSuccess) {                                   \
+          fprintf(stderr, "<%s:%d> %s:\n    %s: %s\n",               \
+                  __FILE__, __LINE__, #e,                            \
+                  cudaGetErrorName(code), cudaGetErrorString(code)); \
+          fflush(stderr);                                            \
+          exit(1);                                                   \
+        }                                                            \
+      } while (0)
+#endif
+
+#if !defined(CUTE_CHECK_LAST)
+#  define CUTE_CHECK_LAST() CUTE_ERROR_EXIT(cudaPeekAtLastError()); CUTE_ERROR_EXIT(cudaDeviceSynchronize())
+#endif
+
+#if !defined(CUTE_CHECK_ERROR)
+#  define CUTE_CHECK_ERROR(e) CUTE_ERROR_EXIT(e)
+#endif
+
+// A dummy function that uses compilation failure to print a type
+template <class... T>
+CUTE_HOST_DEVICE void
+print_type() {
+  static_assert(sizeof...(T) < 0, "Printing type T.");
+}
+
+template <class... T>
+CUTE_HOST_DEVICE void
+print_type(T&&...) {
+  static_assert(sizeof...(T) < 0, "Printing type T.");
+}
+
+//
+// Device-specific helpers
+//
+// e.g.
+// if (thread0()) print(...);
+// if (block0()) print(...);
+// if (thread(42)) print(...);
+
+CUTE_HOST_DEVICE
+bool
+block([[maybe_unused]] int bid)
+{
+#if defined(__CUDA_ARCH__)
+  return blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y == bid;
+#else
+  return true;
+#endif
+}
+
+CUTE_HOST_DEVICE
+bool
+thread([[maybe_unused]] int tid, [[maybe_unused]] int bid)
+{
+#if defined(__CUDA_ARCH__)
+  return (threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y == tid) && block(bid);
+#else
+  return true;
+#endif
+}
+
+CUTE_HOST_DEVICE
+bool
+thread(int tid)
+{
+  return thread(tid,0);
+}
+
+CUTE_HOST_DEVICE
+bool
+thread0()
+{
+  return thread(0,0);
+}
+
+CUTE_HOST_DEVICE
+bool
+block0()
+{
+  return block(0);
+}
+
+}  // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/print.hpp b/lightllm-kernel/cutlass/include/cute/util/print.hpp
new file mode 100755
index 000000000..dbd658169
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/util/print.hpp
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/config.hpp>           // CUTE_HOST_DEVICE
+#include <cute/util/type_traits.hpp> // cute::is_valid
+#include <cute/numeric/numeric_types.hpp> 
+
+//
+// CUDA compatible print and printf
+//
+
+namespace cute
+{
+
+CUTE_HOST_DEVICE
+int
+num_digits(int x)
+{
+  return (x < 10 ? 1 :
+          (x < 100 ? 2 :
+           (x < 1000 ? 3 :
+            (x < 10000 ? 4 :
+             (x < 100000 ? 5 :
+              (x < 1000000 ? 6 :
+               (x < 10000000 ? 7 :
+                (x < 100000000 ? 8 :
+                 (x < 1000000000 ? 9 :
+                  10)))))))));
+}
+
+//
+// print dispatcher
+//
+
+CUTE_HOST_DEVICE
+void
+print(char c) {
+  printf("%c", c);
+}
+
+CUTE_HOST_DEVICE
+void
+print(signed char a) {
+  printf("%d", static_cast<int>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned char a) {
+  printf("%u", static_cast<unsigned int>(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(short a) {
+  printf("%hd", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned short a) {
+  printf("%hu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(int a) {
+  printf("%d", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint1b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(int2b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint2b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(int4b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(uint4b_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(bin1_t a) {
+  printf("%d", int(a));
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned int a) {
+  printf("%u", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long a) {
+  printf("%ld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long a) {
+  printf("%lu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(long long a) {
+  printf("%lld", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(unsigned long long a) {
+  printf("%llu", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(float a) {
+  printf("%f", a);
+}
+
+CUTE_HOST_DEVICE
+void
+print(double a) {
+  printf("%f", a);
+}
+
+template <class... T>
+CUTE_HOST_DEVICE
+void
+print(char const* format, T const&... t) {
+  printf(format, t...);
+}
+
+CUTE_HOST_DEVICE
+void
+print(char const* format) {
+  printf("%s", format);
+}
+
+//
+// pretty printing
+//
+
+CUTE_HOST_DEVICE void
+pretty_print(uint1b_t a) {
+  printf("%*d", 3, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int2b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint2b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int4b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint4b_t a) {
+  printf("%*d", 5, int(a));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(bool v) {
+  printf("%*d", 3, int(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int32_t v) {
+  printf("%*d", 5, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint32_t v) {
+  printf("%*d", 5, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(int64_t v) {
+  printf("%*lld", 5, static_cast<long long>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(uint64_t v) {
+  printf("%*llu", 5, static_cast<unsigned long long>(v));
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(float v) {
+  printf("%*.2e", 10, v);
+}
+
+CUTE_HOST_DEVICE void
+pretty_print(double v) {
+  printf("%*.3e", 11, v);
+}
+
+template <class T>
+CUTE_HOST_DEVICE void
+pretty_print(T t) {
+  printf("  "); print(t);
+}
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp b/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
new file mode 100755
index 000000000..e663b569c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
@@ -0,0 +1,292 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#else
+#include <type_traits>
+#include <utility>      // tuple_size, tuple_element
+#include <cstddef>      // ptrdiff_t
+#include <cstdint>      // uintptr_t
+#include <limits>       // numeric_limits
+#endif
+
+#include <cute/config.hpp> // CUTE_STL_NAMESPACE
+
+namespace cute
+{
+  using CUTE_STL_NAMESPACE::enable_if;
+  using CUTE_STL_NAMESPACE::enable_if_t;
+}
+
+#define __CUTE_REQUIRES(...)   typename cute::enable_if<(__VA_ARGS__)>::type* = nullptr
+#define __CUTE_REQUIRES_V(...) typename cute::enable_if<decltype((__VA_ARGS__))::value>::type* = nullptr
+
+namespace cute
+{
+
+// <type_traits>
+using CUTE_STL_NAMESPACE::conjunction;
+using CUTE_STL_NAMESPACE::conjunction_v;
+
+using CUTE_STL_NAMESPACE::disjunction;
+using CUTE_STL_NAMESPACE::disjunction_v;
+
+using CUTE_STL_NAMESPACE::negation;
+using CUTE_STL_NAMESPACE::negation_v;
+
+using CUTE_STL_NAMESPACE::void_t;
+using CUTE_STL_NAMESPACE::is_void_v;
+
+using CUTE_STL_NAMESPACE::is_base_of;
+using CUTE_STL_NAMESPACE::is_base_of_v;
+
+using CUTE_STL_NAMESPACE::is_const;
+using CUTE_STL_NAMESPACE::is_const_v;
+using CUTE_STL_NAMESPACE::is_volatile;
+using CUTE_STL_NAMESPACE::is_volatile_v;
+
+// Defined in cute/numeric/integral_constant.hpp
+// using CUTE_STL_NAMESPACE::true_type;
+// using CUTE_STL_NAMESPACE::false_type;
+
+using CUTE_STL_NAMESPACE::conditional;
+using CUTE_STL_NAMESPACE::conditional_t;
+
+using CUTE_STL_NAMESPACE::add_const_t;
+
+using CUTE_STL_NAMESPACE::remove_const_t;
+using CUTE_STL_NAMESPACE::remove_cv_t;
+using CUTE_STL_NAMESPACE::remove_reference_t;
+
+using CUTE_STL_NAMESPACE::extent;
+using CUTE_STL_NAMESPACE::remove_extent;
+
+using CUTE_STL_NAMESPACE::decay;
+using CUTE_STL_NAMESPACE::decay_t;
+
+using CUTE_STL_NAMESPACE::is_lvalue_reference;
+using CUTE_STL_NAMESPACE::is_lvalue_reference_v;
+
+using CUTE_STL_NAMESPACE::is_reference;
+using CUTE_STL_NAMESPACE::is_trivially_copyable;
+
+using CUTE_STL_NAMESPACE::is_convertible;
+using CUTE_STL_NAMESPACE::is_convertible_v;
+
+using CUTE_STL_NAMESPACE::is_same;
+using CUTE_STL_NAMESPACE::is_same_v;
+
+using CUTE_STL_NAMESPACE::is_constructible;
+using CUTE_STL_NAMESPACE::is_constructible_v;
+using CUTE_STL_NAMESPACE::is_default_constructible;
+using CUTE_STL_NAMESPACE::is_default_constructible_v;
+using CUTE_STL_NAMESPACE::is_standard_layout;
+using CUTE_STL_NAMESPACE::is_standard_layout_v;
+
+using CUTE_STL_NAMESPACE::is_arithmetic;
+using CUTE_STL_NAMESPACE::is_unsigned;
+using CUTE_STL_NAMESPACE::is_unsigned_v;
+using CUTE_STL_NAMESPACE::is_signed;
+using CUTE_STL_NAMESPACE::is_signed_v;
+
+using CUTE_STL_NAMESPACE::make_signed;
+using CUTE_STL_NAMESPACE::make_signed_t;
+
+// using CUTE_STL_NAMESPACE::is_integral;
+template <class T>
+using is_std_integral = CUTE_STL_NAMESPACE::is_integral<T>;
+
+using CUTE_STL_NAMESPACE::is_empty;
+using CUTE_STL_NAMESPACE::is_empty_v;
+
+using CUTE_STL_NAMESPACE::invoke_result_t;
+
+using CUTE_STL_NAMESPACE::common_type;
+using CUTE_STL_NAMESPACE::common_type_t;
+
+using CUTE_STL_NAMESPACE::remove_pointer;
+using CUTE_STL_NAMESPACE::remove_pointer_t;
+
+using CUTE_STL_NAMESPACE::alignment_of;
+using CUTE_STL_NAMESPACE::alignment_of_v;
+
+// <utility>
+using CUTE_STL_NAMESPACE::declval;
+
+template <class T>
+constexpr T&& forward(remove_reference_t<T>& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+constexpr T&& forward(remove_reference_t<T>&& t) noexcept
+{
+  static_assert(! is_lvalue_reference_v<T>, "T cannot be an lvalue reference (e.g., U&).");
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+constexpr remove_reference_t<T>&& move(T&& t) noexcept
+{
+  return static_cast<remove_reference_t<T>&&>(t);
+}
+
+// <limits>
+using CUTE_STL_NAMESPACE::numeric_limits;
+
+// <cstddef>
+using CUTE_STL_NAMESPACE::ptrdiff_t;
+
+// <cstdint>
+using CUTE_STL_NAMESPACE::uintptr_t;
+
+// C++20
+// using std::remove_cvref;
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
+};
+
+// C++20
+// using std::remove_cvref_t;
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+//
+// dependent_false
+//
+// @brief An always-false value that depends on one or more template parameters.
+// See
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
+// https://github.com/cplusplus/papers/issues/572
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+template <class... Args>
+inline constexpr bool dependent_false = false;
+
+//
+// tuple_size, tuple_element
+//
+// @brief CuTe-local tuple-traits to prevent conflicts with other libraries.
+// For cute:: types, we specialize std::tuple-traits, which is explicitly allowed.
+//   cute::tuple, cute::array, cute::array_subbyte, etc
+// But CuTe wants to treat some external types as tuples as well. For those,
+// we specialize cute::tuple-traits to avoid polluting external traits.
+//   dim3, uint3, etc
+
+template <class T, class = void>
+struct tuple_size;
+
+template <class T>
+struct tuple_size<T,void_t<typename CUTE_STL_NAMESPACE::tuple_size<T>::type>> : CUTE_STL_NAMESPACE::integral_constant<size_t, CUTE_STL_NAMESPACE::tuple_size<T>::value> {};
+
+// S =  : std::integral_constant<std::size_t, std::tuple_size<T>::value> {};
+
+template <class T>
+constexpr size_t tuple_size_v = tuple_size<T>::value;
+
+template <size_t I, class T, class = void>
+struct tuple_element;
+
+template <size_t I, class T>
+struct tuple_element<I,T,void_t<typename CUTE_STL_NAMESPACE::tuple_element<I,T>::type>> : CUTE_STL_NAMESPACE::tuple_element<I,T> {};
+
+template <size_t I, class T>
+using tuple_element_t = typename tuple_element<I,T>::type;
+
+//
+// is_valid
+//
+
+namespace detail {
+
+template <class F, class... Args, class = decltype(declval<F&&>()(declval<Args&&>()...))>
+CUTE_HOST_DEVICE constexpr auto
+is_valid_impl(int) { return CUTE_STL_NAMESPACE::true_type{}; }
+
+template <class F, class... Args>
+CUTE_HOST_DEVICE constexpr auto
+is_valid_impl(...) { return CUTE_STL_NAMESPACE::false_type{}; }
+
+template <class F>
+struct is_valid_fn {
+  template <class... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Args&&...) const { return is_valid_impl<F, Args&&...>(int{}); }
+};
+
+} // end namespace detail
+
+template <class F>
+CUTE_HOST_DEVICE constexpr auto
+is_valid(F&&) {
+  return detail::is_valid_fn<F&&>{};
+}
+
+template <class F, class... Args>
+CUTE_HOST_DEVICE constexpr auto
+is_valid(F&&, Args&&...) {
+  return detail::is_valid_impl<F&&, Args&&...>(int{});
+}
+
+template <bool B, template<class...> class True, template<class...> class False>
+struct conditional_template {
+  template <class... U>
+  using type = True<U...>;
+};
+
+template <template<class...> class True, template<class...> class False>
+struct conditional_template<false, True, False> {
+  template <class... U>
+  using type = False<U...>;
+};
+
+//
+// is_any_of
+//
+
+// Member `value` is true if and only if T is same as (is_same_v) at least one of the types in Us
+template <class T, class... Us>
+struct is_any_of {
+  constexpr static bool value = (... || CUTE_STL_NAMESPACE::is_same_v<T, Us>);
+};
+
+// Is true if and only if T is same as (is_same_v) at least one of the types in Us
+template <class T, class... Us>
+inline constexpr bool is_any_of_v = is_any_of<T, Us...>::value;
+
+} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h b/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
new file mode 100755
index 000000000..0d2bb2904
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief AlignedBuffer is a container for trivially copyable elements suitable for use in
+      unions and shared memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Modifies semantics of cutlass::Array<> to provide guaranteed alignment. 
+template <
+  typename T,
+  int N,
+  int Align = 16
+>
+struct AlignedBuffer {
+  
+  /// Internal storage type
+  using Storage = uint8_t;
+
+  /// Number of logical elements held in buffer
+  static int const kCount = N;
+
+  /// Alignment requirement in bytes
+  static int const kAlign = Align;
+
+  /// Number of storage elements
+  static int const kBytes = 
+    (sizeof_bits<T>::value * N + 7) / 8;
+
+private:
+
+  /// Internal storage
+  alignas(Align) Storage storage[kBytes];
+
+public:
+
+  //
+  // C++ standard members
+  //
+
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type *pointer;
+  typedef value_type const * const_pointer;
+
+  using Array = Array<T, N>;
+  using reference = typename Array::reference;
+  using const_reference = typename Array::const_reference;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage); 
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<pointer>(storage); 
+  }
+  
+  CUTLASS_HOST_DEVICE
+  Storage * raw_data() {
+    return storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Storage const * raw_data() const {
+    return storage;
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kCount;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kCount;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kCount;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/arch.h b/lightllm-kernel/cutlass/include/cutlass/arch/arch.h
new file mode 100755
index 000000000..36d4676bd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/arch.h
@@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines tags for architecture-specific configurations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+#if defined(__NVCC__) || defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+
+/// Computes laneId within a warp
+CUTLASS_DEVICE
+int LaneId() {
+  int ret;
+  asm ("mov.u32 %0, %%laneid;" : "=r"(ret) : );
+  return ret;
+}
+
+/// Computes SM number the thread is running on
+CUTLASS_DEVICE
+int SmId() {
+  int ret;
+  asm ("mov.u32 %0, %%smid;" : "=r"(ret) : );
+  return ret;
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Sm50 {
+  static int const kMinComputeCapability = 50;
+}; 
+struct Sm60 {
+  static int const kMinComputeCapability = 60;
+}; 
+struct Sm61 {
+  static int const kMinComputeCapability = 61;
+};
+struct Sm70 {
+  static int const kMinComputeCapability = 70;
+};
+struct Sm72 {
+  static int const kMinComputeCapability = 72;
+};
+struct Sm75 {
+  static int const kMinComputeCapability = 75;
+};
+struct Sm80 {
+  static int const kMinComputeCapability = 80; 
+};
+struct Sm86 {
+  static int const kMinComputeCapability = 86;
+};
+struct Sm89 {
+  static int const kMinComputeCapability = 89;
+};
+struct Sm90 {
+  static int const kMinComputeCapability = 90; 
+};
+
+/// Triggers a breakpoint on the device
+CUTLASS_DEVICE
+void device_breakpoint() {
+#if defined(__CUDA_ARCH__)
+  asm volatile ("  brkpt;\n");
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h b/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
new file mode 100755
index 000000000..c96897324
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
@@ -0,0 +1,630 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Barrier Operations on SM90+
+*/
+
+#pragma once
+
+#include <cutlass/arch/memory_sm75.h>
+#include <cute/arch/cluster_sm90.hpp>
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
+#define CUDA_BARRIER_ENABLED 1
+#else
+#define CUDA_BARRIER_ENABLED 0
+#endif
+
+namespace cutlass {
+/// @brief
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Enumerates the reserved named barriers to avoid potential conflicts
+// This enum class specifies the NamedBarriers reserved by CUTLASS.
+enum class ReservedNamedBarriers { 
+  EpilogueBarrier = 1,
+  TransposeBarrier = 2,
+  TransformBarrier = 3,
+  StreamkBarrier0 = 4,
+  StreamkBarrier1 = 5
+  , FirstUserBarrier = StreamkBarrier1 + 1
+};
+
+
+class NamedBarrier {
+
+  // Data Members:
+
+  // Range = [1 , NUM_THREADS_PER_CTA]
+  // Range % warp-size (i.e 32) == 0
+  uint32_t const num_threads_;
+
+  // Range : [0, 15]
+  // Note that should be set to the final barrier ID, including ReserveNamedBarrierCount should be considered
+  uint32_t const id_;
+
+ public:
+
+  // Constructor for CUTLASS developers:
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  NamedBarrier(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers)
+      : num_threads_(num_threads), id_(static_cast<uint32_t>(reserved_named_barriers)) {}
+
+  // Constructor for CUTLASS users:
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  NamedBarrier(uint32_t num_threads, uint32_t id = 0)
+      : num_threads_(num_threads), id_(id + ReservedNamedBarrierCount) {
+    CUTLASS_ASSERT(id + ReservedNamedBarrierCount <= HardwareMaxNumNamedBarriers && "Effective barrier_id should not exceed 16.");
+  }
+
+  CUTLASS_DEVICE
+  void arrive_and_wait() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_and_wait_internal(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive_and_wait_unaligned() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_and_wait_internal_unaligned(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_internal(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void arrive_unaligned() const {
+    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
+    NamedBarrier::arrive_internal_unaligned(num_threads_, id_);
+  }
+
+  CUTLASS_DEVICE
+  void sync() const {
+    NamedBarrier::arrive_and_wait();
+  }
+
+  //  Static variants
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void arrive_and_wait(uint32_t num_threads, uint32_t barrier_id) {
+    arrive_and_wait_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void arrive_and_wait(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    arrive_and_wait_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void arrive(uint32_t num_threads, uint32_t barrier_id) {
+    arrive_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void arrive(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    arrive_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+  // Calling interface for CUTLASS users: 
+  // effective barrier ID starts from ReservedNamedBarrierCount
+  CUTLASS_DEVICE
+  static void sync(uint32_t num_threads, uint32_t barrier_id) {
+    sync_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
+  }
+
+  // Calling interface for CUTLASS developers: 
+  // effective barrier ID starts from 0
+  CUTLASS_DEVICE
+  static void sync(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
+    sync_internal(num_threads, static_cast<int>(reserved_named_barriers));
+  }
+
+
+ private:
+  CUTLASS_DEVICE
+  static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_and_wait_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    asm volatile("barrier.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
+    asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void arrive_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
+    asm volatile("barrier.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void sync_internal(uint32_t num_threads, uint32_t barrier_id) {
+    NamedBarrier::arrive_and_wait_internal(num_threads, barrier_id);
+  }
+
+ public:
+  // Currently we reserve 8 NamedBarriers for CUTLASS' own use cases, 
+  // while leaving the renaming for general users.
+  static const uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(ReservedNamedBarriers::FirstUserBarrier);
+  static const uint32_t HardwareMaxNumNamedBarriers = 16;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper introduces a new cluster-wide barrier which handle with Cluster-wide arrive-wait behaviour.
+// This is an extension to the Ampere arrive-wait barriers
+// Note : Ampere arrive-wait Barriers have a larger max-arrive count (2^30) than Hopper arrive-wait Barriers (2^20).
+struct ClusterBarrier {
+
+  using ValueType = uint64_t;
+
+protected:
+  // Can never be initialized - can only be aliased to smem
+  ValueType barrier_;
+
+public:
+
+  CUTLASS_DEVICE
+  ClusterBarrier() = delete;
+
+  CUTLASS_DEVICE
+  void init(uint32_t arrive_count) const {
+    ClusterBarrier::init(&this->barrier_, arrive_count);
+  }
+
+  CUTLASS_DEVICE
+  bool test_wait(uint32_t phase, uint32_t pred=true) const {
+    return ClusterBarrier::test_wait(&this->barrier_, phase, pred);
+  }
+
+  CUTLASS_DEVICE
+  bool try_wait(uint32_t phase) const {
+    return ClusterBarrier::try_wait(&this->barrier_, phase);
+  }
+
+  CUTLASS_DEVICE
+  void wait(uint32_t phase) const {
+    ClusterBarrier::wait(&this->barrier_, phase);
+  }
+
+  // Barrier arrive on local smem
+  CUTLASS_DEVICE
+  void arrive() const {
+    ClusterBarrier::arrive(&this->barrier_);
+  }
+
+  // Remote SMEM arrive with a perdicate (usually done to pick the thread doing the arrive)
+  CUTLASS_DEVICE
+  void arrive(uint32_t cta_id, uint32_t pred = true ) const {
+    ClusterBarrier::arrive(&this->barrier_, cta_id, pred);
+  }
+
+  //
+  //  Static Versions
+  //
+  CUTLASS_DEVICE
+  static void init(ValueType const* smem_ptr, uint32_t arrive_count) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.init.shared::cta.b64 [%1], %0; \n"
+        "}"
+        :
+        : "r"(arrive_count), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_barrier_init(__LINE__, smem_addr, arrive_count);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Static version of wait - in case we don't want to burn a register
+  CUTLASS_DEVICE
+  static void wait(ValueType const* smem_ptr, uint32_t phase) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_wait(__LINE__, smem_addr, phase);
+    // Arbitrarily large timer value after which try-wait expires and re-tries.
+    uint32_t ticks = 0x989680;
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P1; \n\t"
+        "LAB_WAIT: \n\t"
+        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
+        "@P1 bra DONE; \n\t"
+        "bra     LAB_WAIT; \n\t"
+        "DONE: \n\t"
+        "}"
+        :
+        : "r"(smem_addr), "r"(phase), "r"(ticks));
+
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static bool test_wait(ValueType const* smem_ptr, uint32_t phase, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_test_wait(__LINE__, smem_addr, phase, pred);
+    uint32_t waitComplete;
+
+    asm volatile(
+        "{\n\t"
+        ".reg .pred P1; \n\t"
+        ".reg .pred P2; \n\t"
+        "setp.eq.u32 P2, %3, 1;\n\t"
+        "@P2 mbarrier.test_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P1; \n\t"
+        "}"
+        : "=r"(waitComplete)
+        : "r"(smem_addr), "r"(phase), "r"(pred));
+
+    return static_cast<bool>(waitComplete);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  static bool try_wait(ValueType const* smem_ptr, uint32_t phase) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    cutlass::arch::synclog_emit_cluster_barrier_try_wait(__LINE__, smem_addr, phase);
+    uint32_t waitComplete;
+
+    asm volatile(
+        "{\n\t"
+        ".reg .pred P1; \n\t"
+        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P1; \n\t"
+        "}"
+        : "=r"(waitComplete)
+        : "r"(smem_addr), "r"(phase));
+
+    return static_cast<bool>(waitComplete);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+    return 0;
+  }
+
+  // Static Predicated version of the above - in case we know the address.
+  CUTLASS_DEVICE
+  static void arrive(ValueType const* smem_ptr, uint32_t cta_id, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    if (pred) {
+      asm volatile(
+          "{\n\t"
+          ".reg .b32 remAddr32;\n\t"
+          "mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
+          "mbarrier.arrive.shared::cluster.b64  _, [remAddr32];\n\t"
+          "}"
+          :
+          : "r"(smem_addr), "r"(cta_id));
+    }
+
+    cutlass::arch::synclog_emit_cluster_barrier_arrive_cluster(__LINE__, smem_addr, cta_id, pred);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Barrier arrive on local smem
+  CUTLASS_DEVICE
+  static void arrive(ValueType const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.shared::cta.b64 _, [%0];\n\t"
+        "}"
+        :
+        : "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_barrier_arrive(__LINE__, smem_addr);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  static void invalidate(ValueType const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.inval.shared::cta.b64 [%0]; \n\t"
+        "}"
+        :
+        : "r"(smem_addr));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 also introduces a new type of cluster-barrier which supports sync.
+// not just based on Arrive Count, but also transaction count (in bytes)
+struct ClusterTransactionBarrier : public ClusterBarrier {
+
+  CUTLASS_DEVICE
+  ClusterTransactionBarrier() = delete;
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  void arrive_and_expect_tx(uint32_t transaction_bytes) const {
+    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes);
+  }
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  void arrive_and_expect_tx(uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred = 1u) const {
+    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes , cta_id, pred);
+  }
+
+  // Performs an expected transaction bytes increment without doing an arrive operation
+  CUTLASS_DEVICE
+  void expect_transaction(uint32_t transaction_bytes) const {
+    ClusterTransactionBarrier::expect_transaction(&this->barrier_, transaction_bytes);
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  void complete_transaction(uint32_t transaction_bytes, uint32_t pred = 1) const {
+    uint32_t cta_rank = cute::block_rank_in_cluster();
+    ClusterTransactionBarrier::complete_transaction(&this->barrier_, cta_rank, transaction_bytes, pred);
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  void complete_transaction(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
+    ClusterTransactionBarrier::complete_transaction(&this->barrier_, dst_cta_id, transaction_bytes, pred);
+  }
+
+  //
+  //  Static Versions
+  //
+
+  // Performs an arrive operation + expected transaction bytes increment
+  CUTLASS_DEVICE
+  static void arrive_and_expect_tx(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(__LINE__, smem_addr, transaction_bytes);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an arrive operation + expected transaction bytes increment for a remote cta_id in a Cluster
+  CUTLASS_DEVICE
+  static void arrive_and_expect_tx(
+      ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        ".reg .b32 remAddr32;\n\t"
+        "setp.eq.u32 p, %2, 1;\n\t"
+        "@p mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
+        "@p mbarrier.arrive.expect_tx.shared::cluster.b64  _, [remAddr32], %3;\n\t"
+        "}"
+        :
+        : "r"(smem_addr), "r"(cta_id), "r"(pred), "r"(transaction_bytes));
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an expected transaction bytes increment without doing an arrive operation
+  CUTLASS_DEVICE
+  static void expect_transaction(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.expect_tx.shared::cta.b64 [%1], %0; \n\t"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_expect_transaction(__LINE__, smem_addr, transaction_bytes);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  // Performs an expected transaction bytes decrement without doing an arrive operation
+  CUTLASS_DEVICE
+  static void complete_transaction(
+      ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
+#if CUDA_BARRIER_ENABLED
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+    smem_addr = cute::set_block_rank(smem_addr, dst_cta_id);
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.eq.u32 p, %2, 1;\n\t"
+        "@p mbarrier.complete_tx.shared::cluster.relaxed.cluster.b64   [%1], %0;"
+        "}"
+        :
+        : "r"(transaction_bytes), "r"(smem_addr), "r"(pred));
+    cutlass::arch::synclog_emit_cluster_transaction_barrier_complete_transaction(__LINE__, smem_addr, dst_cta_id, transaction_bytes, pred);
+#elif defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+#endif
+  }
+
+  //
+  // DEPRECATED APIs
+  //
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  void arrive_and_reset_bytes(uint32_t transaction_bytes) const {
+    arrive_and_expect_tx(transaction_bytes);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  void arrive_and_reset_bytes(uint32_t transaction_bytes, uint32_t cta_id) const {
+    arrive_and_expect_tx(transaction_bytes, cta_id);
+  }
+  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
+  void reset_bytes(uint32_t transaction_bytes) const {
+    expect_transaction(transaction_bytes);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  void commit(uint32_t transaction_bytes, uint32_t pred = 1) const {
+    complete_transaction(transaction_bytes, pred);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  void commit(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
+    complete_transaction(dst_cta_id, transaction_bytes, pred);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+    arrive_and_expect_tx(smem_ptr, transaction_bytes);
+  }
+  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
+  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
+    arrive_and_expect_tx(smem_ptr, transaction_bytes, cta_id, pred);
+  }
+  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
+  static void reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
+    expect_transaction(smem_ptr, transaction_bytes);
+  }
+  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
+  static void commit(ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
+    complete_transaction(smem_ptr, dst_cta_id, transaction_bytes, pred);
+  }
+};
+
+// Helps with visibility of barrier init operations across warps / cta / cluster
+// Available as a separate function so as to batch inits across barriers and fence once
+// Note : It must be composed with an appropriate sync instruction with the right scope
+// to ensure visibility eg. __syncthreads() or a cluster_arrive() + cluster_wait()
+CUTLASS_DEVICE
+void fence_barrier_init() {
+#if CUDA_BARRIER_ENABLED
+  cutlass::arch::synclog_emit_fence_barrier_init(__LINE__);
+  asm volatile(
+      "{\n\t"
+      "fence.mbarrier_init.release.cluster; \n"
+      "}"
+      ::);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+// Issue a shared memory fence for async operations
+CUTLASS_DEVICE
+void fence_view_async_shared() {
+#if CUDA_BARRIER_ENABLED
+    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
+    asm volatile (
+        "{\n\t"
+        "fence.proxy.async.shared::cta; \n"
+        "}"
+        ::);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+// Arrive on completion of in-flight cp.async operations issued by the calling thread 
+CUTLASS_DEVICE
+void cpasync_barrier_arrive(uint64_t const* smem_ptr) {
+#if CUDA_BARRIER_ENABLED
+  uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+    "{\n\t"
+    "cp.async.mbarrier.arrive.shared::cta.b64 [%0];\n\t"
+    "}"
+    :
+    : "r"(smem_addr));
+  cutlass::arch::synclog_emit_cpasync_barrier_arrive(__LINE__, smem_addr);
+#elif defined(__CUDA_ARCH__)
+  asm volatile ("brkpt;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // end namespace arch
+}  // end namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h b/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
new file mode 100755
index 000000000..9d2344bf3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Directives related to cache operations
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Controls PTX cache operations
+struct CacheOperation {
+  enum Kind {
+    /// Cache at all levels - accessed again
+    Always,
+    /// Cache at global level
+    Global,
+    /// Streaming - likely to be accessed once
+    Streaming,
+    /// Indicates the line will not be used again
+    LastUse,
+    /// Don't cache, and fetch again
+    Volatile,
+    /// Write back at all coherent levels
+    WriteBack,
+    /// Write through to system memory
+    WriteThrough
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace arch
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/config.h b/lightllm-kernel/cutlass/include/cutlass/arch/config.h
new file mode 100755
index 000000000..b0f750063
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/config.h
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Definitions for architecture macros
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0))
+  #define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
+    #define CUTLASS_ARCH_MMA_SM90_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+      #define CUTLASS_ARCH_MMA_SM90A_ENABLED 1
+    #endif
+  #endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)
+  #define CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 Modifiable
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 3))
+  #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
+    #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED 1
+
+    #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+      #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED 1
+    #endif
+  #endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// SM90 F64
+#if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8))
+  #define CUTLASS_ARCH_MMA_SM90_F64_MMA_SUPPORTED 1
+  #if (!defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900)
+    #define CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED 1
+  #endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h b/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
new file mode 100755
index 000000000..14ef19749
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
@@ -0,0 +1,84 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ 
+/*! \file
+    \brief Grid dependent control (GDC) helpers for programmatic dependent launches (PDL).
+*/
+
+#pragma once
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#ifndef CUTLASS_GDC_ENABLED
+  #if (defined(CUTLASS_ENABLE_GDC_FOR_SM90) && \
+     __CUDACC_VER_MAJOR__ >= 12 && \
+     defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+    #define CUTLASS_GDC_ENABLED
+  #endif
+#endif
+
+namespace cutlass {
+namespace arch {
+
+// Issuing the launch_dependents instruction hints a dependent kernel to launch earlier
+// launch_dependents doesn't impact the functionality but the performance:
+// Launching a dependent kernel too early can compete with current kernels,
+// while launching too late can lead to a long latency.
+CUTLASS_DEVICE
+void launch_dependent_grids() {
+#if (defined(CUTLASS_GDC_ENABLED))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+// Issuing the griddepcontrol.wait instruction enforces no global memory access
+// prior to this istruction. This ensures the correctness of global memory access
+// when launching a dependent kernel earlier.
+CUTLASS_DEVICE
+void wait_on_dependent_grids() {
+#if (defined(CUTLASS_GDC_ENABLED))
+  asm volatile("griddepcontrol.wait;");
+#endif
+}
+
+// Enable kernel-level query regarding whether the GDC feature is turned on
+#if (defined(CUTLASS_GDC_ENABLED))
+static constexpr bool IsGdcGloballyEnabled = true;
+#else
+static constexpr bool IsGdcGloballyEnabled = false;
+#endif
+
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory.h
new file mode 100755
index 000000000..db9ad7397
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/memory.h
@@ -0,0 +1,602 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Architecture-specific operators on memory
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Fragment type to store loaded data
+    typename AccessType,
+    /// The bytes of loading
+    int LoadBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always
+    >
+struct global_load;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
+     (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+  #define CUTLASS_ENABLE_L2_PREFETCH 1
+#else
+  #define CUTLASS_ENABLE_L2_PREFETCH 0
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The redundant mov PTX instruction is used to enforce the compiler to
+// keep the initializing code before ld.global
+template <typename AccessType>
+struct global_load<AccessType,
+                   32,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.L2::128B.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#else
+        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#endif
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   32,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   16,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %5, 0;\n"
+        "  mov.b32 %0, %6;\n"
+        "  mov.b32 %1, %7;\n"
+        "  mov.b32 %2, %8;\n"
+        "  mov.b32 %3, %9;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#else
+        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#endif
+        "}\n"
+        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   16,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %5, 0;\n"
+        "  mov.b32 %0, %6;\n"
+        "  mov.b32 %1, %7;\n"
+        "  mov.b32 %2, %8;\n"
+        "  mov.b32 %3, %9;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   8,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %3, 0;\n"
+        "  mov.b32 %0, %4;\n"
+        "  mov.b32 %1, %5;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.v2.u32 {%0, %1}, [%2];\n"
+#else
+        "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+#endif
+        "}\n"
+        : "=r"(data.x), "=r"(data.y)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   8,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %3, 0;\n"
+        "  mov.b32 %0, %4;\n"
+        "  mov.b32 %1, %5;\n"
+        "  @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   4,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b32 %0, %3;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.u32 %0, [%1];\n"
+#else
+        "  @p ld.global.u32 %0, [%1];\n"
+#endif
+        "}\n"
+        : "=r"(data)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   4,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b32 %0, %3;\n"
+        "  @p ld.global.lu.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   2,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b16 %0, %3;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "  @p ld.global.L2::128B.u16 %0, [%1];\n"
+#else
+        "  @p ld.global.u16 %0, [%1];\n"
+#endif
+        "}\n"
+        : "=h"(data)
+        : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   2,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b16 %0, %3;\n"
+        "  @p ld.global.lu.u16 %0, [%1];\n"
+        "}\n"
+        : "=h"(data)
+        : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   1,
+                   CacheOperation::Always
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    if (pred_guard) D = *(reinterpret_cast<AccessType const *>(ptr));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Fragment type to store data
+    typename AccessType,
+    /// The bytes of storing
+    int StoreBytes
+    >
+struct global_store;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename AccessType>
+struct global_store<AccessType, 64> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+      "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+      "  @p st.global.v4.u32 [%11], {%12, %13, %14, %15};\n"
+      "  @p st.global.v4.u32 [%16], {%17, %18, %19, %20};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+        "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16),
+        "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w), 
+        "l"(((uint8_t *)ptr) + 32),
+        "r"(data[2].x), "r"(data[2].y), "r"(data[2].z), "r"(data[2].w),
+        "l"(((uint8_t *)ptr) + 48),
+        "r"(data[3].x), "r"(data[3].y), "r"(data[3].z), "r"(data[3].w));
+  }
+};
+
+
+template <typename AccessType>
+struct global_store<AccessType, 32> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+      "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+        "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16),
+        "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 16> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint4 const &data = reinterpret_cast<uint4 const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %5, 0;\n"
+      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 8> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint2 const &data = reinterpret_cast<uint2 const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %3, 0;\n"
+      "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 4> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint32_t const &data = reinterpret_cast<uint32_t const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  @p st.global.u32 [%0], %1;\n"
+      "}\n"
+      :
+      : "l"(ptr), "r"(data), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 2> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+  uint16_t const &data = reinterpret_cast<uint16_t const &>(D);
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %2, 0;\n"
+      "  @p st.global.u16 [%0], %1;\n"
+      "}\n"
+      :
+      : "l"(ptr), "h"(data), "r"((int)pred_guard));
+  }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 1> {
+  CUTLASS_DEVICE
+  global_store(AccessType const &D, void *ptr, bool pred_guard) {
+    if (pred_guard) *(reinterpret_cast<AccessType *>(ptr)) = D;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ld.shared
+template <int Bytes>
+CUTLASS_DEVICE
+void shared_load(void *dst, uint32_t ptr);
+
+/// ld.shared - 16b
+template <>
+CUTLASS_DEVICE
+void shared_load<2>(void *dst, uint32_t ptr) {
+  asm volatile("ld.shared.u16 %0, [%1];\n"
+    : "=h"(*reinterpret_cast<uint16_t *>(dst))
+    : "r"(ptr));
+}
+
+/// ld.shared - 32b
+template <>
+CUTLASS_DEVICE
+void shared_load<4>(void *dst, uint32_t ptr) {
+  asm volatile("ld.shared.u32 %0, [%1];\n"
+    : "=r"(*reinterpret_cast<uint32_t *>(dst))
+    : "r"(ptr));
+}
+
+/// ld.shared - 64b
+template <>
+CUTLASS_DEVICE
+void shared_load<8>(void *dst, uint32_t ptr) {
+  uint2 *dst_u64 = reinterpret_cast<uint2 *>(dst);
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
+    :
+      "=r"(dst_u64->x),
+      "=r"(dst_u64->y)
+    : "r"(ptr));
+}
+
+/// ld.shared - 128b
+template <>
+CUTLASS_DEVICE
+void shared_load<16>(void *dst, uint32_t ptr) {
+  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// st.shared
+template <int Bytes>
+CUTLASS_DEVICE
+void shared_store(uint32_t ptr, void const *src);
+
+/// st.shared - 16b
+template <>
+CUTLASS_DEVICE
+void shared_store<2>(uint32_t ptr, void const *src) {
+  asm volatile("st.shared.u16 [%0], %1;\n"
+    : :
+    "r"(ptr),
+    "h"(*reinterpret_cast<uint16_t const *>(src))
+  );
+}
+
+/// st.shared - 32b
+template <>
+CUTLASS_DEVICE
+void shared_store<4>(uint32_t ptr, void const *src) {
+  asm volatile("st.shared.u32 [%0], %1;\n"
+    : :
+    "r"(ptr),
+    "r"(*reinterpret_cast<uint32_t const  *>(src))
+  );
+}
+
+/// st.shared - 64b
+template <>
+CUTLASS_DEVICE
+void shared_store<8>(uint32_t ptr, void const *src) {
+  uint2 const *dst_u64 = reinterpret_cast<uint2 const *>(src);
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+    : :
+      "r"(ptr),
+      "r"(dst_u64->x),
+      "r"(dst_u64->y)
+    );
+}
+
+/// st.shared - 128b
+template <>
+CUTLASS_DEVICE
+void shared_store<16>(uint32_t ptr, void const *src) {
+  uint4 const *dst_u128 = reinterpret_cast<uint4 const *>(src);
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+    : :
+      "r"(ptr),
+      "r"(dst_u128->x),
+      "r"(dst_u128->y),
+      "r"(dst_u128->z),
+      "r"(dst_u128->w)
+    );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/memory_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
new file mode 100755
index 000000000..6b487a737
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Architecture-specific operators on memory added for SM75
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cute/arch/copy_sm75.hpp"
+#include "cute/arch/util.hpp"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Layout of destination matrix (column-major implies transpose)
+  typename Layout,
+  /// .x1, .x2, or .x4
+  int MatrixCount
+>
+inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Determine the appropriate way to target PTX's "ldmatrix" instruction.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// CUTLASS helper to get SMEM pointer
+inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
+  return cute::cast_smem_ptr_to_uint(ptr);
+}
+
+/// CUTLASS helper to get SMEM pointer
+inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) {
+  return cutlass_get_smem_pointer(const_cast<void *>(ptr));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::RowMajor, 1>(
+    Array<unsigned, 1> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x;
+    asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
+    reinterpret_cast<int &>(D) = x;
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::RowMajor, 2>(
+    Array<unsigned, 2> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x, y;
+    asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
+    reinterpret_cast<int2 &>(D) = make_int2(x, y);
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::RowMajor, 4>(
+    Array<unsigned, 4> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x, y, z, w;
+    asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
+    reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Transpose on 16b granularity
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::ColumnMajor, 1>(
+    Array<unsigned, 1> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x;
+    asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
+    reinterpret_cast<int &>(D) = x;
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::ColumnMajor, 2>(
+    Array<unsigned, 2> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x, y;
+    asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
+    reinterpret_cast<int2 &>(D) = make_int2(x, y);
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void ldsm<layout::ColumnMajor, 4>(
+    Array<unsigned, 4> & D,
+    void const* ptr) {
+
+  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
+
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    int x, y, z, w;
+    asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
+    reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
+
+  #else
+
+    CUTLASS_UNUSED(D);
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_NOT_IMPLEMENTED();
+
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType, int Bytes>
+struct shared_load_op {
+  CUTLASS_DEVICE
+  shared_load_op(AccessType &D, void const *ptr) {
+    D = *reinterpret_cast<AccessType const *>(ptr);  
+  }
+};
+
+template <typename AccessType>
+CUTLASS_DEVICE void shared_load(AccessType &D, void const *ptr) {
+  shared_load_op<AccessType, int(sizeof(AccessType))>(D, ptr);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType>
+struct shared_load_op<AccessType, 16> {
+  CUTLASS_DEVICE
+  shared_load_op(AccessType &D, void const *ptr) {
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    uint4 v;
+    asm volatile ("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];" : 
+      "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w) : "r"(addr));
+
+    D = reinterpret_cast<AccessType const &>(v);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType>
+struct shared_load_op<AccessType, 8> {
+  CUTLASS_DEVICE
+  shared_load_op(AccessType &D, void const *ptr) {
+    unsigned addr = cutlass_get_smem_pointer(ptr);
+
+    uint2 v;
+    asm volatile ("ld.shared.v2.b32 {%0, %1}, [%2];" : 
+      "=r"(v.x), "=r"(v.y) : "r"(addr));
+
+    D = reinterpret_cast<AccessType const &>(v);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
new file mode 100755
index 000000000..cb0ba4b54
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
@@ -0,0 +1,472 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Architecture-specific operators on memory added for SM80
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/cache_operation.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  #define CUDA_CP_ASYNC_ACTIVATED 1
+#else
+  #define CUDA_CP_ASYNC_ACTIVATED 0
+#endif
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initiates an asynchronous copy from global memory to shared memory.
+///
+/// cp.async
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always>
+struct cp_async;
+
+/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
+/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
+///
+/// cp.async
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always>
+struct cp_async_zfill;
+
+/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
+/// the entire transfer, nans (0x7eff) are written to SMEM if the guard predicate is false.
+///
+/// cp.async
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always>
+struct cp_async_nan;
+
+/// Either 0 or 1 are written to SMEM based on input element type
+/// Used for diagonal elements of triangular matrix of BLAS3 functions
+///
+/// st.shared
+///
+template <
+   /// Type of Element
+   typename Element,
+   /// If the data is for a Hermitian matrix diagonal
+   bool IsHermitianData = false>
+struct cp_async_diag;
+
+static const uint32_t OOB_NAN_F16 = 0x7eff;
+static const uint32_t OOB_NAN_F16x2 = ((OOB_NAN_F16 << 16) | OOB_NAN_F16);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async<SizeInBytes, CacheOperation::Always> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+          "  @p cp.async.ca.shared.global.L2::128B [%1], [%2], %3;\n"
+#else
+          "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+#endif
+          "}\n" ::"r"((int)pred_guard),
+          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      // Make sure the size is supported.
+      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
+                "Size is not supported");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+
+      asm volatile(
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+#else
+        "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+#endif
+        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <>
+struct cp_async_nan<16, CacheOperation::Always> {
+  static int const kSizeInBytes = 16;
+
+  /// Copy with nan fill
+  CUTLASS_DEVICE
+  cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+          "  @p cp.async.ca.shared.global.L2::128B [%1], [%2], %3;\n"
+#else
+          "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+#endif
+          "  @!p st.shared.v4.u32 [%1], {%4, %5, %6, %7};\n"
+          "}\n"
+          :
+          : "r"((int)pred_guard), "r"(smem_int_ptr), "l"(global_ptr),
+            "n"(kSizeInBytes), "r"(OOB_NAN_F16x8.x), "r"(OOB_NAN_F16x8.y), "r"(OOB_NAN_F16x8.z),
+            "r"(OOB_NAN_F16x8.w));
+
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_UNUSED(global_ptr);
+      CUTLASS_UNUSED(pred_guard);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+
+/// Partial specialization to write one (1)
+template<typename Element_>
+struct cp_async_diag <Element_, false> {
+  using Element = Element_;
+
+  CUTLASS_DEVICE
+  cp_async_diag(void *smem_ptr) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      /// Values for the diagonal elements of the triangular input matrix
+      static __constant__ uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
+      static __constant__ uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
+      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      if (platform::is_same<Element, complex<double>>::value) {
+        asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y), "r"(DIAG_DATA_DOUBLE_ONE.x),
+                      "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, complex<float>>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, double>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y),"r"(DIAG_DATA_DOUBLE_ONE.x));
+      } else if (platform::is_same<Element, float>::value) {
+        asm volatile("st.shared.u32 [%0], %1;\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x));
+      } else {
+        CUTLASS_UNUSED(smem_int_ptr);
+        CUTLASS_NOT_IMPLEMENTED();
+      }
+      
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+
+/// Partial specialization to write zero for the imaginary part of Hermitian data
+template<typename Element_>
+struct cp_async_diag <Element_, true> {
+  using Element = Element_;
+
+  CUTLASS_DEVICE
+  cp_async_diag(void *smem_ptr) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      /// Values for the diagonal elements of the triangular input matrix
+      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      if (platform::is_same<Element, complex<double>>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, complex<float>>::value) {
+        asm volatile("st.shared.u32 [%0], %1;\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x));
+      } else {
+        CUTLASS_UNUSED(smem_int_ptr);
+        CUTLASS_NOT_IMPLEMENTED();
+      }
+      
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async<SizeInBytes, CacheOperation::Global> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static_assert(SizeInBytes == 16,
+        "cp.async only supports CacheOperation::Global when access size is 16B.");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      cutlass::arch::synclog_emit_cp_async(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+          "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
+#else
+          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+#endif
+          "}\n" ::"r"((int)pred_guard),
+          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static_assert(SizeInBytes == 16,
+        "cp.async only supports CacheOperation::Global when access size is 16B.");
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
+      cutlass::arch::synclog_emit_cp_async_zfill(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
+
+      asm volatile(
+#if CUTLASS_ENABLE_L2_PREFETCH
+        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+#else
+        "cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
+#endif
+        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
+
+    #else
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+    #endif
+  }
+};
+
+/// Partial specialization
+template <>
+struct cp_async_nan<16, CacheOperation::Global> {
+  static int const kSizeInBytes = 16;
+
+  /// Copy with nan fill
+  CUTLASS_DEVICE
+  cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+      cutlass::arch::synclog_emit_cp_async_nan(__LINE__, smem_int_ptr, global_ptr, pred_guard);
+
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  setp.ne.b32 p, %0, 0;\n"
+#if CUTLASS_ENABLE_L2_PREFETCH
+          "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
+#else
+          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+#endif
+          "  @!p st.shared.v4.u32 [%1], {%4, %5, %6, %7};\n"
+          "}\n"
+          :
+          : "r"((int)pred_guard), "r"(smem_int_ptr), "l"(global_ptr),
+            "n"(kSizeInBytes), "r"(OOB_NAN_F16x8.x), "r"(OOB_NAN_F16x8.y), "r"(OOB_NAN_F16x8.z),
+            "r"(OOB_NAN_F16x8.w));
+
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_UNUSED(global_ptr);
+      CUTLASS_UNUSED(pred_guard);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+CUTLASS_DEVICE
+void cp_async_fence() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.commit_group;\n" ::);
+  cutlass::arch::synclog_emit_cp_async_fence(__LINE__);
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
+template <int N>
+CUTLASS_DEVICE void cp_async_wait() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+  cutlass::arch::synclog_emit_cp_async_wait(__LINE__, N);
+  #endif
+}
+
+/// Blocks until all previous cp.async.commit_group operations have committed.
+template <>
+CUTLASS_DEVICE void cp_async_wait<0>() {
+  #if CUDA_CP_ASYNC_ACTIVATED
+  asm volatile("cp.async.wait_all;\n" ::);
+  cutlass::arch::synclog_emit_cp_async_wait_all(__LINE__);
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace arch
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma.h
new file mode 100755
index 000000000..007ba19be
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/arch.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the operation implied by MMA.
+struct OpMultiplyAdd {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the result is saturated to MAX_FLOAT|MIN_FLOAT or MAX_INT|MIN_INT
+struct OpMultiplyAddSaturate {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input is converted to a narrower type (BF16)
+struct OpMultiplyAddFastBF16 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input is converted to a narrower type (F16)
+struct OpMultiplyAddFastF16 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input data types are mixed and the narrower type is 
+/// upcasted to the wider type
+struct OpMultiplyAddMixedInputUpcast {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input is converted to 2 (big and small) TF32 components
+//  Perform 3xTF32 or 4xTF32 for every F32 output element
+struct OpMultiplyAddFastF32 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the input is converted to 2 (big and small) TF32 components
+//  Perform 3xTF32 or 4xTF32 for every complex<F32> output element
+struct OpMultiplyAddComplexFastF32 {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating that staged accumulation is not to be used. This is valid only for SM89
+/// FP8 kernels.
+struct OpMultiplyAddFastAccum;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the complex multiply-add operation
+struct OpMultiplyAddComplex {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the gaussian complex multiply-add operation
+struct OpMultiplyAddGaussianComplex {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the inner product is defined by (XOR, POPC)
+struct OpXorPopc {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag indicating the inner product is defined by (AND, POPC)
+struct OpAndPopc {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag classifying math operators as thread-level operations.
+struct OpClassSimt {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag classifying operators as Tensor Core operations.
+struct OpClassTensorOp {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tag classifying operators as WMMA Tensor Core operations
+struct OpClassWmmaTensorOp {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag classifying operators as Tensor Core with structure sparse operations.
+struct OpClassSparseTensorOp {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Size of the matrix product (concept: GemmShape)
+  typename Shape_,
+  /// Number of threads participating
+  int kThreads_,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+  /// Inner product operator
+  typename Operator
+>
+struct Mma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - specialized for 1x1x1x1 matrix multiply operation
+template <
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+  /// Inner product operator
+  typename Operator_
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, ElementA, LayoutA, ElementB, LayoutB, ElementC_, LayoutC, Operator_> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = Operator_;
+  using ElementC = ElementC_;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<ElementC, 1> &d,
+    Array<ElementA, 1> const &a,
+    Array<ElementB, 1> const &b,
+    Array<ElementC, 1> const &c
+  ) {
+
+    multiply_add<ElementA, ElementB, ElementC> op;
+
+    d[0] = op(a[0], b[0], c[0]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specifies internal data type for computation
+struct SPFormatType {
+  enum Kind {
+    Thread
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Size of the matrix product (concept: GemmShape)
+  typename Shape_,
+  /// Number of threads participating
+  int kThreads_,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+  /// Inner product operator
+  typename Operator,
+  /// Specifies meta data format
+  SPFormatType::Kind SPFormat = SPFormatType::Thread
+>
+struct SparseMma;
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Specializations for each compute capability
+//
+
+#include "cutlass/arch/mma_sm50.h"
+#include "cutlass/arch/mma_sm60.h"
+#include "cutlass/arch/mma_sm61.h"
+#include "cutlass/arch/mma_sm70.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+#include "cutlass/arch/mma_sparse_sm80.h"
+#include "cutlass/arch/mma_sm89.h"
+#include "cutlass/arch/mma_sparse_sm89.h"
+#include "cutlass/arch/mma_sm90.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+namespace detail {
+/// Helper for determining whether staged accumulation should be used for a given operator
+template <typename Operator>
+struct UseStagedAccumulation {
+  static bool const value = platform::is_same<typename Operator::MathOperator, OpMultiplyAddFastF32>::value ||
+                            platform::is_same<typename Operator::MathOperator, OpMultiplyAddComplexFastF32>::value ||
+                            is_sm89_staged_policy_v<Operator>;
+};
+} // namespace detail
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
new file mode 100755
index 000000000..98ff18bea
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
@@ -0,0 +1,432 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, float, LayoutA, float, LayoutB, float, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<float, 1> const &a,
+    Array<float, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, double, LayoutA, double, LayoutB, double, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = double;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<double, 1> &d,
+    Array<double, 1> const &a,
+    Array<double, 1> const &b,
+    Array<double, 1> const &c
+  ) {
+
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, int, LayoutA, int, LayoutB, int, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int, 1> const &a,
+    Array<int, 1> const &b,
+    Array<int, 1> const &c
+  ) {
+
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  float,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<float, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  float,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<float, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  double,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<double, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  double,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<double, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, half_t, LayoutA, half_t, LayoutB, float, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = float(a[0]) * float(b[0]) + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation for Quaternions
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, Quaternion<float>, LayoutA, Quaternion<float>, LayoutB, Quaternion<float>, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using Element = Quaternion<float>;
+  using ElementC = Element;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<Element, 1> &d,
+    Array<Element, 1> const &a,
+    Array<Element, 1> const &b,
+    Array<Element, 1> const &c
+  ) {
+    multiply_add<Element, Element, Element> op;
+    d[0] = op(a[0], b[0], c[0]);
+  }
+
+};
+
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
new file mode 100755
index 000000000..3e3c71ef3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<2,1,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 1> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 B = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[0] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB>
+struct Mma<
+  gemm::GemmShape<1,2,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[0] * b[i] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma <
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 Blo = __low2half2(reinterpret_cast<__half2 const &>(b));
+    __half2 Bhi = __high2half2(reinterpret_cast<__half2 const &>(b));
+
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+
+    __half2 Dlo = __hfma2(A, Blo, C[0]);
+    __half2 Dhi = __hfma2(A, Bhi, C[1]);
+
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+
+    D[0] = reinterpret_cast<Array<half_t, 2> const &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> const &>(Dhi);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < 2; ++j) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < 2; ++i) {
+        d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
+      }
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 Alo = __low2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 Ahi = __high2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+
+    __half2 Dlo = __hfma2(Alo, B, C[0]);
+    __half2 Dhi = __hfma2(Ahi, B, C[0]);
+
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+
+    D[0] = reinterpret_cast<Array<half_t, 2> &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 2; ++j) {
+        d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
+      }
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
new file mode 100755
index 000000000..82a5aa728
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1,1,4>,
+  1,
+  int8_t,
+  LayoutA,
+  int8_t,
+  LayoutB,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 4>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int8_t, 4> const &a,
+    Array<int8_t, 4> const &b,
+    Array<int, 1> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+
+#else
+
+    d[0] = c[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 4; ++k) {
+      d[0] += a[k] * b[k];
+    }
+
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1, 1, 2>,
+  1,
+  int16_t,
+  layout::RowMajor,
+  int16_t,
+  layout::ColumnMajor,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 2>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int16_t, 2> const &a,
+    Array<int16_t, 2> const &b,
+    Array<int, 1> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+
+    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+#else
+    d[0] = c[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 2; ++k) {
+      d[0] += a[k] * b[k];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
new file mode 100755
index 000000000..6471de8a8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
@@ -0,0 +1,665 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_SUPPORTED
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 &&__CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_ENABLED
+#endif
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP16 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP32 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation specialized for the entire warp
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename Operator
+>
+struct Mma<
+  gemm::GemmShape<16, 16, 4>,
+  32,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  Operator
+> : 
+  public Mma<
+    gemm::GemmShape<8, 8, 4>, 
+    8, 
+    half_t, 
+    LayoutA, 
+    half_t, 
+    LayoutB,
+    ElementC, 
+    LayoutC, 
+    Operator> {
+
+  using Shape = gemm::GemmShape<16, 16, 4>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
new file mode 100755
index 000000000..6cced190e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
@@ -0,0 +1,793 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply for SM75
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+// CUDA Toolkit includes for nvcuda::wmma needed for binarized matrix multiply.
+#include <mma.h>
+#include "cutlass/wmma_array.h"
+#endif
+
+// CUTLASS includes
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+
+#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#define CUTLASS_ARCH_MMA_SM75_ENABLED
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP16 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+  unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+  asm volatile(
+    "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (8b) with SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (4b) - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// b1 ^ b1 + s32 => s32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,128>,
+  32,
+  uint1b_t,
+  layout::RowMajor,
+  uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+
+  using Shape = gemm::GemmShape<8,8,128>;
+
+  using ElementA = uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint1b_t, 32>;
+
+  using ElementB = uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint1b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+  using WmmaFragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::row_major>;
+
+  using WmmaFragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::col_major>;
+
+  using WmmaFragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          int>;
+  
+  WmmaFragmentA const & A = reinterpret_cast<WmmaFragmentA const &>(a);
+  WmmaFragmentB const & B = reinterpret_cast<WmmaFragmentB const &>(b);
+
+  WmmaFragmentC const & C = reinterpret_cast<WmmaFragmentC const &>(c);
+  WmmaFragmentC & D = reinterpret_cast<WmmaFragmentC &>(d);
+
+  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+
+#else
+
+  CUTLASS_UNUSED(a);
+  CUTLASS_UNUSED(b);
+  CUTLASS_UNUSED(c);
+  CUTLASS_UNUSED(d);
+  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
new file mode 100755
index 000000000..f990c1ac2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
@@ -0,0 +1,1505 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+
+#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+#if (__CUDA_ARCH__ <= 890)
+#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
+#endif
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float BF16, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 4>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1684 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 4>,
+  32,
+  tfloat32_t,
+  layout::RowMajor,
+  tfloat32_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 4>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 2>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 1>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
+        "{%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 884 - F64
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 1>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 2>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  double const & A = reinterpret_cast<double const &>(a);
+  double const & B = reinterpret_cast<double const &>(b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=d"(D[0]), "=d"(D[1])
+      : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+    
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+    
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - AND,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int32_t,
+  layout::RowMajor,
+  OpAndPopc> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+
+  using Operator = OpAndPopc;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
new file mode 100755
index 000000000..fe4b7eb7e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
@@ -0,0 +1,367 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Matrix multiply-accumulate specialzied for SM89
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+
+#  define CUTLASS_ARCH_MMA_SM89_SUPPORTED 1
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
+#  define CUTLASS_ARCH_MMA_SM89_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Whether the Mma uses as SM89 staged accumulation policy
+template <class Operator>
+static constexpr bool is_sm89_staged_policy_v =
+  (
+    // ElementA must be FP8
+    platform::is_same<typename Operator::ElementA, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementA, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // ElementB must be FP8
+    platform::is_same<typename Operator::ElementB, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementB, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // The instruction shape must be 16x8x32
+    Operator::ArchMmaOperator::Shape::kM == 16 &&
+    Operator::ArchMmaOperator::Shape::kN == 8 &&
+    Operator::ArchMmaOperator::Shape::kK == 32
+  ) &&
+  (
+    // The operator must be OpMultiplyAdd (default)
+    platform::is_same<typename Operator::MathOperator, OpMultiplyAdd>::value
+  );
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
new file mode 100755
index 000000000..1183ee5e0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
@@ -0,0 +1,245 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/config.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x4 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 2>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[1]),
+        "d"(B[0]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x8 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,8>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,8>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 4>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 2>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+      : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3])
+      : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]),
+        "d"(B[0]), "d"(B[1]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x16 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 8>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 4>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+    
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]),
+        "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]), 
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
new file mode 100755
index 000000000..7041d04dd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
@@ -0,0 +1,1238 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM80
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))
+
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED
+#endif
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+> {
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+  > {
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832 - Float BF16, FP32 accumulation 
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 32>, 32, bfloat16_t, layout::RowMajor,
+           bfloat16_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16816 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 4;
+
+  static int const kMaxID2 = 2;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
new file mode 100755
index 000000000..c092df768
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
@@ -0,0 +1,409 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM89
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+
+#  define CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED 1
+#endif
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
+#  define CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h b/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
new file mode 100755
index 000000000..d2b434453
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief PTX for CTA Reconfiguration
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#ifndef CUDA_CTA_RECONFIG_ACTIVATED
+  #if (__CUDACC_VER_MAJOR__ >= 12 && \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+#endif
+
+namespace cutlass {
+namespace arch {
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_alloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_dealloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd.h
new file mode 100755
index 000000000..3104746e5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/simd.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators
+*/
+
+#pragma once
+
+#include "../array.h"
+#include "../numeric_types.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Element-wise operators
+//
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator*(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i];
+  }
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator+(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] + b[i];
+  }
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator-(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] - b[i];
+  }
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Multiply-accumulate operators
+//
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> mac(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i] + c[i];
+  }
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Dot product operator
+//
+
+CUTLASS_HOST_DEVICE
+template <typename Element, typename Accumulator, int N>
+Accumulator dot(Array<T, N> const &a, Array<T, N> const &b, Accumulator accum) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    accum += a[i] * b[i];
+  }
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "simd_sm60.h"
+#include "simd_sm61.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
new file mode 100755
index 000000000..6e1ef2044
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
@@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM60
+*/
+
+#pragma once
+
+#include "simd.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Element-wise operators - specialized for half_t x 2
+//
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator*(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator+(AArray<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator-(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<T, N> d;
+
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Multiply-accumulate operators - specialized for half_t x 2
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> mac(Array<half_t, 2> const &a, Array<half_t, 2> const &b, Array<half_t, 2> const &c) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for half_t <- (half_t * half_t) x 2 + half_t
+CUTLASS_HOST_DEVICE
+template <>
+half_t dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, half_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for float <- (half_t * half_t) x 2 + float
+CUTLASS_HOST_DEVICE
+template <>
+float dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, float accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
new file mode 100755
index 000000000..b783c943e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
@@ -0,0 +1,147 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM61
+*/
+
+#pragma once
+
+#include "simd.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp b/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
new file mode 100755
index 000000000..ea683859a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
@@ -0,0 +1,1324 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Synchronization event logging for race condition debugging.
+*/
+
+#pragma once
+
+#include "cutlass/detail/helper_macros.hpp"
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <mutex>
+#include <vector>
+#endif
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+
+constexpr uint32_t synclog_cap = 1 << 26;
+
+inline std::mutex synclog_mutex;
+inline std::vector<uint32_t*> synclog_buf_list;
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+inline __device__ uint32_t* synclog_buf;
+#endif
+
+CUTLASS_DEVICE
+uint32_t* synclog_alloc(uint32_t n) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t* buf = synclog_buf;
+  if (buf == nullptr) return nullptr;
+  uint32_t last = atomicAdd(&buf[0], n);
+  if (last + n < synclog_cap) return buf + last + 1;
+  if (last >= synclog_cap) atomicAdd(&buf[0], -n);
+  #endif
+  return nullptr;
+}
+
+CUTLASS_DEVICE
+void synclog_emit_prefix(uint32_t* to, uint32_t header, uint32_t line) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint64_t time64;
+  asm volatile (
+    "mov.u64 %0, %%globaltimer;\n"
+    : "=l"(time64) :
+  );
+  to[0] = header;
+  to[1] = line;
+  to[2] = time64;
+  to[3] = time64 >> 32;
+  to[4] = threadIdx.x;
+  to[5] = threadIdx.y;
+  to[6] = threadIdx.z;
+  to[7] = blockIdx.x;
+  to[8] = blockIdx.y;
+  to[9] = blockIdx.z;
+  #endif
+}
+
+constexpr uint32_t synclog_header_none = 0;
+constexpr uint32_t synclog_length_prefix = 1 + 1 + 2 + 3 + 3;
+
+constexpr bool     synclog_enable_syncthreads = true;
+constexpr uint32_t synclog_header_syncthreads = 1;
+constexpr uint32_t synclog_length_syncthreads = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_syncwarp = true;
+constexpr uint32_t synclog_header_syncwarp = 2;
+constexpr uint32_t synclog_length_syncwarp = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_named_barrier_arrive_and_wait = true;
+constexpr uint32_t synclog_header_named_barrier_arrive_and_wait = 3;
+constexpr uint32_t synclog_length_named_barrier_arrive_and_wait = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_named_barrier_arrive = true;
+constexpr uint32_t synclog_header_named_barrier_arrive = 4;
+constexpr uint32_t synclog_length_named_barrier_arrive = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_init = true;
+constexpr uint32_t synclog_header_cluster_barrier_init = 5;
+constexpr uint32_t synclog_length_cluster_barrier_init = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_wait = 6;
+constexpr uint32_t synclog_length_cluster_barrier_wait = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_barrier_test_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_test_wait = 7;
+constexpr uint32_t synclog_length_cluster_barrier_test_wait = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cluster_barrier_try_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_try_wait = 8;
+constexpr uint32_t synclog_length_cluster_barrier_try_wait = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_barrier_arrive_cluster = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive_cluster = 9;
+constexpr uint32_t synclog_length_cluster_barrier_arrive_cluster = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cluster_barrier_arrive = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive = 10;
+constexpr uint32_t synclog_length_cluster_barrier_arrive = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_cluster_barrier_invalidate = true;
+constexpr uint32_t synclog_header_cluster_barrier_invalidate = 11;
+constexpr uint32_t synclog_length_cluster_barrier_invalidate = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx = 12;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster = 13;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster = synclog_length_prefix + 6;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_expect_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_expect_transaction = 14;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_expect_transaction = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cluster_transaction_barrier_complete_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_complete_transaction = 15;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_complete_transaction = synclog_length_prefix + 6;
+
+constexpr bool     synclog_enable_fence_barrier_init = true;
+constexpr uint32_t synclog_header_fence_barrier_init = 16;
+constexpr uint32_t synclog_length_fence_barrier_init = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_fence_view_async_shared = true;
+constexpr uint32_t synclog_header_fence_view_async_shared = 17;
+constexpr uint32_t synclog_length_fence_view_async_shared = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_wait = true;
+constexpr uint32_t synclog_header_cp_async_wait = 18;
+constexpr uint32_t synclog_length_cp_async_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_cp_async_wait_all = true;
+constexpr uint32_t synclog_header_cp_async_wait_all = 19;
+constexpr uint32_t synclog_length_cp_async_wait_all = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_fence = true;
+constexpr uint32_t synclog_header_cp_async_fence = 20;
+constexpr uint32_t synclog_length_cp_async_fence = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_nan = true;
+constexpr uint32_t synclog_header_cp_async_nan = 21;
+constexpr uint32_t synclog_length_cp_async_nan = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cp_async_zfill = true;
+constexpr uint32_t synclog_header_cp_async_zfill = 22;
+constexpr uint32_t synclog_length_cp_async_zfill = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cp_async = true;
+constexpr uint32_t synclog_header_cp_async = 23;
+constexpr uint32_t synclog_length_cp_async = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_tma_load = true;
+constexpr uint32_t synclog_header_tma_load = 24;
+constexpr uint32_t synclog_length_tma_load = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_tma_store = true;
+constexpr uint32_t synclog_header_tma_store = 25;
+constexpr uint32_t synclog_length_tma_store = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_tma_store_arrive = true;
+constexpr uint32_t synclog_header_tma_store_arrive = 26;
+constexpr uint32_t synclog_length_tma_store_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_tma_store_wait = true;
+constexpr uint32_t synclog_header_tma_store_wait = 27;
+constexpr uint32_t synclog_length_tma_store_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_arrive = true;
+constexpr uint32_t synclog_header_warpgroup_arrive = 28;
+constexpr uint32_t synclog_length_warpgroup_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_warpgroup_wait = true;
+constexpr uint32_t synclog_header_warpgroup_wait = 29;
+constexpr uint32_t synclog_length_warpgroup_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_commit_batch = true;
+constexpr uint32_t synclog_header_warpgroup_commit_batch = 30;
+constexpr uint32_t synclog_length_warpgroup_commit_batch = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_wgmma_reg_smem = true;
+constexpr uint32_t synclog_header_wgmma_reg_smem = 31;
+constexpr uint32_t synclog_length_wgmma_reg_smem = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_wgmma_smem_smem = true;
+constexpr uint32_t synclog_header_wgmma_smem_smem = 32;
+constexpr uint32_t synclog_length_wgmma_smem_smem = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cpasync_barrier_arrive = true;
+constexpr uint32_t synclog_header_cpasync_barrier_arrive = 33;
+constexpr uint32_t synclog_length_cpasync_barrier_arrive = synclog_length_prefix + 3;
+
+CUTLASS_DEVICE
+bool synclog_condition_emit() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x%NumThreadsPerWarp == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return 0;
+  #endif
+}
+
+CUTLASS_DEVICE
+bool synclog_condition_print() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return false;
+  #endif
+}
+
+CUTLASS_DEVICE
+void synclog_print_prefix(char const* header, uint32_t at) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t line = synclog_buf[at + 1];
+  uint32_t timeLo = synclog_buf[at + 2];
+  uint32_t timeHi = synclog_buf[at + 3];
+  uint32_t threadIdxX = synclog_buf[at + 4];
+  uint32_t threadIdxY = synclog_buf[at + 5];
+  uint32_t threadIdxZ = synclog_buf[at + 6];
+  uint32_t blockIdxX = synclog_buf[at + 7];
+  uint32_t blockIdxY = synclog_buf[at + 8];
+  uint32_t blockIdxZ = synclog_buf[at + 9];
+  printf(
+    "%s line=%u time=%lu thread=%u,%u,%u block=%u,%u,%u ",
+    header, line,
+    (uint64_t)timeHi << 32 | timeLo,
+    threadIdxX, threadIdxY, threadIdxZ,
+    blockIdxX, blockIdxY, blockIdxZ
+  );
+  #endif
+}
+
+CUTLASS_DEVICE
+uint64_t synclog_mbarrier_bits(uint32_t smem_addr) {
+  uint64_t bits = 0;
+  asm volatile (
+    "mbarrier.inval.shared::cta.b64 [%1];\n"
+    "ld.shared::cta.b64 %0, [%1];\n"
+    : "=l"(bits) : "r"(smem_addr)
+  );
+  return bits;
+}
+
+CUTLASS_DEVICE
+void synclog_print_wgmma_desc(char const* str, uint32_t lo, uint32_t hi, char const* sep) {
+  CUTLASS_UNUSED(hi);
+  uint32_t smem_int_ptr = (lo & ((1 << 14) - 1)) << 4;
+  printf("%s_smem_int_ptr=%u%s", str, smem_int_ptr, sep);
+}
+
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline void synclog_setup() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  std::scoped_lock lock(synclog_mutex);
+  auto fail = [] () {
+    fprintf(stderr, "synclog_setup() failed\n");
+    std::terminate();
+  };
+  int orig_device = 0;
+  if (cudaGetDevice(&orig_device) != cudaSuccess) {
+    fail();
+  }
+  int device_count = 0;
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
+    fail();
+  }
+  if (synclog_buf_list.size() == 0) {
+    for (int device = 0; device < device_count; device++) {
+      uint32_t* buf = 0;
+      if (cudaSetDevice(device) != cudaSuccess ||
+        cudaMalloc(&buf, synclog_cap * sizeof(uint32_t)) != cudaSuccess) {
+        fail();
+      }
+      synclog_buf_list.push_back(buf);
+    }
+  }
+  for (int device = 0; device < device_count; device++) {
+    uint32_t* buf = synclog_buf_list.at(device);
+    if (cudaSetDevice(device) != cudaSuccess ||
+      cudaMemset(buf, 0, synclog_cap * sizeof(uint32_t)) != cudaSuccess ||
+      cudaMemcpyToSymbol(synclog_buf, &buf, sizeof(buf)) != cudaSuccess) {
+      fail();
+    }
+  }
+  if (cudaSetDevice(orig_device) != cudaSuccess) {
+    fail();
+  }
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncthreads(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncthreads) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncthreads);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncthreads, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncwarp(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncwarp) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncwarp);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncwarp, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive_and_wait(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive_and_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive_and_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive_and_wait, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_init(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t arrive_count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_init, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = arrive_count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(arrive_count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_test_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_test_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_test_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_test_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = pred;
+  to[synclog_length_prefix + 3] = bits;
+  to[synclog_length_prefix + 4] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_try_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_try_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_try_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_try_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);  
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = cta_id;
+  to[synclog_length_prefix + 2] = pred;
+  to[synclog_length_prefix + 3] = bits;
+  to[synclog_length_prefix + 4] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_invalidate(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_invalidate) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_invalidate);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_invalidate, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 3] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = cta_id;
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = bits;
+  to[synclog_length_prefix + 5] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_expect_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_expect_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_expect_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_expect_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_complete_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t dst_cta_id,
+  uint32_t transaction_bytes,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_complete_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_complete_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_complete_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = dst_cta_id;
+  to[synclog_length_prefix + 2] = transaction_bytes;
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = bits;
+  to[synclog_length_prefix + 5] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(dst_cta_id);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_barrier_init(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_barrier_init, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_view_async_shared(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_view_async_shared) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_view_async_shared);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_view_async_shared, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait_all(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait_all) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait_all);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait_all, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_fence(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_fence) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_fence);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_fence, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_nan(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_nan) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_nan);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_nan, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_zfill(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_zfill) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_zfill);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_zfill, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_load(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_mbar,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_load) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_load);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_load, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_mbar;
+  to[synclog_length_prefix + 3] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_mbar);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_arrive(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_wait(
+  uint32_t line,
+  uint32_t count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_wait, line);
+  to[synclog_length_prefix + 0] = count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_arrive(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_commit_batch(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_commit_batch) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_commit_batch);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_commit_batch, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_reg_smem(
+  uint32_t line,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_reg_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_reg_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_reg_smem, line);
+  to[synclog_length_prefix + 0] = desc_b;
+  to[synclog_length_prefix + 1] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_smem_smem(
+  uint32_t line,
+  uint64_t desc_a,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_smem_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_smem_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_smem_smem, line);
+  to[synclog_length_prefix + 0] = desc_a;
+  to[synclog_length_prefix + 1] = desc_a >> 32;
+  to[synclog_length_prefix + 2] = desc_b;
+  to[synclog_length_prefix + 3] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_a);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cpasync_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cpasync_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint64_t bits = synclog_mbarrier_bits(smem_addr);
+  uint32_t* to = synclog_alloc(synclog_length_cpasync_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cpasync_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = bits;
+  to[synclog_length_prefix + 2] = bits >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+#if !defined(CUTLASS_ENABLE_SYNCLOG)
+CUTLASS_DEVICE
+#elif defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+static __attribute__((__noinline__)) __device__
+#else
+static __attribute__((__noinline__))
+#endif
+void synclog_print() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  if (synclog_buf == nullptr || !synclog_condition_print()) {
+    return;
+  }
+  printf("synclog start\n");
+  for (uint32_t at = 1; at < synclog_cap; ) {
+    uint32_t header = synclog_buf[at];
+    if (header == synclog_header_none) {
+      break;
+    }
+    printf("synclog at %u: ", at);
+    if constexpr (synclog_enable_syncthreads) {
+      if (header == synclog_header_syncthreads) {
+        synclog_print_prefix("syncthreads", at);
+        at += synclog_length_syncthreads;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_syncwarp) {
+      if (header == synclog_header_syncwarp) {
+        synclog_print_prefix("syncwarp", at);
+        at += synclog_length_syncwarp;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive_and_wait) {
+      if (header == synclog_header_named_barrier_arrive_and_wait) {
+        synclog_print_prefix("named_barrier_arrive_and_wait", at);
+        at += synclog_length_named_barrier_arrive_and_wait;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive) {
+      if (header == synclog_header_named_barrier_arrive) {
+        synclog_print_prefix("named_barrier_arrive", at);
+        at += synclog_length_named_barrier_arrive;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_init) {
+      if (header == synclog_header_cluster_barrier_init) {
+        synclog_print_prefix("cluster_barrier_init", at);
+        at += synclog_length_cluster_barrier_init;
+        printf("smem_addr=%u arrive_count=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_wait) {
+      if (header == synclog_header_cluster_barrier_wait) {
+        synclog_print_prefix("cluster_barrier_wait", at);
+        at += synclog_length_cluster_barrier_wait;
+        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_test_wait) {
+      if (header == synclog_header_cluster_barrier_test_wait) {
+        synclog_print_prefix("cluster_barrier_test_wait", at);
+        at += synclog_length_cluster_barrier_test_wait;
+        printf("smem_addr=%u phase=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_try_wait) {
+      if (header == synclog_header_cluster_barrier_try_wait) {
+        synclog_print_prefix("cluster_barrier_try_wait", at);
+        at += synclog_length_cluster_barrier_try_wait;
+        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive_cluster) {
+      if (header == synclog_header_cluster_barrier_arrive_cluster) {
+        synclog_print_prefix("cluster_barrier_arrive_cluster", at);
+        at += synclog_length_cluster_barrier_arrive_cluster;
+        printf("smem_addr=%u cta_id=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive) {
+      if (header == synclog_header_cluster_barrier_arrive) {
+        synclog_print_prefix("cluster_barrier_arrive", at);
+        at += synclog_length_cluster_barrier_arrive;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_invalidate) {
+      if (header == synclog_header_cluster_barrier_invalidate) {
+        synclog_print_prefix("cluster_barrier_invalidate", at);
+        at += synclog_length_cluster_barrier_invalidate;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx;
+        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx_cluster", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster;
+        printf("smem_addr=%u transaction_bytes=%u cta_id=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_expect_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_expect_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_expect_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_expect_transaction;
+        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_complete_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_complete_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_complete_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_complete_transaction;
+        printf("smem_addr=%u dst_cta_id=%u transaction_bytes=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_barrier_init) {
+      if (header == synclog_header_fence_barrier_init) {
+        synclog_print_prefix("fence_barrier_init", at);
+        at += synclog_length_fence_barrier_init;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_view_async_shared) {
+      if (header == synclog_header_fence_view_async_shared) {
+        synclog_print_prefix("fence_view_async_shared", at);
+        at += synclog_length_fence_view_async_shared;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait) {
+      if (header == synclog_header_cp_async_wait) {
+        synclog_print_prefix("cp_async_wait", at);
+        at += synclog_length_cp_async_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait_all) {
+      if (header == synclog_header_cp_async_wait_all) {
+        synclog_print_prefix("cp_async_wait_all", at);
+        at += synclog_length_cp_async_wait_all;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_fence) {
+      if (header == synclog_header_cp_async_fence) {
+        synclog_print_prefix("cp_async_fence", at);
+        at += synclog_length_cp_async_fence;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_nan) {
+      if (header == synclog_header_cp_async_nan) {
+        synclog_print_prefix("cp_async_nan", at);
+        at += synclog_length_cp_async_nan;
+        uint64_t gmem_addr = synclog_buf[at-3];
+        gmem_addr += (uint64_t)synclog_buf[at-2] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u\n", synclog_buf[at-4], gmem_addr, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_zfill) {
+      if (header == synclog_header_cp_async_zfill) {
+        synclog_print_prefix("cp_async_zfill", at);
+        at += synclog_length_cp_async_zfill;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async) {
+      if (header == synclog_header_cp_async) {
+        synclog_print_prefix("cp_async", at);
+        at += synclog_length_cp_async;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_load) {
+      if (header == synclog_header_tma_load) {
+        synclog_print_prefix("tma_load", at);
+        at += synclog_length_tma_load;
+        uint64_t gmem_int_desc = synclog_buf[at-4];
+        gmem_int_desc += (uint64_t)synclog_buf[at-3] << 32;
+        printf("gmem_int_desc=%llu smem_int_mbar=%u smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store) {
+      if (header == synclog_header_tma_store) {
+        synclog_print_prefix("tma_store", at);
+        at += synclog_length_tma_store;
+        uint64_t gmem_int_desc = synclog_buf[at-3];
+        gmem_int_desc += (uint64_t)synclog_buf[at-2] << 32;
+        printf("gmem_int_desc=%llu smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_arrive) {
+      if (header == synclog_header_tma_store_arrive) {
+        synclog_print_prefix("tma_store_arrive", at);
+        at += synclog_length_tma_store_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_wait) {
+      if (header == synclog_header_tma_store_wait) {
+        synclog_print_prefix("tma_store_wait", at);
+        at += synclog_length_tma_store_wait;
+        printf("count=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_arrive) {
+      if (header == synclog_header_warpgroup_arrive) {
+        synclog_print_prefix("warpgroup_arrive", at);
+        at += synclog_length_warpgroup_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_wait) {
+      if (header == synclog_header_warpgroup_wait) {
+        synclog_print_prefix("warpgroup_wait", at);
+        at += synclog_length_warpgroup_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_commit_batch) {
+      if (header == synclog_header_warpgroup_commit_batch) {
+        synclog_print_prefix("warpgroup_commit_batch", at);
+        at += synclog_length_warpgroup_commit_batch;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_reg_smem) {
+      if (header == synclog_header_wgmma_reg_smem) {
+        synclog_print_prefix("wgmma_reg_smem", at);
+        at += synclog_length_wgmma_reg_smem;
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_smem_smem) {
+      if (header == synclog_header_wgmma_smem_smem) {
+        synclog_print_prefix("wgmma_smem_smem", at);
+        at += synclog_length_wgmma_smem_smem;
+        synclog_print_wgmma_desc("desc_a", synclog_buf[at-4], synclog_buf[at-3], " ");
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cpasync_barrier_arrive) {
+      if (header == synclog_header_cpasync_barrier_arrive) {
+        synclog_print_prefix("cpasync_barrier_arrive", at);
+        at += synclog_length_cpasync_barrier_arrive;
+        printf("smem_addr=%u", synclog_buf[at-3]);
+        continue;
+      }
+    }
+    asm volatile ("brkpt;\n" ::);
+  }
+  if (synclog_buf[0] >= synclog_cap) {
+    printf(
+      "synclog was truncated (exceeded capacity of %lu bytes)\n",
+      (synclog_cap - 1) * sizeof(uint32_t)
+    );
+  }
+  printf("synclog end\n");
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncthreads
+#define __syncthreads() do {\
+  cutlass::arch::synclog_emit_syncthreads(__LINE__);\
+  __syncthreads();\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncwarp
+#define __syncwarp(...) do {\
+  cutlass::arch::synclog_emit_syncwarp(__LINE__);\
+  __syncwarp(__VA_ARGS__);\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
new file mode 100755
index 000000000..720895f38
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp matrix multiply-add (WMMA) operations
+*/
+
+#pragma once
+
+// CUTLASS WMMA does not support clang at present.
+#if !(defined(__clang__) && defined(__CUDA__))
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))
+#define CUTLASS_ARCH_WMMA_ENABLED
+#define CUTLASS_ARCH_WMMA_SM70_ENABLED
+#endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 720))
+#define CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM72_ENABLED
+#endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 750))
+#define CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM75_ENABLED
+#endif
+#endif
+
+#endif //!(defined(__clang__) && defined(__CUDA__))
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include <mma.h>
+#include "cutlass/arch/mma.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass data types => nvcuda::wmma data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct CutlassToWmmaDataType{
+  using Type = Type_;
+};
+
+/// Statically maps cutlass::half_t => __half
+template<>
+struct CutlassToWmmaDataType<cutlass::half_t> {
+  using Type = __half;
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
+  using Type = __nv_bfloat16;
+};
+#endif
+
+/// Statically maps int8_t => char
+template<>
+struct CutlassToWmmaDataType<int8_t> {
+  using Type = signed char;
+};
+
+/// Statically maps uint8_t => char
+template<>
+struct CutlassToWmmaDataType<uint8_t> {
+  using Type = unsigned char;
+};
+
+/// Statically maps int32_t => int
+template<>
+struct CutlassToWmmaDataType<int32_t> {
+  using Type = int;
+};
+
+#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
+/// Statically maps cutlass::int4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::int4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::s4;
+};
+
+/// Statically maps cutlass::uint4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::uint4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::u4;
+};
+
+/// Statically maps cutlass::uint1b_t => experimental::precision::b1
+template<>
+struct CutlassToWmmaDataType<cutlass::uint1b_t> {
+  using Type = nvcuda::wmma::experimental::precision::b1;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout => nvcuda::wmma layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Layout_>
+struct CutlassToWmmaLayout {
+};
+
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::RowMajor> {
+  using Layout = nvcuda::wmma::row_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_row_major;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::ColumnMajor> {
+  using Layout = nvcuda::wmma::col_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_col_major;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps nvcuda::wmma data types => cutlass data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct WmmaToCutlassDataType{
+  using Type = Type_;
+};
+
+/// Statically maps __half => cutlass::half_t
+template<>
+struct WmmaToCutlassDataType<__half> {
+  using Type = cutlass::half_t;
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct WmmaToCutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// WMMA template structure defines nvcuda::wmma::fragments and static assertion chaeks
+// for a specific template paramterized data type (Element[A|B|C]), layout (Layout[A|B|C]), 
+// and native wmma size (Shape)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <  
+  typename Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  typename ElementA_,                                ///< Data type of A elements 
+  typename LayoutA_,                                 ///< Layout of A matrix (concept: MatrixLayout)  
+  typename ElementB_,                                ///< Data type of B elements
+  typename LayoutB_,                                 ///< Layout of B matrix (concept: MatrixLayout)  
+  typename ElementC_,                                ///< Element type of C matrix  
+  typename LayoutC_,                                 /// Layout of C matrix (concept: MatrixLayout)
+  typename Operator_ = cutlass::arch::OpMultiplyAdd   ///< Inner product operator (multiply-add, xor.popc)
+>
+struct Wmma;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Specializations for each compute capability
+//
+#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
+#include "cutlass/arch/wmma_sm70.h"
+#endif
+
+#ifdef CUTLASS_ARCH_WMMA_SM72_ENABLED
+#include "cutlass/arch/wmma_sm72.h"
+#endif
+
+#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
+#include "cutlass/arch/wmma_sm75.h"
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif //CUTLASS_ARCH_WMMA_ENABLED
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
new file mode 100755
index 000000000..19fda4f85
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for half
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename ElementC_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::half_t,                          ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::half_t,                          ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  ElementC_,                                ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+
+#if defined(CUTLASS_ARCH_WMMA_SM70_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::half_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::half_t;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for f16 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+  // check supported wmma output data type for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::half_t, ElementC>::value || platform::is_same<float, ElementC>::value,
+    "Supported of wmma output data type for f16 multiplicands are: f16 and f32");
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+    
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync for floating point multiplicands is avialable only for SM70 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
new file mode 100755
index 000000000..4a2689058
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
@@ -0,0 +1,210 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for int8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  int8_t,                                   ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  int8_t,                                   ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = int8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = int8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM72 and beyond");
+#endif
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for uint8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  uint8_t,                                  ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  uint8_t,                                  ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = uint8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = uint8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for u8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+  
+#else
+    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM72 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
new file mode 100755
index 000000000..4663e95c7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
@@ -0,0 +1,207 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::int4b_t,                         ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::int4b_t,                         ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 32>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
+#endif
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::uint1b_t,                        ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::uint1b_t,                        ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpXorPopc                  ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpXorPopc;
+  using ArchTag = arch::Sm75;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 128>, Shape>::value,
+    "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/array.h b/lightllm-kernel/cutlass/include/cutlass/array.h
new file mode 100755
index 000000000..62e946949
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/array.h
@@ -0,0 +1,2614 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N,
+  bool RegisterSized = sizeof_bits<T>::value >= 32
+>
+struct Array;
+
+namespace detail {
+
+template<class T>
+struct is_Array : platform::false_type {};
+
+template <
+  typename T,
+  int N,
+  bool RegisterSized
+>
+struct is_Array<Array<T, N, RegisterSized> > : platform::true_type {};
+
+template<typename T>
+constexpr bool is_Array_v = is_Array<T>::value;
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an Array<> in bits
+template <typename T, int N, bool RegisterSized>
+struct sizeof_bits<Array<T, N, RegisterSized> > {
+  static constexpr int value = sizeof(Array<T, N, RegisterSized>) * 8;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if the argument is a power of 2
+CUTLASS_HOST_DEVICE
+constexpr bool ispow2(unsigned x) {
+  return x && (!(x & (x - 1)));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the largest power of two not greater than the argument.
+CUTLASS_HOST_DEVICE
+constexpr unsigned floor_pow_2(unsigned x) {
+  return (x == 0 || ispow2(x)) ? x : ((floor_pow_2(x >> 1)) << 1);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, true> {
+
+  /// Storage type
+  using Storage = T;
+
+  /// Element type
+  using Element = T;
+
+  /// Number of storage elements
+  //static std::size_t const kStorageElements = N;
+  static constexpr size_t kStorageElements = N;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  //
+  // C++ standard members
+  //
+
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type &reference;
+  typedef value_type const & const_reference;
+  typedef value_type *pointer;
+  typedef value_type const * const_pointer;
+
+  //
+  // Iterators
+  //
+
+  /// Bidirectional iterator over elements
+  class iterator {
+
+    /// Pointer to object
+    T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator(T *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+
+    /// Pointer to object
+    const T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(T const *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator operator++(int) {
+      const_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator operator--(int) {
+      const_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+
+    /// Pointer to object
+    T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(T *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator++(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator--(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *(ptr_ - 1);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(reverse_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(reverse_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+
+    /// Pointer to object
+    T const *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(T const *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator++(int) {
+      const_reverse_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator--(int) {
+      const_reverse_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *(ptr_ - 1);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Internal storage
+  Storage storage[kElements];
+
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    fill(T(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return reinterpret_cast<reference>(storage[0]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return reinterpret_cast<const_reference>(storage[0]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reinterpret_cast<reference>(storage[kStorageElements - 1]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return reinterpret_cast<const_reference>(storage[kStorageElements - 1]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  
+  CUTLASS_HOST_DEVICE
+  pointer raw_data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer raw_data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kElements); ++i) {
+      storage[i] = static_cast<Storage>(value);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator begin() const {
+    return cbegin();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator end() const {
+    return cend();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rbegin() const {
+    return crbegin();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rend() const {
+    return crend();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage));
+  }
+
+  //
+  // Comparison operators
+  //
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 1> make_Array(Element x) {
+  return {x};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 2> make_Array(Element x, Element y) {
+  return {x,y};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 3> make_Array(Element x, Element y, Element z) {
+  return {x,y,z};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 4> make_Array(Element x, Element y, Element z, Element w) {
+  return {x,y,z,w};
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int N>
+struct absolute_value_op< Array<T, N> > {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    absolute_value_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct plus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+template <typename T, int N>
+struct minus<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct multiplies<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropogateNaN>
+struct maximum_absolute_value_reduction<Array<T, N>, PropogateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  T operator() (T const& scalar, Array<T, N> const& rhs) const {
+
+    T result = scalar;
+    maximum_absolute_value_reduction<T, PropogateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result = scalar_op(result, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct scale<Array<T, N>> {
+  T const scaling_factor_;
+
+  CUTLASS_HOST_DEVICE
+  scale(T scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const & rhs) const {
+    Array<T, N> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = rhs[i] * scaling_factor_;
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct divides<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate_ftz<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate_ftz<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct maximum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct minimum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  static T scalar_op(T const &lhs, T const &rhs) {
+    return (rhs < lhs ? rhs : lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct minimum_with_nan_propagation<Array<T, N>> : minimum<Array<T, N>, true> 
+{};
+
+template <typename T, int N>
+struct negate<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    negate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <typename T, int N>
+struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused square-and-plus
+template <typename T, int N>
+struct square_and_plus<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> ma_op;
+    return ma_op(rhs, rhs, lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &rhs) const {
+    plus<Array<T, N>> plus_op;
+    multiplies<T> multiplies_op;
+    return plus_op(multiplies_op(rhs, rhs), lhs);
+  }
+};
+
+/// Inverse-square-root
+template <typename T, int N>
+struct inverse_square_root<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+    Array<T, N> result;
+    inverse_square_root<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct inverse_square_root<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & a) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = h2rsqrt(a_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half d_residual = hrsqrt(a_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    inverse_square_root<half_t> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <typename T, int N>
+struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+};
+
+
+template <typename T, int N>
+struct conjugate<Array<T, N> >  {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+
+    conjugate<T> conj_op;
+
+    Array<T, N> ca;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      ca[i] = conj_op(a[i]);
+    }
+    return ca;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations targeting SIMD instructions in device code.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct plus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct minus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct multiplies<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hmul(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hmul(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct divides<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct negate<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hneg2(source_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      half_t x = -lhs[N - 1];
+      __half lhs_val = reinterpret_cast<__half const &>(x);
+      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = -lhs[i];
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <int N>
+struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a, b[i], c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b, c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct minimum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmin2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmin(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i],rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmin2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmin(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmin2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmin(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t, PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct maximum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmax_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmax2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmax(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmax(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    bfloat16_t const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned a_packed = static_cast<unsigned>(a.raw());
+    a_packed = (a_packed | (a_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    bfloat16_t const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+
+/// bit_and
+template <int N>
+struct bit_and<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] & b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_or
+template <int N>
+struct bit_or<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] | b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_not
+template <int N>
+struct bit_not<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (~a_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_xor
+template <int N>
+struct bit_xor<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] ^ b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Operator overloads
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(T const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, T const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  minus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs) {
+  negate<Array<T, N>> op;
+  return op(lhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  divides<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/array_subbyte.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// AlignedArray
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Aligned array type
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Alignment requirement in bytes
+  int Alignment = ( sizeof_bits<T>::value * N + 7 ) / 8
+>
+class alignas(Alignment) AlignedArray: public Array<T, N> {
+public:
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
new file mode 100755
index 000000000..2dd8aa84e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
@@ -0,0 +1,89 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Array holding planar complex elements
+template <typename Element_, int N>
+struct ArrayPlanarComplex {
+
+  /// Underlying real element
+  using Element = Element_;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  /// Underlying Fragment of real-valued elemenets
+  using ArrayReal = cutlass::Array<Element, N>;
+
+public:
+  /// Fragment of real-valued elements representing the real part
+  ArrayReal real;
+
+  /// Fragment of real-valued elements representing the imaginary part
+  ArrayReal imag;
+
+public:
+  /// Sets the array to zero efficiently
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    real.clear();
+    imag.clear();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to deduce template arguments
+template <typename Element, int N>
+CUTLASS_HOST_DEVICE
+ArrayPlanarComplex<Element, N> 
+make_ArrayPlanarComplex(Array<Element, N> const &real, Array<Element, N> const &imag) {
+  return ArrayPlanarComplex<Element, N>{real, imag};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h b/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
new file mode 100755
index 000000000..eb77a9310
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
@@ -0,0 +1,559 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, false> {
+  static constexpr int kSizeBits = sizeof_bits<T>::value * N;
+
+  /// Storage type
+  using Storage = typename platform::conditional<
+    ((kSizeBits % 32) != 0),
+    typename platform::conditional<
+      ((kSizeBits % 16) != 0),
+      uint8_t,
+      uint16_t
+    >::type,
+    uint32_t
+  >::type;
+
+  /// Element type
+  using Element = T;
+
+  /// Number of logical elements per stored object
+  static constexpr int kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
+
+  /// Number of storage elements
+  static constexpr size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  /// Bitmask for covering one item
+  static constexpr Storage kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
+
+  //
+  // C++ standard members with pointer types removed
+  //
+
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type *pointer;
+  typedef value_type const *const_pointer;
+
+  //
+  // References
+  //
+
+  /// Reference object inserts or extracts sub-byte items
+  class reference {
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    reference() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    reference(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    /// Assignment
+    CUTLASS_HOST_DEVICE
+    reference &operator=(T x) {
+    // `*ptr_ & kUpdateMask` will read ptr_ before write to it
+    // This means code pattern like
+    //
+    // ```cpp
+    // Array<half_t, N> result;
+    // result[0] = xxx;
+    // ```
+    // 
+    // Will leads to compiler warning on use of unintialized member variable. Although we know
+    //      this read of uninitialized member variable is harmeless.
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wuninitialized"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+      Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
+
+      Storage kUpdateMask = Storage(~(kMask << (idx_ * sizeof_bits<T>::value)));
+
+      *ptr_ = Storage(((*ptr_ & kUpdateMask) | (item << idx_ * sizeof_bits<T>::value)));
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic pop
+#endif
+
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T get() const {
+      Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      return get();
+    }
+
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+
+  /// Reference object extracts sub-byte items
+  class const_reference {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_reference() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    const_reference(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    const T get() const {
+      Storage item = (*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask;
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      Storage item = Storage(Storage(*ptr_ >> Storage(idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+
+  //
+  // Iterators
+  //
+
+  /// Bidirectional iterator over elements
+  class iterator {
+
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reference operator*() const {
+      return reference(ptr_, idx_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reference operator*() const {
+      return const_reference(ptr_, idx_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    reverse_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_reverse_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kStorageElements); ++i) {
+      storage[i] = Storage(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return const_reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return at(pos);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return at(pos);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return at(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return at(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return const_reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  
+  CUTLASS_HOST_DEVICE
+  Storage * raw_data() {
+    return storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Storage const * raw_data() const {
+    return storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerStoredItem; ++i) {
+      reference ref(storage, i);
+      ref = value;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kStorageElements; ++i) {
+      storage[i] = storage[0];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(storage);
+  }
+
+private:
+  /// Internal storage
+  Storage storage[kStorageElements];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/barrier.h b/lightllm-kernel/cutlass/include/cutlass/barrier.h
new file mode 100755
index 000000000..6f2373b6d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/barrier.h
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide barrier for inter-CTA synchronization.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+namespace detail {
+
+//
+// Utilities for abstracting synchronization methods for barriers
+//
+
+struct SyncthreadsSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncthreads();
+  }
+};
+
+struct SyncwarpSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncwarp();
+  }
+};
+
+template <
+  int ThreadCount,
+  int BarrierId
+>
+struct NamedBarrierSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    cutlass::arch::NamedBarrier::sync(ThreadCount, static_cast<arch::ReservedNamedBarriers>(BarrierId));
+  }
+};
+
+} // namepspace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Group or CTA-wide semaphore for inter-CTA synchronization.
+template <class Sync>
+struct GenericBarrier {
+
+public:
+
+  /// Flag type
+  using T = int;
+
+  /// Initial flag value
+  static const T INIT = 0;
+
+
+protected:
+
+  /// Load flag, as a strong acquire operation (int specialization)
+  CUTLASS_DEVICE
+  static int ld_acquire(int *ptr)
+  {
+    int state = 0;
+
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Acquire pattern using acquire modifier
+    asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+
+#else
+    asm volatile ("ld.cg.global.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+#endif // (__CUDA_ARCH__ >= 700)
+
+    return state;
+  }
+
+
+  /// Reduce into flag, with release pattern (int specialization)
+  CUTLASS_DEVICE
+  static void red_release(int *ptr, int val)
+  {
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Release pattern using acq_rel fence + relaxed modifier.  (The fence also releases data
+    // that was weakly-written by other threads prior to the last syncthreads)
+    asm volatile ("fence.acq_rel.gpu;\n");
+    asm volatile ("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(ptr), "r"(val));
+
+#else
+    __threadfence();
+    atomicAdd(ptr, val);
+#endif // (__CUDA_ARCH__ >= 700)
+  }
+
+
+public:
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) < count) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) != val) {}
+    }
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(atomicCAS(flag_ptr, val, 0) != val) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Increment the arrival count for a flag
+  CUTLASS_DEVICE
+  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx, int val = 1)
+  {
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    Sync::sync();
+
+    if (thread_idx == 0)
+    {
+      red_release(flag_ptr, val);
+    }
+  }
+
+
+  /// Increment the arrival counts for a range of flags
+  CUTLASS_DEVICE
+  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1)
+  {
+    int flag_idx = first_flag_idx + thread_idx;
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    // Barrier to make sure all other threads in group have written their data
+    Sync::sync();
+
+    // Select threads increment their flags
+    if (thread_idx < count) {
+      red_release(flag_ptr, val);
+    }
+  }
+};
+
+using Barrier = GenericBarrier<detail::SyncthreadsSync>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for managing multiple NamedBarriers to be used by different warp groups, allowing
+ * runtime index values to be used to call into named barriers with compile-time-constant IDs.
+ *
+ * @param ThreadCount_ Number of threads that will wait on a NamedBarrier with a given ID
+ * @param Offset Value added to the ID passed in by the user to determine the NamedBarrier ID to call into
+ * @param MaxNumNamedBarriers The maximum number of unique barrier IDs that will be requested on this type
+**/
+template <
+  uint32_t ThreadCount_,
+  uint32_t Offset = 0,
+  uint32_t MaxNumNamedBarriers = 16
+>
+struct NamedBarrierManager {
+
+  static_assert(MaxNumNamedBarriers <= arch::NamedBarrier::HardwareMaxNumNamedBarriers);
+  static_assert(MaxNumNamedBarriers + Offset <= arch::NamedBarrier::HardwareMaxNumNamedBarriers, "Barrier IDs cannot exceed 15");
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  template <uint32_t BarrierId>
+  using BarrierSync = cutlass::GenericBarrier<cutlass::detail::NamedBarrierSync<ThreadCount, BarrierId>>;
+
+  // Underlying type used by all barriers for synchronization. Does not depend on
+  // template parameter BarrierId, so passing in 0 suffices.
+  using T = typename BarrierSync<0>::T;
+
+  using IntegerSequence = cute::make_integer_sequence<uint32_t, MaxNumNamedBarriers>;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    wait_lt_helper(idx, lock_ptr, thread_idx, flag_idx, count, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<false>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<true>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    arrive_inc_helper(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    arrive_range_inc_helper(idx, lock_ptr, thread_idx, first_flag_idx, count, val, IntegerSequence{});
+  }
+
+private:
+  CUTLASS_DEVICE
+  static void
+  check_barrier_in_range([[maybe_unused]] uint32_t idx) {
+    assert((idx < MaxNumNamedBarriers) && "Index exceeds barrier count");
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_lt_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::wait_lt(lock_ptr, thread_idx, flag_idx, count), true)) || ...);
+  }
+
+  template <bool Reset, uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_eq_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    if constexpr (Reset) {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+    else {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_inc(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val), true)) || ...);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
+ *  via an API that mirrors that of NamedBarrierManager
+ *
+ * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
+**/
+template <
+  class Synchronizer,
+  uint32_t ThreadCount_
+>
+struct SyncManager {
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
+
+  // Underlying type used by all barriers for synchronization.
+  using T = typename BarrierSync::T;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/bfloat16.h b/lightllm-kernel/cutlass/include/cutlass/bfloat16.h
new file mode 100755
index 000000000..5af6d3ab8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/bfloat16.h
@@ -0,0 +1,679 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing non-standard 16-bit floating point values with
+          8 bits of exponent and 7 bit of mantissa.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+#include <cuda_bf16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Floating-point type with 8 bits of exponent and 7 bits of mantissa.
+struct alignas(2) bfloat16_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint16_t storage;
+
+  //
+  // Methods
+  //
+
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static bfloat16_t bitcast(uint16_t x) {
+    bfloat16_t h;
+    h.storage = x;
+    return h;
+  }
+
+private:
+  struct from_32_bit_integer_t {};
+  static constexpr from_32_bit_integer_t from_32_bit_integer{};
+
+  template<class T>
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(from_32_bit_integer_t, T x) {
+    static_assert(cutlass::platform::is_integral<T>::value && sizeof(T) == 4, "Requires 32-bit integer");
+
+    float flt = static_cast<float>(x);
+    uint32_t bits;
+
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(flt);
+    #else
+    std::memcpy(&bits, &flt, sizeof(bits));
+    #endif
+
+    storage = uint16_t(bits >> 16);
+  }
+
+public:
+  /// Default constructor
+  bfloat16_t() = default;
+
+  /// Reinterpret cast from CUDA's __nv_bfloat16 type
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(__nv_bfloat16 const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __nv_bfloat16_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(float x) {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+
+    asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x));
+
+    #else
+    uint32_t bits;
+
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(x);
+    #else
+    std::memcpy(&bits, &x, sizeof(bits));
+    #endif
+
+    if ((bits & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((bits & (1 << 16)) != 0);
+      bool round_bit = ((bits & (1 << 15)) != 0);
+      bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0);
+      
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        bits += uint32_t(1 << 16);
+      }
+    }
+    else if (bits & ~0xff800000) {
+      bits = 0x7fffffff;
+    }
+
+    storage = uint16_t((bits >> 16) & 0xffff);
+    #endif
+  }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(double x): bfloat16_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(int x) : bfloat16_t(from_32_bit_integer, x) {}
+
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(uint32_t x) : bfloat16_t(from_32_bit_integer, x) {}
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    unsigned bits = (unsigned(storage) << 16);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Bitcasts to CUDA's bf16 type
+  CUTLASS_DEVICE
+  __nv_bfloat16 to_nv_bfloat16() const {
+    return reinterpret_cast<__nv_bfloat16 const &>(storage);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+    /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x8000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 7) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7f);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::bfloat16_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
+  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t nan_bf16(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::bfloat16_t::bitcast(0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::bfloat16_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::bfloat16_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::bfloat16_t(sqrtf(float(h)));
+#else
+  return cutlass::bfloat16_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
+
+  uint16_t a_bits;
+  uint16_t b_bits;
+
+  #if defined(__CUDA_ARCH__)
+  a_bits = reinterpret_cast<uint16_t const &>(a);
+  b_bits = reinterpret_cast<uint16_t const &>(b);
+  #else
+  std::memcpy(&a_bits, &a, sizeof(a_bits));
+  std::memcpy(&b_bits, &b, sizeof(b_bits));
+  #endif
+
+  uint16_t a_mag = (a_bits & 0x7fff);  
+  uint16_t b_sign = (b_bits & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+
+  return bfloat16_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+
+} // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+
+} // namespace platform
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __heq(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) == float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hne(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) != float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hlt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) < float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hle(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) <= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) > float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hge(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) >= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) + float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hneg(lhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(-float(lhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) - float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) * float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) / float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) + float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) - float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) * float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) / float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator++(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  ++tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator--(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  --tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator++(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp++;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator--(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp--;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(long double x) {
+  return cutlass::bfloat16_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) {
+  return cutlass::bfloat16_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/blas3.h b/lightllm-kernel/cutlass/include/cutlass/blas3.h
new file mode 100755
index 000000000..ee5587d1c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/blas3.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Basic include for CUTLASS BLAS3/HPC code.
+    
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/blas3_types.h"
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines FillMode inversions
+template <FillMode kFillMode>
+struct InvertFillMode;
+
+/// Invert FillMode lower to upper
+template <>
+struct InvertFillMode<FillMode::kLower> {
+  static FillMode const mode = FillMode::kUpper;
+};
+
+/// Invert FillMode upper to lower
+template <>
+struct InvertFillMode<FillMode::kUpper> {
+  static FillMode const mode = FillMode::kLower;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines SideMode inversions
+template <SideMode kSideMode>
+struct InvertSideMode;
+
+/// Invert SideMode left to right
+template <>
+struct InvertSideMode<SideMode::kLeft> {
+  static SideMode const mode = SideMode::kRight;
+};
+
+/// Invert SideMode right to left
+template <>
+struct InvertSideMode<SideMode::kRight> {
+  static SideMode const mode = SideMode::kLeft;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines correct compare operation for Triangular matrix boundary
+template <FillMode kFillMode, DiagType kDiagType = DiagType::kNonUnit>
+struct TrMatrixCompareOp {
+  using Index = int32_t;
+  using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kUnit> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kZero> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater<Index>, 
+                        less<Index>>::type;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns precision in terms of bits (based on datatype) to fill tensors with.
+// Defaults to 5 bits of mantissa for TF32 and FP32 (with implicit round-offs).
+// Also defines acceptable mantissa result variance/error.
+template <typename Element>
+struct MantissaInBits {
+  static int constexpr bits = 5;
+  static double constexpr error = 1.0e-7;
+};
+
+// Full precision is supported for FP64
+template <>
+struct MantissaInBits<double> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+
+template <>
+struct MantissaInBits<cutlass::complex<double>> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/blas3_types.h b/lightllm-kernel/cutlass/include/cutlass/blas3_types.h
new file mode 100755
index 000000000..653b93b77
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/blas3_types.h
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of kernel (based on input or output matrices).
+enum class BlasMode {
+  kGemm,
+  kSymmetric,
+  kHermitian,
+  kTriangular,
+  kInvalid
+};
+
+/// Enumerated type describing the fill mode for matrices for BLAS functions.
+enum class FillMode {
+  kFull,              /// The entire tensor is covered.
+  kLower,             /// The 'lower' part of a tensor is covered including diagonal
+  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
+  kDiagonal,          /// Only diagonal elements are covered.
+  kNone,              /// No element is covered.
+  kInvalid
+};
+
+/// Enumerated type describing the diagonal property of matrices for BLAS functions.
+enum class DiagType {
+  kNonUnit,
+  kUnit,
+  kZero, // Only used internally for computing SYMM/HEMM
+  kInvalid
+}; 
+
+/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
+enum class SideMode {
+  kLeft,
+  kRight,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/block_striped.h b/lightllm-kernel/cutlass/include/cutlass/block_striped.h
new file mode 100755
index 000000000..09f3fb04f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/block_striped.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for performing block-striped access (load, store, reduce) of trivially-copyable,
+    statically-sized array types to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/functional.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// AccessWidth
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the maximal power-of-two that evenly divides the size of T, capped at Limit
+template <
+  typename T,
+  int Limit>
+struct AccessWidth
+{
+  // Inductive case
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes,         /// Template induction variable
+      bool IsAligned  =       /// Whether ObjectBytes is an even multiple of AlignBytes
+        ((AlignBytes <= Limit) &&  (ObjectBytes % AlignBytes == 0))>
+  struct Detail
+  {
+      static const int value = Detail<ObjectBytes, AlignBytes * 2>::value;
+  };
+
+  // Base case (ObjectBytes is not an even multiple of AlignBytes)
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes>         /// Template induction variable
+  struct Detail<ObjectBytes, AlignBytes, false>
+  {
+      static const int value = AlignBytes / 2;
+  };
+
+  /// The maximal power-of-two that evenly divides the size of T
+  static const int value = Detail<
+    (int) sizeof(T),
+    1>::value;
+};
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// StripedAccessType
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Default specialization.  Striping granularity is type T.)
+template <
+    typename T,           /// Data type
+    int TransferBytes =   /// Data access width (16 byte max for global memory access on current architectures)
+      AccessWidth<T, 16>::value>
+struct alignas(TransferBytes) StripedAccessType : public T
+{};
+
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::Array<T>.  Striping granularity is a multiple of T.)
+template <
+    typename T,           /// Array element type
+    int N,                /// Number of elements in array
+    bool RegisterSized,   /// T is register-sized
+    int TransferBytes>    /// Data access width
+struct StripedAccessType<
+    Array<T, N, RegisterSized>,
+    TransferBytes>
+: public AlignedArray<
+            T,                                                  // Element type of StripedAccessType
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(T)),   // Number of elements T in StripedAccessType
+            TransferBytes>                                      // Alignment of StripedAccessType
+{};
+
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::WmmaFragmentArray<T>.  Striping granularity is a multiple of T.)
+template<
+    typename Use,
+    int m,
+    int n,
+    int k,
+    typename ElementT,
+    typename Layout,
+    int kFragments,
+    int TransferBytes>
+struct StripedAccessType<
+    WmmaFragmentArray<nvcuda::wmma::fragment<Use, m, n, k, ElementT, Layout>, kFragments>,
+    TransferBytes>
+: public AlignedArray<
+            ElementT,
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(ElementT)),
+            TransferBytes>
+{};
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStriped
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Utility for performing block-striped access (load, store) of trivially-copyable,
+/// statically-sized array types to global memory
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename AccessT = StripedAccessType<ArrayT> >
+struct BlockStriped
+{
+  /// Number of striped accesses
+  static const int kStripes = int(sizeof(ArrayT) / sizeof(AccessT));
+  static_assert(kStripes > 0, "AccessT type must be smaller than or equal to ArrayT type");
+
+  /// Load
+  CUTLASS_DEVICE
+  static void load(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_data[i] = access_input[(BlockThreads * i) + thread_idx];
+    }
+  }
+
+  /// Load & Add
+  CUTLASS_DEVICE
+  static void load_add(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    plus<AccessT> add;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i)
+    {
+      access_data[i] = add(access_data[i], access_input[(BlockThreads * i) + thread_idx]);
+    }
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  static void store(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    AccessT *access_output = reinterpret_cast<AccessT*>(ptr);
+    const AccessT *access_data = reinterpret_cast<const AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_output[(BlockThreads * i) + thread_idx] = access_data[i];
+    }
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStripedReduce
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Default specialization)
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename ElementT = typename StripedAccessType<ArrayT>::Element>
+struct BlockStripedReduce :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    ElementT>
+{
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<ElementT> reduce;
+    ElementT *access_output = reinterpret_cast<ElementT*>(ptr);
+    const ElementT *access_data = reinterpret_cast<const ElementT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i) {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Specialization for half_t.  Uses half2 vectorized-reduction.)
+template <
+  int BlockThreads,
+  typename ArrayT>
+struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    half2>
+{
+  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of half must be even number in length");
+
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<half2> reduce;
+    half2 *access_output = reinterpret_cast<half2*>(ptr);
+    const half2 *access_data = reinterpret_cast<const half2*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
+    {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp b/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
new file mode 100755
index 000000000..a0fa22b6b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief PTX for TMA Tensor Memory Access operators on memory added for SM90
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#else
+#include <type_traits>
+#include <cstdio>
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#  define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
+#endif
+
+namespace cutlass {
+
+#ifndef NDEBUG
+#define Return_Status(cudaError_t_status)            \
+  if (cudaError_t_status != cudaSuccess) {           \
+    fprintf(stderr,                                  \
+            "[ ERROR: CUDA Runtime ] %s:%d: %s\n",   \
+            __FILE__,                                \
+            __LINE__,                                \
+            cudaGetErrorString(cudaError_t_status)); \
+    return Status::kInvalid;                         \
+  } else {                                           \
+    return Status::kSuccess;                         \
+  }
+#else
+#define Return_Status(cudaError_t_status)          \
+  if (cudaError_t_status != cudaSuccess) {         \
+    return Status::kInvalid;                       \
+  } else {                                         \
+    return Status::kSuccess;                       \
+  }
+#endif
+
+struct ClusterLauncher {
+  constexpr static int MaxClusterSize = 32;
+
+  // Check for hardware compatibility
+  static inline CUTLASS_HOST
+  Status check_cluster_dims(dim3 grid, dim3 cluster) {
+    if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) &&
+        (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch.");
+      return Status::kInvalid;
+    }
+  }
+
+  static inline CUTLASS_HOST
+  Status
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+  init(void const* kernel_function)
+#else
+  init(void const* /* kernel_function */)
+#endif
+  {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    if (kernel_function == nullptr) {
+      CUTLASS_TRACE_HOST("kernel_function is null");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Checking previous error state before calling cudaFuncSetAttribute");
+    cudaError_t prevStatus = cudaGetLastError();
+    if (prevStatus != cudaSuccess) {
+      fprintf(stderr,
+              "[ ERROR: CUDA Runtime ] %s:%d: %s\n",
+              __FILE__,
+              __LINE__,
+              cudaGetErrorString(prevStatus));
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Calling cudaFuncSetAttribute");
+#endif
+    // This attribute was added in CUDA 11.8.
+    cudaError_t status =
+        cudaFuncSetAttribute(
+          kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
+    Return_Status(status);
+#else
+    return Status::kInvalid;
+#endif
+  }
+
+  // This is the method we expect to use going forward
+  static inline CUTLASS_HOST
+  Status launch(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    if (check_cluster_dims(grid_dims, cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+
+    cudaLaunchConfig_t launch_config;
+    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
+    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
+    launch_config.dynamicSmemBytes = smem_size;
+    launch_config.stream = cuda_stream;
+
+    cudaLaunchAttribute launch_attribute[2];
+
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    launch_attribute[0].val.clusterDim.x = cluster_dims.x;
+    launch_attribute[0].val.clusterDim.y = cluster_dims.y;
+    launch_attribute[0].val.clusterDim.z = cluster_dims.z;
+
+    launch_attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    launch_attribute[1].val.programmaticStreamSerializationAllowed = 1;
+
+    launch_config.numAttrs = launch_with_pdl ? 2 : 1;
+
+    launch_config.attrs = launch_attribute;
+
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching GPC_CLUSTER_GRID GridDims = "
+        "(" << grid_dims.x << ", " << grid_dims.y << ", " << grid_dims.z << "), "
+        "And ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+
+};
+
+namespace detail {
+
+template<class Arg>
+void* checked_addressof(Arg&& arg) {
+  static_assert(! std::is_rvalue_reference_v<Arg> || ! std::is_const_v<Arg>, "You cannot take the address of a const rvalue reference (const T&&).");
+  // We use std::addressof to ensure we get the address,
+  // in case the type has an overloaded operator&.
+  // Note that this precludes `const T&&` references.
+  return const_cast<void*>(reinterpret_cast<void const*>(std::addressof(arg)));
+}
+
+} // namespace detail
+
+//! Parameters for launch_on_cluster (see below).
+struct ClusterLaunchParams {
+  //! Grid dimensions
+  dim3 grid_dims{1, 1, 1};
+
+  //! Block dimensions
+  dim3 block_dims{1, 1, 1};
+
+  //! Cluster dimensions
+  dim3 cluster_dims{1, 1, 1};
+
+  //! Number of bytes required for the kernel's shared memory.
+  int smem_size_in_bytes = 0;
+
+  //! CUDA stream on which to launch the kernel.
+  cudaStream_t cuda_stream = nullptr;
+};
+
+/// @brief Launch the kernel on the stream using cluster launch.
+///
+/// @param params Cluster launch parameters (see above).
+/// @param kernel_ptr Pointer to the kernel function (see example).
+/// @param args Zero or more arguments to pass to the kernel.
+///
+/// @tparam Args Types of the arguments passed to the kernel.
+///   Don't specify this/these template argument(s) explicitly.
+///
+/// @return Status::Success on success, else an error code.
+///
+/// @code
+/// template<class SharedMemoryType, class A, class B, class C>
+/// __global__ void kernel(A a, B b, C c);
+///
+/// X x = get_x();
+/// Y y = get_y();
+/// Z z = get_z();
+///
+/// void const* kernel_ptr =
+///   const_cast<void const*>(reinterpret_cast<void*>(
+///     &kernel<SharedMemory, X, Y, Z>));
+/// auto status = launch_kernel_on_cluster(
+///   {grid_dims, block_dims, cluster_dims, sizeof(SharedMemory)},
+///   kernel_ptr, x, y, z);
+/// @endcode
+template<class ... Args>
+CUTLASS_HOST cutlass::Status
+launch_kernel_on_cluster(const ClusterLaunchParams& params,
+  void const* kernel_ptr,
+  Args&& ... args)
+{
+  // Unfortunately, we find ourselves needing to pass in
+  // the parameters as an array of raw pointers.
+  if constexpr (sizeof...(Args) == 0) {
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr, nullptr);
+  }
+  else {
+    void* kernel_params[sizeof...(Args)] = {
+      detail::checked_addressof(std::forward<Args>(args))...
+    };
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr,
+      kernel_params);
+  }
+}
+
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/complex.h b/lightllm-kernel/cutlass/include/cutlass/complex.h
new file mode 100755
index 000000000..6d0bf31df
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/complex.h
@@ -0,0 +1,823 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuComplex.h>
+
+#include <cuda_fp16.h>
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/real.h"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/fast_math.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <iosfwd>
+#endif
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Enumeraed type describing a transformation on a complex value.
+enum class ComplexTransform {
+  kNone,
+  kConjugate
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines ComplexTransform inversions
+template <ComplexTransform kTransform>
+struct InvertComplexTransform;
+
+/// Invert ComplexTransform from kNone to kConjugate
+template <>
+struct InvertComplexTransform<ComplexTransform::kNone> {
+  static ComplexTransform const transform = ComplexTransform::kConjugate;
+};
+
+/// Invert ComplexTransform from kConjugate to kNone
+template <>
+struct InvertComplexTransform<ComplexTransform::kConjugate> {
+  static ComplexTransform const transform = ComplexTransform::kNone;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Accessors for CUDA complex types
+//
+
+#if !defined(__CUDACC_RTC__)
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float const &real(cuFloatComplex const &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float &real(cuFloatComplex &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double const &real(cuDoubleComplex const &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double &real(cuDoubleComplex &z) { return z.x; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float const &imag(cuFloatComplex const &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float &imag(cuFloatComplex &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double const &imag(cuDoubleComplex const &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double &imag(cuDoubleComplex &z) { return z.y; }
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuFloatComplex
+conj(cuFloatComplex const& z) {
+  return make_cuFloatComplex(z.x, -z.y);
+}
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuDoubleComplex
+conj(cuDoubleComplex const& z) {
+  return make_cuDoubleComplex(z.x, -z.y);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Class for representing and manipulating complex numbers with conversions from built-in CUDA
+/// complex types.
+
+template <typename T>
+class complex
+{
+ public:
+  /// Type alias for scalar type
+  using value_type = T;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Real part
+  T _real;
+
+  /// Imaginary part
+  T _imag;
+
+ public:
+
+//
+// Methods
+//
+
+  /// Default constructor
+  complex() = default;
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r) : _real(r), _imag(T(0)) {}
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r, T i) : _real(r), _imag(i) {}
+
+  /// Constructor
+  template<typename A>
+  CUTLASS_HOST_DEVICE
+  complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
+
+
+  #if !defined(__CUDACC_RTC__)
+  /// Conversion from cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuFloatComplex const &z) : _real(static_cast<T>(cuCrealf(z))), _imag(static_cast<T>(cuCimagf(z))) {}
+
+  /// Conversion from cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
+  #endif
+
+  /// Equality operator
+  CUTLASS_HOST_DEVICE bool operator==(complex<T> const &rhs) const {
+    return this->real() == rhs.real() && this->imag() == rhs.imag();
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE bool operator!=(complex<T> const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator+(complex<A> const &rhs) const {
+    return complex<T>(this->real() + rhs.real(), this->imag() + rhs.imag());
+  }
+
+  /// Reduction into memory address.  Components may update out of order.
+  template <typename OtherT>
+  CUTLASS_DEVICE void red(complex<OtherT> *ptr) const {
+    static_assert(platform::is_same<T, OtherT>::value, "Component type must match");
+    cutlass::atomic_add<T> reduce;
+    reduce(&ptr->_real, _real);
+    reduce(&ptr->_imag, _imag);
+  }
+
+  /// Reduction into memory address.  Components may update out of order.  (Half specialization)
+  CUTLASS_DEVICE void red(complex<half_t> *ptr) const {
+    static_assert(platform::is_same<T, half_t>::value, "Component type must match");
+    half2 *h2_ptr = reinterpret_cast<half2*>(ptr);
+    half2 h2_data = reinterpret_cast<half2&>(*this);
+    cutlass::atomic_add<half2> reduce;
+    reduce(h2_ptr, h2_data);
+  }
+
+  /// Subtraction
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator-(complex<A> const &rhs) const {
+    return complex<T>(this->real() - rhs.real(), this->imag() - rhs.imag());
+  }
+
+  /// Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(complex<A> const &rhs) const {
+    return complex<T>(this->real() * rhs.real() - this->imag() * rhs.imag(),
+                      this->real() * rhs.imag() + this->imag() * rhs.real());
+  }
+
+  /// Scalar Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(A const &s) const {
+    return complex<T>(this->real() * s, this->imag() * s);
+  }
+
+  /// Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(complex<A> const &rhs) const {
+    T d = T(rhs.real() * rhs.real() + rhs.imag() * rhs.imag());
+
+    return complex<T>(
+      (real() * rhs.real() + imag() * rhs.imag()) / d,
+      (imag() * rhs.real() - real() * rhs.imag()) / d
+    );
+  }
+
+  /// Scalar Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(A const &s) const {
+    return complex<T>(this->real() / s, this->imag() / s);
+  }
+
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator+=(complex<A> const &rhs) {
+      *this = *this + rhs;
+      return *this;
+  }
+
+  /// Subtraction
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator-=(complex<A> const &rhs) {
+      *this = *this - rhs;
+      return *this;
+  }
+
+  /// Multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(complex<A> const &rhs) {
+      *this = *this * rhs;
+      return *this;
+  }
+
+  /// Scalar multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(A s) {
+      *this = *this * s;
+      return *this;
+  }
+
+  /// Division
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator/=(complex<A> const &rhs) {
+      *this = *this / rhs;
+      return *this;
+  }
+
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &real() const { return _real; }
+
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &real() { return _real; }
+
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &imag() const { return _imag; }
+
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &imag() { return _imag; }
+
+  /// Set the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  void real(T real) { _real = real; }
+
+  /// Set the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  void imag(T imag) { _imag = imag; }
+
+  #if !defined(__CUDACC_RTC__)
+  /// Converts to cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuFloatComplex() const { return make_cuFloatComplex(float(real()), float(imag())); }
+
+  /// Converts to cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
+  #endif
+};
+
+// Complex conjugate
+template<class T>
+CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const& z) {
+  return {z.real(), -z.imag()};
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Accessors for complex template
+//
+
+// Nonmember real and imag need to work for non-complex numbers too.
+// That means cutlass::complex, std::complex, cuda::std::complex, and
+// any user-defined complex number type that looks like std::complex.
+// It's reasonable to assume that a "complex number type" has
+// zero-argument real() and imag() member functions returning
+// non-void.  While cuFloatComplex and cuDoubleComplex lack those
+// member functions, one-argument nonmember real and imag overloads
+// for those types are defined above.
+
+namespace detail {
+
+template <typename T, typename Enable = void>
+struct has_zero_argument_real_member_function :
+  cutlass::platform::false_type
+{};
+
+template <typename T>
+struct has_zero_argument_real_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().real())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_zero_argument_real_member_function_v =
+  has_zero_argument_real_member_function<T>::value;
+
+template <typename T, typename Enable = void>
+struct has_zero_argument_imag_member_function :
+  cutlass::platform::false_type
+{};
+
+template <typename T>
+struct has_zero_argument_imag_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().imag())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_zero_argument_imag_member_function_v =
+  has_zero_argument_imag_member_function<T>::value;
+
+} // namespace detail
+
+template<typename T>
+CUTLASS_HOST_DEVICE auto real(T z) {
+  if constexpr (detail::has_zero_argument_real_member_function_v<T>) {
+    return z.real();
+  } else {
+    return z;
+  }
+}
+  
+template<typename T>
+CUTLASS_HOST_DEVICE auto imag(T z) {
+  if constexpr (detail::has_zero_argument_imag_member_function_v<T>) {
+    return z.imag();
+  } else {
+    // Imaginary part of a non-complex input has the same type as the
+    // input, and its value is zero.  CUTLASS assumes in this case
+    // that value-initializing T is well-formed and results in zero.
+    return T{};
+  }
+}
+  
+//
+// Output operators
+//
+
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
+  T _r = real(z);
+  T _i = imag(z);
+
+  if (bool(_i)) {
+    return out << _r << "+i" << _i;
+  }
+  return out << _r;
+}
+#endif
+
+//
+// Non-member operators defined for complex types
+//
+
+
+//
+// Non-member functions defined for complex numbers
+//
+
+// abs returns the magnitude of the complex number.
+
+CUTLASS_HOST_DEVICE float abs(complex<float> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+CUTLASS_HOST_DEVICE double abs(complex<double> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+// In theory, it would make sense to add a complex<long double>
+// specialization of abs here, since hypot works for long double too.
+// In practice, long double doesn't have a portable number of bits or
+// behavior, so users who care about higher-precision floating-point
+// computation should probably insist on an actual FP128 type.
+
+template <typename T>
+CUTLASS_HOST_DEVICE T abs(complex<T> const &z) {
+  // cutlass::complex permits all kinds of T, including types that
+  // don't have NaN.  For a generic floating-point type with Inf
+  // and/or NaN, LAPACK's DLAPY2 algorithm would make sense, as it
+  // would handle issues like avoiding unwarranted overflow if
+  // z.real() or z.imag() is slightly bigger than the square root of
+  // the max finite number.  That could be a future improvement; for
+  // now, the code just uses the naive algorithm.
+  //
+  // Use the "swap two-step" idiom so that argument-dependent lookup
+  // can find any CUTLASS-specific overloads.
+  using cutlass::sqrt;
+  return sqrt(z.real() * z.real() + z.imag() * z.imag());
+}
+
+/// Returns the magnitude of the complex number
+template <typename T>
+CUTLASS_HOST_DEVICE T arg(complex<T> const &z) {
+  return atan2(imag(z), real(z));
+}
+
+/// Returns the squared magnitude of a real number
+template <typename T>
+CUTLASS_HOST_DEVICE T norm(T const &z) {
+    return z * z;
+}
+
+/// Returns the squared magnitude of a real number
+template <>
+CUTLASS_HOST_DEVICE int8_t norm(int8_t const &z) {
+    return static_cast<int8_t>(z * z);
+}
+
+/// Returns the squared magnitude of a complex number
+template <typename T>
+CUTLASS_HOST_DEVICE double norm(complex<T> const &z) {
+  return real(z) * real(z) + imag(z) * imag(z);
+}
+
+/// Norm-accumulate calculation
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
+  return accumulator + static_cast<R>(x) * static_cast<R>(x);
+}
+
+/// Norm accumulate specialized for complex types
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
+  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
+    static_cast<R>(imag(z)) * static_cast<R>(imag(z));
+}
+
+namespace detail {
+  
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::true_type) {
+  return conj(z);
+}
+
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::false_type) {
+  return z;
+}
+
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z) {
+  constexpr bool use_unqualified_conj =
+    ! cutlass::platform::is_arithmetic_v<T> &&
+    ! detail::has_cutlass_conj_v<T> &&
+    detail::has_unqualified_conj_v<T>;
+  return conj_impl(z, cutlass::platform::bool_constant<use_unqualified_conj>{});
+}
+  
+} // namespace detail
+
+// Return the complex conjugate of the input.
+//
+// This MUST be a function and not a function object, because it may
+// be common practice for downstream types to define specifically
+// cutlass::conj overloads, instead of overloads in their namespace.
+//
+// As a result of this being a function and not a function object,
+// CUTLASS code needs to declare "using cutlass::conj;" in scope and
+// then call this function unqualified, just like std::swap.
+//
+// If an overload already exists for cutlass::conj(T), that overload
+// will be called instead of this one.  Otherwise:
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where (namespace-unqualified) conj(z) is well formed
+//    and cutlass::conj(z) is NOT well formed, return conj(z); and,
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup, but does not live in the
+// cutlass namespace).  It excludes cutlass::conj(z) in order to
+// prevent infinite recursion.
+//
+// Case (3) covers non-Standard non-complex number types.
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z) {
+  return detail::conj_impl(z);
+}
+
+/// Projects the complex number z onto the Riemann sphere
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> proj(complex<T> const &z) {
+  T d = real(z) * real(z) + imag(z) * imag(z) + T(1);
+  return complex<T>((T(2) * real(z)) / d, (T(2) * imag(z)) / d);
+}
+
+/// Returns a complex number with magnitude r and phase theta
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> polar(T const &r, T const &theta = T()) {
+  return complex<T>(r * cos(theta), r * sin(theta));
+}
+
+/// Computes the complex exponential of z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> exp(complex<T> const &z) {
+  return complex<T>(fast_exp(real(z)) * fast_cos(imag(z)), fast_exp(real(z)) * fast_sin(imag(z)));
+}
+
+/// Computes the log of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log(complex<T> const &z) {
+  return complex<T>(log(abs(z)), arg(z));
+}
+
+/// Computes the log base 10 of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log10(complex<T> const &z) {
+  return log(z) / T(log(T(10)));
+}
+
+/// Computes the square root of complex number z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sqrt(complex<T> const &z) {
+  return sqrt(T(2)) / T(2) *
+         complex<T>(sqrt(sqrt(norm(z)) + real(z)),
+                    (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z)));
+}
+
+/// Computes the cosine of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> cos(complex<T> const &z) {
+  return (exp(z) + exp(-z)) / T(2);
+}
+
+/// Computes the sin of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
+  return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
+}
+
+/// Comparison
+template <typename T>
+CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex-valued type.
+template <typename T>
+struct RealType< complex<T> >
+{
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = 2;
+
+  CUTLASS_HOST_DEVICE
+  static complex<T> from_real(double x) {
+    return complex<T>(static_cast<T>(x));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<half_t> from_real<cutlass::complex<half_t> >(double r) {
+  return cutlass::complex<half_t>(half_t(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<float> from_real<cutlass::complex<float> >(double r) {
+  return cutlass::complex<float>(float(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<double> from_real<cutlass::complex<double> >(double r) {
+  return cutlass::complex<double>(r);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct is_complex {
+  static bool const value = false;
+};
+
+template <typename T>
+struct is_complex<complex<T>> {
+  static bool const value = true;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real());
+    Output y_i = Output(lhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b.real();
+    real += -a.imag() * b.imag();
+    imag += a.real() * b.imag();
+    imag += a.imag () * b.real();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, T, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    T const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b;
+    imag += a.imag () * b;
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<T, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    T const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a * b.real();
+    imag += a * b.imag();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Conjugate
+template <typename T>
+struct conjugate<complex<T>>  {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(complex<T> const &a) const {
+    // Invoke the complex<T> overload specifically, rather than
+    // wasting the compiler's effort on overload resolution.
+    return cutlass::conj(a);
+  }
+};
+
+#if ! defined(__CUDACC_RTC__)
+template <>
+struct conjugate<cuFloatComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuFloatComplex operator()(cuFloatComplex const& z) const {
+    return make_cuFloatComplex(z.x, -z.y);
+  }
+};
+
+template <>
+struct conjugate<cuDoubleComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuDoubleComplex operator()(cuDoubleComplex const& z) const {
+    return make_cuDoubleComplex(z.x, -z.y);
+  }
+};
+#endif
+  
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared_difference<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs, complex<T> rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real()) - Output(rhs.real());
+    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Reduces value into the data pointed to by ptr (complex<T> specialization)
+template <typename T>
+struct atomic_add<complex<T>> {
+  CUTLASS_DEVICE
+  void operator()(complex<T> *ptr, const complex<T> &data)
+  {
+    data.red(ptr);
+  }
+};
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/constants.h b/lightllm-kernel/cutlass/include/cutlass/constants.h
new file mode 100755
index 000000000..49d96045a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/constants.h
@@ -0,0 +1,1239 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *                                                                                                  
+ **************************************************************************************************/
+
+/* \file 
+  \brief Boost-style constant definitions for floating-point types.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/complex.h"
+
+///////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace constants {
+
+///////////////////////////////////////////////////////////////////////////////////
+
+//
+// Primary templates
+//
+
+/// Returns 1, the multiplicative identity element
+template <typename T> CUTLASS_HOST_DEVICE T one();
+
+/// Returns 0, the additive identity element
+template <typename T> CUTLASS_HOST_DEVICE T zero();
+
+/// Returns 2
+template <typename T> CUTLASS_HOST_DEVICE T two();
+
+/// Returns pi, approximately 3.141
+template <typename T> CUTLASS_HOST_DEVICE T pi();
+
+/// Returns 2 * pi
+template <typename T> CUTLASS_HOST_DEVICE T two_pi();
+
+/// Returns pi / 2
+template <typename T> CUTLASS_HOST_DEVICE T half_pi();
+
+/// Returns sqrt(pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_pi();
+
+/// Returns sqrt(pi / 2)
+template <typename T> CUTLASS_HOST_DEVICE T root_half_pi();
+
+/// Returns sqrt(2 * pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_two_pi();
+
+/// Returns sqrt(ln(4))
+template <typename T> CUTLASS_HOST_DEVICE T root_ln_four();
+
+/// Returns e, approximately 2.718...
+template <typename T> CUTLASS_HOST_DEVICE T e();
+
+/// Returns (1/2)
+template <typename T> CUTLASS_HOST_DEVICE T half();
+
+/// Returns sqrt(2), approximately 1.414...
+template <typename T> CUTLASS_HOST_DEVICE T root_two();
+
+/// Returns sqrt(2)/2, approximately 0.707...
+template <typename T> CUTLASS_HOST_DEVICE T half_root_two();
+
+/// Returns ln(2), approximately 0.693...
+template <typename T> CUTLASS_HOST_DEVICE T ln_two();
+
+/// Returns ln(ln(2)), approximately -0.3665...
+template <typename T> CUTLASS_HOST_DEVICE T ln_ln_two();
+
+/// Returns 1/3, approximately 0.333...
+template <typename T> CUTLASS_HOST_DEVICE T third();
+
+/// Returns 2/3, approximately 0.666...
+template <typename T> CUTLASS_HOST_DEVICE T twothirds();
+
+/// Returns pi - 3, approximately 0.1416...
+template <typename T> CUTLASS_HOST_DEVICE T pi_minus_three();
+
+/// Returns 4 - pi, approximately 0.858...
+template <typename T> CUTLASS_HOST_DEVICE T four_minus_pi();
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for double
+
+/// Returns 1, the multiplicative identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double one<double>() {
+  uint64_t bits = 0x3ff0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> one< complex<double> >() {
+  return complex<double>(one<double>(), double());
+}
+
+/// Returns 0, the additive identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double zero<double>() {
+  uint64_t bits = 0x0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> zero< complex<double> >() {
+  return complex<double>(zero<double>(), double());
+}
+
+/// Returns 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two<double>() {
+  uint64_t bits = 0x4000000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two< complex<double> >() {
+  return complex<double>(two<double>(), double());
+}
+
+/// Returns pi, approximately 3.141  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi<double>() {
+  uint64_t bits = 0x400921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi< complex<double> >() {
+  return complex<double>(pi<double>(), double());
+}
+
+/// Returns 2 * pi  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two_pi<double>() {
+  uint64_t bits = 0x401921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two_pi< complex<double> >() {
+  return complex<double>(two_pi<double>(), double());
+}
+
+/// Returns pi / 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_pi<double>() {
+  uint64_t bits = 0x3ff921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_pi< complex<double> >() {
+  return complex<double>(half_pi<double>(), double());
+}
+
+/// Returns sqrt(pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_pi<double>() {
+  uint64_t bits = 0x3ffc5bf891b4ef6aull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_pi< complex<double> >() {
+  return complex<double>(root_pi<double>(), double());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_half_pi<double>() {
+  uint64_t bits = 0x3ff40d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_half_pi< complex<double> >() {
+  return complex<double>(root_half_pi<double>(), double());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two_pi<double>() {
+  uint64_t bits = 0x40040d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two_pi< complex<double> >() {
+  return complex<double>(root_two_pi<double>(), double());
+}
+
+/// Returns sqrt(ln(4))  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_ln_four<double>() {
+  uint64_t bits = 0x3ff2d6abe44afc43ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_ln_four< complex<double> >() {
+  return complex<double>(root_ln_four<double>(), double());
+}
+
+/// Returns e, approximately 2.718...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double e<double>() {
+  uint64_t bits = 0x4005bf0a8b145769ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> e< complex<double> >() {
+  return complex<double>(e<double>(), double());
+}
+
+/// Returns (1/2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half<double>() {
+  uint64_t bits = 0x3fe0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half< complex<double> >() {
+  return complex<double>(half<double>(), double());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two<double>() {
+  uint64_t bits = 0x3ff6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two< complex<double> >() {
+  return complex<double>(root_two<double>(), double());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_root_two<double>() {
+  uint64_t bits = 0x3fe6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_root_two< complex<double> >() {
+  return complex<double>(half_root_two<double>(), double());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_two<double>() {
+  uint64_t bits = 0x3fe62e42fefa39efull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_two< complex<double> >() {
+  return complex<double>(ln_two<double>(), double());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_ln_two<double>() {
+  uint64_t bits = 0xbfd774f29bdd6b9full;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_ln_two< complex<double> >() {
+  return complex<double>(ln_ln_two<double>(), double());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double third<double>() {
+  uint64_t bits = 0x3fd5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> third< complex<double> >() {
+  return complex<double>(third<double>(), double());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double twothirds<double>() {
+  uint64_t bits = 0x3fe5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> twothirds< complex<double> >() {
+  return complex<double>(twothirds<double>(), double());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi_minus_three<double>() {
+  uint64_t bits = 0x3fc21fb54442d180ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi_minus_three< complex<double> >() {
+  return complex<double>(pi_minus_three<double>(), double());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double four_minus_pi<double>() {
+  uint64_t bits = 0x3feb7812aeef4ba0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> four_minus_pi< complex<double> >() {
+  return complex<double>(four_minus_pi<double>(), double());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for float
+
+/// Returns 1, the multiplicative identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float one<float>() {
+  uint32_t bits = 0x3f800000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> one< complex<float> >() {
+  return complex<float>(one<float>(), float());
+}
+
+/// Returns 0, the additive identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float zero<float>() {
+  uint32_t bits = 0x0u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> zero< complex<float> >() {
+  return complex<float>(zero<float>(), float());
+}
+
+/// Returns 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two<float>() {
+  uint32_t bits = 0x40000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two< complex<float> >() {
+  return complex<float>(two<float>(), float());
+}
+
+/// Returns pi, approximately 3.141  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi<float>() {
+  uint32_t bits = 0x40490fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi< complex<float> >() {
+  return complex<float>(pi<float>(), float());
+}
+
+/// Returns 2 * pi  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two_pi<float>() {
+  uint32_t bits = 0x40c90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two_pi< complex<float> >() {
+  return complex<float>(two_pi<float>(), float());
+}
+
+/// Returns pi / 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_pi<float>() {
+  uint32_t bits = 0x3fc90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_pi< complex<float> >() {
+  return complex<float>(half_pi<float>(), float());
+}
+
+/// Returns sqrt(pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_pi<float>() {
+  uint32_t bits = 0x3fe2dfc5u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_pi< complex<float> >() {
+  return complex<float>(root_pi<float>(), float());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_half_pi<float>() {
+  uint32_t bits = 0x3fa06c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_half_pi< complex<float> >() {
+  return complex<float>(root_half_pi<float>(), float());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two_pi<float>() {
+  uint32_t bits = 0x40206c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two_pi< complex<float> >() {
+  return complex<float>(root_two_pi<float>(), float());
+}
+
+/// Returns sqrt(ln(4))  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_ln_four<float>() {
+  uint32_t bits = 0x3f96b55fu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_ln_four< complex<float> >() {
+  return complex<float>(root_ln_four<float>(), float());
+}
+
+/// Returns e, approximately 2.718...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float e<float>() {
+  uint32_t bits = 0x402df854u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> e< complex<float> >() {
+  return complex<float>(e<float>(), float());
+}
+
+/// Returns (1/2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half<float>() {
+  uint32_t bits = 0x3f000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half< complex<float> >() {
+  return complex<float>(half<float>(), float());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two<float>() {
+  uint32_t bits = 0x3fb504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two< complex<float> >() {
+  return complex<float>(root_two<float>(), float());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_root_two<float>() {
+  uint32_t bits = 0x3f3504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_root_two< complex<float> >() {
+  return complex<float>(half_root_two<float>(), float());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_two<float>() {
+  uint32_t bits = 0x3f317218u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_two< complex<float> >() {
+  return complex<float>(ln_two<float>(), float());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_ln_two<float>() {
+  uint32_t bits = 0xbebba795u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_ln_two< complex<float> >() {
+  return complex<float>(ln_ln_two<float>(), float());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float third<float>() {
+  uint32_t bits = 0x3eaaaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> third< complex<float> >() {
+  return complex<float>(third<float>(), float());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float twothirds<float>() {
+  uint32_t bits = 0x3f2aaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> twothirds< complex<float> >() {
+  return complex<float>(twothirds<float>(), float());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi_minus_three<float>() {
+  uint32_t bits = 0x3e10fdaau;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi_minus_three< complex<float> >() {
+  return complex<float>(pi_minus_three<float>(), float());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float four_minus_pi<float>() {
+  uint32_t bits = 0x3f5bc095u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> four_minus_pi< complex<float> >() {
+  return complex<float>(four_minus_pi<float>(), float());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for tfloat32_t
+
+/// Returns 1, the multiplicative identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t one<tfloat32_t>() {
+  uint32_t bits = 0x3f801000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> one< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(one<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t zero<tfloat32_t>() {
+  uint32_t bits = 0x1000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> zero< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(zero<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two<tfloat32_t>() {
+  uint32_t bits = 0x40001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi<tfloat32_t>() {
+  uint32_t bits = 0x40491fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2 * pi  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40c91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi / 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fc91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fe2efc5u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fa07c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_half_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40207c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_ln_four<tfloat32_t>() {
+  uint32_t bits = 0x3f96c55fu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_ln_four< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_ln_four<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t e<tfloat32_t>() {
+  uint32_t bits = 0x402e0854u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> e< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(e<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns (1/2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half<tfloat32_t>() {
+  uint32_t bits = 0x3f001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two<tfloat32_t>() {
+  uint32_t bits = 0x3fb514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_root_two<tfloat32_t>() {
+  uint32_t bits = 0x3f3514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_root_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_two<tfloat32_t>() {
+  uint32_t bits = 0x3f318218u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_ln_two<tfloat32_t>() {
+  uint32_t bits = 0xbebbb795u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_ln_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t third<tfloat32_t>() {
+  uint32_t bits = 0x3eaabaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> third< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(third<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t twothirds<tfloat32_t>() {
+  uint32_t bits = 0x3f2abaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> twothirds< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(twothirds<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi_minus_three<tfloat32_t>() {
+  uint32_t bits = 0x3e110daau;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi_minus_three< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi_minus_three<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t four_minus_pi<tfloat32_t>() {
+  uint32_t bits = 0x3f5bd095u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> four_minus_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(four_minus_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for half_t
+
+/// Returns 1, the multiplicative identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t one<half_t>() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> one< complex<half_t> >() {
+  return complex<half_t>(one<half_t>(), half_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t zero<half_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> zero< complex<half_t> >() {
+  return complex<half_t>(zero<half_t>(), half_t());
+}
+
+/// Returns 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two<half_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two< complex<half_t> >() {
+  return complex<half_t>(two<half_t>(), half_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi<half_t>() {
+  uint16_t bits = 0x4248u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi< complex<half_t> >() {
+  return complex<half_t>(pi<half_t>(), half_t());
+}
+
+/// Returns 2 * pi  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two_pi<half_t>() {
+  uint16_t bits = 0x4648u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two_pi< complex<half_t> >() {
+  return complex<half_t>(two_pi<half_t>(), half_t());
+}
+
+/// Returns pi / 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_pi<half_t>() {
+  uint16_t bits = 0x3e48u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_pi< complex<half_t> >() {
+  return complex<half_t>(half_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_pi<half_t>() {
+  uint16_t bits = 0x3f17u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_pi< complex<half_t> >() {
+  return complex<half_t>(root_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_half_pi<half_t>() {
+  uint16_t bits = 0x3d03u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_half_pi< complex<half_t> >() {
+  return complex<half_t>(root_half_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two_pi<half_t>() {
+  uint16_t bits = 0x4103u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two_pi< complex<half_t> >() {
+  return complex<half_t>(root_two_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_ln_four<half_t>() {
+  uint16_t bits = 0x3cb6u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_ln_four< complex<half_t> >() {
+  return complex<half_t>(root_ln_four<half_t>(), half_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t e<half_t>() {
+  uint16_t bits = 0x4170u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> e< complex<half_t> >() {
+  return complex<half_t>(e<half_t>(), half_t());
+}
+
+/// Returns (1/2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half<half_t>() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half< complex<half_t> >() {
+  return complex<half_t>(half<half_t>(), half_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two<half_t>() {
+  uint16_t bits = 0x3da8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two< complex<half_t> >() {
+  return complex<half_t>(root_two<half_t>(), half_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_root_two<half_t>() {
+  uint16_t bits = 0x39a8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_root_two< complex<half_t> >() {
+  return complex<half_t>(half_root_two<half_t>(), half_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_two<half_t>() {
+  uint16_t bits = 0x398cu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_two<half_t>(), half_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_ln_two<half_t>() {
+  uint16_t bits = 0xb5ddu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_ln_two<half_t>(), half_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t third<half_t>() {
+  uint16_t bits = 0x3555u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> third< complex<half_t> >() {
+  return complex<half_t>(third<half_t>(), half_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t twothirds<half_t>() {
+  uint16_t bits = 0x3955u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> twothirds< complex<half_t> >() {
+  return complex<half_t>(twothirds<half_t>(), half_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi_minus_three<half_t>() {
+  uint16_t bits = 0x3088u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi_minus_three< complex<half_t> >() {
+  return complex<half_t>(pi_minus_three<half_t>(), half_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t four_minus_pi<half_t>() {
+  uint16_t bits = 0x3adeu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> four_minus_pi< complex<half_t> >() {
+  return complex<half_t>(four_minus_pi<half_t>(), half_t());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for bfloat16_t
+
+/// Returns 1, the multiplicative identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t one<bfloat16_t>() {
+  uint16_t bits = 0x3f80u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> one< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(one<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t zero<bfloat16_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> zero< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(zero<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two<bfloat16_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi<bfloat16_t>() {
+  uint16_t bits = 0x4049u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2 * pi  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two_pi<bfloat16_t>() {
+  uint16_t bits = 0x40c9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi / 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fc9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fe3u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fa0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_half_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two_pi<bfloat16_t>() {
+  uint16_t bits = 0x4020u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_ln_four<bfloat16_t>() {
+  uint16_t bits = 0x3f97u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_ln_four< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_ln_four<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t e<bfloat16_t>() {
+  uint16_t bits = 0x402eu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> e< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(e<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns (1/2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half<bfloat16_t>() {
+  uint16_t bits = 0x3f00u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two<bfloat16_t>() {
+  uint16_t bits = 0x3fb5u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_root_two<bfloat16_t>() {
+  uint16_t bits = 0x3f35u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_root_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_two<bfloat16_t>() {
+  uint16_t bits = 0x3f31u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_ln_two<bfloat16_t>() {
+  uint16_t bits = 0xbebcu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_ln_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t third<bfloat16_t>() {
+  uint16_t bits = 0x3eabu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> third< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(third<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t twothirds<bfloat16_t>() {
+  uint16_t bits = 0x3f2bu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> twothirds< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(twothirds<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi_minus_three<bfloat16_t>() {
+  uint16_t bits = 0x3e11u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi_minus_three< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi_minus_three<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t four_minus_pi<bfloat16_t>() {
+  uint16_t bits = 0x3f5cu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> four_minus_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(four_minus_pi<bfloat16_t>(), bfloat16_t());
+}
+///////////////////////////////////////////////////////////////////////////////////
+
+} // namespace constants
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
new file mode 100755
index 000000000..526db83ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
@@ -0,0 +1,96 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/layout/tensor.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Maps a rank-1 cute::Shape<> representing the cluster shape on to the IM2COL TMA atom that should be used with it
+template <class UnimodalClusterShape>
+constexpr auto
+sm90_cluster_shape_to_im2col_tma_atom(UnimodalClusterShape unimodal_cluster_shape) {
+  static_assert(cute::rank(unimodal_cluster_shape) == 1,
+    "Use this function to figure out TMA for each mode individually.");
+
+  if constexpr (cute::size(unimodal_cluster_shape) == 1) {
+    return cute::SM90_TMA_LOAD_IM2COL{};
+  }
+  else {
+    return cute::SM90_TMA_LOAD_IM2COL_MULTICAST{};
+  }
+}
+
+// Collective tile traits struct that serves as a type list containing a tensor's mem layouts and atoms for the
+template<
+  class GmemTiledCopy_,
+  class SmemLayout_,
+  class SmemCopyAtom_ = void
+>
+struct Sm90ImplicitGemmTileTraits {
+  using GmemTiledCopy = GmemTiledCopy_;
+  using SmemLayout = SmemLayout_;
+  using SmemCopyAtom = SmemCopyAtom_;
+};
+
+// Accepts a cutlass::layout::Tensor tag and computes the corresponding spatial dimension count
+template <class GmemLayoutTagA, class GmemLayoutTagB>
+constexpr int
+gmem_layout_tags_to_spatial_dims() {
+  static_assert(cute::is_same_v<GmemLayoutTagA, GmemLayoutTagB>);
+  if constexpr      (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNWC>) {
+    return 1;
+  }
+  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNHWC>) {
+    return 2;
+  }
+  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNDHWC>) {
+    return 3;
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<GmemLayoutTagA>);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective::detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
new file mode 100755
index 000000000..a08209efb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
@@ -0,0 +1,257 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/collective/builders/sm90_common.inl"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS_FPROP
+template <
+  conv::Operator ConvOp,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ConvOp,
+    ElementA,
+    GmemLayoutA,
+    AlignmentA,
+    ElementB,
+    GmemLayoutB,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90> ||
+                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Pingpong>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(cutlass::gemm::collective::detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, cutlass::gemm::collective::detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // For fprop, majorA = K,  major B = K;
+  // For wgrad, majorA = MN, major B = MN;
+  // For dgrad, majorA = K,  major B = MN;
+  static constexpr cute::GMMA::Major GmmaMajorA =
+    (ConvOp == conv::Operator::kWgrad) ? cute::GMMA::Major::MN : cute::GMMA::Major::K;
+  static constexpr cute::GMMA::Major GmmaMajorB =
+    (ConvOp == conv::Operator::kFprop) ? cute::GMMA::Major::K : cute::GMMA::Major::MN;
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  // For wgrad kernel, tensor A uses tma tiled mode and tensor B uses tma im2col mode.
+  using GmemTiledCopyA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<1>(ClusterShape_MNK{}))),
+      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<1>(ClusterShape_MNK{})))>;
+  using GmemTiledCopyB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<0>(ClusterShape_MNK{}))),
+      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<0>(ClusterShape_MNK{})))>;
+
+  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
+      Step<_2,_1,_3>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
+      Step<_2,_1,_3>{}));
+
+  constexpr static int NumSpatialDimensions = cutlass::conv::collective::detail::gmem_layout_tags_to_spatial_dims<GmemLayoutA, GmemLayoutB>();
+
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, PipelineStages, NumSpatialDimensions, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = CollectiveConv<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      ElementB,
+      TiledMma,
+      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyA, SmemLayoutA>,
+      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyB, SmemLayoutB>
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA auto kernel schedule
+template <
+  conv::Operator ConvOp,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ConvOp,
+    ElementA,
+    GmemLayoutA,
+    AlignmentA,
+    ElementB,
+    GmemLayoutB,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+/*
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
+  // Cooperative schedule performs best for CUDA Toolkits with version >= 12.1
+
+  // For TileShape_M == 64, choosing KernelTmaWarpSpecialized as the KernelSchedule
+  // Since KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
+  using KernelWarpSpecializedSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+      KernelImplicitTmaWarpSpecializedSm90PingPong, KernelImplicitTmaWarpSpecializedSm90Cooperative>;
+#else
+  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
+#endif
+*/
+  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
+
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassTensorOp,
+      ConvOp,
+      ElementA,
+      GmemLayoutA,
+      AlignmentA,
+      ElementB,
+      GmemLayoutB,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelWarpSpecializedSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
new file mode 100755
index 000000000..9d6a16c0d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/collective_conv.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/conv/dispatch_policy.hpp
+struct KernelScheduleAuto {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  conv::Operator,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_gmma_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
new file mode 100755
index 000000000..d187b5ece
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/detail.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class ElementB,
+  class TiledMma,
+  class TileTraitsA,
+  class TileTraitsB
+>
+struct CollectiveConv {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "sm90_implicit_gemm_gmma_ss_warpspecialized.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
new file mode 100755
index 000000000..ac272c8e2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Construct the stride types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_A() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes ((w,n), C)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,d,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (k, nq/npq/nzpq)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1 ||
+                  DispatchPolicy::NumSpatialDimensions == 2 ||
+                  DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, int64_t>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes ((q,n), K)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,z,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+// Construct the stirde types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_B() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes (k, (C,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (C, (w,n))
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,d,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes (C, (k,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Compute the lower/near corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = -1 * problem_shape.lower_padding[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+// Computes the upper/far corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_upper_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> upper{};
+  if constexpr (ConvOp == conv::Operator::kFprop) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_C[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i] + problem_shape.shape_C[i+1] - problem_shape.shape_A[i+1];
+    });
+  }
+  return upper;
+}
+
+// Compute the lower/near corner of (t,r,s), returning it as a cute::array in [S,R,T] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_srt(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = 0;
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+template <class CopyOp> struct is_im2col_load { static constexpr bool value = false; };
+template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL          > { static constexpr bool value = true; };
+template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
new file mode 100755
index 000000000..78862b0a0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,663 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor_predicate.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_im2col.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  class ClusterShape,
+  class KernelSchedule,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+        ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutA = typename TileTraitsA_::SmemLayout;
+  using SmemLayoutB = typename TileTraitsB_::SmemLayout;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // Deduce the kernel-facing stride tuple types based on the dispatch policy
+  // (which is a function of the number of spatial dimensions, the algorithm, etc.)
+  using StrideA = decltype(detail::sm90_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm90_dispatch_policy_to_stride_B<DispatchPolicy>());
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+  
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  // TODO: move pipeline mode tiling into the collective setup phase instead
+  static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(rank(SmemLayoutB{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<1>(TileShape{}) == size<0>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  // The tma load mode of wgrad is tiled for tensor A and im2col for tensor B while the tma load mode of fprop and dgrad
+  // kernel is im2col for tensor A and tiled for tensor B.
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST>)),
+      "GmemTiledCopyA - invalid SM90 TMA copy atom specified.");
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)),
+      "GmemTiledCopyB - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+
+private:
+  // Note that for fprop and dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // Get tma_load_a instantce.
+  template <class TensorA>
+  static constexpr auto
+  get_tma_load_a_instance(TensorA const& tensor_a, ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      // The calculation of gbasis strides for dgrad kernel needs perform negate for dilation values.
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+  
+      return make_im2col_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          product_each(shape(SmemLayoutA{}(_,_,_0{}))),
+          size<1>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad kernel.
+    else {
+      return make_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+          size<1>(ClusterShape{}));
+    }
+  }
+
+  // Get tma_load_b instantce.
+  template <class TensorB>
+  static constexpr auto
+  get_tma_load_b_instance(TensorB const& tensor_b, ProblemShape const& problem_shape) {
+    // TMA im2col mode for tensor B in wgrad kernel.
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+  
+      return make_im2col_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          product_each(shape(SmemLayoutB{}(_,_,_0{}))),
+          size<0>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+          size<0>(ClusterShape{}));
+    }
+  }
+
+public:
+
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+
+  // Device side kernel params
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int(0)))>;
+
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementA const*>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementB const*>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  // Lowers the host side user facing arguments to the kernel facing lauch params
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // from the flat problem shape arrays of ConvProblemShape<ConvOp, N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes
+    };
+  }
+  
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    implementable &= problem_shape.stride_A[NumTensorDimensions-1] == 1;
+    implementable &= problem_shape.stride_B[NumTensorDimensions-1] == 1;
+
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+
+    // Check valid padding values for TMA_LOAD_IM2COL
+    constexpr int padding_limit = (ProblemShape::RankS == 1) ? 65536 : (ProblemShape::RankS == 2 ? 256 : 16);
+    for (int i = 0; i < problem_shape.RankS; ++i) {
+      implementable = implementable && problem_shape.lower_padding[i] <= padding_limit && problem_shape.lower_padding[i] >= 0;
+      implementable = implementable && problem_shape.upper_padding[i] <= padding_limit && problem_shape.upper_padding[i] >= 0;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+      return false;
+    }
+
+    if (problem_shape.groups > 1) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: This kernel does not support conv groups > 1.\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mk - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k)
+  /// gB_nk - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k)
+  /// The rest of the tensors can be specified as needed by this collective.
+  /// The dimensions of gA_mk and gA_nk do not contain L to maintain consistency with 
+  /// StrideA and StrideB set up for TMA 
+  template <class ProblemShapeMNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params){
+  //load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mk = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K));                            // (m,k)
+    Tensor mB_nk = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K));                            // (n,k)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k)
+
+    return cute::make_tuple(gA_mk, gB_nk);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_producer_state,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    int lane_predicate = cute::elect_one_sync();
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      auto [gA_mk, gB_nk] = load_inputs;
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA = gA_mk(_,_,m_coord,_);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nk(_,_,n_coord,_);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_producer_state for _writing_
+        pipeline.producer_acquire(smem_pipe_producer_state);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_producer_state);
+
+        int write_stage = smem_pipe_producer_state.index();
+
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_producer_state
+        ++smem_pipe_producer_state;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_producer_state) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_producer_state);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_consumer_state,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_consumer_state;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_consumer_state;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_producer_state is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_consumer_state and smem_pipe_release
+      ++smem_pipe_consumer_state;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
new file mode 100755
index 000000000..d2e895299
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
@@ -0,0 +1,654 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv2dProblem desciption:
+    activation (NHWC), 
+    filter (KRSC), 
+    output (NPQK), 
+    pading (pad_h, pad_w),
+    stride (stride_h, stride_w),
+    dilation (dilation_h, dilation_w).
+    
+  Free functions to map:
+    Map tensor extents (Conv2d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv2dProblemSize {
+
+  // Conv2d strictly problem size parameters
+  int N, H, W, C, P, Q, K, R, S;
+  int pad_h, pad_w;
+  int stride_h, stride_w;
+  int dilation_h, dilation_w;
+  Mode mode;
+
+  // Conv2d implementation-related parameters 
+  int split_k_slices;
+  int groups;
+
+  //
+  // Methods
+  //
+
+public:
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize():
+    N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
+    pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int P,
+    int Q,
+    int K,
+    int R,
+    int S,
+    Mode mode
+  ): 
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(1), groups (1) { }
+  
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int K,
+    int R,
+    int S,
+    int P,
+    int Q,
+    int pad_h,
+    int pad_w,
+    int stride_h,
+    int stride_w,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w),
+    dilation_h(dilation_h), dilation_w(dilation_w), 
+    mode(mode), split_k_slices(split_k_slices), groups (groups) { }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord padding,       // pad_h, _, pad_w, _
+    cutlass::MatrixCoord stride,          // stride_h, stride_w
+    cutlass::MatrixCoord dilation,        // dilation_h, dilation_w
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // computes output size and sets P and Q (skip output from ctor arguments)
+  CUTLASS_HOST_DEVICE  
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,   // NHWC
+    cutlass::Tensor4DCoord filter_size,  // KRSC
+    cutlass::Tensor4DCoord padding,      // pad_h, upper_pad_h, pad_w, upper_pad_w
+    cutlass::MatrixCoord stride,         // stride_h, stride_w
+    cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {
+      // set output P and Q
+      P = ((H + pad_h + padding[1] - R * dilation_h) / stride_h) + 1;
+      Q = ((W + pad_w + padding[3] - S * dilation_w) / stride_w) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (skip padding, striding, and dilation)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1),
+    dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv2dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (R == conv.R) && (S == conv.S) &&
+      (P == conv.P) && (Q == conv.Q) &&
+      (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv2dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  /// Returns activation extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord activation_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord filter_extent(bool is_deconv = false) const {
+
+    return is_deconv ? cutlass::Tensor4DCoord ({C, R, S, K / groups})
+        : cutlass::Tensor4DCoord ({K, R, S, C / groups});
+  }
+
+  /// Returns output extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord output_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return (N * H * W * C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return (K * R * S * C / groups);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return (N * P * Q * K);
+  }
+  
+  /// Returns padding as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord padding() const {
+
+    return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord stride() const {
+
+    return cutlass::MatrixCoord ({stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord dilation() const {
+
+    return cutlass::MatrixCoord ({dilation_h, dilation_w});
+  }
+
+  /////////////////////////////////////////////////////////////////
+  //        Methods used for strided dgrad implementation
+  /////////////////////////////////////////////////////////////////
+  /// Number of filter r positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_r(int r) const {
+    return ((R - r + stride_h - 1) / stride_h);
+  }
+
+  /// Number of filter s positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_s(int s) const {
+    return ((S - s + stride_w - 1) / stride_w);
+  }
+
+  /// Number of filter positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_positions(int r, int s) const {
+    return num_gemm_k_filter_r(r) * num_gemm_k_filter_s(s);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv2dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C / problem_size.groups
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+  int iterations = 0;
+
+  if (group_mode == GroupMode::kNone) {
+
+    if (algorithm == IteratorAlgorithm::kFixedChannels) {
+
+      int positions_per_iteration = threadblock_K / problem_size.C;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
+        break;
+
+      default:
+        break;
+      }
+    }
+    else if (algorithm == IteratorAlgorithm::kFewChannels) {
+
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
+        break;
+
+      default:
+        break;
+      }
+    }
+    else {
+      int elements_per_split_k_slice = 0;
+
+      switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+
+      default:
+        break;
+      }
+    }
+
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
+  } else {  // Group conv
+
+    int channels_per_group = problem_size.C / problem_size.groups;
+    int k_per_group = problem_size.K / problem_size.groups;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+          // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
+          if (problem_size.groups != 1) {
+            if (k_per_group < threadblock_N) {
+              iterations *= threadblock_N / k_per_group;
+            }
+          }
+          break;
+
+        default:
+          break;
+      }
+    } else if (algorithm == IteratorAlgorithm::kOptimized) {
+      // Current optimized iterator only support GroupMode::kSingleGroup
+      if (group_mode == GroupMode::kSingleGroup) {
+        switch (conv_operator) {
+          case Operator::kFprop:
+            iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+            break;
+
+          default:
+            break;
+        }
+      }
+    }
+
+  }
+
+  return iterations;
+}
+
+
+template <int N = 1, int Output_P = 1, int Output_Q = 1>
+CUTLASS_HOST_DEVICE
+int depthwise_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+    int n =  problem_size.N;
+    int p = (problem_size.P + Output_P - 1) /  Output_P;
+    int q = (problem_size.Q + Output_Q - 1) /  Output_Q;
+
+    int iterations = (n * p * q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    return iterations;
+}
+
+
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations_per_channel(
+    Operator conv_operator,
+    Conv2dProblemSize const &problem_size,
+    IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+
+  int iterations = 0; //0 means not applicable
+  if (algorithm == IteratorAlgorithm::kAnalytic || algorithm == IteratorAlgorithm::kOptimized) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      default:
+        break;
+    }
+  }
+  return iterations;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  Strided dgrad helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns number of CTAs tile M to cover valid MMAs per starting filter postion
+CUTLASS_HOST_DEVICE
+int strided_dgrad_tile_m_per_filter(
+  Conv2dProblemSize const &problem_size,
+  int tile_size_m) {
+
+  // Compute NHW rows in Dx output that needs MMA per starting filter position
+  int rows_h_per_filter = (problem_size.H + problem_size.stride_h - 1) / problem_size.stride_h;
+  int rows_w_per_filter = (problem_size.W + problem_size.stride_w - 1) / problem_size.stride_w;
+  int rows_nhw_per_filter = problem_size.N * rows_h_per_filter * rows_w_per_filter;
+
+  // Number of CTAs tile M to cover valid MMAs per starting filter postion
+  int tile_m_per_filter = (rows_nhw_per_filter + tile_size_m - 1) / tile_size_m;
+
+  return tile_m_per_filter;
+}
+
+// Computes starting Dx coord (h, w) for given starting filter postion
+CUTLASS_HOST_DEVICE
+void strided_dgrad_starting_coords(
+  Conv2dProblemSize const &problem_size,
+  FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+  int r, int s,
+  int &start_h, int &start_w) {
+
+  // function locals for remainder by fast divmod
+  int pad_h_rem_, pad_w_rem_;
+
+  // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
+  stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
+  int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
+  stride_h_divmod.divmod(start_h, r_);
+
+  //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
+  stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
+  int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
+  stride_w_divmod.divmod(start_w, s_);
+}
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
new file mode 100755
index 000000000..9a9514f2d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv3dProblem desciption:
+    activation (NDHWC), 
+    filter (KTRSC), 
+    output (NZPQK), 
+    pading (pad_d, pad_h, pad_w), 
+    stride (stride_d, stride_h, stride_w), 
+    dilation (dilation_d, dilation_h, dilation_w).
+  
+  Free functions to map:
+    Map tensor extents (Conv3d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)  
+*/
+
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv3dProblemSize : public Conv2dProblemSize {
+  //
+  // Type definitions
+  //
+
+  // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions
+  using Coord3D = Coord<3>;
+
+  //
+  // Data members
+  //
+
+  // Conv3d strictly problem size parameters
+  int D, T, Z;    // input depth, filter depth, output depth
+  int pad_d;      // padding in depth dimension
+  int stride_d;   // stride in depth dimension
+  int dilation_d; // dilation in depth dimension
+
+  //
+  // Methods
+  //
+public:
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(): 
+    Conv2dProblemSize(),
+    D(0), T(0), Z(0), 
+    pad_d(0),
+    stride_d(1), 
+    dilation_d(1) { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int Z,
+    int P,
+    int Q,
+    int K,
+    int T,
+    int R,
+    int S,
+    Mode mode
+  ):
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
+    D(D), T(T), Z(Z), 
+    pad_d(T / 2), stride_d(1), dilation_d(1) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int K,
+    int T,
+    int R,
+    int S,
+    int Z,
+    int P,
+    int Q,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+    N, H, W, C, K, R, S, P, Q, 
+    pad_h, pad_w, 
+    stride_h, stride_w, 
+    dilation_h, dilation_w,
+    mode, split_k_slices, groups),
+    D(D), T(T), Z(Z), 
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::Tensor5DCoord output_size,   // NZPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord, Coord3D
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    CUTLASS_STL_NAMESPACE::tuple<Coord3D, Coord3D> padding, // Coord3D {pad_d, pad_h, pad_w} & Coord3D {far pad_d, pad_h, pad_w} to calculate o/p/q
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {CUTLASS_STL_NAMESPACE::get<0>(padding)[1], CUTLASS_STL_NAMESPACE::get<1>(padding)[1],
+       CUTLASS_STL_NAMESPACE::get<0>(padding)[2], CUTLASS_STL_NAMESPACE::get<1>(padding)[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(CUTLASS_STL_NAMESPACE::get<0>(padding)[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d + CUTLASS_STL_NAMESPACE::get<1>(padding)[0] - T * dilation_d) / stride_d) + 1;
+    }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv3dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
+      (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
+      (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv3dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+  
+  /// Returns activation extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord activation_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, D, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord filter_extent(bool is_deconv = false) const {
+
+    return is_deconv ? cutlass::Tensor5DCoord ({C, T, R, S, K})
+        : cutlass::Tensor5DCoord ({K, T, R, S, C});
+  }
+
+  /// Returns output extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord output_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, Z, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return (N * D * H * W * C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return (K * T * R * S * C);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return (N * Z * P * Q * K);
+  }
+
+  /// Returns padding as Coord3D
+  CUTLASS_HOST_DEVICE
+  Coord3D padding() const {
+
+    return Coord3D ({pad_d, pad_h, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D stride() const {
+
+    return Coord3D ({stride_d, stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D dilation() const {
+
+    return Coord3D ({dilation_d, dilation_h, dilation_w});
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv3dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.D * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv3dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+  int iterations = 0;
+  int elements_per_split_k_slice = 0;
+  if (group_mode == GroupMode::kNone) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+    
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+    
+      default:
+        break;
+    }
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.T * problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
+  }
+
+  return iterations;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
new file mode 100755
index 000000000..ffcc547fb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
@@ -0,0 +1,561 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem shapes.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/conv/convolution.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Implements the user facing argument for all CUTLASS 3.x convolutions in a rank agnostic fashion.
+// All tensors are flat and by default treated as layout right (NDHWC, KTRSC, NZPQK)
+// Supports asymmetric padding, traversal strides, dilations, and all conv algorithm types.
+template <
+  conv::Operator ConvOp_,
+  int NumSpatialDimensions_
+>
+struct ConvProblemShape {
+  //
+  // Alias types for members
+  //
+
+  static constexpr int RankS = NumSpatialDimensions_;
+  static constexpr int RankT = NumSpatialDimensions_ + 2;
+  static constexpr conv::Operator ConvOp = ConvOp_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  using SpatialExtent = cute::array<int, RankS>;
+  using TensorExtent  = cute::array<int, RankT>;
+  using TensorStride  = cute::array<int64_t, RankT>;
+  using ShapePadding = SpatialExtent;
+  using TraversalStride = SpatialExtent;
+  using ShapeDilation = SpatialExtent;
+  using Corner = SpatialExtent;
+
+  //
+  // Members
+  //
+  cutlass::conv::Mode mode{};
+  TensorExtent shape_A{};
+  TensorStride stride_A{};
+  TensorExtent shape_B{};
+  TensorStride stride_B{};
+  TensorExtent shape_C{};
+  TensorStride stride_C{};
+
+  // asymmetric padding, both upper and lower padding must be >= 0
+  ShapePadding lower_padding{};
+  ShapePadding upper_padding{};
+  TraversalStride traversal_stride{};
+  ShapeDilation dilation{};
+  int groups = 1;
+
+  //
+  // Methods
+  //
+
+  ConvProblemShape() = default;
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      TensorStride stride_xformed_act,                                     // [n,z,p,q,k]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and presume packed tensor strides in canonical (CWHDN) order.
+  ConvProblemShape(
+      conv::Mode mode,
+      TensorExtent shape_act,
+      TensorExtent shape_flt,
+      ShapePadding lower_padding,
+      ShapePadding upper_padding,
+      TraversalStride tstride,
+      ShapeDilation dilation,
+      int groups)
+      : ConvProblemShape(
+        mode,
+        shape_act,
+        packed_stride_right_major(shape_act),
+        shape_flt,
+        packed_stride_right_major(shape_flt),
+        lower_padding,
+        upper_padding,
+        tstride,
+        dilation,
+        groups) {
+    }
+
+#if ! defined(__CUDACC_RTC__)
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(stride_act_.size() == stride_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(stride_flt_.size() == stride_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int64_t> stride_xformed_act_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    TensorStride stride_xformed_act{};
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(stride_xformed_act_.begin(), stride_xformed_act_.end(), stride_xformed_act.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    stride_act = packed_stride_right_major(shape_act);
+    stride_flt = packed_stride_right_major(shape_flt);
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+#endif // not defined(__CUDACC_RTC__)
+
+  // Set shape and stride of tensor A/B/C according to following table:
+  // |              | Fprop  | Dgrad  | Wgrad |
+  // | ------       | ------ | ------ | ------|
+  // |   ShapeA     | NDHWC  | NZPQK  | NZPQK |
+  // |   ShapeB     | KTRSC  | KTRSC  | NDHWC |
+  // |   ShapeC     | NZPQK  | NDHWC  | KTRSC |
+  //
+  CUTLASS_HOST_DEVICE
+  constexpr void
+  set_shape_stride_ABC(
+    TensorExtent shape_act,
+    TensorStride stride_act,
+    TensorExtent shape_flt,
+    TensorStride stride_flt,
+    TensorExtent shape_xformed_act,
+    TensorStride stride_xformed_act) {
+
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      shape_A = shape_act;
+      stride_A = stride_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_xformed_act;
+      stride_C = stride_xformed_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_act;
+      stride_C = stride_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_act;
+      stride_B = stride_act;
+      shape_C = shape_flt;
+      stride_C = stride_flt;
+    }
+  }
+
+  // Get A extents.
+  // fprop: A extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // dgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // wgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((K), (Q,P,Z,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_A() const {
+    using cute::make_shape;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        cute::reverse(take<0, RankT - 1>(shape_A)),
+        shape_A[RankT - 1]);
+    }
+    // For wgrad kernel, we need to linearize NZPQ for tensor A
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_A[RankT - 1],
+        cute::product(take<0, RankT - 1>(shape_A)));
+    }
+  }
+
+  // Get B extents.
+  // fprop: B extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  // dgrad: B extents array contains [K,T,R,S,C]. Turn that into ((C), (K,S,R,T))
+  // wgrad: B extents array contains [N,D,H,W,C]. Turn that into ((C), (W,H,D,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_B() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop) {
+      return make_shape(
+        shape_B[0],
+        reverse(take<1, RankT>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_B[RankT - 1],
+        reverse(take<0, RankT - 1>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kDgrad) {
+      // shape_B: [K,T,R,S,C], return: [(C),(K,S,R,T)]
+      return make_shape(
+        shape_B[RankT - 1],
+        cute::insert<0>(
+          reverse(take<1, RankT - 1>(shape_B)),
+          shape_B[0]));
+    }
+  }
+
+  // Get C extents.
+  // fprop: C extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // dgrad: C extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // wgrad: C extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_C() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        reverse(take<0, RankT - 1>(shape_C)),
+        shape_C[RankT - 1]);
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_C[0],
+        reverse(take<1, RankT>(shape_C)));
+    }
+  }
+
+  // Static method that returns the canonical strides of tensors (layouts are right major and compact)
+  CUTLASS_HOST_DEVICE
+  static constexpr TensorStride
+  packed_stride_right_major(TensorExtent const& extents) {
+    TensorStride strides{};
+    strides[RankT-1] = 1;
+    cute::for_each(cute::make_rseq<RankT-1>{}, [&](auto i) {
+      strides[i] = extents[i+1] * strides[i+1];
+    });
+    return strides;
+  }
+
+  // Static method that returns the packed logical size of any TensorExtent
+  CUTLASS_HOST_DEVICE
+  static constexpr size_t
+  size(TensorExtent const& extents) {
+    size_t size = 1;
+    cute::for_each(cute::make_seq<RankT>{}, [&](auto i) {
+      size *= extents[i];
+    });
+    return size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_A() const {
+    return shape_A[0] * stride_A[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_B() const {
+    return shape_B[0] * stride_B[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_C() const {
+    return shape_C[0] * stride_C[0];
+  }
+
+  // Equality operator
+  CUTLASS_HOST_DEVICE
+  bool operator==(ConvProblemShape<ConvOp, NumSpatialDimensions> const& rhs) const {
+    using cute::for_each;
+    using cute::make_seq;
+
+    bool is_equal = true;
+
+    // Compare all tensor extents
+    for_each(make_seq<RankT>{}, [&](auto i) {
+      is_equal = is_equal
+          && (shape_A[i] == rhs.shape_A[i])
+          && (shape_B[i] == rhs.shape_B[i]);
+    });
+
+    // Compare all spatial extents
+    for_each(make_seq<RankS>{}, [&](auto i) {
+      is_equal = is_equal
+          && (lower_padding[i] == rhs.lower_padding[i])
+          && (upper_padding[i] == rhs.upper_padding[i])
+          && (traversal_stride[i] == rhs.traversal_stride[i])
+          && (dilation[i] == rhs.dilation[i]);
+    });
+
+    return is_equal;
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(ConvProblemShape<ConvOp, NumSpatialDimensions> const &rhs) const {
+    return !(*this == rhs);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  calculate_xformed_act(TensorExtent shape_act, TensorExtent shape_flt) {
+    TensorExtent shape_xformed_act{};
+    // calculate n,z,p,q,k.
+    // a helper lambda to compute a single spatial extent of the nzpqk tensor
+    auto nzpqk_extent = [](int act_ext, int filter_ext, int pad_total, int dilation, int tstride) {
+      return 1 + (act_ext + pad_total - ((filter_ext -1) * dilation + 1)) / tstride;
+    };
+
+    shape_xformed_act[0] = shape_act[0]; // Activation N extent
+    cute::for_each(cute::make_seq<RankS>{}, [&](auto i) {
+      shape_xformed_act[i+1] = nzpqk_extent(
+          shape_act[i+1], shape_flt[i+1], upper_padding[i] + lower_padding[i], dilation[i], traversal_stride[i]);
+      });
+    shape_xformed_act[RankT-1] = shape_flt[0]; // Filter K extent
+
+    TensorStride stride_xformed_act = packed_stride_right_major(shape_xformed_act);
+
+    return cute::make_tuple(shape_xformed_act, stride_xformed_act);
+  }
+};
+
+template<
+  conv::Operator ConvOp,
+  int SpatialDim
+>
+void print(ConvProblemShape<ConvOp, SpatialDim> const& problem) {
+  printf("ConvProblemShape with %d spatial dimensions implementing cutlass::conv::Operator::%d\n",
+      SpatialDim, int(ConvOp));
+  printf("\tTensorA: ");
+      cute::print(problem.shape_A); printf(":");
+      cute::print(problem.stride_A); printf("\n");
+  printf("\tTensorB: ");
+      cute::print(problem.shape_B); printf(":");
+      cute::print(problem.stride_B); printf("\n");
+  printf("\tTensorC: ");
+      cute::print(problem.shape_C); printf(":");
+      cute::print(problem.stride_C); printf("\n");
+  printf("\tLower padding:     "); print(problem.lower_padding);       printf("\n");
+  printf("\tUpper padding:     "); print(problem.upper_padding);       printf("\n");
+  printf("\tTraversal strides: "); print(problem.traversal_stride);    printf("\n");
+  printf("\tDilation:          "); print(problem.dilation);            printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
new file mode 100755
index 000000000..243ee269d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+
+This file contains definitions and utility functions for describing convolution problem sizes in terms of
+activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
+dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map CUTLASS's implicit gemm
+tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
+
+                        * Mapping convolutions to Gemm computation *
+
+Cutlass implements convolutions with the Implicit Gemm algorithm.  This algorithm performs a gemm
+(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
+The underlying gemm operation follows the standard gemm definition:
+
+                                     C = A * B + C
+
+                               A and B are input matrices
+                            C is source and output matrix
+
+
+For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
+to convolution tensors Activation, Filter and Output as described in the table below.
+
+        ___________________________________________________________________________
+         ConvolutionalOperator |        A        |      B         |       C
+        ___________________________________________________________________________
+        |                      |                 |                |               |
+        |       Fprop          |    Activation   |    Filter      |     Output    |
+        |       Dgrad          |     Output      |    Filter      |   Activation  |
+        |       Wgrad          |     Output      |  Activation    |     Filter    |
+        ___________________________________________________________________________
+
+In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
+
+For example, it's confusing and error prone to document a convolution class or function
+as operating on "A, B, Output."  Instead, use the mapping functions below,
+and adhere to using either A, B, C or Activation, Filter, Output.
+
+Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
+Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Convolutional operator
+enum class Operator {
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kDeconv
+};
+
+/// Distinguishes convolution from cross correlation
+enum class Mode {
+  kCrossCorrelation,
+  kConvolution
+};
+
+/// Selects among several implementation variants trading off performance with simplicity
+enum class IteratorAlgorithm {
+  kAnalytic,      ///< functionally correct in all cases but lower performance
+  kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+  kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
+  kFewChannels,   ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
+  kFixedStrideDilation ///< Optimized for fixed stride and dilation
+};
+
+/// Distinguishes among partial specializations that accelerate certain problems where convolution
+/// stride is unit.
+enum class StrideSupport {
+  kStrided,       ///< arbitrary convolution stride
+  kUnity,         ///< unit convolution stride
+  kFixed          ///< fixed convolution stride
+};
+
+/// Identifies split-K mode
+enum class SplitKMode {
+  kNone,
+  kSerial,
+  kParallel
+};
+
+/// Identifies group mode
+enum class GroupMode {
+  kNone,
+  kSingleGroup,   ///< One CTA calculates one group or less
+  kMultipleGroup, ///< One CTA calculates multiple groups
+  kDepthwise      ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a tensor
+template <
+  int N = 1,
+  int H = 1,
+  int W = 1,
+  int C = 1
+>
+struct TensorNHWCShape {
+  static int const kN = N;
+  static int const kH = H;
+  static int const kW = W;
+  static int const kC = C;
+
+  static int const kHW = H * W;
+  static int const kNHW = N * kHW;
+  static int const kNHWC = N * H * W * C;
+
+  static int const kCount = kNHWC;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<4> toCoord() {
+    return make_Coord(kN, kH, kW, kC);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
+template <
+  /// Stride in horizontal direction
+  int u = 1,
+  /// Stride in vertical direction
+  int v = 1
+>
+struct Stride2D {
+  static int const kU = u;
+  static int const kV = v;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kU, kV);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
new file mode 100755
index 000000000..3e4173569
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
@@ -0,0 +1,137 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Helper function to get the problem shape
+template <typename T, class ProblemShape>
+auto get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::true_type) {
+  return T::get_problem_shape_MNKL(problem_shape);
+}
+
+template <typename T, class ProblemShape>
+ProblemShape get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::false_type) {
+  return problem_shape;
+}
+
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad         |   Wgrad   |
+// |   ----        | --------- | --------        | --------  |
+// |   Shape_M     | (Q,P,Z,N) | (W/V,H/U,D/O,N) | (K)       |
+// |   Shape_N     | (K)       | (C)             | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)       | (Q,P,Z,N) |
+// |   Shape_L     | _1        | (V,U,O)         | _1        |
+
+template <class ProblemShape>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ProblemShape const& problem_shape) {
+  return problem_shape;
+}
+
+
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+  using cute::insert;
+  using cute::make_shape;
+  using cute::reverse;
+  using cute::take;
+
+  constexpr int RankT = SpatialDim + 2;
+
+  if constexpr (ConvOp == conv::Operator::kWgrad) {
+    auto M_xformed = problem_shape.shape_C[0];
+    auto N_xformed = reverse(take<1, RankT>(problem_shape.shape_C));
+    auto K_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_A));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kFprop){
+    auto M_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_C));
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    auto K_xformed = reverse(take<1, RankT>(problem_shape.shape_B));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    auto L_xformed = reverse(problem_shape.traversal_stride); // (V,U,O)
+    auto M_xformed = ceil_div(reverse(take<0,RankT - 1>(problem_shape.shape_C)), L_xformed);
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    // shape_B: [K,T,R,S,C], K_xformed: [K,S,R,T]
+    auto K_xformed = insert<0>(
+                (reverse(take<1,RankT - 1>(problem_shape.shape_B))),
+                problem_shape.shape_B[0]);
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+}
+
+// Assuming im2col linearization
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad               |   Wgrad   |
+// |   ----        | --------- | --------              | --------  |
+// |   Shape_M     | (Q*P*Z*N) | ([W/V]*[H/U]*[D/O]*N) | (K)       |
+// |   Shape_N     | (K)       | (C)                   | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)             | (Q*P*Z*N) |
+// |   Shape_L     | _1        | (V*U*O)               | _1        |
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_linearized_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+
+  auto [M, N, K, L] = get_transformed_problem_shape_MNKL(problem_shape);
+
+  if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+    return cute::make_shape(cute::product(M), N, K, cute::product(L));
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    return cute::make_shape(M, N, cute::product(K), L);
+  }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
new file mode 100755
index 000000000..193f8d885
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
@@ -0,0 +1,421 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// common
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/trace.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  ConvUniversalAdapter is a stateful, reusable handle built around a kernel
+  of type cutlass::conv::kernel::ConvUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, static methods
+  are exposed that bypass the stateful methods or args->params lowering.
+*/
+template <class ConvKernel_>
+class ConvUniversalAdapter
+{
+public:
+  using ConvKernel = GetUnderlyingKernel_t<ConvKernel_>;
+  using TileShape = typename ConvKernel::TileShape;
+  using ElementA = typename ConvKernel::ElementA;
+  using ElementB = typename ConvKernel::ElementB;
+  using ElementC = typename ConvKernel::ElementC;
+  using ElementD = typename ConvKernel::ElementD;
+  using ElementAccumulator = typename ConvKernel::TiledMma::ValTypeC;
+  using DispatchPolicy = typename ConvKernel::DispatchPolicy;
+  using CollectiveMainloop = typename ConvKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename ConvKernel::CollectiveEpilogue;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  // Tease out meta-information about the conv algorithm
+  static constexpr conv::Operator kConvolutionalOperator = DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
+
+  // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop!
+  using OperatorClass = cute::conditional_t<
+      (cute::size(typename ConvKernel::TiledMma::AtomThrID{}) > 1),
+      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
+
+  using ArchTag = typename ConvKernel::ArchTag;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = ConvKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename ConvKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  /// Argument structure: User API
+  using Arguments = typename ConvKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename ConvKernel::Params;
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the conv can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (ConvKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += ConvKernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = ConvKernel::to_underlying_arguments(args, workspace);
+    return ConvKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return ConvKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("ConvUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<ConvKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<ConvKernel>,
+        ConvKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes conv state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("ConvUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = ConvKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Initialize the Params structure
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      // account for dynamic smem capacity if needed
+      int smem_size = ConvKernel::SharedStorageSize;
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<ConvKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("ConvUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling ConvKernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    CUTLASS_TRACE_HOST("ConvUniversal::run()");
+    dim3 const block = ConvKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr (ConvKernel::ArchTag::kMinComputeCapability >= 90) {
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename ConvKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{}));
+      void* kernel_params[] = {&params};
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster, 
+                                               block, 
+                                               smem_size, 
+                                               stream, 
+                                               kernel_params,
+                                               kernel_index);
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<ConvKernel>;
+        if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 90) {
+          if constexpr (is_static_1x1x1) {
+            device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+            launch_result = Status::kSuccess;
+          }
+          else {
+            launch_result = ClusterLauncher::launch(
+                grid, cluster, block, smem_size, stream, kernel, kernel_params);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+              grid, block, smem_size, stream, kernel_params, 0
+              );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return run(args, workspace, stream, cuda_adapter);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
new file mode 100755
index 000000000..43ab94b5f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Depthwise Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename DirectConvolutionKernel_>
+class DirectConvolution {
+public:
+
+  using UnderlyingKernel = DirectConvolutionKernel_;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+
+  using ReorderKernel = typename UnderlyingKernel::ReorderKernel;
+
+ private:
+
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  DirectConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    if (kGroupMode != conv::GroupMode::kDepthwise) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // C and K should be multiple of groups
+    if (args.problem_size.K != args.problem_size.groups &&
+      args.problem_size.C != args.problem_size.groups) {
+      return Status::kErrorInvalidProblem;
+    }
+    
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {  
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.ptr_reordered_B = args.ref_reordered_B.data();
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    // Launch reorder kernel
+    if (params_.ptr_reordered_B != nullptr) {
+      dim3 grid = ReorderKernel::get_grid_shape(params_);
+      dim3 block = ReorderKernel::get_block_shape();
+
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<ReorderKernel><<<grid, block, 0, stream>>>(params_);
+    }
+
+    // Launch main kernel
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    // Dynamic SMEM size based on input params.
+    int smem_size = int(params_.get_smem_size());
+
+    // Make sure we can use that much shared memory.
+    cudaError_t status = 
+        cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (status != cudaSuccess)
+      return Status::kErrorInternal;
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+  int get_smem_size() { return int(params_.get_smem_size()); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
new file mode 100755
index 000000000..a1cb06e98
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -0,0 +1,361 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename ImplicitGemmKernel_>
+class ImplicitGemmConvolution {
+public:
+
+  using UnderlyingKernel = GetUnderlyingKernel_t<ImplicitGemmKernel_>;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // check group conv constraint
+    if (args.problem_size.groups != 1) {
+      if (kGroupMode == conv::GroupMode::kNone) {
+        return Status::kErrorInvalidProblem;
+      } 
+
+      // C and K should be multiple of groups
+      if (args.problem_size.K % args.problem_size.groups ||
+        args.problem_size.C % args.problem_size.groups) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // split-k is not supported
+      if (args.problem_size.split_k_slices != 1) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      int k_per_group = args.problem_size.K / args.problem_size.groups;
+      // k_per_group should be multiple of ThreadblockShape N, one CTA calculate one group
+      if (kGroupMode == conv::GroupMode::kSingleGroup && k_per_group % ThreadblockShape::kN) {
+        return Status::kErrorInvalidProblem;
+      }
+      // ThreadblockShape::kN should be divisible by k_per_group, one CTA calculate multiple groups
+      if (kGroupMode == conv::GroupMode::kMultipleGroup && ThreadblockShape::kN % k_per_group) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // current optimized iterator algo only supports SingleGroup mode
+      if (kIteratorAlgorithm == IteratorAlgorithm::kOptimized &&
+        kGroupMode != conv::GroupMode::kSingleGroup) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+
+    // check for unsupported problem sizes for strided dgrad / deconv implementation
+    if ((kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) &&
+      kStrideSupport == conv::StrideSupport::kStrided) {
+      // split-k (serial or parallel) is not supported for strided dgrad / deconv
+      if(args.problem_size.split_k_slices > 1 && (args.problem_size.stride().at(args.problem_size.stride().max_dim_index()) > 1)) {
+        return Status::kErrorNotSupported;
+      }
+
+      // dilation > {1x1} is not supported for strided dgrad / deconv
+      if(args.problem_size.dilation_h > 1 || args.problem_size.dilation_w > 1) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+   
+    if (args.problem_size.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+  
+      if (smem_size >= (48 << 10)) {
+        cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                      cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                      smem_size);
+  
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+    cutlass::Status launch_result = cutlass::Status::kSuccess ;
+
+    if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          void* kernel_params[] = {&params_};
+          launch_result = cuda_adapter->launch(
+              grid, dim3(1,1,1), block, smem_size, stream, kernel_params, kernel_index
+              );
+        }
+        else {
+          launch_result = Status::kErrorInternal;
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);      
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter, kernel_index);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
new file mode 100755
index 000000000..265156cc5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level fused activation's scale+bias+relu and Implicit GEMM Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename ImplicitGemmFusionKernel_>
+class ImplicitGemmConvolutionFusion {
+public:
+
+  using ImplicitGemmFusionKernel = ImplicitGemmFusionKernel_;
+
+  using ElementA = typename ImplicitGemmFusionKernel::ElementA;
+  using LayoutA = typename ImplicitGemmFusionKernel::LayoutA;
+  using ElementB = typename ImplicitGemmFusionKernel::ElementB;
+  using LayoutB = typename ImplicitGemmFusionKernel::LayoutB;
+
+//  using ElementScaleBias = typename ImplicitGemmFusionKernel::ElementScaleBias;
+//  using LayoutScaleBias = typename ImplicitGemmFusionKernel::LayoutScaleBias;
+
+  using ElementC = typename ImplicitGemmFusionKernel::ElementC;
+  using LayoutC = typename ImplicitGemmFusionKernel::LayoutC;
+  using ElementAccumulator = typename ImplicitGemmFusionKernel::ElementAccumulator;
+  using ElementCompute = typename ImplicitGemmFusionKernel::ElementCompute;
+  using OperatorClass = typename ImplicitGemmFusionKernel::OperatorClass;
+  using ArchTag = typename ImplicitGemmFusionKernel::ArchTag;
+  using ThreadblockShape = typename ImplicitGemmFusionKernel::ThreadblockShape;
+  using WarpShape = typename ImplicitGemmFusionKernel::WarpShape;
+  using InstructionShape = typename ImplicitGemmFusionKernel::InstructionShape;
+  using ThreadblockSwizzle = typename ImplicitGemmFusionKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename ImplicitGemmFusionKernel::EpilogueOutputOp;
+  static int const kStages = ImplicitGemmFusionKernel::kStages;
+  static int const kConvDim = ImplicitGemmFusionKernel::kConvDim;
+  using WarpMmaOperator = typename ImplicitGemmFusionKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename ImplicitGemmFusionKernel::ArchMmaOperator;
+  using MathOperator = typename ImplicitGemmFusionKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmFusionKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmFusionKernel::kIteratorAlgorithm;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename ImplicitGemmFusionKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename ImplicitGemmFusionKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolutionFusion() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = ImplicitGemmFusionKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = ImplicitGemmFusionKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    if (args.problem_size.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename ImplicitGemmFusionKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmFusionKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes Impicit GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_scale = args.ref_A_scale.data();
+    params_.ptr_bias = args.ref_A_bias.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<ImplicitGemmFusionKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
new file mode 100755
index 000000000..b8b5eb2bf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/arch/arch.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Policies for categorical dispatch of mainloop against kernel grid schedules
+//
+struct KernelImplicitTmaWarpSpecializedSm90 : cutlass::gemm::KernelTmaWarpSpecialized { };
+struct KernelImplicitTmaWarpSpecializedSm90Cooperative { };
+struct KernelImplicitTmaWarpSpecializedSm90Pingpong { };
+
+//
+// Collective Mainloop Policies
+//
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+// for fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>,
+  class KernelSchedule = KernelImplicitTmaWarpSpecializedSm90,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  static constexpr int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+
+  static_assert(NumSpatialDimensions >= 1);
+  static_assert(! (cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                   cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Pingpong>),
+    "Persistent schedules not support for conv yet.");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv 
+
+//////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
new file mode 100755
index 000000000..23ccea2f8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Stateless universal device CONV kernel type that treats CONV as
+ * a composition of a collective mainloop and a collective epilogue.
+**/
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_ = void,
+  class Enable = void
+>
+class ConvUniversal {
+  static_assert(cutlass::detail::dependent_false<Enable>,
+      "Could not find a valid specialization at the kernel layer to dispatch against.");
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
new file mode 100755
index 000000000..79bedb2c8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
@@ -0,0 +1,322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
+#include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_fusion.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultConvEpilogueWithBroadcastSimt {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimt<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess,
+    false,
+    PermuteDLayout,
+    StrideSupport,
+    Rank
+  >::Epilogue;
+};
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastSimtStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  ElementTensor,
+  ElementVector,
+  OutputOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  OutputOp,
+  ReductionOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Defaults for strided Dgrad
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
new file mode 100755
index 000000000..c5a8b1315
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -0,0 +1,1927 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dDgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                               OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA 
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB 
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for optimized IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kUnity
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
new file mode 100755
index 000000000..9fbd97e58
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -0,0 +1,2007 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport, 
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+    Stages, MathOperatorTag, true
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA 
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
new file mode 100755
index 000000000..8589ace02
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
@@ -0,0 +1,357 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv2dFpropFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
new file mode 100755
index 000000000..76bc12886
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Defines a default configuration for convolution with absolute maximum calculation.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithAbsMax {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithAbsMax<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
new file mode 100755
index 000000000..0825789ce
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
@@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
new file mode 100755
index 000000000..e6e8a8220
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename EpilogueReductionOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithReduction {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithReductionTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
new file mode 100755
index 000000000..e2deaf6fe
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
@@ -0,0 +1,622 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dGroupFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dGroupFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline that supports all GroupMode.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport, 
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA,
+      GroupMode
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB,
+      GroupMode
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and
+/// 2 stage pipeline that supports all GroupMode.
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and multistage
+/// pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and
+/// 2 stage pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
new file mode 100755
index 000000000..d0e52dfe3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -0,0 +1,1011 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dWgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
new file mode 100755
index 000000000..110e07db9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv2dWgradFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgradFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    IteratorScaleBias,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgradFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    IteratorScaleBias,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
new file mode 100755
index 000000000..cb50ba49b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -0,0 +1,736 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dDgrad;
+
+/// Defines a kernel for Conv3dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
new file mode 100755
index 000000000..41fdd64a5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -0,0 +1,981 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.    
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv3dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic Iterator Algorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized Iterator Algorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB; 
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
new file mode 100755
index 000000000..d0457d572
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv3dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv3dFpropFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
new file mode 100755
index 000000000..0fc291e60
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv3dFpropWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv3dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
new file mode 100755
index 000000000..4ed5e0c1b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -0,0 +1,936 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dWgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
new file mode 100755
index 000000000..4db152cd7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
@@ -0,0 +1,999 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDeconv2d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
new file mode 100755
index 000000000..d11432ed3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv2d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv2dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimtStridedDgrad<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
new file mode 100755
index 000000000..70800c7af
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultDeconv3d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      true /*IsDeconv*/
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        true /*IsDeconv*/
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        true /*IsDeconv*/
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
new file mode 100755
index 000000000..affe7a06f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
@@ -0,0 +1,309 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv3d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv3dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
new file mode 100755
index 000000000..aa4f2c359
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level Depthwise implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/kernel/direct_convolution.h"
+
+#include "cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_pipelined.h"
+
+// Direct Conv Related Header files
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h"
+
+#include "cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = cutlass::sizeof_bits<ElementB>::value / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop with direct convolution algorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  // MatrixShape<Height, Width>
+  typename StrideShape = cutlass::MatrixShape<-1, -1>,
+  // MatrixShape< Height, Width> 
+  typename DilationShape =  cutlass::MatrixShape<-1, -1>, 
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseDirect2dConvFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for Analytic IteratorAlgorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag, //   cutlass::arch::OpMultiplyAdd
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      sizeof_bits<ElementB>::value,
+      2,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        cutlass::conv::GroupMode::kDepthwise
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedStrideDilation,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+
+
+
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  static_assert(StrideShape::kRow >= 0 && StrideShape::kColumn >= 0, "Stride should be fixed");
+  static_assert(DilationShape::kRow >= 0 && DilationShape::kColumn >= 0, "Stride should be fixed");
+
+  // Activations loaded by threadblock
+  static int const ActivationShapeH = (ThreadBlockOutputShape::kH - 1) * StrideShape::kRow +
+                             (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ActivationShapeW = (ThreadBlockOutputShape::kW - 1) * StrideShape::kColumn +
+                             (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ActivationShape =
+      cutlass::conv::TensorNHWCShape<1, ActivationShapeH, ActivationShapeW, ThreadblockShape::kN >;
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag,
+      IteratorAlgorithm::kFixedStrideDilation,
+      StrideShape,
+      DilationShape,
+      ActivationShape>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      StrideShape,
+      DilationShape,
+      ActivationShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue,
+    IteratorAlgorithm::kFixedStrideDilation
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
new file mode 100755
index 000000000..5e4299564
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
@@ -0,0 +1,505 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multi-staged Depthwise Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,            ///! Epilogue
+          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
+          conv::Operator ConvOperator,   ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+          typename Arguments_,           ///! Kernel Arguments
+          typename ConvOutputIteratorParameter_, ///! Output Iterator Params
+          typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+          conv::GroupMode GroupMode_ = conv::GroupMode::kNone,  ///! Group mode
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >  ///! OutputShape per ThreadBlock
+struct DirectConvolutionParams {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+  using Arguments = Arguments_;
+  using ConvOutputIteratorParameter = ConvOutputIteratorParameter_;
+
+  using ThreadblockShape = typename Mma::Shape;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+  static int const kStages = Mma::kStages;
+
+  ConvProblemSize problem_size;
+  cutlass::gemm::GemmCoord grid_tiled_shape;
+  gemm::GemmCoord implicit_gemm_problem_size;
+  int swizzle_log_tile;
+  int smem_size_;
+
+  int gemm_k_iterations;
+  int gemm_k_iterations_per_channel;
+  typename Mma::IteratorA::Params iterator_A;
+  typename Mma::IteratorA::Element const *ptr_A;
+  typename Mma::IteratorB::Params iterator_B;
+  typename Mma::IteratorB::Element const *ptr_B;
+  typename Mma::IteratorB::Element *ptr_reordered_B;
+  typename Epilogue::OutputTileIterator::Params iterator_C;
+  typename Epilogue::OutputTileIterator::Element *ptr_C;
+  typename Epilogue::OutputTileIterator::Params iterator_D;
+  typename Epilogue::OutputTileIterator::Element *ptr_D;
+  typename EpilogueOutputOp::Params output_op;
+  int *semaphore;
+  SplitKMode split_k_mode;
+  int split_k_slices;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams() : swizzle_log_tile(0), gemm_k_iterations(0) {}
+
+  ///
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams(Arguments const &args, int *semaphore = nullptr)
+      : problem_size(args.problem_size),
+        implicit_gemm_problem_size(
+            cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+        iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+        ptr_A(args.ref_A.data()),
+        iterator_B(Mma::IteratorB::getParams(args.problem_size, args.ref_B.layout())),
+        ptr_B(args.ref_B.data()),
+        ptr_reordered_B(args.ref_reordered_B.data()),
+        iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size),
+        ptr_C(args.ref_C.data()),
+        iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size),
+        ptr_D(args.ref_D.data()),
+        output_op(args.output_op),
+        semaphore(semaphore),
+        split_k_mode(args.split_k_mode),
+        split_k_slices(args.problem_size.split_k_slices) {
+    gemm_k_iterations =
+        depthwise_gemm_k_iterations<ThreadBlockOutputShape::kN,
+                                    ThreadBlockOutputShape::kH,
+                                    ThreadBlockOutputShape::kW>(kConvolutionalOperator,
+                                                                ThreadblockShape::kK,
+                                                                args.problem_size,
+                                                                kIteratorAlgorithm,
+                                                                kGroupMode,
+                                                                ThreadblockShape::kN);
+
+    gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+        kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+
+    // Dynamic SMEM usage because stride and dilation are runtime params.
+    smem_size_ = (max(iterator_A.activation_size, int(sizeof(typename Epilogue::SharedStorage))) * kStages + iterator_B.filter_size);
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_smem_size() {
+    // Dynamic Smem Size
+    return smem_size_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Params_, typename ElementB_>
+struct ReorderKernel {
+  using Params = Params_;
+  using ElementB = ElementB_;
+
+  union SharedStorage {};
+
+  static unsigned int const kReorderKernelThreadPerCTA = 128;
+
+  CUTLASS_HOST_DEVICE
+  ReorderKernel() {}
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(Params const &params) {
+    return dim3{static_cast<unsigned int>(
+                    (params.problem_size.filter_size() + kReorderKernelThreadPerCTA - 1) /
+                    kReorderKernelThreadPerCTA),
+                1,
+                1};
+  }
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_block_shape() { return dim3{kReorderKernelThreadPerCTA, 1, 1}; }
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int64_t m = static_cast<int64_t>(params.problem_size.groups);
+    int64_t n = static_cast<int64_t>(params.problem_size.filter_size() / params.problem_size.K);
+    const ElementB *src_with_type = static_cast<const ElementB *>(params.ptr_B);
+    ElementB *dst_with_type = static_cast<ElementB *>(params.ptr_reordered_B);
+
+    int64_t linear_index = blockIdx.x * kReorderKernelThreadPerCTA + threadIdx.x;
+    int64_t index_m = linear_index / n;
+    int64_t index_n = linear_index % n;
+    int64_t new_linear_index = index_m + index_n * m;
+
+    if (linear_index < m * n) {
+      dst_with_type[new_linear_index] = src_with_type[linear_index];
+    }
+    return;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,    ///! Group mode
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+struct DirectConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename cutlass::gemm::GemmShape<1, 1, 1>;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefB ref_reordered_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      TensorRefB const & ref_reordered_B = nullptr,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_reordered_B(ref_reordered_B),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  using Params =
+      typename cutlass::conv::kernel::DirectConvolutionParams<Mma,
+                                                              Epilogue,
+                                                              ThreadblockSwizzle,
+                                                              kConvolutionalOperator,
+                                                              Arguments,
+                                                              ConvOutputIteratorParameter,
+                                                              ConvProblemSize,
+                                                              kGroupMode,
+                                                              ThreadBlockOutputShape>;
+
+  using ReorderKernel = typename cutlass::conv::kernel::ReorderKernel<Params, ElementB>;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if threadblock is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+    int iterator_column_offset = 0;
+    int filter_row_offset = 0;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode == GroupMode::kDepthwise) {
+        iterator_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+        iterator_column_offset
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_reordered_B,
+      thread_idx,
+      MatrixCoord(
+        filter_row_offset,
+        iterator_column_offset
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+
+    // Compute threadblock-scoped matrix multiply-add
+    // Epilogue is fused in the mainloop
+    mma(params.gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        params.iterator_A,
+        iterator_B,
+        params.iterator_B,
+        accumulators,
+        epilogue,
+        output_op,
+        iterator_D,
+        iterator_C,
+        params.split_k_slices);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
new file mode 100755
index 000000000..b1e0b477a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -0,0 +1,455 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone    ///! Group mode
+>
+struct ImplicitGemmConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    int gemm_k_iterations_per_channel;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), gemm_k_iterations(0) { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(
+        kConvolutionalOperator,
+        ThreadblockShape::kK,
+        args.problem_size,
+        kIteratorAlgorithm,
+        kGroupMode,
+        ThreadblockShape::kN);
+
+      gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+          kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+    int iterator_A_column_offset = threadblock_tile_idx.k() * Mma::Shape::kK;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode != GroupMode::kDepthwise) {
+        int k_per_group = params.problem_size.K / params.problem_size.groups;
+        int group_idx = threadblock_tile_idx.n() * Mma::Shape::kN / k_per_group;
+        int channels_per_group = params.problem_size.C / params.problem_size.groups;
+        iterator_A_column_offset += group_idx * channels_per_group;
+      } else {
+        iterator_A_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        iterator_A_column_offset
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, params.gemm_k_iterations_per_channel);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
new file mode 100755
index 000000000..74ecae401
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
@@ -0,0 +1,461 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined fused activation's scale+bias+relu and Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionFusion {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+
+  using ElementScaleBias = typename Mma::IteratorScaleBias::Element;
+  using LayoutScaleBias = typename Mma::IteratorScaleBias::Layout;
+
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+ 
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefScaleBias = typename Mma::IteratorScaleBias::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefScaleBias ref_scale;
+    TensorRefScaleBias ref_bias;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefScaleBias const & ref_scale,
+      TensorRefScaleBias const & ref_bias,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_scale(ref_scale),
+      ref_bias(ref_bias),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    gemm::GemmCoord implicit_gemm_problem_size{};
+    int swizzle_log_tile{0};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Mma::IteratorScaleBias::Params iterator_scale_bias{};
+    typename Mma::IteratorScaleBias::Element const *ptr_scale = nullptr;
+    typename Mma::IteratorScaleBias::Element const *ptr_bias = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C {};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D {};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_scale_bias(args.problem_size, args.ref_scale.layout()),
+      ptr_scale(args.ref_scale.data()),
+      ptr_bias(args.ref_bias.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionFusion() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A operand
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    // Construct iterators to B operand
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+ 
+    // Construct iterators to A scale/bias vector
+    typename Mma::IteratorScaleBias iterator_scale_bias(
+      params.iterator_scale_bias,
+      params.problem_size,
+      params.ptr_scale,
+      params.ptr_bias,
+      thread_idx,
+      MatrixCoord(
+        0, (kConvolutionalOperator == conv::Operator::kFprop) ?
+                  (threadblock_tile_idx.k() * Mma::Shape::kK) :
+                  // Wgrad
+                  (threadblock_tile_idx.n() * Mma::Shape::kN)
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A,
+        iterator_B, iterator_scale_bias, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
new file mode 100755
index 000000000..bf00f90ba
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
@@ -0,0 +1,492 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionStridedDgrad {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+  
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  // Strided dgrad uses a specialized threadblock swizzle for functionality and performance
+  static_assert((platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradHorizontalThreadblockSwizzle>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<1>>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<4>>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<8>>::value),
+    "Needs ThreadblockSwizzle type specialized for strided dgrad");
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size{};
+    TensorRefA ref_A{};
+    TensorRefB ref_B{};
+    TensorRefC ref_C{};
+    TensorRefC ref_D{};
+    typename EpilogueOutputOp::Params output_op{};
+    SplitKMode split_k_mode{};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    FastDivmod stride_h_divmod{};
+    FastDivmod stride_w_divmod{};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C{};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D{};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      stride_h_divmod(args.problem_size.stride_h),
+      stride_w_divmod(args.problem_size.stride_w),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size, ThreadblockShape::kM),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size, ThreadblockShape::kM),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+      
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+  
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionStridedDgrad() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Compute starting filter position for strided dgrad
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(params.problem_size, 
+                                                            ThreadblockShape::kM);
+    int filter_tile_m = (threadblock_tile_idx.m() / tile_m_per_filter);
+    
+
+    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
+    //
+    // int start_r = filter_tile_m / (params.problem_size.stride_w);
+    // int start_s = filter_tile_m % (params.problem_size.stride_w);
+
+    int start_r, start_s;
+    params.stride_w_divmod(start_r, start_s, filter_tile_m);
+
+    int filter_r = start_r;
+    int filter_s = start_s;
+
+    if (params.problem_size.mode == Mode::kConvolution) {
+      filter_r = (params.problem_size.R - 1 - filter_r);
+      filter_s = (params.problem_size.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      params.problem_size,
+      params.stride_h_divmod, params.stride_w_divmod,
+      filter_r, filter_s,
+      start_h, start_w);
+
+    if (start_h >= params.problem_size.H || start_w >= params.problem_size.W) {
+      return;
+    }
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    // Check if CTA contributes valid MMA (Dy * w) and accumulator will be non-zero after MMA
+    if (start_r < params.problem_size.R && start_s < params.problem_size.S) {
+      // Scale gemm_k_iterations for strided dgrad
+      int gemm_k_iterations = (params.gemm_k_iterations / (params.problem_size.R * params.problem_size.S)
+                              ) * params.problem_size.num_gemm_k_filter_positions(start_r, start_s);
+      
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.iterator_A,
+        params.problem_size,
+        params.ptr_A,
+        thread_idx,
+        params.stride_h_divmod, params.stride_w_divmod,
+        start_r, start_s,
+        MatrixCoord(
+          threadblock_tile_idx.m() * Mma::Shape::kM,
+          threadblock_tile_idx.k() * Mma::Shape::kK
+        ) 
+      );
+      
+      typename Mma::IteratorB iterator_B(
+        params.iterator_B,
+        params.problem_size,
+        params.ptr_B,
+        thread_idx,
+        start_r, start_s,
+        MatrixCoord(
+          threadblock_tile_idx.k() * Mma::Shape::kK,
+          threadblock_tile_idx.n() * Mma::Shape::kN
+        )
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      params.stride_h_divmod, params.stride_w_divmod,
+      start_r, start_s,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    if (output_op.is_source_needed())
+    {
+      // Tile iterator reading from source accumulator tensor
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.iterator_C,
+        params.ptr_C,
+        ConvOutputIteratorParameter::extent(params.problem_size),
+        thread_idx,
+        params.stride_h_divmod, params.stride_w_divmod,
+        start_r, start_s,
+        threadblock_offset);
+
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_idx.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_idx.k());
+      }
+
+      // Run epilogue with addend source iterator
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+    else
+    {
+      // Run epilogue without addend source iterator
+      epilogue(output_op, iterator_D, accumulators);
+    }
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
new file mode 100755
index 000000000..b05fd2d3e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
@@ -0,0 +1,494 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Convolution kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionWithAbsMax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+  using TensorRefAux = cutlass::TensorRef<typename EpilogueOutputOp::ElementAuxOutput, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    TensorRefC ref_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      TensorRefAux const & ref_Aux,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial,
+      void * ptr_Vector = nullptr,
+      typename LayoutC::Stride::Index ldr = 0
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      ref_Aux(ref_Aux),
+      output_op(output_op),
+      split_k_mode(split_k_mode),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename Epilogue::AuxOutputTileIterator::Params iterator_Aux;
+    typename Epilogue::AuxOutputTileIterator::Element *ptr_Aux;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0), 
+      gemm_k_iterations(0),
+      ptr_Vector(nullptr),
+      ldr(0)
+    { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      iterator_Aux(ConvOutputIteratorParameter::layout(args.ref_Aux)),
+      ptr_Aux(args.ref_Aux.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode),
+      ptr_Vector(args.ptr_Vector), 
+      ldr(args.ldr)
+
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionWithAbsMax() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.iterator_Aux,
+      params.ptr_Aux,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector = 
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
+    }
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             ConvOutputIteratorParameter::extent(params.problem_size),
+             threadblock_offset);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
new file mode 100755
index 000000000..1f27e0686
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
@@ -0,0 +1,499 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionWithFusedEpilogue {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial,
+      void * ptr_Vector = nullptr,
+      void * ptr_Tensor = nullptr,
+      typename LayoutC::Stride::Index ldr = 0,
+      typename LayoutC::Stride::Index ldt = 0
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      ldr(ldr),
+      ldt(ldt)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+    void * ptr_Tensor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0), 
+      gemm_k_iterations(0),
+      ptr_Vector(nullptr),
+      ldr(0),
+      ptr_Tensor(nullptr)
+    { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode),
+      params_Tensor(args.ldt),
+      ptr_Vector(args.ptr_Vector), 
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor)
+
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionWithFusedEpilogue() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::ElementTensor *ptr_Tensor = 
+      static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector = 
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        ConvOutputIteratorParameter::extent(params.problem_size),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
+    }
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             tensor_iterator,
+            ConvOutputIteratorParameter::extent(params.problem_size),
+             threadblock_offset);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
new file mode 100755
index 000000000..657ac6b3e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class ConvUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelImplicitTmaWarpSpecializedSm90, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+> : public cutlass::gemm::kernel::GemmUniversal< 
+  ProblemShape_, 
+  CollectiveMainloop_, 
+  CollectiveEpilogue_, 
+  TileScheduler_
+>
+{};
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h b/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
new file mode 100755
index 000000000..37ece7927
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for depthwise convolution
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// MMA operation
+template <
+  /// Size of the matrix product (concept: GemmShape)
+  typename Shape_,
+  /// Number of threads participating
+  int kThreads_,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Inner product operator
+  typename Operator
+>
+struct ElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// General implementation
+template <
+    /// Size of the matrix product (concept: GemmShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Element type of C matrix
+    typename ElementC_>
+struct ElementwiseInnerProduct<Shape_, 1, ElementA_, ElementB_, ElementC_, arch::OpMultiplyAdd> {
+  using Shape = Shape_;
+  using Operator = arch::OpMultiplyAdd;
+  using ElementC = ElementC_;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Array<ElementC_, Shape::kN> &d,
+                  Array<ElementA_, Shape::kN> const &a,
+                  Array<ElementB_, Shape::kN> const &b,
+                  Array<ElementC_, Shape::kN> const &c) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Shape::kN; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of half_t
+template <>
+struct ElementwiseInnerProduct<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  half_t,
+  half_t,
+  arch::OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator =  arch::OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 tmp_D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> const &>(tmp_D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
+  typename Operator = arch::OpMultiplyAdd,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+struct DepthwiseDirectConvElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Operator used to compute GEMM
+  typename Operator_
+>
+struct DepthwiseDirectConvElementwiseInnerProductGeneric {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = Operator_;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMN>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = cutlass::conv::thread::ElementwiseInnerProduct<
+    gemm::GemmShape<Shape::kN, Shape::kN, 1>,
+    1,
+    ElementA,
+    ElementB,
+    ElementC,
+    Operator>;
+
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+    Array<ElementC, Shape::kN> *ptr_D = reinterpret_cast<Array<ElementC, Shape::kN> *>(&D);
+    Array<ElementA, Shape::kN> const *ptr_A =
+        reinterpret_cast<Array<ElementA, Shape::kN> const *>(&A);
+    Array<ElementB, Shape::kN> const *ptr_B =
+        reinterpret_cast<Array<ElementB, Shape::kN> const *>(&B);
+
+    MmaOp mma_op;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN / MmaOp::Shape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+
+          Array<ElementC, MmaOp::Shape::kN> tmpD = ptr_D[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementA, MmaOp::Shape::kN> tmpA = ptr_A[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementB, MmaOp::Shape::kN> tmpB = ptr_B[n];
+
+          mma_op(tmpD, tmpA, tmpB, tmpD);
+
+          ptr_D[m * Shape::kN / MmaOp::Shape::kN + n] = tmpD;
+
+        }
+      }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+    /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_
+>
+struct DepthwiseDirectConvElementwiseInnerProduct<
+  Shape_,
+  ElementA_,
+  ElementB_,
+  ElementC_,
+  arch::OpMultiplyAdd
+  > {
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA =
+      Array<ElementA, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;  // 1 * groups_per_thread
+
+  /// C operand storage
+  using FragmentC =
+      Array<ElementC, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  static bool const use_optimized = 0;
+
+  using ArchMmaOperator =  DepthwiseDirectConvElementwiseInnerProductGeneric<Shape,
+                                                        ElementA,
+                                                        ElementB,
+                                                        ElementC,
+                                                        Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    ArchMmaOperator mma;
+
+    mma(D, A, B, C);
+
+  }
+};
+
+} // namespace thread
+} // namespace conv
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..978c14feb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,485 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorAnalytic strided dgrad needs special handling to skip MMAs
+// on non-contributing w positions
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+  
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Moves filter_s
+    filter_s_ += problem_size_.stride_w;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    // Restore filter_s
+    filter_s_ = start_s_;
+
+    // Move filter_r 
+    filter_r_ += problem_size_.stride_h;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    // Restore filter_r
+    filter_r_ = start_r_;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+    
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorAnalytic unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+>{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+  
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..6fb1cb18e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,619 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+  > {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dStridedDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dStridedDgradFilterIteratorOptimizedParams const &base): 
+      Conv2dStridedDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dStridedDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv2dStridedDgradFilterIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int start_r_;
+  int start_s_;
+
+  int64_t reset_bytes_s_;
+  int64_t reset_bytes_r_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized(
+    Conv2dStridedDgradFilterIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
+    reset_bytes_r_ = reset_bytes_s_ +
+                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+          
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    TensorCoord coord{filter_k_, filter_r_, filter_s_, column};
+
+    pointer_ += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+
+    int next_idx = 0;
+    LongIndex reset_bytes = params_.reset_bytes;
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ >= problem_size_.S) {
+      
+      // Restore filter_s
+      filter_s_ = start_s_;
+
+      // Move filter_r by stride_h
+      filter_r_ += problem_size_.stride_h;
+#if 0
+      bool check = (filter_r_ < problem_size_.R);
+
+      filter_r_ = check ? filter_r_ : start_r_;
+      next_idx = check ? 1 : 2;
+      reset_bytes += (check ? reset_bytes_s_ : reset_bytes_r_);
+#else
+    asm volatile(
+        "{\n\t"
+        " .reg .pred %%p;\n\t"
+        " .reg .s64 t1;\n\t"
+        " setp.lt.s32 %%p, %3, %4;\n\t"
+        " selp.s32 %0, %3, %5, %%p;\n\t"
+        " selp.s32 %1, 1, 2, %%p;\n\t"
+        " selp.s64 t1, %6, %7, %%p;\n\t"
+        " add.s64 %2, %8, t1;\n\t"
+        "}\n"
+        : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+        : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+          "l"(reset_bytes_s_), "l"(reset_bytes_r_), "l"(reset_bytes));
+#endif
+    }
+
+    // offset pointers by offset_bytes
+    pointer_ += (params_.inc_next[next_idx] - reset_bytes);
+
+    if (next_idx == 2) {
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+  > {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dDgradFilterIteratorOptimizedParams const &base): 
+      Conv2dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv2dDgradFilterIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_rs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized(
+    Conv2dDgradFilterIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_rs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+          
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[2] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..1de41f3f7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0),
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int filter_r = filter_r_;
+    int filter_s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      filter_r = (problem_size_.R - 1 - filter_r);
+      filter_s = (problem_size_.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      problem_size_, 
+      stride_h_divmod, stride_w_divmod, 
+      filter_r, filter_s, 
+      start_h, start_w);
+
+    // Effective P and Q for filter position required for remapping NHW rows
+    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
+    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
+
+      // (STEP 1) [reorder NHW rows to start with same filter positions]
+      offset_n_[s] = offset_npq / (P * Q);
+      int residual = offset_npq % (P * Q);
+
+      int p = (residual / Q);
+      int q = (residual % Q);
+
+      int mapped_h = (start_h + p * problem_size_.stride_h);
+      int mapped_w = (start_w + q * problem_size_.stride_w);
+      
+      // Access (p, q) coordinates for Dy tensor and a filter position in gemm_k=0
+      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are divisible 
+      // by stride_h and stride_w
+      offset_p_[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
+      offset_q_[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, 
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+
+    // Restore filter_s 
+    filter_s_ = start_s_;
+
+    // Move filter_r by stride_h
+    filter_r_ +=  problem_size_.stride_h;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+
+    // Restore filter_r 
+    filter_r_ = start_r_;
+
+    // Move filter_k
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_]; 
+    int q = offset_q_[iteration_strided_];
+    
+    int conv_sign = (problem_size_.mode == Mode::kConvolution ? 1 : -1);
+
+    p += (conv_sign * (filter_r_ / problem_size_.stride_h));
+    q += (conv_sign * (filter_s_ / problem_size_.stride_w));
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements; 
+
+    return TensorCoord(
+      n, 
+      p, 
+      q, 
+      k);
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return 
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic for unity strides can be optimized by 
+// eliminating modulo arithmetic to compute unscaled coordinates 
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < 
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int n = offset_n_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h) / problem_size_.stride_h;
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w) / problem_size_.stride_w;
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(n, p, q, k);
+  }
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // Conv2dDgradFilterTileAccessIteratorAnalytic unity stride specialization 
+    // only supports (stride_h, stride_w) = (1, 1)
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..ffa13c934
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,821 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2dDgradOutputGradientTileAccessIteratorOptimized strided dgrad needs special handling 
+// to skip MMAs (Dx = Dy * w) on invalid filter positions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  using Mask = uint64_t;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dStridedDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+  
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+  int64_t reset_bytes_s_;
+  int64_t reset_bytes_r_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    filter_k_(0),
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
+
+    reset_bytes_r_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0] +
+                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    int filter_r = filter_r_;
+    int filter_s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      filter_r = (problem_size_.R - 1 - filter_r);
+      filter_s = (problem_size_.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      problem_size_, 
+      stride_h_divmod, stride_w_divmod, 
+      filter_r, filter_s, 
+      start_h, start_w);
+
+
+    // Effective starting P and Q for filter position required for remapping NHW rows
+    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
+    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);      
+
+      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
+
+      // (STEP 1) [reorder NHW rows to start with same filter positions]
+      offset_n[s] = offset_npq / (P * Q);
+      int residual = offset_npq % (P * Q);
+
+      int p = (residual / Q);
+      int q = (residual % Q);
+
+      int mapped_h = (start_h + p * problem_size_.stride_h);
+      int mapped_w = (start_w + q * problem_size_.stride_w);
+      
+      // Access (p, q) coordinates for Dy tensor for filter position in gemm_k=0
+      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are ensured to be 
+      // divisible by stride_h and stride_w
+      offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
+      offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
+
+      // Initialize pointers for gemm_k=0
+      TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    //
+    // Precompute mask predicates
+    //
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = start_r; r < problem_size_.R; r += problem_size_.stride_h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int p = offset_p[s_idx] ;
+
+        p += (params_.conv_sign * (r / problem_size_.stride_h));
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for(int s = start_s; s < problem_size_.S; s += problem_size_.stride_w) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int q = offset_q[s_idx];
+        q += (params_.conv_sign * (s / problem_size_.stride_w));
+
+        bool pred = (q >=0 && q < problem_size_.Q);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size.K);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, 
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn});
+  }
+
+private:
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset, LongIndex byte_reset = 0) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset - byte_reset;
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+
+    int next_idx = 0;
+    int64_t reset_bytes = 0;
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ >= problem_size_.S) {
+      
+      // Restore filter_s
+      filter_s_ = start_s_;
+
+      // Move filter_r by stride_h
+      filter_r_ += problem_size_.stride_h;
+#if 0
+      if (filter_r_ < problem_size_.R) {
+
+        next_idx = 1;
+
+        // Restore bytes in q coordinate (Mma in filter s dimension)
+        reset_bytes = reset_bytes_s_;
+
+      } else {
+
+        // Restore filter_r
+        filter_r_ = start_r_;
+
+        next_idx = 2;
+
+        // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
+        reset_bytes = reset_bytes_r_;
+      }
+#else
+      asm volatile(
+          "{\n\t"
+          " .reg .pred %%p;\n\t"
+          " setp.lt.s32 %%p, %3, %4;\n\t"
+          " selp.s32 %0, %3, %5, %%p;\n\t"
+          " selp.s32 %1, 1, 2, %%p;\n\t"
+          " selp.s64 %2, %6, %7, %%p;\n\t"
+          "}\n"
+          : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+          : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+            "l"(reset_bytes_s_), "l"(reset_bytes_r_));
+#endif
+    }
+
+    // offset pointers by offset_bytes
+    add_byte_offset_(params_.inc_next[next_idx] - reset_bytes);
+
+    if (next_idx == 2) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+      }
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+    }
+  }
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // Limit on filter size
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2dDgradOutputGradientTileAccessIteratorOptimized unity stride dgrad is optimized for dgrad
+// with problem stride = {1x1}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+> {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Conv2dDgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dDgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      //  int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      params_.hw_divmod(offset_n[s], residual, offset_nhw);
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_h[s], offset_w[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_k_ + v_idx * AccessType::kElements >= problem_size.K);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // activation nhw and filter position k, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int h, int w, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, p, q, filter_k_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+  
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+      }
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..9317ea0cd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,332 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone
+>
+class Conv2dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_c_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_init_;
+  int group_idx_offset_;
+  int channels_per_group_;
+  int crs_cnt_;
+  int crs_per_group_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_(0),
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      channels_per_group_ = problem_size_.C / problem_size_.groups;
+      crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kColumn - 1) / Shape::kColumn);
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        // moves to next group
+        crs_cnt_ = 0;
+        ++group_idx_offset_;
+        filter_c_ = group_idx_offset_ * channels_per_group_ + filter_c_init_;
+      } else {
+        filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+      }
+    }
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    int c = filter_c_ + iteration_vector_ * AccessType::kElements; 
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
new file mode 100755
index 000000000..5a4489c01
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kColumn;
+
+  static int const kAccessesPerVector = kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.column() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int r = 0;
+    int s = 0;
+    int c = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+
+      int rs_index = (rsc_index / problem_size_.C);
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
new file mode 100755
index 000000000..3f1f2bc14
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,353 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kColumn / AccessType::kElements;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    //
+    // This requires problem_size.C == AccessType::kElements
+    //
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.column() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..243d724b3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,422 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  using Mask = uint64_t;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFpropActivationIteratorOptimizedParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      //  int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      params.pq_divmod(offset_n[s], residual, offset_npq);
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_p[s], offset_q[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && h >= 0 && h < problem_size_.H);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output npq and filter position r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int p, int q, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, h, w, filter_c_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+    }
+  }
+   
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
+        masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
+      }
+    }
+  } 
+   
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
+      masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Conv2dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..1725db5af
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,330 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,
+  bool IsDeconv_ = false
+>
+class Conv2dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+  int filter_c_init_;
+  int crs_cnt_;
+  int crs_per_group_;  
+  int group_idx_offset_c_;
+  int channels_per_group_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+  int group_idx_offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_c_(0),
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      if (kGroupMode == conv::GroupMode::kDepthwise){
+        channels_per_group_ = 1;
+        crs_per_group_ = problem_size_.S * problem_size_.R;
+      } else {
+        channels_per_group_ = input_channels / problem_size_.groups;
+        crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kRow - 1) / Shape::kRow);
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+      if (kGroupMode != conv::GroupMode::kNone && kGroupMode != conv::GroupMode::kDepthwise) {
+        group_idx_offset_k_[s] = (thread_coord.strided() + s * ThreadMap::Delta::kStrided) / (output_channels / problem_size_.groups);
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        crs_cnt_ = 0;
+        filter_c_ = filter_c_init_;
+        if (kGroupMode != conv::GroupMode::kDepthwise) {
+          // moves to next group
+          ++group_idx_offset_c_;
+        }
+      } else {
+        filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+      }
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = filter_c_ + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      return coord.n() < output_channels && coord.c() < input_channels;
+    } else if (kGroupMode == conv::GroupMode::kDepthwise) {
+      return coord.n() < output_channels && coord.c() < 1; // channels_per_group_ is always equal to ONE.
+    } else {
+      return coord.n() < output_channels && coord.c() < channels_per_group_ &&
+             group_idx_offset_c_ == group_idx_offset_k_[iteration_strided_];
+    }
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (output_channels % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (output_channels % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
new file mode 100755
index 000000000..a1291aa01
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kRow;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.row() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int c = 0;
+    int s = 0;
+    int r = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+      int rs_index = (rsc_index / problem_size_.C);
+
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.K &&
+      coord.h() >= 0 &&
+      coord.h() < problem_size_.R &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
new file mode 100755
index 000000000..e90d50174
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kRow / AccessType::kElements;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.row() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.h() >= 0 && coord.h() < problem_size_.R;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..4c2343c32
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  bool IsDeconv_ = false
+>
+class Conv2dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_rs_;
+  int filter_c_;
+  int channels_per_group_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized(
+    Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_rs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+    channels_per_group_ = (IsDeconv ? problem_size_.K : problem_size_.C) / problem_size_.groups;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+        predicates_[v_idx] |= (pred << s);
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+ 
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    predicates_[v] = clear ? 0u : predicates_[v];
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_[iteration_vector_] & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (output_channels % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (output_channels % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
new file mode 100755
index 000000000..d34bc9faf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
@@ -0,0 +1,893 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams(
+    Conv2dProblemSize const &,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFewChannelsParams {
+
+  using Layout = Layout_;
+
+
+  int32_t stride_w;
+  int32_t stride_h;
+  int32_t stride_n;
+
+  FastDivmod divmod_P;
+  FastDivmod divmod_Q;
+  FastDivmod divmod_S;
+  FastDivmod divmod_C;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams(
+    Conv2dProblemSize const &problem_size,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ):
+    stride_w(int32_t(layout.stride()[0])),
+    stride_h(int32_t(layout.stride()[1])),
+    stride_n(int32_t(layout.stride()[2])),
+    divmod_P(problem_size.P),
+    divmod_Q(problem_size.Q),
+    divmod_S(problem_size.S),
+    divmod_C(problem_size.C)
+  {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams
+struct Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int tiled_rows_per_filter;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                            ///< layout object
+    int element_size_bits,                           ///< size of each element in bits
+    MatrixCoord threadblock_shape
+  ): layout(layout) {
+    
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
+  
+    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
+    
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+
+CUTLASS_HOST_DEVICE
+void TraceIteratorParams(
+  char const *conv_operator, 
+  char const *operand,
+  int element_size_bits,
+  MatrixCoord threadblock_shape,
+  int thread_count,
+  int access_size,
+  layout::PitchLinearCoord threadmap_iterations,
+  layout::PitchLinearCoord threadmap_delta
+) {
+ 
+#if !defined(__CUDA_ARCH__)
+
+  char const *fname = "conv_iterator_params.csv";
+
+  std::ifstream test(fname);
+  bool file_exists = test.is_open();
+
+  if (file_exists) {
+    test.close();
+  }
+ 
+  std::ofstream trace("conv_iterator_params.csv", std::ofstream::app);
+
+  if (!file_exists) {
+    trace 
+      << "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize,"
+      << "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n";
+  }
+
+  trace << conv_operator << "," << operand << "," << element_size_bits << "," 
+    << threadblock_shape.row() << "," << threadblock_shape.column()
+    << "," << thread_count << "," << access_size 
+    << "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided()
+    << "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n";
+#endif
+}
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \
+  TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta);
+
+#else
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template<>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q), 
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+
+#if ENABLE_CONV2D_PARAMS_PRINT
+  /// Prints internal state.
+  CUTLASS_HOST_DEVICE
+  void print() {
+    auto stride = layout.stride();
+    printf(
+      "Conv2dFpropActivationIteratorOptimizedParams:\n"
+      "  layout(w: %d, h: %d, n: %d)\n"
+      "  inc_next[%ld, %ld, %ld]\n"
+      "  filter_c_delta(%d) - PQ(%d)\n"
+      "  pq_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n"
+      "  q_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n",
+      stride[0], stride[1], stride[2],
+      inc_next[0], inc_next[1], inc_next[2],
+      filter_c_delta,
+      PQ,
+      pq_divmod.divisor,
+      pq_divmod.multiplier,
+      pq_divmod.shift_right,
+      q_divmod.divisor,
+      q_divmod.multiplier,
+      q_divmod.shift_right
+    );
+  }
+#endif  
+};
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template <int Interleaved_>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNCxHWx<Interleaved_>> {
+  static int const kInterleaved = Interleaved_;
+ 
+  using Layout = layout::TensorNCxHWx<kInterleaved>;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[0]) * problem_size.dilation_h
+        - (problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1])
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorNHWC>
+{
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+
+#if ENABLE_CONV2D_PARAMS_PRINT
+  /// Prints internal state.
+  CUTLASS_HOST_DEVICE
+  void print() {
+    auto stride = layout.stride();
+    printf(
+      "Conv2dFpropFilterIteratorOptimizedParams:\n"
+      "  layout[%d, %d, %d]\n"
+      "  RS(%d), filter_c_delta(%d), inc_next(k: %ld, rs: %ld, c: %ld)\n",
+      stride[0], stride[1], stride[2],
+      RS,
+      filter_c_delta,
+      inc_next_k, inc_next_rs, inc_next_c
+    );
+  }
+#endif
+};
+
+template<int Interleaved_>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorCxRSKx<Interleaved_>>
+{
+  static int const kInterleaved = Interleaved_;
+  using Layout = layout::TensorCxRSKx<kInterleaved>;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      (  int64_t(layout.stride()[0])
+        - kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2])
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved 
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Dgrad Optimized Dy params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator
+struct Conv2dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next K}
+
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  int HW;                  // product of H*W
+
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    HW(problem_size.H *problem_size.W), 
+    hw_divmod(HW), 
+    w_divmod(problem_size.W) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * (problem_size.R - 1) * (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad Optimized Dy params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dStridedDgradOutputGradientIteratorOptimizedParams {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  
+  int64_t inc_next[3];    // {next S, next R, next K}
+
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  int tiled_rows_per_filter;
+
+  int conv_sign;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                            ///< layout object
+    int element_size_bits,                           ///< size of each element in bits
+    MatrixCoord threadblock_shape
+  ): layout(layout) {
+    
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
+  
+    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
+
+    conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// Dgrad Optimized w params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), RS(problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = ((int64_t)layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
+        - (problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// StridedDgrad Optimized w params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dStridedDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next[3];        // {next S, next R, next K}
+  int64_t reset_bytes;        // offset in units of bytes to move back the pointer 
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), RS(problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    // next S
+    inc_next[0] =
+      ( (int64_t)layout.stride()[0] * problem_size.stride_w
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] =
+      ( (int64_t)layout.stride()[1] * problem_size.stride_h
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
+        //- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // offset in units of bytes to move the pointer in backward direction
+    reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+            * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator
+struct Conv2dWgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int NPQ;                      // precomputd product of N*P*Q for clearing predicates
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int64_t offset_next_strided;    // offset in units of bytes to next npq coordinate within tile
+  int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile
+  int64_t inc_next_npq;           // offset in units of bytes to next npq position in subsequent tile
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    layout(layout),
+    NPQ(problem_size.N * problem_size.P * problem_size.Q),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
+    offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
+                        * element_size_bits / 8;
+
+    offset_next_contiguous = (threadmap_delta.contiguous())
+                            * element_size_bits / 8;
+
+    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
+                      * element_size_bits / 8;
+  }
+};
+
+struct Conv2dWgradActivationIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod sc_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+  FastDivmod c_divmod;
+  FastDivmod s_divmod;
+  int small_channel_conv_s_offset;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout
+  ):
+    layout(layout),
+    sc_divmod(problem_size.S * problem_size.C),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q),
+    c_divmod(problem_size.C),
+    s_divmod(problem_size.S * problem_size.dilation_w),
+    small_channel_conv_s_offset((problem_size.S - 1) * problem_size.dilation_w - problem_size.pad_w) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    Conv2dWgradActivationIteratorOptimizedParams(
+      problem_size,
+      layout
+    ) { 
+    
+      TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation", 
+        element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+    }
+};
+
+struct PredicatedScaleBiasVectorAccessIteratorParams {
+  public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams() { }
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      layout::PitchLinear const &layout) {}
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      layout::RowMajor const &layout) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
new file mode 100755
index 000000000..17f4594ba
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template wraps the tile access iterator concept to load whole tiles from tensors in
+      memory used for implicit GEMM convolution.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename TileAccessIterator_>
+class TileIterator {
+public:
+  using TileAccessIterator = TileAccessIterator_;
+
+  using Shape = typename TileAccessIterator::Shape;
+  using Element = typename TileAccessIterator::Element;
+  using Layout = typename TileAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = typename TileAccessIterator::ThreadMap;
+  using AccessType = typename TileAccessIterator::AccessType;
+  using TensorRef = typename TileAccessIterator::TensorRef;
+  using Index = typename TileAccessIterator::Index;
+  using LongIndex = typename TileAccessIterator::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
+  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
+  using Params = typename TileAccessIterator::Params;
+  static int const kConvDim = TileAccessIterator::kConvDim;
+  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, 
+    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+private:
+
+  /// Internal state
+  TileAccessIterator tile_access_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TileIterator(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
+    return TileAccessIterator::getParams(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    tile_access_iterator_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    tile_access_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator &operator++() {
+    tile_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator operator++(int) {
+    TileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          cutlass::arch::global_load<
+            AccessType,
+            sizeof(AccessType)
+          >(
+            frag_ptr[idx],
+            tile_access_iterator_.get() + pointer_offset,
+            tile_access_iterator_.valid()
+          );
+  
+          ++tile_access_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    tile_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    tile_access_iterator_.advance();
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // dispatch to iterator implementation
+    return TileAccessIterator::can_implement(problem_size);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad Tile Iterator
+template <typename TileAccessIterator_>
+class TileIteratorStridedDgrad {
+public:
+  using TileAccessIterator = TileAccessIterator_;
+
+  using Shape = typename TileAccessIterator::Shape;
+  using Element = typename TileAccessIterator::Element;
+  using Layout = typename TileAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = typename TileAccessIterator::ThreadMap;
+  using AccessType = typename TileAccessIterator::AccessType;
+  using TensorRef = typename TileAccessIterator::TensorRef;
+  using Index = typename TileAccessIterator::Index;
+  using LongIndex = typename TileAccessIterator::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
+  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
+  using Params = typename TileAccessIterator::Params;
+  static int const kConvDim = TileAccessIterator::kConvDim;
+  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, 
+    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+private:
+
+  /// Internal state
+  TileAccessIterator tile_access_iterator_;
+
+public:
+
+  /// Constructor (output gradient (Dy) OperandA ctor)
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(
+      params, 
+      problem_size, 
+      ptr, 
+      thread_idx, 
+      stride_h_divmod, stride_w_divmod, 
+      start_r, start_s, 
+      threadblock_offset) { }
+
+  /// Constructor (filter (w) OperandB ctor)
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(params, 
+      problem_size, 
+      ptr, 
+      thread_idx, 
+      start_r, start_s, 
+      threadblock_offset) { }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
+    return TileAccessIterator::getParams(problem_size, layout);
+  }
+
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    tile_access_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad &operator++() {
+    tile_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad operator++(int) {
+    TileIteratorStridedDgrad self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        cutlass::arch::global_load<
+          AccessType,
+          sizeof(AccessType)
+        >(
+          frag_ptr[c + s * ThreadMap::Iterations::kContiguous],
+          tile_access_iterator_.get() + pointer_offset,
+          tile_access_iterator_.valid()
+        );
+
+        ++tile_access_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    tile_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    tile_access_iterator_.advance();
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // dispatch to iterator implementation
+    return TileAccessIterator::can_implement(problem_size);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..3e3a4f155
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // Filter postion (r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int r, s, c;
+
+    if (kAccessesPerVector == 1) {
+      /// One 128b aligned access fetching more than one element
+      c = filter_c_[iteration_contiguous_];
+      r = filter_r_[iteration_contiguous_];
+      s = filter_s_[iteration_contiguous_];
+    }  
+    else {
+      /// Multiple access to support non-128b alignment in contiguous dimension
+      c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
+      int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
+      s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
+      int wrap_s = (filter_s_[iteration_contiguous_] + wrap_c) / problem_size_.S;
+      r = filter_r_[iteration_contiguous_] + wrap_s;
+    } 
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+    
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+   
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+ 
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..8cbcc3d9f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dWgradActivationIteratorOptimizedParams;
+
+private:
+
+  Conv2dWgradActivationIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for npq -> nhw translation
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized(
+    Conv2dWgradActivationIteratorOptimizedParams const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      // filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      // int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      params_.sc_divmod(precomputed_filter_r_[c], residual, rsc_offset);
+      params_.c_divmod(precomputed_filter_s_[c], filter_c_[c], residual);
+
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+
+      precomputed_filter_r_[c] =  -problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] =  -problem_size_.pad_w + s * problem_size_.dilation_w;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int r = precomputed_filter_r_[iteration_contiguous_];
+    int s = precomputed_filter_s_[iteration_contiguous_];
+    int c = filter_c_[iteration_contiguous_];
+
+    if (kAccessesPerVector > 1) {
+      // This code section is only to support non-128b alignment
+      // Multiple access to support non-128b alignment in contiguous dimension
+      int wrap_c;
+      params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        s -= (problem_size_.dilation_w * wrap_c);
+        
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, params_.small_channel_conv_s_offset - s);
+        s = params_.small_channel_conv_s_offset - s;
+
+        r -= (problem_size_.dilation_h * wrap_s);
+
+      } else {
+        s += (problem_size_.dilation_w * wrap_c);
+
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, s + problem_size_.pad_w);
+        s -= problem_size_.pad_w;
+
+        r += (problem_size_.dilation_h * wrap_s);
+      }
+    }
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    // int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq_[iteration_strided_]);
+    params_.q_divmod(p, q, residual);
+
+    int h = p * problem_size_.stride_h + r;
+    int w = q * problem_size_.stride_w + s;
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..793649dbe
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,260 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int npq = offset_npq_[iteration_strided_];
+
+    int n = npq / (problem_size_.P * problem_size_.Q);
+    int residual = npq % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    int k = filter_k_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(n, p, q, k);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..07233d892
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dWgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Conv2dWgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_k_;
+  int offset_npq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dWgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_k_(0),
+    offset_npq_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_npq_ = threadblock_offset.column() + thread_coord.strided();
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_npq = offset_npq_ + s * ThreadMap::Delta::kStrided;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          bool predicate = valid_(at_(offset_npq, filter_k + v * AccessType::kElements));
+  
+          uint32_t pred = (predicate ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+ 
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_npq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_npq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_npq_ + s * ThreadMap::Delta::kStrided >= params_.NPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+
+    pointer_ += params_.inc_next_npq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is pointed to
+  /// by offset_npq and k.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_npq, int k) const {
+
+    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int npq = offset_npq;
+    // int n = npq / (problem_size_.P * problem_size_.Q);
+    // int residual = npq % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+    
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq);
+    params_.q_divmod(p, q, residual);
+
+    return TensorCoord(n, p, q, k);
+  }
+  
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    ) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..943ab88cf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,268 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // For a fixed filter position (t,r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+     ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int c = offset_c_[iteration_contiguous_];
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..2d5837dd3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradFilterTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = StrideSupport_;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dDgradFilterIteratorOptimizedParams const &base): 
+      Conv3dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv3dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv3dDgradFilterIteratorOptimizedParams const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized(
+    Conv3dDgradFilterIteratorOptimizedParams const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_trs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[3] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
+
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..30b7f2fcf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,343 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv3dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ConvProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_d_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+  
+private:
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator but DOES NOT scale by the convolution stride. This is needed
+  /// to compute predicates in the valid() method. The return value of the public at()
+  /// method is correctly scaled.
+  CUTLASS_HOST_DEVICE
+  TensorCoord unscaled_at_() const {
+    int n = offset_n_[iteration_strided_];
+    int d = offset_d_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int z = (d + problem_size_.pad_d - t * problem_size_.dilation_d);
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h);
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w);
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+
+      offset_d_[s] = residual / (problem_size_.H * problem_size_.W);
+      residual     = residual % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    TensorCoord coord = unscaled_at_();
+
+    return TensorCoord(
+      coord.n(), 
+      coord.d() / problem_size_.stride_d, 
+      coord.h() / problem_size_.stride_h, 
+      coord.w() / problem_size_.stride_w, 
+      coord.c());
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord unscaled_coord = unscaled_at_();
+    TensorCoord coord = at();
+
+    return 
+      !(unscaled_coord.d() % problem_size_.stride_d) &&
+      !(unscaled_coord.h() % problem_size_.stride_h) && 
+      !(unscaled_coord.w() % problem_size_.stride_w) &&
+      coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.Z &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..5a53c8cbd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  static_assert(StrideSupport_ == conv::StrideSupport::kUnity,
+    "Only unit-stride dgrad is supported at this time.");
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  using Coord3D = Coord<3>;
+  static int const kAccessesPerVector = 1;
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_d[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      //  int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+      //
+      //
+      //  offset_d[s] = residual / (problem_size_.H * problem_size_.W);
+      //  residual    = residual % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      // input: (ndhw offset) output: (n offset and resudial (dhw offset))
+      params_.dhw_divmod(offset_n[s], residual, offset_ndhw);
+      // input: (dhw offset) output: (d offset and resudial (hw))
+      params_.hw_divmod(offset_d[s], residual, residual);
+      // input: (hw offset) output: (h offset and resudial (w offset))
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_d[s], offset_h[s], offset_w[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int z = offset_d[s_idx] + problem_size_.pad_d - t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && z >= 0 && z < problem_size_.Z);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (p >= 0 && p < problem_size_.P);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_k_ >= problem_size.K) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // activation ndhw and filter position k, t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int d, int h, int w, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int z = d + problem_size_.pad_d - t * problem_size_.dilation_d;
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    clear_mask_(filter_k_ >= problem_size_.K);
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != Coord3D({1, 1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..f0f9a86a3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,291 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_z_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+      offset_z_[s] = residual / (problem_size_.P * problem_size_.Q);
+      residual     = residual % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int z = offset_z_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - filter_t_);
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..78b270eb9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;  
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dFpropActivationIteratorOptimizedParams<Layout>;
+
+private:
+
+  Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  // mask for t, r, and s
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized(
+    Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ) :    
+  params_(params), 
+  problem_size_(problem_size),
+  filter_t_(0), 
+  filter_r_(0), 
+  filter_s_(0),
+  filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_z[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //  int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //
+      //  offset_z[s] = residual / (problem_size_.P * problem_size_.Q);
+      //  residual = residual % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      // input: (nzpq offset) output: (n offset and resudial (zpq offset))
+      params.zpq_divmod(offset_n[s], residual, offset_nzpq);
+      // input: (zpq offset) output: (z offset and resudial (pq))
+      params.pq_divmod(offset_z[s], residual, residual);
+      // input: (pq offset) output: (p offset and resudial (q offset))
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_z[s], offset_p[s], offset_q[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    // mask predicates for filter position T
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int d = offset_z[s_idx] * problem_size_.stride_d - problem_size_.pad_d + t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && d >= 0 && d < problem_size_.D);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }   
+
+    // mask predicates for filter position R
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (h >= 0 && h < problem_size_.H);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }  
+
+    // mask predicates for filter position S
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output nzpq and filter position t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int z, int p, int q, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    clear_mask_(filter_c_ >= problem_size_.C);
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // Conv3dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..9f04adc40
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,259 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  bool IsDeconv_ = false
+>
+class Conv3dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0),
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    return coord.n() < output_channels &&
+      coord.c() < input_channels;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+    // check alignment constraint on iterator's contiguous dimension
+    if (input_channels % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..efe34497f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,279 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  bool IsDeconv_ = false
+>
+class Conv3dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_c_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized(
+    Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_trs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
+      predicates_ |= (pred << s);
+    }
+
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
+      predicates_ = 0u;
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+      
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
+      predicates_ = 0;
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_ & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (input_channels % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
new file mode 100755
index 000000000..ac422b8f0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
@@ -0,0 +1,508 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv3d analytic tile iterators
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams(
+    Conv3dProblemSize const &,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template<>
+struct Conv3dFpropActivationIteratorOptimizedParams<layout::TensorNDHWC> {
+  
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int ZPQ;                // product of Z*P*Q
+  int PQ;                 // product of P*Q
+
+  FastDivmod zpq_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q),
+    ZPQ(problem_size.Z * problem_size.P * problem_size.Q),  
+    zpq_divmod(ZPQ),
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+  
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv3dFpropFilterIteratorOptimizedParams<layout::TensorNDHWC>
+{
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_trs;        // offset in units of bytes to next TRS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    TRS = problem_size.T * problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[3]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[3]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(TRS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv3d DGRAD OutputGradient (dy) iterator
+struct Conv3dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next K}
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  FastDivmod dhw_divmod;
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    dhw_divmod(problem_size.D * problem_size.H * problem_size.W),
+    hw_divmod(problem_size.H * problem_size.W), 
+    w_divmod(problem_size.W) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d DGRAD Filter (w) iterator
+struct Conv3dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_trs;       // offset in units of bytes to next TRS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), TRS(problem_size.T * problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = ((int64_t)layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[3]
+        - (problem_size.T * problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/// Parameters object for Conv3d WGRAD OutputGradient iterator
+struct Conv3dWgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+  using LongIndex = typename Layout::LongIndex;
+
+  Layout layout;
+
+  int NZPQ;                // precomputd product of N*Z*P*Q for clearing predicates
+  int ZPQ;                 // product of Z*P*Q
+  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+  unsigned zpq_shr;        //    in device code.
+
+  int PQ;                  // product of P*Q
+  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+  unsigned pq_shr;         //    in device code.
+
+  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+  unsigned q_shr;          //    in device code.
+
+  LongIndex offset_next_strided;     // offset in units of bytes to next nzpq coordinate within tile
+  LongIndex offset_next_contiguous;  // offset in units of bytes to next k coordinate within tile
+  LongIndex inc_next_nzpq;           // offset in units of bytes to next nzpq position in subsequent tile
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): layout(layout) {
+
+  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "output_gradient", 
+    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+  // Incremental offsets in unites of bytes (number of elements) * element_size_bits / 8
+  offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
+                      * element_size_bits / 8;
+
+  offset_next_contiguous = (threadmap_delta.contiguous()) 
+                          * element_size_bits / 8;
+
+  inc_next_nzpq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
+                    * element_size_bits / 8;
+
+  // Precompute several quantities for fast modulo arithmetic.
+  NZPQ = problem_size.N * problem_size.Z * problem_size.P * problem_size.Q;
+  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+  find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+  PQ = problem_size.P * problem_size.Q;
+  find_divisor(pq_mul, pq_shr, PQ);
+
+  find_divisor(q_mul, q_shr, problem_size.Q);
+
+  }
+};
+
+/// Parameters object for Conv3d WGRAD Activation Tile Access Iterator
+struct Conv3dWgradActivationIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int RSC;                 // product of R*S*C
+  unsigned rsc_mul;        // precomputed quantities for fast computation of div/% by RSC
+  unsigned rsc_shr;        //    in device code.
+
+  int SC;                  // product of S*C
+  unsigned sc_mul;         // precomputed quantities for fast computation of div/% by SC
+  unsigned sc_shr;         //    in device code.
+
+  unsigned c_mul;          // precomputed quantities for fast computation of div/% by C
+  unsigned c_shr;          //    in device code.
+
+  int ZPQ;                 // product of Z*P*Q
+  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+  unsigned zpq_shr;        //    in device code.
+
+  int PQ;                  // product of P*Q
+  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+  unsigned pq_shr;         //    in device code.
+
+  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+  unsigned q_shr;          //    in device code.
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): layout(layout) {
+
+  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "activation", 
+    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+  // Precompute several quantities for fast modulo arithmetic.
+  RSC = problem_size.R * problem_size.S * problem_size.C;
+  find_divisor(rsc_mul, rsc_shr, RSC);
+
+  SC = problem_size.S * problem_size.C;
+  find_divisor(sc_mul, sc_shr, SC);
+      
+  find_divisor(c_mul, c_shr, problem_size.C);
+
+  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+  find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+  PQ = problem_size.P * problem_size.Q;
+  find_divisor(pq_mul, pq_shr, PQ);
+
+  find_divisor(q_mul, q_shr, problem_size.Q);
+
+  }
+};
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..cc8faea70
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static int const kAccessesPerVector = 1;
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Filter postion (t,r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_t_[ThreadMap::Iterations::kContiguous];
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+
+      filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      residual = residual % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int t = filter_t_[iteration_contiguous_];
+    int r = filter_r_[iteration_contiguous_];
+    int s = filter_s_[iteration_contiguous_];
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+ 
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..2b10d207f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,319 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dWgradActivationIteratorOptimizedParams {
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dWgradActivationIteratorOptimizedParams const &base)
+          : Conv3dWgradActivationIteratorOptimizedParams(base) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
+          : Conv3dWgradActivationIteratorOptimizedParams(
+          problem_size,
+          layout,
+          sizeof_bits<Element>::value,
+          {Shape::kRow, Shape::kColumn},
+          ThreadMap::kThreads,
+          ThreadMap::kElementsPerAccess,
+          {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+          {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (t,r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for nzpq -> ndhw translation
+  int precomputed_filter_t_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      // 
+      // filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      // int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+      //
+      // filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      // residual = residual % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      fast_divmod(precomputed_filter_t_[c], residual, trsc_offset, params_.RSC, params_.rsc_mul, params_.rsc_shr);
+      fast_divmod(precomputed_filter_r_[c], residual, residual, params_.SC, params_.sc_mul, params_.sc_shr);
+      fast_divmod(precomputed_filter_s_[c], filter_c_[c], residual, problem_size_.C, params_.c_mul, params_.c_shr);
+
+      int t = precomputed_filter_t_[c];
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        t = (problem_size_.T - 1 - t);
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+      
+      // efective t,r,s for every contiguous dimension
+      precomputed_filter_t_[c] = - problem_size_.pad_d + t * problem_size_.dilation_d;
+      precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // 
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq_[iteration_strided_], params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+ 
+    int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_];
+    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];
+    int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100755
index 000000000..be9d4fb7a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int nzpq = offset_nzpq_[iteration_strided_];
+
+    int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    return TensorCoord(n, z, p, q, filter_k_[iteration_contiguous_]);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() < problem_size_.Z &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100755
index 000000000..0ef145f19
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dWgradOutputGradientIteratorOptimizedParams {
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dWgradOutputGradientIteratorOptimizedParams const &base)
+          : Conv3dWgradOutputGradientIteratorOptimizedParams(base) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
+          : Conv3dWgradOutputGradientIteratorOptimizedParams(
+            problem_size,
+            layout,
+            sizeof_bits<Element>::value,
+            {Shape::kRow, Shape::kColumn},
+            ThreadMap::kThreads,
+            ThreadMap::kElementsPerAccess,
+            {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+            {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
+    };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  uint32_t predicates_;
+  int filter_k_;
+  int offset_nzpq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_k_(0),
+    offset_nzpq_(0) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_nzpq_ = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_nzpq = offset_nzpq_ + s * ThreadMap::Delta::kStrided;
+
+        bool predicate = valid_(at_(offset_nzpq, filter_k));
+
+        uint32_t pred = (predicate ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_nzpq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_nzpq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_nzpq_ + s * ThreadMap::Delta::kStrided >= params_.NZPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+    pointer_ += params_.inc_next_nzpq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is (offset_nzpq, k) pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_nzpq, int k) const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int nzpq = offset_nzpq_;
+    // int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    //
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq, params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+
+    return TensorCoord(n, z, p, q, k);
+  }
+
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    );
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
new file mode 100755
index 000000000..802318349
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvParams;
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams;
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvFilterIteratorParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<>
+struct Depthwise2dFpropDirectConvParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int32_t activation_tile_h;
+  int32_t activation_tile_w;
+  int32_t activation_tile_hw;
+  FastDivmod activation_tile_w_divmod;
+  
+  int filter[2];
+  int stride[2];
+  int dilation[2];
+  int inc_next[2];
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_load_count;
+  int activation_storage_elements;
+  int activation_size;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,             ///< layout object
+      MatrixCoord threadblock_shape,    ///< CTA threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int element_size_bits,      ///< bits of activation element
+      const int thread_count,           ///< threads per threadblock
+      const int thread_count_contiguous, ///< number of threads for continuous dimension
+      const int element_per_load)       ///< element per each load
+      : layout(layout) {
+          
+    filter[0] = problem_size.S;
+    filter[1] = problem_size.R;
+    
+    stride[0] =  problem_size.stride_w;
+    stride[1] =  problem_size.stride_h;
+
+    dilation[0] = problem_size.dilation_w;
+    dilation[1] = problem_size.dilation_h;
+
+    // Compute activation_tile size per threadblock because stride and dilation are runtime params.
+    activation_tile_h = (threadblock_output_shape.h() - 1) * problem_size.stride_h +
+                        (problem_size.R - 1) * problem_size.dilation_h + 1;
+    activation_tile_w = (threadblock_output_shape.w() - 1) * problem_size.stride_w +
+                        (problem_size.S - 1) * problem_size.dilation_w + 1;
+    activation_tile_hw = activation_tile_h * activation_tile_w;
+
+    activation_tile_w_divmod = FastDivmod(activation_tile_w);
+
+    /// Below two values could not be templatized because the stride and dilation are runtime params
+    activation_load_count = (thread_count_contiguous * activation_tile_hw + (thread_count - 1)) / thread_count;
+    activation_storage_elements = activation_load_count * element_per_load * thread_count;
+    activation_size =  activation_storage_elements * element_size_bits / 8;
+
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q = (problem_size.Q + (threadblock_output_shape.w() - 1)) /
+                  (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+
+    // next S
+    inc_next[0] = problem_size.dilation_w;
+    // next R
+    inc_next[1] = (activation_tile_w * problem_size.dilation_h - (problem_size.S - 1) * problem_size.dilation_w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template <>
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_size;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,                          ///< Layout object
+      MatrixCoord threadblock_shape,                 ///< Threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int activation_size_                     ///< Activation size loaded by iterator
+      )
+      : layout(layout),
+        activation_size(activation_size_) {
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q =
+        (problem_size.Q + (threadblock_output_shape.w() - 1)) / (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template <>
+struct Depthwise2dFpropDirectConvFilterIteratorParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int filter_size;
+
+  bool is_convolution;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,           ///< Layout object
+      MatrixCoord threadblock_shape,  ///< Threadblock Shape
+      const int filter_size_)         ///< Filter size loaded by iterator
+      : layout(layout),
+        filter_size(filter_size_),
+        is_convolution(problem_size.mode == Mode::kConvolution){}
+};
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
new file mode 100755
index 000000000..192d96105
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
@@ -0,0 +1,314 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename StrideShape_,
+          typename DilationShape_,
+          typename ActivationShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  // Compilation value of stride , dialtion and activation shape
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_;
+  using ActivationShape = ActivationShape_;
+
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static int const kActivationSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // activation N is inside the Tensor or not
+  bool valid_n_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * StrideShape::kRow - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * StrideShape::kColumn - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+
+    valid_n_ = activatioin_base_.n() < problem_size_.N;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        kActivationSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h = next / ActivationShape::kW;
+    int w = next % ActivationShape::kW;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+    bool valid_c = coord.c() < problem_size_.C;
+    bool valid_h = coord.h() >= 0 && coord.h() < problem_size_.H;
+    bool valid_w = coord.w() >= 0 && coord.w() < problem_size_.W;
+    return valid_n_ ? valid_c & valid_h & valid_w : 0;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < ThreadMap::Iterations::kCount) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kActivationSize;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return ThreadMap::Iterations::kCount;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check stride and dilation constraint
+    if (problem_size.stride_h != StrideShape::kRow || problem_size.stride_w != StrideShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (problem_size.dilation_h != DilationShape::kRow || problem_size.dilation_w != DilationShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
new file mode 100755
index 000000000..a858a23f9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,291 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // Number of loads for activations tensor X.
+  const int number_of_loads_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0),
+        number_of_loads_(params.activation_load_count) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_activation_coord(offset_intial_npq_);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * problem_size_.stride_h - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * problem_size_.stride_w - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        sizeof_bits<Element>::value,
+        ThreadMap::kThreads,
+        ThreadMap::Detail::ShapeVec::kContiguous,
+        ThreadMap::kElementsPerAccess);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h, w;
+    params_.activation_tile_w_divmod(h, w, next) ;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N && coord.h() >= 0 && coord.h() < problem_size_.H &&
+           coord.w() >= 0 && coord.w() < problem_size_.W && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < number_of_loads_) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return params_.activation_size;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return number_of_loads_;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
new file mode 100755
index 000000000..50aeee006
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
@@ -0,0 +1,551 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Epilogue stores the data into global memory
+    typename Epilogue_,
+    /// iterator implementation variants
+    conv::IteratorAlgorithm IteratorAlgorithm_ = conv::IteratorAlgorithm::kOptimized,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseFpropDirectConvMultipleStage :
+   public DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using Epilogue = Epilogue_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static conv::IteratorAlgorithm const kItertorAlgorithm = IteratorAlgorithm_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB = 
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropDirectConvMultipleStage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0,
+                              int group_start_B = 0) {
+    if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+      // Number of iterators is a static value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    } else {
+      // Number of iterators is a runtime value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA &iterator_A,
+      ///< Params of global memory iterator
+      typename IteratorA::Params const &iterator_a_params,
+      ///< iterator over B operand in global memory
+      IteratorB &iterator_B,
+      ///< Params of global memory iterator
+      typename IteratorB::Params const &iterator_b_params,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      /// Epilogue
+      Epilogue &epilogue, 
+      ///< Output operator
+      typename Epilogue::OutputOp const &output_op, 
+      ///< Tile iterator for destination 
+      typename Epilogue::OutputTileIterator &destination_iterator,
+      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      typename Epilogue::OutputTileIterator &source_iterator,
+
+      int split_k_slices = 1
+      ) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      if (stage == 0) {
+        iterator_B.set_iteration_index(0);
+        this->smem_iterator_B_.set_iteration_index(0);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+          typename IteratorB::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                  IteratorB::ThreadMap::kElementsPerAccess /
+                                  IteratorB::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+            
+            ++iterator_B;
+          }
+
+          ++this->smem_iterator_B_;
+        }
+      }
+
+      if(kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation){
+        // Number of iterators is compilation static.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_index(0);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+
+      } else {
+        // Number of iterators is a runtime value.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_num(iterator_A.get_iteration_num());
+        this->smem_iterator_A_.set_iteration_index(0);
+
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+
+      this->smem_iterator_A_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    unsigned int iterations = 0;
+    constexpr int inner_loop_iterations = round_up(Base::kWarpGemmIterations, 2);
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {   // Each iteration is a cta tile.
+
+      accum.clear();
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < inner_loop_iterations; ++warp_mma_k) {
+        if (Base::kWarpGemmIterations % 2 == 0 || warp_mma_k + 1 != Base::kWarpGemmIterations) {
+          // Load warp-level tiles from shared memory, wrapping to k offset if
+          // this is the last group as the case may be.
+
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                              warp_transformed_frag_B[warp_mma_k % 2],
+                              warp_loaded_frag_A[warp_mma_k % 2],
+                              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k == 0) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+        }
+
+        if (warp_mma_k < Base::kWarpGemmIterations) {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        if (warp_mma_k + 1 == inner_loop_iterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == inner_loop_iterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next cta
+          iterator_A.advance();
+
+          this->smem_iterator_A_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({-Base::kStages, 0});
+   
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.advance(- (Base::kStages-1) * iterator_A.get_load_size());
+            smem_read_stage_idx = 0;
+          } else {
+            this->warp_tile_iterator_A_.advance(iterator_A.get_load_size());
+            ++smem_read_stage_idx;
+          }
+
+          if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+            this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+          }
+
+          // goback to start position. B has no multiple stage
+          this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Shape::kK, 0});
+
+          --gemm_k_iterations;
+        }
+      }
+
+      //
+      // Epilogue
+      //
+      int32_t smem_base_offset = iterator_B.get_load_size() + (iterations % Base::kStages) * iterator_A.get_load_size();
+
+      destination_iterator.set_tile_index(iterations * split_k_slices);
+      
+      source_iterator.set_tile_index(iterations * split_k_slices);
+    
+      epilogue(output_op, destination_iterator, accum, source_iterator, smem_base_offset);
+
+      ++iterations;
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
new file mode 100755
index 000000000..52d604e43
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+template <typename Shape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized {
+public:   
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static int const kFilterSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+  using Params = Depthwise2dFpropDirectConvFilterIteratorParams<Layout>;
+
+ protected:
+
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int offset_trs_[ThreadMap::Iterations::kStrided];
+
+public:
+
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_trs_[s] = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+      return Params(problem_size, layout, {Shape::kRow, Shape::kColumn}, kFilterSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Do nothing because the filter is persistent in the SMEM
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
+    int trs =  offset_trs_[iteration_strided_];
+
+    return TensorCoord(k, trs, 0 , 0);  // As a 2D-matrix
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K &&
+            coord.h() < Shape::kColumn;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    int64_t offset = coord.n();
+    if (params_.is_convolution) {
+      offset += (Shape::kColumn - coord.h() - 1)* problem_size_.K;
+    } else {
+      offset += coord.h() * problem_size_.K;
+    }
+
+    return reinterpret_cast<AccessType const *>(pointer_ +
+                                                offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines the filter size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kFilterSize;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check whether runtime filter size is same as templated filter size.
+    if ((problem_size.R * problem_size.S) != Shape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
new file mode 100755
index 000000000..c2825fa60
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
@@ -0,0 +1,336 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class DepthwiseFpropPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,            ///< number of iterations per channel
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+    // Depthwise specific
+    int channel_start_index = 0;
+    int rs_plane_idx = 0;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+        // Reset interation index.
+        iterator_B.set_iteration_index(0);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+            // Move to next set of filter groups.
+            channel_start_index += Base::kWarpGemmIterations;
+          }
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+
+      rs_plane_idx = (rs_plane_idx == gemm_k_iterations_per_channel - 1) ? 0: (rs_plane_idx + 1);
+
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
new file mode 100755
index 000000000..967587be0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a directconv threadblock-scoped Depthwise kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    ///
+    typename ThreadMapA_,
+    ///
+    typename ThreadMapB_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct DepthwiseDirectConvMmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  using ThreadMapA = ThreadMapA_;
+  using ThreadMapB = ThreadMapB_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseDirectConvMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::
+      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  /// kWarpGemmIterations could be even and odd. 
+  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<1,  // Not determined at compile-time :(
+                               Shape::kN + Policy::SmemPaddingA::kRow>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<Policy::ThreadMapB::StorageShape::kStrided +
+                                   Policy::SmemPaddingB::kRow,  // filter_rs_size
+                               Policy::ThreadMapB::StorageShape::kContiguous +
+                                   Policy::SmemPaddingB::kColumn>;  // Tile N = 64?
+
+   public:
+    //
+    // Data members
+    //
+
+    // Let persistent B matrix in front of dynamic matrix A
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand
+    /// Not be determined at compile-time -- Just to get a Smem start address.
+    AlignedBuffer<typename Operator::ElementA, 1> operand_A;  
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseDirectConvMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
new file mode 100755
index 000000000..de84180f3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
@@ -0,0 +1,952 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting depthwise related simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/warp/mma_depthwise_simt.h"
+
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h"
+
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+namespace detail {
+//
+// Convert a WarpShapeM which is the whole tile of elements into the number of elements (2D) held by
+// each partitions within warp. 
+// The goal is for each thread's tile of elements to be as square as
+// possible for performance (4x4 will be faster than 2x8).
+template<int WarpShapeM,  // The number of elements (1D) contained in the entire warp
+         int WarpNumThreadsM> // The number of partitions within the warp
+struct SimtWarpShape {
+  // kP * kQ * WarpNumThreadsM = WarpShapeM
+  // If needed, enable more specializations.
+};
+template <>
+struct SimtWarpShape<4, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<8, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<8, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+template <>
+struct SimtWarpShape<8, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<16, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <int WarpNumThreadsM>
+struct SimtWarpShape<25, WarpNumThreadsM> {
+  static_assert(WarpNumThreadsM == 1, "WarpShapeM could not be evenly splited by threads");
+  static constexpr int kP = 5;
+  static constexpr int kQ = 5;
+};
+
+template <>
+struct SimtWarpShape<32, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 8;
+};
+
+template <>
+struct SimtWarpShape<32, 2> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+
+template <>
+struct SimtWarpShape<32, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+
+}  // namespace detail
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of threadblock-scoped output tile 
+    typename ThreadBlockOutputShape,
+    /// Shape of filter shape per threadblock
+    typename FilterShape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DepthwiseMmaCoreWithLaneAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : cutlass::gemm::threadblock::DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeA_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeB_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseMmaCoreWithLaneAccessSize<Shape_,
+                                        WarpShape_,
+                                        cutlass::gemm::GemmShape<1, 1, 1>,
+                                        ElementA_,
+                                        layout::RowMajor,
+                                        ElementB_,
+                                        layout::ColumnMajor,
+                                        ElementC_,
+                                        LayoutC_,
+                                        arch::OpClassSimt,
+                                        kLaneAccessSizeA_,
+                                        kLaneAccessSizeB_,
+                                        2,
+                                        Operator_> : public cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                                                                           WarpShape_,
+                                                                           cutlass::gemm::GemmShape<1, 1, 1>,
+                                                                           ElementA_,
+                                                                           layout::RowMajor,
+                                                                           ElementB_,
+                                                                           layout::ColumnMajor,
+                                                                           ElementC_,
+                                                                           LayoutC_,
+                                                                           arch::OpClassSimt,
+                                                                           2,
+                                                                           Operator_> {
+  using Base = cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                              WarpShape_,
+                              cutlass::gemm::GemmShape<1, 1, 1>,
+                              ElementA_,
+                              layout::RowMajor,
+                              ElementB_,
+                              layout::ColumnMajor,
+                              ElementC_,
+                              LayoutC_,
+                              arch::OpClassSimt,
+                              2,
+                              Operator_>;
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeA = kLaneAccessSizeA_;
+  static int const kLaneAccessSizeB = kLaneAccessSizeB_;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeA > 0 && kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = typename Base::WarpCount;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory are same as base class
+  //
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = cutlass::gemm::threadblock::detail::simt_get_warp_threads_m<WarpShape>(); 
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = kLaneAccessSizeA / sizeof_bits<ElementA>::value;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, 1>, // Set kStrided = 1 because activation shape is runtime value.
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<1, Shape::kN>, // set kRow is 1 because it is a runtime value
+    ElementA, 
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    true  // Dynamic iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  // Get output P, Q per thread
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape_, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_,
+                                                    IteratorAlgorithm::kFixedStrideDilation,
+                                                    StrideShape_,
+                                                    DilationShape_,
+                                                    ActivationShape_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_; 
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using ActivationShape = ActivationShape_;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ActivationShape::kC, ActivationShape::kNHW>,
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<ActivationShape::kNHW, ActivationShape::kC>,
+    ElementA,
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    false  // static iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy,          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+      IteratorAlgorithm::kFixedStrideDilation, /// Iterator algo type
+      StrideShape,   /// Stride ( MatrixShape<Height, Width> )
+      DilationShape,  /// Dilation ( MatrixShape<Height, Width> )
+      ActivationShape /// Activation Shape loaded by threadblock
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
new file mode 100755
index 000000000..3bee07d0a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
@@ -0,0 +1,802 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped fused activation's 
+   scale+bias+relu and Implicit GEMM Convolution kernel.
+
+   The original implicit gemm will store out-of-bound data as zeroes in the
+   shared memory because zeros into the tensor core, zeroes out of the tensor
+   cores.  The result is remained the same.   When fusing scale+bias+relu
+   into the mainloop, it is no longer true because
+
+     0 x scale + bias = bias
+
+   which is no longer always 0.  So, instead of storing zeroes, this fused
+   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
+   scale+bias+relu, the code is like
+
+     if (data == 0x7eff)
+       data = 0;
+     else
+       data = scale+bias+relu(data, scale, bias);
+
+  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
+  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/conv/warp/scale_bias_relu_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorScaleBias_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaFpropFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the scale and bias vectors
+  using TensorRefScaleBias = TensorRef<ElementScaleBias, LayoutScaleBias>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the A scale and bias vectors in shared memory
+    using ShapeScaleBias =
+        MatrixShape<1 + Policy::SmemPaddingA::kRow,
+                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand Scale and Bias
+    AlignedBuffer<ElementScaleBias, ShapeScaleBias::kCount> operand_A_scale_bias;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the A scale and bias vectors
+    CUTLASS_DEVICE
+    static LayoutScaleBias LayoutScaleBias() {
+      return LayoutScaleBias::packed(
+          {ShapeScaleBias::kRow, ShapeScaleBias::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the A operand Scale vector
+    CUTLASS_HOST_DEVICE
+    TensorRefScaleBias operand_A_scale_bias_ref() {
+      return TensorRefScaleBias{operand_A_scale_bias.data(), LayoutScaleBias()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
+  /// from shared memory
+  WarpIteratorScaleBias warp_tile_iterator_A_scale_bias_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaFpropFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_A_scale_bias_(
+            shared_storage.operand_A_scale_bias_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorScaleBias_,
+    /// Iterates over vectors of scale and bias vector in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorScaleBias_,
+    /// Cache operation for scale/bias operand 
+    cutlass::arch::CacheOperation::Kind CacheOpScaleBias,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorScaleBias_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmFpropFusionMultistage
+    : public MmaFpropFusionBase<Shape_, typename IteratorScaleBias_::Element,
+                       typename IteratorScaleBias_::Layout, Policy_,
+                       WarpIteratorScaleBias_, Stages> {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorScaleBias = IteratorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Base class
+  using Base = MmaFpropFusionBase<Shape_, typename IteratorScaleBias::Element,
+                         typename IteratorScaleBias::Layout, Policy,
+                         WarpIteratorScaleBias, Stages>;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScaleBias = SmemIteratorScaleBias_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpScaleBias =
+      CacheOpScaleBias;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpLoadedFragmentScaleBias =
+      typename WarpIteratorScaleBias::Fragment;
+
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
+  SmemIteratorScaleBias smem_iterator_A_scale_bias_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+  
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmFpropFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_A_scale_bias_(shared_storage.operand_A_scale_bias_ref(),
+                                    thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorScaleBias &iterator_A_scale_bias,
+                              IteratorB &iterator_B, int group_start_A = 0,
+                              int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
+    // are small.  One iteration is enough.
+    if (group_start_A == 0) {
+      typename IteratorScaleBias::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorScaleBias::AccessType *>(
+              this->smem_iterator_A_scale_bias_.get());
+
+      int const kSrcBytes =
+          sizeof_bits<typename IteratorScaleBias::Element>::value *
+          IteratorScaleBias::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
+          dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
+    }
+
+    iterator_B.set_iteration_index(group_start_B);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorScaleBias iterator_A_scale_bias,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,  
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess / 8;
+        
+        // Uses Nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+        ++this->smem_iterator_A_;
+      }
+
+      // Async Copy for operand A scale and bias vectors.  Scale and bias
+      // vectors are small.  One iteration is enough.
+      {
+        typename IteratorScaleBias::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorScaleBias::AccessType *>(
+                this->smem_iterator_A_scale_bias_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorScaleBias::Element>::value *
+            IteratorScaleBias::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
+            dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_A_scale_bias.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpLoadedFragmentScaleBias warp_loaded_frag_A_scale_bias[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::conv::warp::FpropScaleBiasReluTransform<WarpTransformedFragmentA,
+                                            WarpLoadedFragmentScaleBias>
+        elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_A_scale_bias_.load(
+        warp_loaded_frag_A_scale_bias[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_scale_bias_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_A_scale_bias[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_scale_bias_.load(
+            warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_scale_bias_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                               warp_loaded_frag_A_scale_bias[warp_mma_k % 2]);
+        }
+
+        warp_mma(
+                 accum, 
+                 warp_transformed_frag_A[warp_mma_k % 2],
+                 warp_transformed_frag_B[warp_mma_k % 2],
+                 accum
+                );
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_A_scale_bias.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_A_scale_bias_.add_tile_offset(
+                {0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
new file mode 100755
index 000000000..eea7743a4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -0,0 +1,539 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmMultistage : 
+  public gemm::threadblock::MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A, IteratorB &iterator_B,
+    int group_start_A = 0, int group_start_B = 0) {
+
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                  dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                  dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+  
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (Detail::kStagedAccumulation) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A,
+                               group_start_iteration_B);
+
+        if (Detail::kStagedAccumulation) {
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    if (Detail::kStagedAccumulation) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+  
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
new file mode 100755
index 000000000..79bcb78aa
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,             ///< number of iterations per channel
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
new file mode 100755
index 000000000..1ec0c61dd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
@@ -0,0 +1,729 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped fused activation's scale+bias+relu and
+   Implicit GEMM Convolution kernel.
+
+   The original implicit gemm will store out-of-bound data as zeroes in the
+   shared memory because zeros into the tensor core, zeroes out of the tensor
+   cores.  The result is remained the same.   When fusing scale+bias+relu
+   into the mainloop, it is no longer true because
+
+     0 x scale + bias = bias
+
+   which is no longer always 0.  So, instead of storing zeroes, this fused
+   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
+   scale+bias+relu, the code is like
+
+     if (data == 0x7eff)
+       data = 0;
+     else
+       data = scale+bias+relu(data, scale, bias);
+
+  The biggest difference compared with the fused Fprop and scale+bias+relu is
+  that scale and bias are loop invariant in Wgrad so that they only needs to 
+  be loaded once before the mainloop.
+
+  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
+  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
+
+
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/conv/warp/scale_bias_relu_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Element type of scale and bias vectors 
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaWgradFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaWgradFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorScaleBias_,
+    /// Iterates over vectors of scale and bias vector i
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmWgradFusionMultistage
+    : public MmaWgradFusionBase<Shape_, typename IteratorScaleBias_::Element,
+                       typename IteratorScaleBias_::Layout, Policy_, Stages> {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorScaleBias = IteratorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Base class
+  using Base = MmaWgradFusionBase<Shape_, typename IteratorScaleBias::Element,
+                         typename IteratorScaleBias::Layout, Policy_, Stages>;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const kBBufferSize =
+        ((sizeof(typename Operator::ElementC) == 4) &&
+         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
+                             typename Operator::ElementA>::value &&
+           platform::is_same<typename Operator::Policy::Operator::ElementB,
+                             typename Operator::ElementB>::value)) &&
+         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
+            ? 1
+            : 2;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpLoadedFragmentScaleBias = typename IteratorScaleBias::Fragment;
+
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+  
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmWgradFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+
+    iterator_A.set_iteration_index(group_start_A);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
+                dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorScaleBias iterator_B_scale_bias,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0, 
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    WarpLoadedFragmentScaleBias warp_loaded_frag_B_scale_bias;
+    iterator_B_scale_bias.add_tile_offset({0, warp_idx_n_});
+    iterator_B_scale_bias.load(warp_loaded_frag_B_scale_bias);
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess / 8;
+        
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses Nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
+            dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[Detail::kBBufferSize];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[Detail::kBBufferSize];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::conv::warp::WgradScaleBiasReluTransform<WarpTransformedFragmentB,
+                                            WarpLoadedFragmentScaleBias>
+        elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_B[0],
+                         warp_loaded_frag_B_scale_bias);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (Detail::kBBufferSize == 2) {
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize]);
+          ++this->warp_tile_iterator_A_;
+        }
+
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % Detail::kBBufferSize],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
+                               warp_loaded_frag_B_scale_bias);
+        }
+
+        warp_mma(
+                 accum, 
+                 warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
+                 warp_transformed_frag_B[warp_mma_k % 2],
+                 accum
+                );
+
+        if (Detail::kBBufferSize == 1) {
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+          ++this->warp_tile_iterator_A_;
+  
+        }
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B_scale_bias);
+        }
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
new file mode 100755
index 000000000..bfe9a3981
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -0,0 +1,470 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  int problem_size_trs;
+  int problem_size_c;
+  int filter_trs_;
+
+  TensorCoord thread_offset_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_trs(problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_trs(problem_size.T * problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    thread_offset_ =
+        thread_offset_ +
+        TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == problem_size_trs) {
+      filter_trs_ = 0;
+      add_tile_offset(TensorCoord(1, 0));
+    }
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    uint32_t enabled = 0;
+
+#if defined(_MSC_VER) || (__CUDACC_VER_MAJOR__ < 11)
+    enabled = threadIdx.x < kThreads * 2;
+#else
+    asm volatile(
+        "{\n"
+        "  .reg .u32 tid_reg;\n"
+        "  .reg .pred p;\n"
+        "  mov.u32 tid_reg, %%tid.x;\n"
+        "  setp.lt.u32 p, tid_reg, %1;\n"
+        "  selp.u32 %0, 1, 0, p;\n"
+        "}\n" : "+r"(enabled) :"n"(kThreads * 2));
+#endif
+
+    return ((thread_offset_.contiguous() < problem_size_c) && enabled);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+
+  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv3dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    iterator_.advance();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
new file mode 100755
index 000000000..24f0de4c2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
@@ -0,0 +1,371 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 1;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kIterations = WarpShape::kContiguous / 8;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  using Params = Conv2dWgradActivationIteratorOptimizedParams;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+
+  /// Size of tensor
+  Conv2dProblemSize problem_size_;
+
+  int32_t thread_offset_;
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int32_t filter_c_[kIterations];
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(params, problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < kIterations; ++c) {
+      int rsc_offset = thread_offset_ + c * 8;
+
+      int residual, tmp;
+      params_.sc_divmod(tmp, residual, rsc_offset);
+      params_.c_divmod(tmp, filter_c_[c], residual);
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + filter_c_[c],
+        true
+      );
+    }
+
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + filter_c_[c],
+        true 
+      );
+    }
+
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedScaleBiasVectorIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dProblemSize const &problem_size, Layout const &layout)
+        : params_(problem_size, layout::TensorNHWC(0, 0, 0)){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(params, problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
new file mode 100755
index 000000000..67418e689
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
+      Convolution problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+static int get_strided_dgrad_tile_m(
+  cutlass::conv::Conv2dProblemSize const &problem_size,
+  int tile_size_m) {
+
+  // CTAs in M dimension per starting filter position
+  int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, tile_size_m);
+
+  // Inflate number of CTAs in M dimension to cover every strating filter position even those that
+  // may fall out of valid MMA (Dy * w) but are needed to apply epilogue (beta * Dx_source) 
+  // and point-wise fusion
+  int tile_m = tile_m_per_filter * int(problem_size.stride().product());
+
+  // There is a possible performance optimization here that leads up to 2x speeds than the current 
+  // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
+  //
+  // * Optimization * 
+  // Only launch CTAs in M dimension which contribute to a row in Dx output
+  // 
+  // 
+  // * Constraints *
+  // (A) stride <= filter, for example, stride={2x2} and filter={3x3}: 
+  //       - (A.1): There are no constraints for this case and the optimization does 
+  //                affect this case functionality or performance. 
+  // (B) stride > filter, for example, stride={2x2} and filter={1x1}: 
+  //       - (B.1): Dx output tensor should be zero initialized
+  //       - (B.2): The kernel epilogue cannot apply beta. Thus, beta should be zero 
+
+  return tile_m;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Threadblock swizzling function for strided dgrad convolution
+struct StridedDgradHorizontalThreadblockSwizzle : 
+  public gemm::threadblock::GemmHorizontalThreadblockSwizzle {
+
+  using Base = gemm::threadblock::GemmHorizontalThreadblockSwizzle;
+
+  CUTLASS_HOST_DEVICE
+  StridedDgradHorizontalThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    gemm::GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    // compute number of tiles in m dimension
+    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
+
+    // compute number of tiles in n dimension 
+    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
+
+    return gemm::GemmCoord(
+      tile_m,
+      tile_n,
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
+  private:
+    using Base::get_tiled_shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Threadblock swizzling function for strided dgrad convolution
+template <int N = 1>
+struct StridedDgradIdentityThreadblockSwizzle : 
+  public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
+
+  using Base = gemm::threadblock::GemmIdentityThreadblockSwizzle<N>;
+
+  CUTLASS_HOST_DEVICE
+  StridedDgradIdentityThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    gemm::GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    // compute number of tiles in m dimension
+    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
+
+    // compute number of tiles in n dimension 
+    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
+
+    return gemm::GemmCoord(
+      tile_m,
+      tile_n,
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
+  private:
+    using Base::get_tiled_shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+template <int N = 1, int Output_N = 1, int Output_P = 1, int Output_Q = 1>
+struct DepthwiseDirect2dConvIdentityThreadblockSwizzle
+    : public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvIdentityThreadblockSwizzle() {}
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
+                            cutlass::conv::Conv2dProblemSize const &problem_size,
+                            gemm::GemmCoord tile_size,
+                            int split_k_slices) {
+        
+    gemm::GemmCoord implicit_gemm_problem_size =
+        cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return gemm::GemmCoord(1,
+                     (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+                     split_k_slices);
+  }
+};
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
new file mode 100755
index 000000000..ed385df03
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/thread/depthwise_mma.h"
+
+
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseSimt
+    : public cutlass::gemm::warp::
+          MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_> {
+  using Base = cutlass::gemm::warp::
+      MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_>;
+      
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+public:
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::conv::warp::DepthwiseMmaSimtTileIterator<
+    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseSimt():Base() {}
+};
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+    typename FilterShape_,
+    /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+    typename ThreadOutputShape_,
+    /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm_ = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_ = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_ =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_ = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseDirectConvSimt {
+ public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+  using FilterShape = FilterShape_;
+
+  /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Iterator algo type
+  static conv::IteratorAlgorithm const IteratorAlgorithm = IteratorAlgorithm_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_; 
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+  
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
+                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
+                                    platform::is_same< ElementA, int8_t >::value && 
+                                    platform::is_same< ElementB, int8_t >::value;
+
+  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
+
+  /// Thread-level matrix multiply accumulate operator
+  using ThreadMma = cutlass::conv::thread::DepthwiseDirectConvElementwiseInnerProduct<
+    cutlass::gemm::GemmShape<
+      Shape::kM / Policy::WarpShape::kRow,    // number of output pixels proccessed per thread
+      Shape::kN / Policy::WarpShape::kColumn, // number of channels proccessed per thread
+      1>,
+    ElementA,
+    ElementB,
+    ElementC,
+    arch::OpMultiplyAdd,
+    dp4a_type
+  >;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Shape of the underlying instruction
+  using InstructionShape = cutlass::gemm::GemmShape<1,1,use_dp4a ? 4 : 1>;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = cutlass::conv::warp::DepthwiseDirect2dConvSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>, // <output tile=(P*Q), output channels> per warp
+    FilterShape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    cutlass::gemm::Operand::kA,
+    ElementA,
+    Policy,
+    IteratorAlgorithm,
+    StrideShape,
+    DilationShape,
+    ActivationShape,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<1, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    cutlass::gemm::Operand::kC,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename ThreadMma::FragmentC;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseDirectConvSimt() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &d, 
+    FragmentA a, 
+    FragmentB b, 
+    FragmentC const &c, int group_idx = 0) const {
+
+    ThreadMma mma;
+
+    mma(d, a, b, c);
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
new file mode 100755
index 000000000..26d9638ba
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
@@ -0,0 +1,862 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/conv/convolution.h"
+
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
+///
+/// concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Operand identity
+  cutlass::gemm::Operand Operand,
+  /// Data type of A elements
+  typename Element_,
+  /// Layout of operand
+  typename Layout_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK = 1,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize = 1
+>
+class DepthwiseMmaSimtTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseMmaSimtTileIterator<Shape_,
+                                   cutlass::gemm::Operand::kB,
+                                   Element_,
+                                   layout::RowMajor,
+                                   Policy_,
+                                   PartitionsK,
+                                   PartitionGroupSize>
+    : public cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize> {
+
+  using Base = cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize>;
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = typename Base::TensorRef;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = typename Base::ThreadShape;
+
+  /// Number of individual loads
+  using Iterations =  typename Base::Iterations;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+  static_assert(Policy::LaneMmaShape::kN == 1, "Each thread should be 1 element per LDS along the k-dim");
+  
+private:
+
+  MatrixCoord lane_offset_;
+  int channel_idx_;
+  int base_channel_idx_;
+  int warps_n_;
+
+ public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator():Base() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) : Base(ref, lane_id) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    warps_n_ = -1;
+    channel_idx_ = 0;
+    base_channel_idx_ = 0;
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+  }
+  
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    if(warps_n_ == -1){
+        warps_n_ = coord.column();
+    }
+    
+    Base::add_tile_offset(coord);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        void const *ptr = this->ref_.data() +
+                          this->ref_.offset({-(channel_idx_ - base_channel_idx_),
+                                             n * Policy::WarpShape::kColumn}) +
+                          pointer_offset / Policy::LaneMmaShape::kN;
+
+        // Base_k of a warp +  Base_k of current threads.
+        int thread_k_base_idx =
+            warps_n_ * Shape::kColumn / Policy::LaneMmaShape::kN + lane_offset_.column();
+
+        if (channel_idx_ + k == thread_k_base_idx + n * Policy::WarpShape::kColumn) {
+          // Depthwise kernel would only do computation when channel == k.
+          // Loads an element when the current computation channel == the k corresponding to this thread.
+          arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
+        } else {
+          // Reduce SMEM load
+          dst_ptr[n + k * Iterations::kColumn].fill(Element(0));
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    if(k_group % PartitionGroupSize == 0 && k_group != 0){
+      base_channel_idx_ = k_group;
+    }
+    channel_idx_ = k_group;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadBlockOutputShape_,
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK = 1,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize = 1>
+class DepthwiseDirect2dConvSimtTileIterator;
+
+
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm,
+                                            StrideShape,   
+                                            DilationShape,
+                                            ActivationShape,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    ThreadOutputShape::kNHW, // Output tile shape Computed by current threads
+    ThreadOutputShape::kC
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+protected:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  int activation_offset[ThreadOutputShape::kH][ThreadOutputShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+  int iterator_offset_;
+
+  int inc_next_s_ ;
+  int inc_next_r_ ;
+  
+  MatrixCoord lane_offset_;
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template<typename Params>
+  CUTLASS_HOST_DEVICE
+  void setup_initial_status(Params const& params)  {
+  
+    inc_next_s_ = params.inc_next[0];
+    inc_next_r_ = params.inc_next[1];
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_p_ =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    int base_q_ =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int base_w = (base_q_ + q) * params.stride[0];
+          int base_h = (base_p_ + p) * params.stride[1];
+
+          int offset = base_h * params.activation_tile_w + base_w;
+          activation_offset[p][q][col] = offset;
+        }
+      }
+    }
+  }
+
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ = MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      iterator_offset_ += inc_next_s_;
+
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      iterator_offset_ += inc_next_r_;
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator & operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          void const *ptr = ref_.data() +
+                            ref_.offset({activation_offset[p][q][n] + (iterator_offset_),
+                                         n * Policy::WarpShape::kColumn}) +
+                            pointer_offset / Policy::LaneMmaShape::kN;
+          arch::shared_load(dst_ptr[n + q + p * ThreadOutputShape::kW], ptr);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm::kFixedStrideDilation,
+                                            StrideShape_,
+                                            DilationShape_,
+                                            ActivationShape_,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_;
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
+                "The warp-level GEMM M size must be divisible by the number of threads arranged "
+                "along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0,
+                "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  // Activations loaded by threadblock
+  static int const ThreadActivationShapeH = (ThreadOutputShape::kH - 1) * StrideShape::kRow +
+                                            (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ThreadActivationShapeW = (ThreadOutputShape::kW - 1) * StrideShape::kColumn +
+                                            (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ThreadActivationShape = cutlass::conv::
+      TensorNHWCShape<1, ThreadActivationShapeH, ThreadActivationShapeW, ThreadOutputShape::kC>;
+
+  // Thread-level shape of a fragment
+  using ThreadShape =
+      MatrixShape<ThreadOutputShape::kNHW,
+                  ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / Policy::LaneMmaShape::kN>;
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+ protected:
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  Array<Element, Policy::LaneMmaShape::kN>
+      activation[ThreadActivationShape::kH][ThreadActivationShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+
+
+  MatrixCoord lane_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(TensorRef ref, int lane_id) {
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template <typename Params>
+  CUTLASS_HOST_DEVICE void setup_initial_status(
+      Params const &params) {
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_h =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH * StrideShape::kRow;
+    int base_w =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW * StrideShape::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadActivationShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadActivationShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (base_h + h) * ActivationShape::kW + (base_w + w);
+
+          void const *ptr = ref_.data() + ref_.offset({offset, col * Policy::WarpShape::kColumn});
+          arch::shared_load(activation[h][w][col], ptr);
+        }
+      }
+    }
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ =
+        MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          const int h = p * StrideShape::kRow + iterator_r_ * DilationShape::kRow;
+          const int w = q * StrideShape::kColumn + iterator_s_ * DilationShape::kColumn;
+
+          dst_ptr[n + q + p * ThreadOutputShape::kW] = activation[h][w][n];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
new file mode 100755
index 000000000..4da31ab81
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
@@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per channel scale+bias+relu before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentScaleBias>
+struct FpropScaleBiasReluTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumScaleBias = FragmentScaleBias::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns
+  static int const MmaCols = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using ScaleBiasOperand = Array<T, MmaElements * MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
+    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
+    // It requires C to be an even number.
+    asm volatile(
+        "{\n\t"
+        " .reg .pred %%p;\n\t"
+        " .reg .b32 t1;\n\t"
+        " setp.eq.u32 %%p, %2, %4;\n\t"
+        " fma.rn.f16x2.relu t1, %1, %2, %3;\n"
+        " selp.u32 %0, 0, t1, %%p;\n\t"
+        "}\n"
+        : "=r"(ptr_activations[0])
+        : "r"(ptr_scale_bias[0]), "r"(ptr_activations[0]),
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16x2));
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentScaleBias const &scale_bias) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    ScaleBiasOperand const *ptr_scale_bias =
+        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i], ptr_scale_bias[(i / MmaScaleBiasPair) % MmaCols]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentScaleBias>
+struct WgradScaleBiasReluTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumScaleBias = FragmentScaleBias::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 rows
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using ScaleBiasOperand = Array<__half2, MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    __half2 *ptr_activations = reinterpret_cast<__half2 *>(&activations);
+    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
+
+#if 1 
+    // CUDA + PTX version
+
+    bool h1_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].x) == cutlass::arch::OOB_NAN_F16);
+    bool h2_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].y) == cutlass::arch::OOB_NAN_F16);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We cannot gurantee that the pair of F16 are both in bound or both 
+    // out-of-bound because C x R x S can be an odd number.
+    asm volatile(
+        "{\n\t"
+        " fma.rn.f16x2.relu %0, %1, %2, %3;\n"
+        "}"
+        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
+        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
+          "r"(ptr_scale_bias[1]));
+
+    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h1_oob ?
+            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff0000) :
+            reinterpret_cast<uint32_t &>(ptr_activations[0]);
+
+    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h2_oob ?
+            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff) :
+            reinterpret_cast<uint32_t &>(ptr_activations[0]);
+#else
+    // pure PTX version
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+    asm volatile(
+        "{\n"
+        " .reg .b16 t1, t2;\n"
+        " .reg .b32 t3, t4, t5, t6;\n"
+        " .reg .pred p1, p2;\n"
+        " mov.b32 {t1, t2}, %2;\n"
+        " setp.eq.s16 p1, t1, %4;\n"
+        " setp.eq.s16 p2, t2, %4;\n"
+        " fma.rn.f16x2.relu t3, %1, %2, %3;\n"
+        " and.b32 t4, t3, %5;\n"
+        " selp.b32 t5, t4, t3, p1;\n"
+        " and.b32 t6, t5, %6;\n"
+        " selp.b32 %0, t6, t5, p2;\n"
+        "}\n"
+        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
+        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16), "n"(0xffff0000), "n"(0x0000ffff));
+#endif
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentScaleBias const &scale_bias) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    ScaleBiasOperand const *ptr_scale_bias =
+        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i], ptr_scale_bias[(i / MmaRows)]);
+    }
+  }
+};
+} // namespace warp
+} // namespace conv 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/coord.h b/lightllm-kernel/cutlass/include/cutlass/coord.h
new file mode 100755
index 000000000..d778046c2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/coord.h
@@ -0,0 +1,480 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief A Coord is a coordinate of arbitrary rank into a tensor or matrix
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically-sized array specifying Coords within a tensor
+template <
+  int Rank_,                          ///< Logical rank of coordinate
+  typename Index_ = int,              ///< Index type used for each dimension
+  typename LongIndex_ = int64_t       ///< Long index type used for linear offsets
+>
+struct Coord {
+
+public:
+
+  //
+  // Type and constant definitions
+  //
+
+  /// Number of elements in Coord
+  static int const kRank = Rank_;
+
+  /// Index type used to store elements
+  using Index = Index_;
+
+  /// Type used to represent linear offsets
+  using LongIndex = LongIndex_;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Indices
+  Index idx[kRank];
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor initializes uniformly
+  CUTLASS_HOST_DEVICE
+  explicit Coord(Index value = Index(0)) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = value;
+    }
+  }
+
+  /// Constructs from an array of integers
+  CUTLASS_HOST_DEVICE
+  Coord(Index const (&_idx)[kRank]) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = _idx[i];
+    }
+  }
+
+  /// Constructs from some other Coord
+  template <int R, typename I, typename L>
+  CUTLASS_HOST_DEVICE
+  Coord(Coord<R, I, L> other) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = other[i];
+    }
+  }
+
+  /// Returns a slice of the Coord which may be larger or smaller in rank
+  /// than this.
+  template <int Slice>
+  CUTLASS_HOST_DEVICE
+  Coord<Slice, Index, LongIndex> slice(int start = 0, Index identity = 0) const {
+    Coord<Slice, Index, LongIndex> result;
+    for (int i = 0; i < Slice; ++i) {
+      if (i + start < kRank) {
+        result[i] = idx[i + start];
+      }
+      else {
+        result[i] = identity;
+      }
+    }
+    return result;
+  }
+
+  /// Returns the index of the dimension with least value
+  CUTLASS_HOST_DEVICE
+  int min_dim_index() const {
+    int i = 0;
+    for (int j = 1; j < kRank; ++j) {
+      if (idx[j] < idx[i]) {
+        i = j;
+      }
+    }
+    return i;
+  }
+
+  /// Returns the index of the dimension with greatest value
+  CUTLASS_HOST_DEVICE
+  int max_dim_index() const {
+    int i = 0;
+    for (int j = 1; j < kRank; ++j) {
+      if (idx[j] > idx[i]) {
+        i = j;
+      }
+    }
+    return i;
+  }
+
+  /// Returns true if Coord is non-zero.
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    for (int i = 0; i < kRank; ++i) {
+      if (idx[i]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// Returns true if Coord is uniformly zero.
+  CUTLASS_HOST_DEVICE
+  bool operator!() const {
+    for (int i = 0; i < kRank; ++i) {
+      if (idx[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Coord operator+(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] + b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Coord operator-(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] - b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Coord operator*(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] * b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Coord operator/(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] / b.idx[i];
+    }
+    return c;
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Coord& operator+=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] += b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Coord& operator-=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] -= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Coord& operator*=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] *= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Coord& operator/=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] /= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// Member access operator
+  CUTLASS_HOST_DEVICE Index& operator[](int dim) { return idx[dim]; }
+
+  /// Member access operator
+  CUTLASS_HOST_DEVICE Index const& operator[](int dim) const { return idx[dim]; }
+
+  /// Computes the dot product with anotherCoord object
+  CUTLASS_HOST_DEVICE
+  LongIndex dot(Coord const& b, LongIndex sum = LongIndex(0)) const {
+    for (int i = 0; i < kRank; ++i) {
+      sum += idx[i] * b.idx[i];
+    }
+    return sum;
+  }
+
+  /// Gets the index of a given Coord element
+  template <int Dim>
+  CUTLASS_HOST_DEVICE Index& at() {
+    return idx[Dim];
+  }
+
+  /// Access via index; may limit unrolling potential
+  CUTLASS_HOST_DEVICE
+  Index& at(int dim) { return idx[dim]; }
+
+  /// Gets the index of a given Coord element
+  template <int Dim>
+  CUTLASS_HOST_DEVICE Index const& at() const {
+    return idx[Dim];
+  }
+
+  /// Access via index; may limit unrolling potential
+  CUTLASS_HOST_DEVICE
+  Index const& at(int dim) const { return idx[dim]; }
+
+  /// Determines if two Coord<> objects are equal
+  CUTLASS_HOST_DEVICE
+  bool operator==(Coord const& b) const {
+    bool equal = true;
+    for (int i = 0; equal && i < kRank; ++i) {
+      equal = (idx[i] == b.idx[i]);
+    }
+    return equal;
+  }
+
+  /// Not equal
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Coord const& b) const { return !(*this == b); }
+
+  /// Clamps a coordinate to a range specified by maximum and minimum values
+  CUTLASS_HOST_DEVICE
+  Coord& clamp(Coord const& max, Coord const& min = Coord()) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]);
+    }
+    return *this;
+  }
+
+  /// Returns the sum of all elements
+  CUTLASS_HOST_DEVICE
+  Index sum() const {
+    Index sum_(idx[0]);
+    for (int i = 1; i < kRank; ++i) {
+      sum_ += idx[i];
+    }
+    return sum_;
+  }
+
+  /// Returns the product of all elements
+  CUTLASS_HOST_DEVICE
+  LongIndex product() const {
+    LongIndex product_(idx[0]);
+    for (int i = 1; i < kRank; ++i) {
+      product_ *= idx[i];
+    }
+    return product_;
+  }
+
+  /// Less than operator
+  CUTLASS_HOST_DEVICE
+  bool operator<(Coord const &b) const {
+    for (int i = 0; i < kRank; ++i) {
+      if (!(idx[i] < b[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Less than or equals operator
+  CUTLASS_HOST_DEVICE
+  bool operator<=(Coord const &b) const {
+    for (int i = 0; i < kRank; ++i) {
+      if (!(idx[i] <= b[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Greater than operator
+  CUTLASS_HOST_DEVICE
+  bool operator>(Coord const &b) const {
+    return !(*this <= b);
+  }
+
+  /// Greater than or equals operator
+  CUTLASS_HOST_DEVICE
+  bool operator>=(Coord const &b) const {
+    return !(*this < b);
+  }
+};
+
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Index s, Coord<Rank, Index> coord) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Coord<Rank, Index> coord, Index s) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
+/// Scalar division
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator/(Index s, Coord<Rank, Index> coord) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] = s / coord[i];
+  }
+  return coord;
+}
+
+/// Scalar division
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator/(Coord<Rank, Index> coord, Index s) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] /= s;
+  }
+  return coord;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Integer-valued make_Coord
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to make a 2-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<1, T> make_Coord(T _0) {
+  T values[1] = {_0};
+  return Coord<1, T>(values);
+}
+
+/// Helper to make a 2-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<2, T> make_Coord(T _0, T _1) {
+  T values[2] = {_0, _1};
+  return Coord<2, T>(values);
+}
+
+/// Helper to make a 3-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<3, T> make_Coord(T _0, T _1, T _2) {
+  T values[3] = {_0, _1, _2};
+  return Coord<3, T>(values);
+}
+
+/// Helper to make a 4-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<4, T> make_Coord(T _0, T _1, T _2, T _3) {
+  T values[4] = {_0, _1, _2, _3};
+  return Coord<4, T>(values);
+}
+
+/// Helper to make a 5-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<5, T> make_Coord(T _0, T _1, T _2, T _3, T _4) {
+  T values[5] = {_0, _1, _2, _3, _4};
+  return Coord<5, T>(values);
+}
+
+/// Helper to make a 1-element coordinate
+template <int N, typename T> 
+CUTLASS_HOST_DEVICE
+Coord<N, T>make_Coord_with_padding(T _0) {
+  Coord<N, T> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = N - 1; i > 0; --i) {
+    coord[i] = 0;
+  }
+
+  coord[0] = _0;
+
+  return coord;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/core_io.h b/lightllm-kernel/cutlass/include/cutlass/core_io.h
new file mode 100755
index 000000000..40ae22246
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/core_io.h
@@ -0,0 +1,286 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for printing cutlass/core objects
+*/
+#pragma once
+
+#include <iostream>
+#include <typeinfo>
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Output operator for CUDA built-in dim3 type
+inline std::ostream &operator<<(std::ostream &out, dim3 d) {
+  return out << d.x << ", " << d.y << ", " << d.z;
+}
+
+/// Output operator for CUDA built-in error type
+inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
+  return out << cudaGetErrorString(error);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                    stream operators for cutlass namespace                                     //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, int Rank>
+inline
+std::ostream& operator<<(std::ostream& out, Array<Element, Rank> const& v) {
+  for (int i = 0; i < Rank; ++i) {
+    out << (i ? ", " : "") << v[i];
+  }
+  return out;
+}
+
+template <int Rank>
+inline
+std::ostream& operator<<(std::ostream& out, Coord<Rank> const& coord) {
+  for (int i = 0; i < Rank; ++i) {
+    out << (i ? ", " : "") << coord[i];
+  }
+  return out;
+}
+
+inline
+std::istream & operator>>(std::istream &stream, half_t &x) {
+  float tmp;
+  stream >> tmp;
+  x = static_cast<cutlass::half_t>(tmp);
+  return stream;
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, half_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) {
+  return out << float(x);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to enable formatted printing of CUTLASS scalar types to an ostream
+template <typename T>
+struct ScalarIO {
+
+  /// Value to print
+  T value;
+
+  /// Default ctor
+  ScalarIO() { }
+
+  /// Constructs from a value
+  ScalarIO(T value): value(value) {}
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default printing to ostream
+template <typename T>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<T> const &scalar) {
+  return out << scalar.value;
+}
+
+/// Printing to ostream of int8_t as integer rather than character
+template <>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<int8_t> const &scalar) {
+  return out << int(scalar.value);
+}
+
+/// Printing to ostream of uint8_t as integer rather than character
+template <>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<uint8_t> const &scalar) {
+  return out << unsigned(scalar.value);
+}
+
+
+/// Default printing to ostream for MatrixShape
+template <int Row, int Column>
+inline
+std::ostream & operator<<(std::ostream &out, MatrixShape<Row, Column> const &matrix_shape) {
+  out << "cutlass::MatrixShape::(kRow, kColumn) {"
+    << cutlass::MatrixShape<Row,Column>::kRow <<","
+    << cutlass::MatrixShape<Row,Column>::kColumn <<"}";
+  return out;
+}
+
+
+/// Prints matrix to ostream
+template <typename Element, int Rows, int Columns>
+std::ostream & operator<<(std::ostream &out, Matrix<Element, Rows, Columns> const &rhs) {
+
+  for (int i = 0; i < Rows; ++i) {
+    for (int j = 0; j < Columns; ++j) {
+      ScalarIO<Element> element(rhs.at(i, j));
+      out << (j ? ", " : "") << element;
+    }
+    out << "\\n";
+  }
+
+  return out;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &out, Quaternion<T> const &rhs) {
+
+  out << ScalarIO<T>(rhs.w()) << " ";
+  if (rhs.x() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.x()) << "*i ";
+  if (rhs.y() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.y()) << "*j ";
+  if (rhs.z() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.z()) << "*k";
+
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::gemm namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace gemm {
+
+/// Default printing to ostream for GemmShape
+template <int M, int N, int K>
+inline
+std::ostream & operator<<(std::ostream &out, GemmShape<M,N,K> const &gemm_shape) {
+  out << "cutlass::gemm::GemmShape::(kM, kN, kK) {"
+    << cutlass::gemm::GemmShape<M,N,K>::kM <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kN <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kK << "}";
+  return out;
+}
+
+/// Default printing to ostream for GemmCoord
+inline
+std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) {
+  out << "cutlass::gemm::GemmCoord {"
+    << gemm_coord.m() <<","
+    << gemm_coord.n() <<","
+    << gemm_coord.k() << "}";
+  return out;
+}
+
+} //namespace gemm
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                       stream operators for cutlass namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default printing to ostream for PitchLinearShape
+template < int Contiguous, int Strided>
+inline
+std::ostream & operator<<(std::ostream &out, PitchLinearShape<Contiguous, Strided> const &pitch_linear_shape) {
+  out << "cutlass::PitchLinearShape:(kContiguous, kStrided) {"
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kContiguous <<","
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kStrided <<"}";
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::conv namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace conv {
+/// Default printing to ostream for Conv2dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv2dProblemSize const& problem) {
+  out << "NHWC: (" << problem.N << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C / problem.groups << ")" << std::endl
+      << "NPQK: (" << problem.N << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "groups: (" << problem.groups << ")" << std::endl
+      << "Pad_h, Pad_w: (" << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "Stride_h, Stride_w: (" << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "Dilation_h, Dilation_w: (" << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ")" << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+
+/// Default printing to ostream for Conv3dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv3dProblemSize const& problem) {
+  out << "NDHWC: (" << problem.N << ", " << problem.D << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KTRSC: (" << problem.K << ", " << problem.T << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
+      << "NZPQK: (" << problem.N << ", " << problem.Z << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "pad_d, pad_h, pad_w: ("  << problem.pad_d << ", " << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "stride_d, stride_h, stride_w: ("  << problem.stride_d << ", " << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "dilation_d, dilation_h, dilation_w: ("  << problem.dilation_d << ", " << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ") " << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+} // namespace conv
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
new file mode 100755
index 000000000..1c8f56a65
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Interface betweeen a CUTLASS device-wide operator and CUDA.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+
+#include "cutlass/platform/platform.h"
+#if ! defined(__CUDACC_RTC__)
+#include <cstdio>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// NVRTC doesn't need definitions for these host classes
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) \
+    && !defined(__CUDACC_RTC__)
+#define CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__))
+#define CUDA_HOST_ADAPTER_TENSORMAP_ENABLED
+#endif
+
+// Include <cuda.h> for CUDA Driver API calls if any of these capabilities are enabled.
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||        \
+    defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+#include <cuda.h>
+
+#endif // defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||
+       // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Macro-level guard for CUDA Host Adapter
+//
+#if !defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER)
+#define CUTLASS_ENABLE_CUDA_HOST_ADAPTER false
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#if !defined(__CUDACC_RTC__)
+
+#include <cudaTypedefs.h>
+#include <driver_types.h>
+
+#define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok
+
+#if defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver) \
+  template <typename... Args>                       \
+  CUresult call_##func(Args... args) {              \
+    return func(args...);                           \
+  }
+
+#else // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5)
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPointByVersion(    \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn, ver,                                              \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func##_v##ver>(pfn)(args...); \
+  }
+
+#else
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPoint(             \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn,                                                   \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func>(pfn)(args...);          \
+  }
+
+#endif // (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5)
+
+#endif // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeTiled, 12000);
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeIm2col, 12000);
+#endif
+
+#undef CUTLASS_CUDA_DRIVER_STRINGIFY
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_CALL(func) cutlass::call_##func
+
+#endif // !defined(__CUDACC_RTC__)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This class manages runtime CUlaunchAttribute that can be supplied to CudaHostAdapter
+/// CudaHostLaunchAttributes will be an empty struct in earlier CTK where CUlaunchAttribute
+/// is not introduced.
+struct CudaHostLaunchAttributes {
+
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+  /// Reasonable maximum launch attributes that are commonly applied
+  static constexpr int32_t kMaximumAttributeCount = 5;
+
+  /// Launch attributes
+  CUlaunchAttribute launch_attributes[kMaximumAttributeCount];
+  int32_t      attribute_count = 0;
+
+  CUTLASS_HOST_DEVICE
+  CudaHostLaunchAttributes(CUlaunchAttribute *launch_attributes_ = nullptr,
+                           int32_t attribute_count_ = 0) {
+    CUTLASS_ASSERT(attribute_count_ >= 0 && attribute_count_ < kMaximumAttributeCount);
+    for (int32_t i = 0; i < attribute_count_ && i < kMaximumAttributeCount; ++i) {
+      launch_attributes[i] = launch_attributes_[i];
+    }
+    attribute_count = attribute_count_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  CUlaunchAttribute const* data() const {
+    return launch_attributes;
+  }
+
+  CUTLASS_HOST_DEVICE
+  size_t size() const {
+    return attribute_count;
+  }
+  
+#endif // (CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+};
+
+
+/// This class defines an object which abstracts interactions between the CUTLASS device-wide GEMM and
+/// CUDA. The intention is to enable CUTLASS to be used with both the CUDA Runtime API and CUDA Driver API.
+struct CudaHostAdapter {
+
+  /// Limit the number of kernels
+  static constexpr int32_t kMaximumKernelCount = 4;
+
+  /// Maximum cluster size
+  static constexpr int MaxClusterSize = 32;
+
+  //
+  // Data members
+  //
+
+  /// Handles
+  void        *kernel_handles[kMaximumKernelCount];
+  int32_t      kernel_count = 0;
+
+  CudaHostLaunchAttributes launch_attributes;
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CudaHostAdapter() = default;
+
+  /// Dtor
+  virtual ~CudaHostAdapter() = default;
+
+  /// Copy Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(const CudaHostAdapter & rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(rhs.launch_attributes) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  /// Copy Assignment
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter& operator=(const CudaHostAdapter & rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+
+    launch_attributes = rhs.launch_attributes;
+
+    return *this;
+  }
+
+
+  /// Move ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(CudaHostAdapter && rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(std::move(rhs.launch_attributes)) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  // / Move assignment
+  CUTLASS_HOST_DEVICE 
+  CudaHostAdapter& operator=(CudaHostAdapter && rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+    launch_attributes = std::move(rhs.launch_attributes);
+    return *this;
+  }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(void **kernel_handles_, 
+                  int32_t kernel_count_,
+                  CudaHostLaunchAttributes const &launch_attributes_ = { })
+      : kernel_count(kernel_count_),
+        launch_attributes(launch_attributes_) {
+    CUTLASS_ASSERT(kernel_count >= 0 && kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = kernel_handles_[i];
+    }
+  }
+
+  /// Returns true if the CudaHostAdapter is empty (kernel_count == 0)
+  CUTLASS_HOST_DEVICE 
+  bool empty() const { return !kernel_count; }
+
+  /// Returns kernel_count
+  CUTLASS_HOST_DEVICE
+  size_t size() const { return static_cast<size_t>(kernel_count); }
+
+  /// Queries the occupancy of a kernel
+  virtual Status query_occupancy(
+    int32_t *device_sms, 
+    int32_t *sm_occupancy,
+    int32_t kernel_index,
+    int32_t thread_count,
+    int32_t smem_size) const = 0;
+ 
+  /// Launches a kernel without using Threadblock Clusters. 
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+  /// Launches a kernel using the CUDA Extensible Launch API and Threadblock Clusters.
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const cluster_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+#if defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+  /// Create a tensor map descriptor object representing im2col memory region.
+  virtual CUresult tensorMapEncodeIm2col (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const int* pixelBoxLowerCorner,
+    const int* pixelBoxUpperCorner,
+    cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Create a tensor map descriptor object representing tiled memory region.
+  virtual CUresult tensorMapEncodeTiled (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const cuuint32_t* boxDim,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Modify an existing tensor map descriptor with an updated global address.
+  virtual CUresult tensorMapReplaceAddress(
+    CUtensorMap* tensorMap,
+    void* globalAddress)  const = 0;
+
+#endif // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+protected:
+
+  /**
+   * Fills a buffer in Global Memory with a byte sequence copied from host memory.
+   * This function can be overriden to dispatch to the appropriate cuMemsetD*Async API
+  */
+  virtual Status memsetDeviceImpl(
+    void* destination, ///< Device memory pointer to be filled
+    void const* fill_value, ///< Value to be filled in the buffer
+    size_t fill_size, ///< Size of the data type to be used for filling the buffer
+    size_t count, ///< Number of elements of size fill_size
+    cudaStream_t stream) const = 0;
+
+public:
+
+  /// Fills a buffer in Global Memory with a byte sequence copied from host memory
+  template<class FillValueType>
+  CUTLASS_HOST_DEVICE
+  Status memsetDevice(
+      void* destination,
+      FillValueType fill_value, 
+      size_t count,
+      cudaStream_t stream) const {
+    return this->memsetDeviceImpl(
+      destination,
+      &fill_value,
+      sizeof(FillValueType),
+      count,
+      stream);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/cutlass.h b/lightllm-kernel/cutlass/include/cutlass/cutlass.h
new file mode 100755
index 000000000..e12616a20
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/cutlass.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Basic include for CUTLASS.
+*/
+
+#pragma once
+
+#include "cutlass/arch/synclog.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/// Status code returned by CUTLASS operations
+enum class Status {
+  kSuccess,                    ///< Operation was successful.
+  kErrorMisalignedOperand,     ///< operands fail alignment requirements.
+  kErrorInvalidDataType,       ///< DataType fails requirement.
+  kErrorInvalidLayout,         ///< Layout fails alignment requirement.
+  kErrorInvalidProblem,        ///< Specified problem size is not supported by operator.
+  kErrorNotSupported,          ///< Operation is not supported on current device.
+  kErrorWorkspaceNull,         ///< The given workspace is null when it is required to be non-null.
+  kErrorInternal,              ///< An error within CUTLASS occurred.
+  kErrorArchMismatch,          ///< CUTLASS runs on a device that it was not compiled for.
+  kErrorInsufficientDriver,    ///< CUTLASS runs with a driver that is too old.
+  kErrorMemoryAllocation,      ///< Kernel launch failed due to insufficient device memory.
+  kInvalid                     ///< Status is unspecified.
+};
+
+/// Convert cutlass status to status strings
+CUTLASS_HOST_DEVICE
+static char const* cutlassGetStatusString(cutlass::Status status) {
+  switch (status) {
+    case cutlass::Status::kSuccess:
+      return "Success";
+    case cutlass::Status::kErrorMisalignedOperand:
+      return "Error Misaligned Operand";
+    case cutlass::Status::kErrorInvalidDataType:
+      return "Error Invalid Data Type";
+    case cutlass::Status::kErrorInvalidLayout:
+      return "Error Invalid Layout";
+    case cutlass::Status::kErrorInvalidProblem:
+      return "Error Invalid Problem";
+    case cutlass::Status::kErrorNotSupported:
+      return "Error Not Supported";
+    case cutlass::Status::kErrorWorkspaceNull:
+      return "Error Workspace Null";
+    case cutlass::Status::kErrorInternal:
+      return "Error Internal";
+    case cutlass::Status::kErrorInsufficientDriver:
+      return "Error Insufficient Driver";
+    case cutlass::Status::kErrorArchMismatch:
+      return "Error Architecture Mismatch";
+    case cutlass::Status::kErrorMemoryAllocation:
+      return "Error Memory Allocation failed";
+    case cutlass::Status::kInvalid: break;
+  }
+
+  return "Invalid status";
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static const int NumThreadsPerWarp = 32;
+static const int NumThreadsPerWarpGroup = 128;
+static const int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2;
+static const int NumThreadsPerQuad = 4;
+static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to return true when called by thread 0 of threadblock 0.
+CUTLASS_HOST_DEVICE bool thread0() {
+  #if defined(__CUDA_ARCH__)
+    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
+  #else
+    return false;
+  #endif
+}
+
+/// Returns a lane index in the warp. The threads in warp may not be convergent
+CUTLASS_DEVICE
+int canonical_lane_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x % NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp-uniform value indicating the canonical warp index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_idx_sync() { 
+  #if defined(__CUDA_ARCH__)
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarp, 0);
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp index in the CTA. The threads in warp may not be convergent
+/// As it doesn't sync the warp, it faster and allows forward progress
+CUTLASS_DEVICE
+int canonical_warp_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x / NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp-uniform value indicating the canonical warp group index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_group_idx() {
+  #if defined(__CUDA_ARCH__)
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0);
+  #else
+    return 0;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
new file mode 100755
index 000000000..a4b288e7c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/container/tuple.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <size_t I, class Tuple>
+struct deduce_mixed_width_dtype {
+static_assert(I >= 0u && I <= 2u, "Valid indices are 0, 1, and 2, which represent Operand, Scale, and Bias, respectively.");
+
+private:
+  using underlying_tuple = cute::conditional_t<cute::is_tuple<Tuple>::value, Tuple, cute::tuple<Tuple>>;
+  static constexpr size_t valid_index = cute::min(I, cute::tuple_size_v<underlying_tuple> - 1);
+
+public:
+  using type = cute::conditional_t<(I < cute::tuple_size_v<underlying_tuple>), 
+                                    cute::tuple_element_t<valid_index, underlying_tuple>,
+                                    void>;
+};
+
+template <size_t I, class Tuple>
+using deduce_mixed_width_dtype_t = typename deduce_mixed_width_dtype<I, Tuple>::type;
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
new file mode 100755
index 000000000..76e52d2bf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+/// @brief A bool constant that depends on one or more template parameters.
+///
+/// For more detailed documentation and use cases,
+/// please see `dependent_false` below.
+template <bool Value, class... Args>
+inline constexpr bool dependent_bool_value = Value;
+
+/// @brief An always-false value that depends on one or more template parameters.
+///
+/// This exists because `static_assert(false);` always fails,
+/// even if it occurs in the `else` branch of an `if constexpr`.
+/// The following example shows how to use `dependent_false` in that case.
+///
+/// @code
+/// template<class T>
+/// void foo (T t)
+/// {
+///     if constexpr (std::is_integral_v<T>) {
+///         do_integer_stuff(t);
+///     }
+///     else if constexpr (std::is_floating_point_v<T>) {
+///         do_floating_point_stuff(t);
+///     }
+///     else {
+///         static_assert(dependent_false<T>, "T must be "
+///             "an integral or floating-point type.");
+///     }
+/// }
+/// @endcode
+///
+/// This implements the C++ Standard Library proposal P1830R1.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
+///
+/// That proposal is under review as of 2022/12/05.
+/// The following link shows P1830's current review status.
+///
+/// https://github.com/cplusplus/papers/issues/572
+///
+/// P2593R0 proposes an alternate solution to this problem,
+/// that would change the C++ language itself.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+///
+/// For headers in this library, however, we only consider library solutions
+/// as work-arounds for future C++ features.
+template <class... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
+}  // end namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
new file mode 100755
index 000000000..4cd895f14
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
@@ -0,0 +1,205 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Helper macros for the CUTLASS library
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifdef CUTLASS_NAMESPACE
+#define concat_tok(a, b) a ## b
+#define mkcutlassnamespace(pre, ns) concat_tok(pre, ns)
+#define cutlass mkcutlassnamespace(cutlass_, CUTLASS_NAMESPACE)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#elif defined(__CUDACC_RTC__)
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#else
+#define CUTLASS_HOST_DEVICE inline
+#define CUTLASS_DEVICE inline
+#endif
+
+#define CUTLASS_HOST __host__
+#define CUTLASS_GLOBAL __global__ static
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) 
+{ }
+
+#if defined(__GNUC__)
+  #define CUTLASS_UNUSED(expr) __CUTLASS_UNUSED(expr)
+#else
+  #define CUTLASS_UNUSED(expr) do { ; } while (&expr != &expr)
+#endif
+
+#ifdef _MSC_VER
+// Provides support for alternative operators 'and', 'or', and 'not'
+#include <iso646.h>
+#endif // _MSC_VER
+
+#if !defined(__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+#if defined(__CUDA_ARCH__)
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __FUNCSIG__); asm volatile ("brkpt;\n"); }
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __PRETTY_FUNCTION__); asm volatile ("brkpt;\n"); }
+  #endif
+#else
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+  #endif
+#endif
+
+// CUTLASS_CMATH_NAMESPACE is the namespace where code can find
+// <cmath> functions like isnan and log.  Such functions are in
+// the std namespace in host code, but in the global namespace
+// in device code.
+//
+// The intended use case for this macro is in "using" declarations
+// for making argument-dependent lookup (ADL) work in generic code.
+// For example, if T is cutlass::half_t, the following code will
+// invoke cutlass::isnan(half_t).  If T is float, it will invoke
+// std::isnan on host and ::isnan on device.  (CUTLASS's support
+// for NVRTC prevents it from using things in the std namespace
+// in device code.)  Correct use of "using" declarations can help
+// avoid unexpected implicit conversions, like from half_t to float.
+//
+// template<class T>
+// bool foo(T x) {
+//   using CUTLASS_CMATH_NAMESPACE :: isnan;
+//   return isnan(x);
+// }
+//
+// Without this macro, one would need to write the following.
+//
+// template<class T>
+// bool foo(T x) {
+// #if defined(__CUDA_ARCH__)
+//   using ::isnan;
+// #else
+//   using std::isnan;
+// #endif
+//   return isnan(x);
+// }
+
+#if defined(__CUDA_ARCH__)
+#  define CUTLASS_CMATH_NAMESPACE
+#else
+#  define CUTLASS_CMATH_NAMESPACE std
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+
+#ifndef CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 0
+#endif
+
+
+// CUDA 10.1 introduces the mma instruction
+#if !defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)
+#define CUTLASS_ENABLE_TENSOR_CORE_MMA 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUTLASS_ASSERT(x) assert(x)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
+#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
+  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
+    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+  #else
+    #define CUTLASS_PRAGMA_UNROLL #pragma unroll
+    #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
+  #endif
+
+  #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
+
+#else
+
+    #define CUTLASS_PRAGMA_UNROLL
+    #define CUTLASS_PRAGMA_NO_UNROLL
+    #define CUTLASS_GEMM_LOOP
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+#define CUTLASS_THREAD_LOCAL thread_local
+#else
+#define CUTLASS_THREAD_LOCAL
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(_MSVC_LANG)
+#  define CUTLASS_CPLUSPLUS _MSVC_LANG
+#else
+#  define CUTLASS_CPLUSPLUS __cplusplus
+#endif
+
+#if (201700L <= CUTLASS_CPLUSPLUS)
+#define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
+#define CUTLASS_CXX17_OR_LATER 1
+#else
+#define CUTLASS_CONSTEXPR_IF_CXX17
+#define CUTLASS_CXX17_OR_LATER 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}; // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
new file mode 100755
index 000000000..cbed61f68
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
@@ -0,0 +1,406 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/pointer_sparse.hpp"       // cute::is_sparse
+#include "cute/swizzle.hpp"              // cute::Swizzle
+#include "cute/swizzle_layout.hpp"       // cute::detail::get_swizzle_portion
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/copy_sm90_tma.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/collective.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For each cutlass::layout, provides its corresponding cute stride types, 64b by default
+
+template <class L>
+struct TagToStrideA {
+  using type = L;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+template <class L>
+struct TagToStrideB {
+  using type = L;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+// For each cutlass::layout *, provides its corresponding cute stride types, 64b by default
+// Used by pointer array and grouped gemm
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [M, N, L]
+template <class LayoutTag>
+struct TagToStrideC : TagToStrideA<LayoutTag> { };
+
+// Conv: Maps to modes ((P,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,Z,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNDHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCS> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSR> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R,T), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSRT> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R,T), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRTK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Convenience aliases
+template<class LayoutTag>
+using TagToStrideA_t = typename TagToStrideA<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideB_t = typename TagToStrideB<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideC_t = typename TagToStrideC<LayoutTag>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For 2.x compatibility APIs, provide stride->layout tag mappers
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  // Account for stride types with and without batch mode and batch modes with static zero stride
+  return cute::is_constant<1, decltype(cute::front(cute::get<ModeIndex>(cute::remove_pointer_t<Stride>{})))>::value;
+}
+
+template<int ModeIndex, class Shape, class Stride>
+constexpr bool
+is_major(cute::Layout<Shape,Stride> = {}) {
+  return is_major<ModeIndex>(Stride{});
+}
+
+// Note : This method can be used for deducing the Layout Tag of A, C, D Matrices
+template<class StrideA>
+constexpr
+auto
+stride_to_layout_tag_A() {
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  if constexpr (cute::is_layout<InternalStrideA>::value) {
+    return stride_to_layout_tag_A<decltype(cute::stride(InternalStrideA{}))>();
+  }
+  else if constexpr (is_major<0, StrideA>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  // Specialize for sparse layout
+  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} && 
+                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 && 
+                     cute::is_same_v<cute::_1, cute::remove_cvref_t<decltype(cute::get<1,0>(InternalStrideA{}))>>) {
+    return layout::ColumnMajor{};
+  }
+  else { // K major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideB>
+constexpr
+auto
+stride_to_layout_tag_B() {
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  if constexpr (cute::is_layout<InternalStrideB>::value) {
+    return stride_to_layout_tag_B<decltype(cute::stride(InternalStrideB{}))>();
+  }
+  else if constexpr (is_major<0, StrideB>()) { // N major
+    return layout::RowMajor{};
+  }
+  else { // K major
+    return layout::ColumnMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideC>
+constexpr
+auto
+stride_to_layout_tag_C() {
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  if constexpr (cute::is_layout<InternalStrideC>::value) {
+    return stride_to_layout_tag_C<decltype(cute::stride(InternalStrideC{}))>();
+  }
+  else if constexpr (is_major<0, StrideC>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  else { // N major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Utilities to map Stride back on to their corresponding layout tags
+template <class S>
+struct StrideToLayoutTagA {
+  using type = decltype(detail::stride_to_layout_tag_A<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagB {
+  using type = decltype(detail::stride_to_layout_tag_B<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagC {
+  using type = decltype(detail::stride_to_layout_tag_C<S>());
+};
+
+// Convenience aliases
+template<class S>
+using StrideToLayoutTagA_t = typename StrideToLayoutTagA<S>::type;
+
+template<class S>
+using StrideToLayoutTagB_t = typename StrideToLayoutTagB<S>::type;
+
+template<class S>
+using StrideToLayoutTagC_t = typename StrideToLayoutTagC<S>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Inspects a tiled copy and whether its copy engine is TMA or not
+template<class GmemTiledCopy>
+constexpr bool is_tma_copy_engine() {
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return false;
+  }
+  else {
+   if constexpr (   cute::is_base_of_v<cute::SM90_TMA_LOAD,                         GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_MULTICAST,              GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL,                 GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL_MULTICAST,       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE,                       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE_IM2COL,                GmemTiledCopy>
+                  ) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <class X, class = void>
+struct RawDtype { using type = X; };
+
+template <class X>
+struct RawDtype<X,cute::void_t<typename X::raw_type>> { using type = typename X::raw_type; };
+
+
+// Inspects a TiledCopy and returns its alignment in terms of element count
+template <class GmemTiledCopy, class Element, class ElementMma = Element>
+constexpr int
+get_alignment_count_from_gmem_tiled_copy() {
+
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return 1;
+  }
+
+  // Account for ElementC = void kernels
+  else if constexpr (cute::is_void_v<Element>) {
+    return 0;
+  }
+
+  else {
+    // For TMA tiled copies, we know the alignment has to be 128 bits
+    if constexpr (is_tma_copy_engine<GmemTiledCopy>()) {
+      // For sparse MMA, alignment in logical elements is increased by sparsity factor
+      if constexpr (cute::is_sparse_v<ElementMma>) {
+        return 128 / sizeof_bits<Element>::value * ElementMma::sparsity;
+      }
+      return 128 / sizeof_bits<Element>::value;
+    }
+    else {
+      // For non-TMA tiled copies, TiledCopy holds the alignment count directly in its TiledShape_MN
+      return GmemTiledCopy::NumValSrc;
+    }
+  }
+}
+
+// Return alignment bit requirements for the GEMM inputs.
+template <
+  class ElementType
+>
+constexpr int
+get_input_alignment_bits() {
+  return 128;
+}
+
+// Return alignment bit requirements for the GEMM outputs.
+template <class ElementType>
+constexpr int
+get_output_alignment_bits() {
+  return 128;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(cute::Layout<Shape,Stride> const& layout) {
+  // Condition: shape must divide by Alignment without rounding
+  bool shape_check = cute::size(layout.shape()) == Alignment * cute::size(cute::upcast<Alignment>(layout));
+  // Condition: every dynamic stride must be a multiple of Alignment
+  bool stride_check = cute::all_of(cute::flatten(layout.stride()), [](auto s){ return cute::is_static<decltype(s)>::value || (s % Alignment == 0); });
+  return shape_check && stride_check;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(Shape const& shape, Stride const& stride) {
+  return check_alignment<Alignment>(cute::make_layout(shape, stride));
+}
+
+template<int B, int M, int S>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(cute::Swizzle<B, M, S>) {
+  static_assert(B >= 0 and M >= 0);
+  return size_t(1) << size_t(B + M + cute::abs(S));
+}
+
+template<class Layout>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(Layout layout) {
+  return alignment_for_swizzle(cute::detail::get_swizzle_portion(layout));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
new file mode 100755
index 000000000..0e491b9c4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
@@ -0,0 +1,71 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cute/layout.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class TiledMma, class = void>
+struct IsSparseTensorOp : cute::false_type { };
+
+// TiledMma for sparse must have ValTypeE
+template <class TiledMma>
+struct IsSparseTensorOp<TiledMma, cute::void_t<typename TiledMma::ValTypeE>>
+    : cute::true_type { };
+
+// The following metafunction is used to extract the OperatorClass from a cutlass 3.x kernel.
+template <class TiledMma>
+struct get_operator_class {
+  static constexpr bool is_sparse_op = IsSparseTensorOp<TiledMma>::value;
+  static constexpr bool is_tensor_op = cute::size<0>(typename TiledMma::AtomShape_MNK{}) >= 8;
+  using type = cute::conditional_t<
+                is_tensor_op, 
+                cute::conditional_t<
+                  is_sparse_op,
+                  cutlass::arch::OpClassSparseTensorOp,
+                    cutlass::arch::OpClassTensorOp
+                  >,
+                cutlass::arch::OpClassSimt
+                >;
+};
+
+template <class T>
+using get_operator_class_t = typename get_operator_class<T>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/device_kernel.h b/lightllm-kernel/cutlass/include/cutlass/device_kernel.h
new file mode 100755
index 000000000..7af5d96cf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/device_kernel.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for generic CUTLASS kernel.
+*/
+
+#pragma once
+
+// __grid_constant__ was introduced in CUDA 11.7.
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7)))
+#  define CUTLASS_GRID_CONSTANT_SUPPORTED
+#endif
+
+// __grid_constant__ can be enabled only on SM70+
+#if defined(CUTLASS_GRID_CONSTANT_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+#  define CUTLASS_GRID_CONSTANT_ENABLED
+#endif
+
+#if ! defined(CUTLASS_GRID_CONSTANT)
+#  if defined(CUTLASS_GRID_CONSTANT_ENABLED)
+#    define CUTLASS_GRID_CONSTANT __grid_constant__
+#  else
+#    define CUTLASS_GRID_CONSTANT
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+template <typename T>   struct Type2Type  {  using type=T;                    };
+// using the simple type to replace the complex type to reduce this symbol size
+template <typename  T>                                                                        struct GetUnderlyingKernel                              : public Type2Type<T>               {};
+template <uint64_t shader_guid, unsigned index, template <uint64_t, unsigned> class Wrapper > struct GetUnderlyingKernel<Wrapper<shader_guid,index>>  : public Wrapper<shader_guid,index> {};
+template <typename  T>                                                                        using  GetUnderlyingKernel_t                            = typename GetUnderlyingKernel<T>::type;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+void Kernel(typename Operator::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  typename Operator::SharedStorage *shared_storage =
+      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
+
+  Operator op;
+
+  op(params, *shared_storage);
+  cutlass::arch::synclog_print();
+}
+
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+void Kernel2(typename Operator::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  typename Operator::SharedStorage *shared_storage =
+      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
+
+  Operator::invoke(params, *shared_storage);
+  cutlass::arch::synclog_print();
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// 3.0 specific launch
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+#ifdef __CUDACC__
+// Enclosing this in __CUDACC__ suppresses MSVC warnings.
+__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+#endif // __CUDACC__
+void device_kernel(CUTLASS_GRID_CONSTANT typename Operator::Params const params)
+{
+  // Dynamic shared memory base pointer
+  extern __shared__ char smem[];
+  Operator op;
+  op(params, smem);
+  cutlass::arch::synclog_print();
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+} /// namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
new file mode 100755
index 000000000..759591b5d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -0,0 +1,812 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/atom/mma_traits_sm90.hpp"
+#include "cute/atom/mma_traits_sm90_gmma.hpp"
+#include "cute/atom/copy_traits_sm90.hpp"
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/builders/sm90_common.inl"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#else
+#include <type_traits>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the parameterized dispatch policy for the TMA epilogue
+template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class Schedule>
+constexpr auto
+sm90_get_tma_dispatch_policy() {
+  using namespace cute;
+
+  constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
+  constexpr int FragmentSize = size(EpilogueTileMN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128);
+  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
+  constexpr bool ReuseSmem = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
+  // TMA store delay performs worse with residual loads and compilicates tensormap updates for Ptr-Array GEMMs
+  constexpr bool DelayTmaStore = is_void_v<ElementC> && !detail::sm90_is_ptr_array_tma_v<Schedule>;
+  constexpr int StagesD = cute::min(EpiTiles, 2);
+  constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
+                                    : cute::min(EpiTiles, 4);
+
+  if constexpr (detail::sm90_is_ptr_array_tma_v<Schedule>) {
+      return Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, 
+                                            DelayTmaStore, Schedule::NumEpilogueWarpGroups>{};
+  } 
+  else {
+    return Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
+  }
+}
+
+// Returns the smem layout atom to be used for C or D matrix
+template<class GmemStrideType, class Element, class EpilogueTile_MN>
+constexpr auto
+sm90_get_epilogue_smem_swizzle_layout_atom() {
+  using namespace cute;
+
+  // ColMajor C/D (M-major)
+  if constexpr (cutlass::gemm::detail::is_major<0>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::MN, Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  // RowMajor C/D (N-major)
+  else if constexpr (cutlass::gemm::detail::is_major<1>(GmemStrideType{})) {
+    return cutlass::gemm::collective::detail::ss_smem_selector<
+      cute::GMMA::Major::K , Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
+    >();
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<GmemStrideType>, "Unsupported gmem layout.");
+  }
+}
+
+// Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
+template <class ElementD, class EpilogueTileType, class Schedule, class TileShape_MNK>
+constexpr auto
+sm90_compute_tile_shape_or_override() {
+  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
+    auto epi_tile = [&] () {
+      if constexpr (detail::sm90_is_cooperative_v<Schedule>) {
+        auto tile_m = cute::min(_128{}, size<0>(TileShape_MNK{}));
+        auto tile_n = cute::min(_32{}, size<1>(TileShape_MNK{}));
+        return make_shape(tile_m, tile_n);
+      }
+      else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
+        constexpr int N_perf = sizeof_bits_v<ElementD> == 8 ? 64 : 32;
+        auto tile_m = cute::min(_64{}, size<0>(TileShape_MNK{}));
+        auto tile_n = cute::min(Int<N_perf>{}, size<1>(TileShape_MNK{}));
+        return make_shape(tile_m, tile_n);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<Schedule>, "Unsupported schedule.");
+      }
+    }();
+
+    return cute::transform(epi_tile, seq<0,1>{},
+      [] (auto epi_tiler, auto I) {
+        auto cta_tiler = make_layout(get<I>(TileShape_MNK{}));
+        // This is a multimodal CTA tiler, transform before returning
+        if constexpr (depth(cta_tiler) > 0) {
+          // This is an implicit multimodal tiler, match profile and return
+          if constexpr (tuple_size_v<decltype(shape(cta_tiler))> == 1) {
+            return make_tile(epi_tiler);
+          }
+          // This is an explicit multimodal tiler, compose out epi tiler
+          else {
+            return composition(cta_tiler, epi_tiler);
+          }
+        }
+        // This is a flat CTA tiler, no need for transformation
+        else {
+          return epi_tiler;
+        }
+      });
+  }
+  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
+    EpilogueTileType epi_tile;
+    constexpr int M = size<0>(shape(epi_tile));
+    constexpr int N = size<1>(shape(epi_tile));
+
+    static_assert(!is_layout<EpilogueTileType>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+    static_assert(M ==  64 && detail::sm90_is_warp_specialized_v<Schedule> ||
+                  M == 128 && detail::sm90_is_cooperative_v<Schedule>, "Unsupported tile shape");
+    static_assert(N % 16 == 0, "Unsupported tile shape");
+
+    return epi_tile;
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
+  }
+}
+
+// callbacks builder with TMA aux out
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && not cute::is_subbyte_v<typename FusionOp::ElementAux>>
+> {
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<typename FusionOp::GmemLayoutTagAux>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
+  using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator<
+    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source<
+    GmemStrideTypeAux, typename FusionOp::ElementAux>());
+  using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
+
+  using Callbacks = fusion::FusionCallbacks<
+    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+};
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  FusionOp,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
+              && sizeof_bits_v<typename FusionOp::ElementAux> == 1>
+> {
+  using Callbacks = fusion::FusionCallbacks<
+    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    FusionOp, TileShape_MNK, EpilogueTile_MN,
+    Layout<_1,_0>, DefaultCopy // aux bit tensor doesn't use smem
+  >;
+};
+
+// Helper for building TMA warp-specialized collective epilogues, specialized by
+// the fusion operation performed and the dispatch policy to use.
+template <
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD_,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class FusionOpOrCallbacks,
+  class DispatchPolicy
+>
+struct Sm90TmaBuilderImpl {
+  // Passing void D disables destination store + smem allocation
+  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
+                     fusion::get_element_aux_t<FusionOpOrCallbacks>, ElementD_>;
+
+  // Passing void C disables source load + smem allocation
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
+
+  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
+  
+  using UnderlyingGmemStrideTypeC = cute::remove_pointer_t<GmemStrideTypeC>;
+  using UnderlyingGmemStrideTypeD = cute::remove_pointer_t<GmemStrideTypeD>;
+
+  using CopyOpS2G = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagD>,
+      SM90_TMA_STORE_IM2COL,
+      SM90_TMA_STORE
+    >;
+  using CopyOpG2S = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagC>,
+      SM90_TMA_LOAD_IM2COL,
+      SM90_TMA_LOAD
+    >;
+
+  // Get the smallest tiled copy we can use to retile the accumulators
+  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  // Get register to register tiled copy that happen before shared memory store.
+  // Apply void as no register transform op needed currently.
+  using CopyOpR2R = void;
+
+  // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
+  // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
+  using FusionCallbacks = 
+    typename CallbacksBuilder<
+      DispatchPolicy,
+      FusionOpOrCallbacks,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator
+    >::Callbacks;
+
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      DispatchPolicy,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // Need to pass void through to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD_,
+      GmemStrideTypeD,
+      FusionCallbacks,
+      CopyOpG2S,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC>()),
+      CopyOpS2G,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD>()),
+      CopyAtomC,
+      CopyOpR2R
+    >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Descriptor classes for defining EVT nodes
+// Some of the epilogue visitor nodes require non-intuitive template arguments
+// such as CopyOpS2R for AuxLoad node. Traditionaly, these are resolved by the
+// builder classes. Here we provide a set of descriptor classes that resolve
+// these template arguments from more intuitive types such as Stride, Layout
+
+// Get TileShape, EpilogueTile, Dispatch Policy, StagesC, and STagesD
+template<
+  typename TileShape_MNK,
+  typename EpilogueTileType, 
+  typename ElementC,
+  typename ElementD,
+  typename Schedule
+>
+struct EpilogueDescriptor {
+  using TileShape = TileShape_MNK;
+  using EpilogueTile = 
+    decltype(
+      detail::sm90_compute_tile_shape_or_override<
+        ElementD, EpilogueTileType, Schedule, TileShape_MNK
+      >()
+    );
+  using DispatchPolicy = 
+    decltype(
+      detail::sm90_get_tma_dispatch_policy<
+        TileShape_MNK, EpilogueTile, 
+        ElementC, ElementD, Schedule
+      >()
+    );
+  constexpr static int StagesC = DispatchPolicy::StagesC;
+  constexpr static int StagesD = DispatchPolicy::StagesD;
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxLoad node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxLoadDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesC;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpS2R =
+    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
+};
+
+// Get Stride, SmemLayout, and CopyOpS2R for AuxStore node
+template<
+  typename EpilogueDescriptor,
+  typename StrideOrLayoutTag,
+  typename ElementAux
+>
+struct AuxStoreDescriptor {
+  constexpr static int Stages = EpilogueDescriptor::StagesD;
+  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
+  using Element = ElementAux;
+  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
+  using SmemLayoutAtom =
+    decltype(
+      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
+      >()
+    );
+  using CopyOpR2S =
+    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+// No-smem builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  FloatRoundStyle RoundStyle
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
+    cute::enable_if_t<cute::is_same_v<Schedule, NoSmemWarpSpecialized> ||
+                      cute::is_same_v<Schedule, PtrArrayNoSmemWarpSpecialized> >> {
+
+  // Passing void C disables source load
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      ElementD, ElementC_>; // prevents cute breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      GmemLayoutTagD, GmemLayoutTagC_>;
+  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
+      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = thread::LinearCombination<
+    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
+    ScaleType, RoundStyle, ElementC>;
+
+  using CollectiveOp = cute::conditional_t<
+    cute::is_same_v<Schedule, NoSmemWarpSpecialized>,
+    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+      cutlass::epilogue::collective::DefaultEpilogue<
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+        ThreadOp,
+        cutlass::gemm::EpilogueDefault>>,
+    // Epilogue for Ptr-Array and Grouped Gemm
+    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+      cutlass::epilogue::collective::DefaultEpilogueArray<
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+        ThreadOp,
+        Schedule>>
+    >;
+};
+
+// Tma warp-specialized builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD_,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class FusionOperation
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD_,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    FusionOperation,
+    cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
+                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> ||
+                      detail::sm90_is_ptr_array_tma_v<Schedule>>> {
+private:
+  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
+                     fusion::get_element_aux_t<FusionOperation>, ElementD_>;
+  using EpilogueTile_MN =
+    decltype(detail::sm90_compute_tile_shape_or_override<ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
+  using DispatchPolicy =
+    decltype(detail::sm90_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD,Schedule>());
+
+public:
+  using CollectiveOp =
+    typename detail::Sm90TmaBuilderImpl<
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD_,
+      GmemLayoutTagD,
+      AlignmentD,
+      FusionOperation,
+      DispatchPolicy
+    >::CollectiveOp;
+};
+
+// Auto builder
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class FusionOperation
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    EpilogueScheduleAuto,
+    FusionOperation,
+    void> {
+private:
+  static_assert(cute::is_same_v<FusionOperation, fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>>,
+                "Auto schedule doesn't support fusion. Use one of the TmaWarpSpecialized schedules instead.");
+
+  // Pick No-Smem epilogue as the Auto Epilogue Schedule (Auto schedules do not guarantee best performance) 
+  // since TMA epilogues are not compatible with non-TMA non-WS mainloops
+  using EpilogueSchedule = NoSmemWarpSpecialized;
+  using _CollectiveBuilder = CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    EpilogueSchedule,
+    FusionOperation
+  >;
+
+public:
+  using CollectiveOp = typename _CollectiveBuilder::CollectiveOp;
+};
+
+// DEPRECATED Tma warp-specialized builder for elementwise fusion
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class UnusedFusionOp
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC,
+    GmemLayoutTagC,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    UnusedFusionOp,
+    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
+private:
+  using FusionOp =
+    fusion::LinCombEltAct<Schedule::template ActivationFunctor, ElementD, ElementCompute, ElementC, ElementCompute, Schedule::Round>;
+  using ImplSchedule =
+    cute::conditional_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule>,
+      TmaWarpSpecialized, TmaWarpSpecializedCooperative>;
+
+public:
+  using CollectiveOp =
+    typename CollectiveBuilder<
+      arch::Sm90,
+      OpClass,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      EpilogueTileType,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      GmemLayoutTagC,
+      AlignmentC,
+      ElementD,
+      GmemLayoutTagD,
+      AlignmentD,
+      ImplSchedule,
+      FusionOp
+    >::CollectiveOp;
+};
+
+// DEPRECATED Tma warp-specialized builder for bias + elementwise fusion
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class Schedule,
+  class UnusedFusionOp
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltAct or fusion::LinCombPerRowBiasEltActAux instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    Schedule,
+    UnusedFusionOp,
+    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
+private:
+  using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
+    ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
+  // MSVC doesn't seem to be able to deduce DispatchPolicy correctly if it's
+  // defined as decltype of a detail::sm90_get_tma_dispatch_policy call.
+  // Instead, we paste in the contents of that function.  A natural refactoring
+  // would be to create a type alias in the detail namespace.
+  using DispatchPolicy = Sm90TmaWarpSpecialized<
+    /* StagesC = */ size(shape_div(take<0, 2>(TileShape_MNK{}), EpilogueTile_MN{})),
+    /* StagesD = */ 2,
+    /* FragmentSize = */ size(EpilogueTile_MN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128),
+    /* ReuseSmemC = */ sizeof_bits_v<ElementC_> == sizeof_bits_v<ElementD>,
+    false
+  >;
+
+  using GmemStrideTypeAux = gemm::TagToStrideC_t<GmemLayoutTagD>;
+  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
+    GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>());
+  using SmemCopyOpAux = decltype(detail::sm90_get_smem_store_op_for_accumulator<
+    GmemStrideTypeAux, typename Schedule::ElementT>());
+  using FusionOperationAux = fusion::LinCombPerRowBiasEltActAux<
+    GmemLayoutTagD, Schedule::template ActivationFunctor, ElementD, ElementCompute,
+    typename Schedule::ElementT, typename Schedule::ElementBias, ElementC_, ElementCompute
+  >;
+  using FusionCallbacksAux = fusion::FusionCallbacks<
+    DispatchPolicy, FusionOperationAux, TileShape_MNK, EpilogueTile_MN, SmemLayoutAtomAux, SmemCopyOpAux
+  >;
+
+  using FusionOperationNoAux = fusion::LinCombPerRowBiasEltAct<
+    Schedule::template ActivationFunctor, ElementD, ElementCompute,
+    typename Schedule::ElementBias, ElementC_, ElementCompute
+  >;
+  using FusionCallbacksNoAux = fusion::FusionCallbacks<
+    DispatchPolicy, FusionOperationNoAux, TileShape_MNK, EpilogueTile_MN
+  >;
+
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
+
+  using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
+
+  // Get the smallest tiled copy we can use to retile the accumulators
+  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
+  // Get register to register tiled copy that happen before shared memory store.
+  // Apply void as no register transform op needed.
+  using CopyOpR2R = void;
+
+public:
+  using CollectiveOp = cutlass::epilogue::collective::Sm90EpilogueTmaWarpSpecializedBiasElementwise<
+      DispatchPolicy::StagesC,
+      DispatchPolicy::StagesD,
+      DispatchPolicy::FragmentSize,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // Need to pass void through to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD,
+      GmemStrideTypeD,
+      cute::conditional_t<Schedule::StoreT, FusionCallbacksAux, FusionCallbacksNoAux>,
+      SM90_TMA_LOAD,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
+      SM90_TMA_STORE,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>()),
+      CopyAtomC,
+      CopyOpR2R
+    >;
+};
+
+// CollectiveBuilder that transposed epilogue below is used for sm90 gmma RS TT kernels
+// since swapping NNN kernels input matrix and transposing its output at the same time then
+// we can get TTN kernel.
+template <
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  FloatRoundStyle RoundStyle
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    OpClass,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    EpilogueTileType,
+    ElementAccumulator,
+    ElementCompute,
+    ElementC_,
+    GmemLayoutTagC_,
+    AlignmentC,
+    ElementD,
+    GmemLayoutTagD,
+    AlignmentD,
+    cutlass::gemm::EpilogueTransposed,
+    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
+    void> {
+  // Passing void C disables source load
+  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      ElementD, ElementC_>; // prevents cute breakages
+  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
+      GmemLayoutTagD, GmemLayoutTagC_>;
+  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
+      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = thread::LinearCombination<
+    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
+    ScaleType, RoundStyle, ElementC>;
+
+  using CollectiveOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::DefaultEpilogue<
+      cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
+      cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
+      ThreadOp,
+      cutlass::gemm::EpilogueTransposed>
+    >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
new file mode 100755
index 000000000..cd2639c5d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Selects the largest vectorized smem store atom available
+template <class GmemStrideTypeD, class ElementD>
+constexpr auto
+sm90_get_smem_store_op_for_accumulator() {
+  using namespace cute;
+
+  if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) {
+    return SM90_U16x8_STSM_T{};
+  }
+  else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) {
+    return SM90_U32x4_STSM_N{};
+  }
+  else {
+    // auto-vectorizing store
+    return AutoVectorizingCopyWithAssumedAlignment{};
+  }
+}
+
+// Selects the largest vectorized smem load atom available
+template <class GmemStrideTypeC, class ElementC>
+constexpr auto
+sm90_get_smem_load_op_for_source() {
+  using namespace cute;
+
+  // Reuse the logic from smem store selector
+  using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator<GmemStrideTypeC, ElementC>());
+
+  if constexpr (cute::is_same_v<SmemStoreOp, SM90_U16x8_STSM_T>) {
+    return SM75_U16x8_LDSM_T{};
+  }
+  else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U32x4_STSM_N>) {
+    return SM75_U32x4_LDSM_N{};
+  }
+  else {
+    // auto-vectorizing load
+    return AutoVectorizingCopyWithAssumedAlignment<128>{};
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
new file mode 100755
index 000000000..d54cd0a8f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy.hpp>         // cute::DefaultCopy
+#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify epilogue subtile shape or dispatch to automatic computation of subtile shape
+struct EpilogueTileAuto {};
+
+// Used to let the builder pick the epilogue schedule automatically.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct EpilogueScheduleAuto {};
+struct EpilogueIm2ColScheduleAuto {};
+
+template <
+  class ArchTag,
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class EpilogueScheduleType,
+  class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not build a collective epilogue for given parameters.");
+};
+
+// helper sub-builder for epilogue fusion callbacks (for internal use by CollectiveBuilder only)
+namespace detail {
+
+// callbacks builder with operation tag
+template<
+  class DispatchPolicy,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class = void
+>
+struct CallbacksBuilder {
+  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp, TileShape_MNK, EpilogueTile_MN>;
+};
+
+// callbacks builder with callbacks passthrough
+template <
+  class DispatchPolicy,
+  class FusionCallbacks,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  DispatchPolicy,
+  FusionCallbacks,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  cute::enable_if_t<not cute::is_base_of_v<fusion::FusionOperation, FusionCallbacks>>
+> {
+  using Callbacks = FusionCallbacks;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
new file mode 100755
index 000000000..8fb1a9588
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
@@ -0,0 +1,71 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cutlass/detail/dependent_false.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class... Args
+>
+class CollectiveEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "detail.hpp"
+
+//
+// Gemm
+//
+#include "default_epilogue.hpp"
+#include "default_epilogue_array.hpp"
+#include "epilogue_tensor_broadcast.hpp"
+#include "sm70_epilogue_vectorized.hpp"
+#include "sm70_epilogue_vectorized_array.hpp"
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+#include "sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp"
+#include "sm90_epilogue_array_tma_warpspecialized.hpp"
+//
+// Conv
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
new file mode 100755
index 000000000..cd4a6ccdd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies an element wise operation to all elements within the fragment
+/// and writes them out to destination storage.
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogue {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  // Note: SharedStorage is unused for DefaultEpilogue
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogue(Params const& params_, SharedStorage const& shared_storage = SharedStorage())
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c);                 // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                 // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
new file mode 100755
index 000000000..0f6f32931
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Applies an element wise operation to all elements within the fragment
+// and writes them out to destination storage.
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogueArray {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+  
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecialized> || cute::is_same_v<EpilogueSchedule, PtrArrayDefault>, "Incompatible epilogue schedule.");
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogueArray(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+      // If grouped gemm
+      if (epilogue_op.is_source_needed()) {
+        stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC[l_coord]);
+      }
+      stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l_coord]);
+    }
+    else {
+      stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+      stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+    }
+
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
new file mode 100755
index 000000000..6c0368e09
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
@@ -0,0 +1,491 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cute/util/type_traits.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Stride>
+constexpr bool
+is_m_major() {
+  return cutlass::gemm::detail::is_major<0,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_n_major() {
+  return cutlass::gemm::detail::is_major<1,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_im2col() {
+  return cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNHWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNDHWC>>;
+}
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecialized> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_v = sm90_is_ptr_array_tma<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_cooperative : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_cooperative<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_cooperative_v = sm90_is_ptr_array_tma_cooperative<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_pingpong : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_pingpong<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_pingpong_v = sm90_is_ptr_array_tma_pingpong<Schedule>::value;
+
+template<class DispatchPolicy>
+struct sm90_is_ptr_array_tma_dispatch_policy : cute::false_type {};
+
+template<
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups
+>
+struct sm90_is_ptr_array_tma_dispatch_policy<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                   StagesD, 
+                                   FragmentSize,
+                                   ReuseSmemC, 
+                                   DelayTmaStore, 
+                                   NumEpilogueWarpGroups>> 
+    : cute::true_type {};
+
+template<class DispatchPolicy>
+static constexpr bool sm90_is_ptr_array_tma_dispatch_policy_v = sm90_is_ptr_array_tma_dispatch_policy<DispatchPolicy>::value;
+
+using cutlass::atomic_maximum;
+
+template <class T>
+static constexpr int elements_per_access_v = cutlass::sizeof_bits<uint32_t>::value / cutlass::sizeof_bits<T>::value;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_cooperative_v =
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueSchedule> ||
+  sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule>;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_warp_specialized_v =
+  (!sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule> && sm90_is_ptr_array_tma_v<EpilogueSchedule>) ||
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecialized, EpilogueSchedule>;
+
+template <class GmemLayoutTag>
+static constexpr bool is_im2col_mode =
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNHWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNDHWC>;
+
+template <class T>
+struct EmptyStorage {
+  CUTLASS_HOST_DEVICE
+  T* data() { return nullptr; }
+};
+
+template<class EpilogueSchedule, class Stride>
+CUTLASS_HOST_DEVICE
+auto get_epilogue_stride(Stride stride){
+  if constexpr (cute::is_base_of_v<cutlass::gemm::EpilogueTransposed, EpilogueSchedule>) {
+    return cute::make_stride(cute::get<1>(stride), cute::get<0>(stride), cute::get<2>(stride));
+  }
+  else {
+    return stride;
+  }
+}
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithBias { 
+  static constexpr bool value = false; 
+  using type = typename ThreadEpilogueOp::ElementCompute; 
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithBias <ThreadEpilogueOp, cute::void_t<typename ThreadEpilogueOp::ElementBias>> { 
+  static constexpr bool value = true; 
+  using type = typename ThreadEpilogueOp::ElementBias; 
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithPerChannelScaling {
+  static constexpr bool value = false;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithPerChannelScaling <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsPerChannelScalingSupported>> {
+  static constexpr bool value = true;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithActivation {
+  static constexpr bool value = false;
+  using type = void;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithActivation <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsEltActSupported>> {
+  static constexpr bool value = true;
+  using type = typename ThreadEpilogueOp::ActivationFn;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithElementwiseArguments : cute::false_type {};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithElementwiseArguments<
+        ThreadEpilogueOp,
+        cute::void_t<typename ThreadEpilogueOp::ElementwiseOp::Arguments>> : cute::true_type {};
+
+// Wrapper class to use operator-style epilogues in sm90 TMA warp-specialized kernels
+template <class EpilogueOp>
+class Sm90TmaWarpSpecializedAdapter : public EpilogueOp {
+public:
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  using LoadPipeline = cutlass::PipelineTransactionAsync<0>;
+  using LoadPipelineState = cutlass::PipelineState<0>;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+  constexpr static bool RequiresTransactionBytes = false;
+
+  using StorePipeline = cutlass::PipelineTmaStore<0>;
+  using StorePipelineState = cutlass::PipelineState<0>;
+
+  using TensorStorage = typename EpilogueOp::SharedStorage;
+  using TensorMapStorage = typename EpilogueOp::SharedStorage;
+  using PipelineStorage = typename LoadPipeline::SharedStorage;
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors([[maybe_unused]] typename EpilogueOp::Params const&) {
+  }
+
+  // ctor inheritance
+  using EpilogueOp::EpilogueOp;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TmaWarpSpecializedAdapter(
+      typename EpilogueOp::Params const& params,
+      [[maybe_unused]] TensorStorage& shared_tensors)
+    : EpilogueOp(params) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] CtaTileMNK cta_tile_mnk,
+      [[maybe_unused]] CtaCoordMNKL cta_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] int subtile_idx=-1)
+  {
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] TileShapeMNK tile_shape_MNK,
+      [[maybe_unused]] TileCoordMNKL tile_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapC const& load_tensormap,
+      [[maybe_unused]] int subtile_idx=-1,
+      [[maybe_unused]] bool wait = false)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx,
+    [[maybe_unused]] int32_t warp_group_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(cta_tile_mnk);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(cta_tile_mnk) * get<0,i>(cta_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(cta_tile_mnk);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(cta_tile_mnk) * get<1,i>(cta_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        cta_tile_mnk,
+        cta_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapD const& store_tensormap,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(tile_shape_MNK);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(tile_shape_MNK) * get<0,i>(tile_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(tile_shape_MNK);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(tile_shape_MNK) * get<1,i>(tile_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        tile_shape_MNK,
+        tile_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  // Dummy methods to perform different parts of TMA/Tensormap modifications
+
+  template <bool IsLoad,
+            class ProblemShapeMNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape,
+      [[maybe_unused]] int32_t next_batch,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
+};
+
+// SFINAE helpers for detecting beta/beta_ptr in EVT arguments.
+template <class Arguments, class = void>
+struct has_beta {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta<Arguments, cute::void_t<decltype(Arguments{}.thread.beta)>> {
+  static constexpr bool value = true;
+};
+
+template <class Arguments, class = void>
+struct has_beta_ptr {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta_ptr<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr)>> {
+  static constexpr bool value = true;
+};
+
+} // namespace detail
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
new file mode 100755
index 000000000..48833ecf1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
@@ -0,0 +1,271 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor for performing tensor-tensor broadacasts atop existing epilogues.
+
+  Concretely, the opeartion performed is the following:
+    UnaryOp(
+        BinaryOp1(
+            BinaryOp0(
+                Activation((alpha * A @ B) + bias),
+                beta * C0
+            ),
+            beta * C1
+        )
+    )
+
+    where:
+        - C0 and C1 have the same extents as the output
+        - BinaryOp0 and BinaryOp1 perform elementwise binary operations
+        - UnaryOp is an elementwise operation
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Collective epilogue that applies elementwise tensor-tensor operations atop other epilogues
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_,
+  bool PerColumnBias_ = false
+>
+class EpilogueTensorBroadcast {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementBias = typename ThreadEpilogueOp::ElementBias;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ActivationFunctor = typename ThreadEpilogueOp::ActivationFunctor;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static constexpr int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static constexpr bool IsBinaryOp0Enabled = ThreadEpilogueOp::IsBinaryOp0Enabled;
+  static constexpr bool IsBinaryOp1Enabled = ThreadEpilogueOp::IsBinaryOp1Enabled;
+  static constexpr bool IsUnaryOpEnabled = ThreadEpilogueOp::IsUnaryOpEnabled;
+
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  using BiasStride = typename cute::conditional_t<PerColumnBias, Stride<_0, _1, _0>, Stride<_1, _0, _0>>;
+
+  struct SharedStorage { };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias* ptr_Bias = nullptr;
+    ElementC* ptr_C0 = nullptr;
+    ElementC* ptr_C1 = nullptr;
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  EpilogueTensorBroadcast(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source0_needed() || epilogue_op.is_source1_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 4");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c    = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d    = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+    auto stride_bias = detail::get_epilogue_stride<EpilogueSchedule>(BiasStride{});
+
+    // Represent the full output tensor
+    Tensor mC0_mnl = make_tensor(make_gmem_ptr(params.ptr_C0), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mC1_mnl = make_tensor(make_gmem_ptr(params.ptr_C1), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                     // (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), stride_bias);            // (m,n,l)
+
+    Tensor gC0_mnl = local_tile(mC0_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gC1_mnl = local_tile(mC1_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});        // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});  // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this thread block is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC0 = gC0_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gC1 = gC1_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                     // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                               // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                                           // (VEC,THR_M,THR_N)
+    Tensor tCgC0 = thr_mma.partition_C(gC0);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgC1 = thr_mma.partition_C(gC1);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgBias = thr_mma.partition_C(gBias);                                                     // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value,
+        "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC0) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgC1) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+    CUTE_STATIC_ASSERT_V(size(tCgBias) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    bool bias_needed = params.ptr_Bias != nullptr;
+    bool c0_needed = (params.ptr_C0 != nullptr) && epilogue_op.is_source0_needed();
+    bool c1_needed = (params.ptr_C1 != nullptr) && epilogue_op.is_source1_needed();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accumulators); ++i) {
+      if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+        ElementBias bias = bias_needed ? tCgBias(i) : ElementBias(0);
+        ElementC c0 = c0_needed ? tCgC0(i) : ElementC(0);
+        ElementC c1 = c1_needed ? tCgC1(i) : ElementC(0);
+
+        tCgD(i) = epilogue_op(accumulators(i), c0, c1, bias);
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
new file mode 100755
index 000000000..a8083dab1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
@@ -0,0 +1,549 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class StrideC,
+  class StrideD,
+  class ThreadEpilogueOp,
+  class SmemLayout,
+  class CopyAtomR2S,
+  class TiledCopyS2R,
+  class CopyAtomR2G,
+  class EpilogueScheduleType = EpilogueSimtVectorized,
+  class Enable = void
+>
+class Epilogue {
+  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>, 
+                "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = CopyAtomR2G;
+
+  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  static constexpr bool IsActHasArgs = detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpilogueOp>::value;
+
+  // Host side epilogue arguments
+  template<class ThreadEpiOp, class = void>
+  struct ThreadEpilogueOpArguments {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+  };  
+
+  template<class ThreadEpiOp>
+  struct ThreadEpilogueOpArguments<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+  };
+
+  struct Arguments {
+    ThreadEpilogueOpArguments<ThreadEpilogueOp> thread{};
+    using StrideBias = decltype(thread.dBias);
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  template<class ThreadEpiOp, class = void>
+  struct ParamsType {
+    typename ThreadEpiOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  template<class ThreadEpiOp>
+  struct ParamsType<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    typename ThreadEpiOp::Params thread{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  using Params = ParamsType<ThreadEpilogueOp>;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) { 
+    typename ThreadEpilogueOp::Params thread_op_args;
+    thread_op_args.alpha = args.thread.alpha;
+    thread_op_args.beta = args.thread.beta;
+    thread_op_args.alpha_ptr = args.thread.alpha_ptr;
+    thread_op_args.beta_ptr = args.thread.beta_ptr;
+
+    if constexpr (IsActHasArgs) {
+      return {
+        thread_op_args,
+        args.thread.activation,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+    else {
+      return {
+        thread_op_args,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
+  
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gBiast = flat_divide(gBias, tile);                                          // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gBias = thread_s2r.partition_D(gBiast);                   // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rC = make_tensor<ElementC>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementD>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rBias = make_tensor_like(tSR_gBias);                      // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("gBias   : "); print(gBias.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rC : "); print(tSR_rC.layout()); print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+      print("gBiast  : "); print(gBiast.layout()); print("\n");
+      print("tSR_gBias  : "); print(tSR_gBias.layout()); print("\n");
+      print("tSR_rBias  : "); print(tSR_rBias.layout()); print("\n");
+    }
+#endif
+
+    if constexpr (IsEpilogueBiasSupported) {
+      if (params.ptr_Bias) {
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        Tensor tSR_gBias_flt = filter_zeros(tSR_gBias);
+        Tensor tSR_rBias_flt = filter_zeros(tSR_rBias);
+        Tensor tSR_cD_flt = filter_zeros(tSR_cD, tSR_gBias.stride());
+
+        // Step 0. Copy Bias from GMEM to fragment
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tSR_cD_flt(coords...), take<0, 2>(residue_mnk)); };
+        copy_if(pred_fn, tSR_gBias_flt, tSR_rBias_flt);    
+      }
+    }
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if constexpr (IsEpilogueBiasSupported) {
+          Tensor tSR_rBiasmn = tSR_rBias(_,_,_,step_m,step_n);
+
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        } else {
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i), tSR_rC(i));
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
new file mode 100755
index 000000000..8a70370b2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
@@ -0,0 +1,412 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Ptr Array Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpiloguePtrArraySimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = TiledCopyS2R;
+  using GmemTiledCopyD = TiledCopyS2R;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+      // If grouped gemm
+      if (epilogue_op.is_source_needed()) {
+        stride_c = params.dC[l_coord];
+      }
+      stride_d = params.dD[l_coord];
+    }
+    else {
+      stride_c = params.dC;
+      stride_d = params.dD;
+    }
+
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      //             (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementOutput>(shape(tSR_rAcc));                       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+    }
+#endif
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if (epilogue_op.is_source_needed()) {
+          // source is needed
+          Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+          Tensor tSR_rCmn = make_tensor<ElementC>(shape(tSR_gCmn));                     // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+          // Step 5. Copy C from GMEM to a fragment
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rCmn(i,m,n) = tSR_gCmn(i,m,n);
+                }
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Elementwise operation with conversion
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rD(i,m,n) = epilogue_op(tSR_rAcc(i,m,n), tSR_rCmn(i,m,n));
+                }
+                // Step 7. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+        else {
+          // source is not needed, avoid load and lift compute
+
+          // Step 5. Elementwise operation with conversion
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_rAcc); ++i) {
+            tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
new file mode 100755
index 000000000..84b6e14ee
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -0,0 +1,1191 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                   StagesD_,
+                                   FragmentSize_,
+                                   ReuseSmemC_,
+                                   DelayTmaStore_,
+                                   NumEpilogueWarpGroups_
+                                  >,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                                        StagesD_,
+                                                        FragmentSize_,
+                                                        ReuseSmemC_,
+                                                        DelayTmaStore_, 
+                                                        NumEpilogueWarpGroups_
+                                                       >;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_C;
+      cute::array<cute::TmaDescriptor, NumEpilogueWarpGroups> smem_tensormap_D;
+    } tensormaps;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideC, StrideC>;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC;
+    ElementD ** ptr_D = nullptr;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
+            repeat_like(InternalStrideC{}, int32_t(0)), InternalStrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
+            repeat_like(InternalStrideD{}, int32_t(0)), InternalStrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    cute::TmaDescriptor* tensormaps;
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_L = get<3>(init_shape);
+
+    static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_c = InternalStrideC{};
+      stride_d = InternalStrideD{};
+    } 
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
+      init_M = get<0>(problem_shape_MNKL);
+      init_N = get<1>(problem_shape_MNKL);
+      init_L = get<3>(problem_shape_MNKL);
+
+      stride_c = args.dC;
+      stride_d = args.dD;
+    }
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c = {};
+    if constexpr (is_source_supported) {
+      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(args.ptr_C); 
+      Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
+      tma_load_c = make_tma_copy(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    typename Params::TMA_D tma_store_d;
+    if constexpr (is_destination_supported) {
+      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(args.ptr_D);
+      Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
+      tma_store_d = make_tma_copy(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    auto fusion_workspace = static_cast<char*>(workspace);
+    auto fusion_workspace_size = FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+    auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
+                                      static_cast<char*>(workspace) + fusion_workspace_size);
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, fusion_workspace),
+      tma_load_c,
+      tma_store_d,
+      tma_descriptor_workspace,
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD,
+      transaction_bytes,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    auto descriptors_shape = cute::make_shape(sm_count, Int<NumInputTensors>{});
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (size(descriptors_shape) * SizeOfCuTensorMap) + FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    bool implementable = true;
+    bool fusion_implementable = true;
+
+    if (problem_shape.is_host_problem_shape_available()) {
+      for (int i = 0; i < problem_shape.groups(); ++i) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+
+        if constexpr (is_destination_supported) {
+          constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+          constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(cute::make_shape(M,N,L), InternalStrideD{});
+        }
+
+        if constexpr (not cute::is_void_v<ElementC>) {
+          constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+          constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(cute::make_shape(M,N,L), InternalStrideC{});
+        }
+
+        fusion_implementable = fusion_implementable && FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
+      }
+    }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if constexpr (cute::is_void_v<ElementC>) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    // Initialize tma for loading
+    constexpr bool IsLoad = true;
+    auto load_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, 0);
+    return load_tensormaps;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC,
+    __CUTE_REQUIRES(std::is_pointer_v<TensorMapC>)
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapC const& load_tensormap,
+      int subtile_idx=-1,
+      bool wait_until_load_finishes = false) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));             //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    };
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    LoadPipelineState last_load_producer_state = load_pipe_producer_state;
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    LoadPipelineState prior_state = load_pipe_producer_state;
+
+    bool did_load = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (is_C_load_needed) {
+          if (issue_tma_load) {
+            copy(params.tma_load_c.with(load_tensormap, *tma_barrier, mcast_mask),
+                bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+            load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+          }
+          last_load_producer_state = load_pipe_producer_state;
+          did_load = true;
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    if (wait_until_load_finishes && did_load) {
+      typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_tma_consumer_state =
+        {last_load_producer_state.index(), !last_load_producer_state.phase(), last_load_producer_state.count()};
+      load_pipeline.consumer_wait(epi_load_pipe_tma_consumer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+
+    if (!fusion_callbacks.is_producer_load_needed()) {
+      return load_pipe_producer_state; 
+    }
+
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapD const& store_tensormap,
+      int subtile_idx=-1) {
+
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));            //       (M,N,L)
+
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                          // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference R2S copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    };
+    auto cst_callbacks = fusion_callbacks.get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    int epi_m_prev = 0, epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d.with(store_tensormap), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        int mma_m = epi_m;
+        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+        // Vectorized fragment loop with visitor callback entry point
+        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+        int r2s_v = epi_n_in_mma * size(tRS_rD_frg);
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tRS_rD_frg); ++epi_v) {
+          tRS_rD_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+        }
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output needs register shuffling before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+    int warp_idx_in_warp_group = canonical_warp_idx_sync() % NumWarpsPerWarpGroup;
+    // Since only one warp issues TMA store, we only need that one warp to initialize tensormaps
+    if (warp_idx_in_warp_group == 0) {
+      // Initialize tma
+      constexpr bool IsLoad = false;
+      auto store_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, warp_group_idx);
+      return store_tensormaps;
+    }
+    TmaDescriptor* null_tma_desc = nullptr;
+    return cute::make_tuple(null_tma_desc);
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    Layout desc_layout = make_layout(make_shape(sm_count, Int<NumInputTensors>{}));
+
+    Tensor gmem_tensormap = make_tensor(params.tensormaps, desc_layout);                      // (SMs, NumInputTensors)
+
+    if constexpr (IsLoad) {
+      if (not cute::is_void_v<ElementC>) {
+        constexpr int C_tensormap_index = NumEpilogueWarpGroups;
+        Tensor pC_tensormap = make_tensor(params.tma_load_c.get_tma_descriptor(), Int<1>{}, Int<1>{});
+        Tensor sC_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_C), Int<1>{}, Int<1>{});
+
+        if (cute::elect_one_sync()) {
+          // Bringing tensormaps from params to smem for modification later
+          copy(recast<uint128_t>(pC_tensormap), recast<uint128_t>(sC_tensormap));
+        }
+        __syncwarp();
+        return cute::make_tuple(&gmem_tensormap(sm_idx, C_tensormap_index));
+
+      }
+      TmaDescriptor* null_tma_desc = nullptr;
+      return cute::make_tuple(null_tma_desc);
+    }
+    else {
+      Tensor pD_tensormap = make_tensor(params.tma_store_d.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sD_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_D[warp_group_idx]), Int<1>{}, Int<1>{});
+
+      if (cute::elect_one_sync()) {
+        // Bringing tensormaps from params to smem for modification later
+        copy(recast<uint128_t>(pD_tensormap), recast<uint128_t>(sD_tensormap));
+      }
+      __syncwarp();
+      return cute::make_tuple(&gmem_tensormap(sm_idx, warp_group_idx));
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+    // Replacing global_address for the next batch
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                        params.ptr_C[next_batch]);
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                      params.ptr_D[next_batch]);
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t warp_group_idx) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride = {0,0,0,0,0};
+
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        ElementC const* ptr_C = nullptr;
+        Tensor tensor_c = make_tensor(ptr_C, make_layout(make_shape(M,N,Int<1>{}), params.dC[next_group]));
+
+        cute::detail::fill_tma_gmem_shape_stride(params.tma_load_c, tensor_c, 
+                                                 prob_shape, prob_stride);
+        // Convert strides to byte strides
+        for (uint64_t& stride : prob_stride) {
+          stride = (stride * sizeof_bits_v<ElementC>) / 8;
+        }
+        cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                                prob_shape,
+                                                                prob_stride);
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      ElementD const* ptr_D = nullptr;
+      Tensor tensor_d = make_tensor(ptr_D, make_layout(make_shape(M,N,Int<1>{}), params.dD[next_group]));
+
+      cute::detail::fill_tma_gmem_shape_stride(params.tma_store_d, tensor_d, 
+                                               prob_shape, prob_stride);
+      // Convert strides to byte strides
+      for (uint64_t& stride : prob_stride) {
+        stride = (stride * sizeof_bits_v<ElementD>) / 8;
+      }
+
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                              prob_shape,
+                                                              prob_stride);
+    }
+  }
+
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      cute::TmaDescriptor const* tensormap,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address<IsLoad>(shared_tensormaps, params, next_batch, warp_group_idx);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties<IsLoad>(
+            shared_tensormaps, params, next_batch, problem_shape_mnkl, warp_group_idx);
+      }
+
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      TensorMapStorage& shared_tensormaps,
+      cute::TmaDescriptor const* tensormap,
+      const int32_t warp_group_idx = 0) {
+
+    // Entire warp must do this (ie its aligned)
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_C);
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_D[warp_group_idx]);
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::TmaDescriptor const* tensormap) {
+    if constexpr (IsLoad) {
+      if constexpr (not cute::is_void_v<ElementC>) {
+        cute::tma_descriptor_fence_acquire(tensormap);
+      }
+    } 
+    else {
+      cute::tma_descriptor_fence_acquire(tensormap);
+    }
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
new file mode 100755
index 000000000..b96c4aea0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -0,0 +1,904 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_,
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const* ptr_C;
+    StrideC dC;
+    ElementD const* ptr_D;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
+            repeat_like(StrideC{}, int32_t(0)), StrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
+            repeat_like(StrideD{}, int32_t(0)), StrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c = {};
+    if constexpr (is_source_supported) {
+      Tensor tensor_c = make_tensor(make_gmem_ptr(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
+      tma_load_c = make_tma_copy_C_sm90(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{});
+    }
+
+    typename Params::TMA_D tma_store_d;
+    if constexpr (is_destination_supported) {
+      Tensor tensor_d = make_tensor(make_gmem_ptr(args.ptr_D), make_layout(make_shape(M,N,L), args.dD));
+      tma_store_d = make_tma_copy_C_sm90(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{});
+    }
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      tma_load_c,
+      tma_store_d,
+      transaction_bytes
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto shape = cute::make_shape(M,N,L);
+
+    bool implementable = true;
+    if constexpr (is_destination_supported) {
+      constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+      constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+      if constexpr (cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(take<0,2>(shape), take<0,2>(StrideD{}));
+      }
+      else {
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(shape, StrideD{});
+      }
+    }
+
+    if constexpr (not cute::is_void_v<ElementC>) {
+      constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+      constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+      if constexpr (cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(take<0,2>(shape), take<0,2>(StrideC{}));
+      }
+      else {
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(shape, StrideC{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if constexpr (cute::is_void_v<ElementC>) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void
+  prefetch_tma_descriptors(Params const& epilogue_params) {
+    if constexpr (is_source_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_load_c.get_tma_descriptor());
+    }
+    if constexpr (is_destination_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_store_d.get_tma_descriptor());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor C under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_C>(
+      make_coord(m_coord, n_coord),
+      make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    );
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (issue_tma_load && is_C_load_needed) {
+          copy(params.tma_load_c.with(*tma_barrier, mcast_mask),
+              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor D under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_D>( 
+        make_coord(m_coord, n_coord),
+        make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                      // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference tiled copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    );
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
+    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                            // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    [[maybe_unused]] int epi_m_prev = 0;
+    [[maybe_unused]] int epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        [[maybe_unused]] bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        int mma_m = epi_m;
+        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+        // Vectorized fragment loop with visitor callback entry point
+        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+        int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+          tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+        }
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output register transformation before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rD_frg); ++i) {
+          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
+        }
+
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
new file mode 100755
index 000000000..974904008
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing pipelined epilogues with bias add and elementwise activation functions.
+         This collective is now DEPRECATED, will be removed in the next release. Use EVT instead.
+*/
+
+#pragma once
+
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  class BlockTileShape_,    //     (BLK_M,BLK_N,BLK_K)
+  class EpilogueTileShape_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class Sm90EpilogueTmaWarpSpecializedBiasElementwise
+  : public CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+> {
+private:
+  using Impl =
+    CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+    >;
+public:
+  using DispatchPolicy = Sm90TmaWarpSpecializedBiasElementwise<StagesC_, StagesD_, FragmentSize_>;
+  using ElementCompute = typename Impl::ThreadEpilogueOp::ElementCompute;
+  using ElementBias = typename Impl::ThreadEpilogueOp::ElementBias;
+  using ElementT = typename Impl::ThreadEpilogueOp::ElementAux;
+
+  // Constructor inheritance
+  using Impl::Impl;
+
+  // Host side epilogue arguments
+  struct [[deprecated("use Sm90TmaWarpSpecialized Arguments instead")]]
+  Arguments {
+    struct ThreadArgs {
+      ElementCompute alpha{1};
+      ElementCompute beta{0};
+      ElementCompute const *alpha_ptr{nullptr};
+      ElementCompute const *beta_ptr{nullptr};
+    } thread;
+    ElementC_ const* ptr_C{nullptr};
+    StrideC_ dC{};
+    ElementD_* ptr_D{nullptr};
+    StrideD_ dD{};
+    ElementBias const* ptr_Bias{nullptr};
+    ElementT* ptr_T{nullptr};
+
+    CUTLASS_HOST_DEVICE
+    operator typename Impl::Arguments() const {
+      typename Impl::Arguments arguments;
+      arguments.thread.alpha = thread.alpha;
+      arguments.thread.beta = thread.beta;
+      arguments.thread.alpha_ptr = thread.alpha_ptr;
+      arguments.thread.beta_ptr = thread.beta_ptr;
+      if constexpr (not cute::is_void_v<ElementBias>) {
+        arguments.thread.bias_ptr = ptr_Bias;
+      }
+      if constexpr (not cute::is_void_v<ElementT>) {
+        arguments.thread.aux_ptr = ptr_T;
+        arguments.thread.dAux = dD;
+      }
+      arguments.ptr_C = ptr_C;
+      arguments.dC = dC;
+      arguments.ptr_D = ptr_D;
+      arguments.dD = dD;
+
+      return arguments;
+    }
+  };
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
new file mode 100755
index 000000000..f829a2ff5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
@@ -0,0 +1,195 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Builder Epilogue Schedules
+//
+//////////////////////////////////////////////////////////////////////////////
+
+struct PtrArrayDefault {};
+struct EpilogueSimtVectorized {};
+struct EpiloguePtrArraySimtVectorized {};
+struct NoSmemWarpSpecialized {};
+struct PtrArrayNoSmemWarpSpecialized {};
+struct PtrArrayPlanarComplexNoSmemWarpSpecialized {};
+struct TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperative {};
+struct PtrArrayTmaWarpSpecializedCooperative {
+  static constexpr int NumEpilogueWarpGroups = 2;
+};
+
+// Standard warp specialized epilogue
+struct PtrArrayTmaWarpSpecialized {
+  static constexpr int NumEpilogueWarpGroups = 1;
+};
+
+// Pingpong kernel epilogue
+struct PtrArrayTmaWarpSpecializedPingpong {
+  static constexpr int NumEpilogueWarpGroups = 2;
+};
+
+// DEPRECATED schedules, will be removed in next release
+struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedElementwise : public TmaWarpSpecializedElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedCooperativeElementwise : public TmaWarpSpecializedCooperativeElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+struct TmaWarpSpecializedBiasElementwiseBase : public TmaWarpSpecialized{};
+struct TmaWarpSpecializedCooperativeBiasElementwiseBase : public TmaWarpSpecializedCooperative {};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedBiasElementwise : public TmaWarpSpecializedBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedCooperativeBiasElementwise : public TmaWarpSpecializedCooperativeBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Collective Dispatch Policies
+//
+//////////////////////////////////////////////////////////////////////////////
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm90TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_
+>
+struct Sm90PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+};
+
+// DEPRECATED policies, will be removed in next release
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_ = 2
+>
+struct Sm90TmaWarpSpecializedBiasElementwise {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
new file mode 100755
index 000000000..9ee37234c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
@@ -0,0 +1,89 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Dispatch interface for epilogue fusion callbacks
+// For visitor fusions, this is just a convenience wrapper to provide metadata and non-nested args.
+// It is also valid to just pass visitor callbacks directly to the collective, e.g. fusion::Sm90LinearCombination,
+// provided the collective supports a visitor callbacks interface. This is useful for implementing custom fusions.
+template <
+  class DispatchPolicy,  // specialize on collective's dispatch policy since callbacks API will depend on collective's algorithm
+  class Operation,       // the fusion operation being performed, e.g. fusion::LinearCombination
+  class CtaTile_MNK,     // computed tile per CTA
+  class EpilogueTile_MN, // epilogue subtile size
+  class... Args          // callbacks implementation dependent args (e.g. copy atoms, smem layouts)
+>
+struct FusionCallbacks {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy, Operation>, "Could not find a callbacks specialization.");
+};
+
+// Metadata helper to handle custom EVTs or other non-FusionCallbacks types
+template <class T>
+struct FusionCallbacksTraits {
+  using DispatchPolicy = void;
+  using Operation = T;
+  using CtaTile_MNK = void;
+  using EpilogueTile_MN = void;
+  using ElementCompute = void;
+};
+
+template <
+  class DispatchPolicy_,
+  class Operation_,
+  class CtaTile_MNK_,
+  class EpilogueTile_MN_,
+  class... Args
+>
+struct FusionCallbacksTraits<
+  FusionCallbacks<DispatchPolicy_, Operation_, CtaTile_MNK_, EpilogueTile_MN_, Args...>
+> {
+  using DispatchPolicy = DispatchPolicy_;
+  using Operation = Operation_;
+  using CtaTile_MNK = CtaTile_MNK_;
+  using EpilogueTile_MN = EpilogueTile_MN_;
+  using ElementCompute = typename Operation::ElementCompute;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
new file mode 100755
index 000000000..3aed32710
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
@@ -0,0 +1,351 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/layout/matrix.h>
+#include <cute/numeric/numeric_types.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Fusion Operations
+// Template args must not be implementation dependent
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FusionOperation {
+  // metadata types/queries that can be overrided
+  using ElementOutput = void;
+  using ElementCompute = void;
+
+  using ElementSource = void;
+  static constexpr bool IsSourceSupported = false;
+
+  using ElementScalar = void;
+  static constexpr int AlignmentScalar = 0;
+  static constexpr bool IsScaleFactorSupported = false;
+  static constexpr bool IsPerRowScaleSupported = false;
+  using ElementBias = void;
+  static constexpr int AlignmentBias = 0;
+  static constexpr bool IsPerRowBiasSupported = false;
+  static constexpr bool IsDePerRowBiasSupported = false;
+
+  using ActivationFn = void;
+  static constexpr bool IsEltActSupported = false;
+  static constexpr bool IsDeEltActSupported = false;
+
+  using ElementAux = void;
+  using GmemLayoutTagAux = void;
+  static constexpr int AlignmentAux = 0;
+  static constexpr bool IsAuxOutSupported = false;
+  static constexpr bool IsAuxInSupported = false;
+
+  using ElementAmax = void;
+  static constexpr bool IsAbsMaxSupported = false;
+
+};
+
+// D = alpha * acc
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAcc : FusionOperation {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentScalar = 1;
+  static constexpr auto RoundStyle = RoundStyle_;
+};
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinearCombination
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_> {
+  using ElementSource = ElementSource_;
+  static constexpr bool IsSourceSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombTopKSoftmaxCol
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+};
+
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerRowBiasSupported = true;
+};
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerColBiasSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltAct
+    : LinCombPerRowBias<ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// aux = alpha * acc + beta * C + per-row bias
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltActAux
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerRowLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr bool IsPerRowScaleSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr bool IsScaleFactorSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementAmax_ = ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltActAmaxAux
+    : ScaledLinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAmax = ElementAmax_;
+  static constexpr bool IsAbsMaxSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsDeEltActSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxInSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+// dBias = sum of columns of D
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltActDePerRowBias
+    : LinCombDeEltAct<GmemLayoutTagAux_, ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementAux_, ElementSource_, ElementScalar_, AlignmentAux_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsDePerRowBiasSupported = true;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
new file mode 100755
index 000000000..e028846a4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,1787 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Fusion callbacks specializations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+using Sm90EVT = Sm90TreeVisitor<NodeOp, ChildOps...>;
+
+// D = alpha * acc
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, 
+      Sm90AccFetch
+    > {
+  using Impl = 
+    Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>,
+      Sm90AccFetch
+    >;
+  using Operation = fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    // Give a name and flat ordering to the fusion callback args
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+
+    // Conversion to the args expected by the visitor implementation
+    // to_underlying_arguments will implicitly call this
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : alpha * acc
+          {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+          {},                     // leaf args : acc
+          {} // binary args : multiplies
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombination =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C, where beta and alpha can be vectors for each batch
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombinationPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltAct<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C), where beta and alpha can be vectors for each batch
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltActPtrArray =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombinationPtrArray<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltActPtrArray<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltActPtrArray<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerRowBias<
+    CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerRowBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerColBias<
+    StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerColBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// Aux = alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltActAux =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
+      Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
+          {                 // unary op : store(beta * C + (alpha * acc + bias))
+            {                  // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, dAux} // unary args : store
+          },                // end unary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// D = per-row alpha * acc + per-row beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90PerRowLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute,
+                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    using StrideAlpha = Stride<bool,_0,int64_t>;
+    using StrideBeta  = Stride<bool,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    StrideAlpha dAlpha = {bool(1), _0{}, 0};
+    StrideBeta  dBeta  = {bool(1), _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {beta_ptr, beta, dBeta}, // leaf args : beta
+            {},                      // leaf args : C
+            {                        // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha}, // leaf args : alpha
+              {},                         // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                     // ternary args : multiply_add
+            },                       // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename T>
+constexpr bool is_fp8_v = cute::is_same_v<T,float_e4m3_t> || cute::is_same_v<T,float_e5m2_t>;
+
+// We only apply the scaling factor if output is fp8
+template <typename ElementOutput>
+struct ScaleOutOp { template <typename T> using Op = cutlass::first<T>; };
+template <>
+struct ScaleOutOp<float_e4m3_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+template <>
+struct ScaleOutOp<float_e5m2_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+
+template <typename T>
+using amax = cutlass::maximum_absolute_value_reduction<T, true>; // propogate nans
+
+}; // end namespace detail
+
+// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+      Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
+          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
+            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+              {{beta, scale_c},
+               {beta_ptr, scale_c_ptr},
+               {dBeta, {_0{}, _0{}, 0}}
+               },  // leaf args : (scale_c * beta)
+              {},  // leaf args : C
+              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                {{alpha, scale_a, scale_b}, 
+                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {} // ternary args : multiply_add
+              },   // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {{scale_d},
+           {scale_d_ptr}
+           },   // leaf args : scale_d
+          {} // binary args : multiplies or first
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+
+// fp8 aux specialization
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8 =
+  Sm90SplitTreeVisitor<
+    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+    Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+          Sm90SplitTreeFetch // Z
+        >
+      >,
+      Sm90ScalarBroadcast<ElementScalar> // scale_d
+    >,
+    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
+      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
+        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
+          Sm90SplitTreeFetch // Z
+        >,
+        Sm90ScalarBroadcast<ElementScalar> // scale_aux
+      >
+    >
+  >;
+
+// non-fp8 aux specialization
+// lets us use some EVT specializations such as relu + uint1b_t aux
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8 =
+  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
+          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+          Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+        >
+      >
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+// dispatcher
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
+  >,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+  >
+>;
+
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementAmax,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    ElementScalar scale_aux = ElementScalar(1);
+    ElementScalar const* scale_aux_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    ElementAmax* amax_D_ptr = nullptr;
+    ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      // Only compute amax_d if D is fp8
+      ElementAmax* amax_D_ptr_ = nullptr;
+      if constexpr (detail::is_fp8_v<ElementOutput>) {
+        amax_D_ptr_ = amax_D_ptr;
+      }
+
+      // Aux is fp8 -> DAG arguments
+      if constexpr (detail::is_fp8_v<ElementAux>) {
+        typename Impl::Arguments args;
+        // always use structured binding to unpack DAG args since it may or may not be a tuple
+        auto& [Z_args, aux_args, D_args] = args;
+
+        Z_args =
+          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+            {{beta, scale_c},
+             {beta_ptr, scale_c_ptr},
+             {dBeta, {_0{}, _0{}, 0}}
+             },  // leaf args : (scale_c * beta)
+            {},  // leaf args : C
+            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+              {{alpha, scale_a, scale_b}, 
+               {alpha_ptr, scale_a_ptr, scale_b_ptr},
+               {dAlpha ,{_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+               },                   // leaf args : (scale_a * scale_b * alpha)
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            {} // ternary args : multiply_add
+          };   // end ternary op
+
+        D_args =
+          {    // binary op : activation(Z) * scale_d or activation(Z)
+            {    // unary op : reduce(activation(Z))
+              {             // unary op : activation(Z)
+                {},             // leaf args : Z
+                activation      // unary args : activation
+              },                // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},
+             {scale_d_ptr}
+             },  // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+
+        aux_args =
+          {    // unary op : store(Aux)
+            {    // binary op : Z * scale_d or Z
+              {    // unary op : reduce(Z)
+                {},            // leaf args : Z
+                {amax_aux_ptr} // unary args : reduce
+              },   // end unary op
+              {{scale_aux},
+               {scale_aux_ptr}
+               },  // leaf args : scale_d
+              {} // binary args : multiplies
+            },   // end binary op
+            {aux_ptr, dAux} // unary args : store
+          };   // end unary op
+
+        return args;
+      }
+
+      // Aux is not fp8 -> Tree arguments
+      else {
+        return
+          {  // binary op : activation(Z) * scale_d or activation(Z)
+            {  // unary op : reduce(activation(Z))
+              {  // unary op : activation(Z)
+                {  // unary op : store(Z)
+                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+                    {{beta, scale_c},
+                     {beta_ptr, scale_c_ptr},
+                     {dBeta, {_0{}, _0{}, 0}}
+                    },                // leaf args : (scale_c * beta)
+                    {},               // leaf args : C
+                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                      {{alpha, scale_a, scale_b}, 
+                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                       {dAlpha, {_0{}, _0{}, 0}}
+                      },                // leaf args : (scale_a * scale_b * alpha)
+                      {},               // leaf args : acc
+                      {bias_ptr, ElementBias(0), dBias
+                      },                // leaf args : bias
+                      {}              // ternary args : multiply_add
+                    },                // end ternary op
+                    {}              // ternary args : multiply_add
+                  },                // end ternary op
+                  {aux_ptr, dAux} // unary args : store
+                },                // end unary op
+                activation     // unary args : activation
+              },               // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+      }
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc), aux)
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90AuxLoad<Stages, EpilogueTile, ElementAux, StrideAux, SmemLayoutAtom, CopyOpS2R, AlignmentAux> // aux
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation(beta * C + (alpha * acc), aux)
+          {                  // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}               // ternary args : multiply_add
+          },                 // end ternary op
+          {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+          activation // binary args : activation
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltActDePerRowBias =
+  Sm90EVT<Sm90Compute<cutlass::epilogue::thread::Identity, ElementOutput, ElementCompute, RoundStyle>, // Identity for final conversion
+    Sm90EVT<Sm90ColReduction<plus, plus, plus, 0, CtaTileShapeMNK,
+                             ElementBias, ElementCompute, RoundStyle, Stride<_1,_0,int64_t>, AlignmentBias>,
+      Sm90LinCombDeEltAct<CtaTileShapeMNK, EpilogueTile, Stages, StrideAux, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+                          ElementCompute, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias* dbias_ptr = nullptr;
+    StrideBias dDbias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+      {   // unary op : identity/convert
+        {    // unary op : reduce(activation(beta * C + (alpha * acc), aux))
+          {    // binary op : activation(beta * C + (alpha * acc), aux)
+            {                  // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // binary op : alpha * acc
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {}                  // binary args : multiplies
+              },                    // end binary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+            activation // binary args : activation
+          },   // end binary op
+          {dbias_ptr, ElementCompute(0), dDbias} // unary args : reduce
+        },   // end unary op
+        {} // unary args : identity/convert
+      };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombTopKSoftmaxCol =
+  Sm90EVT<Sm90TopKSoftmaxColReduction<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, RoundStyle>, // softmax(top_k(beta * C + (alpha * acc)))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int TopK,
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          {} // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <class FusionOpOrCallbacks, class = cute::void_t<>>
+struct get_element_aux {
+  using type = void;
+};
+
+template <class FusionOpOrCallbacks>
+struct get_element_aux<FusionOpOrCallbacks, cute::void_t<typename FusionOpOrCallbacks::ElementAux>> {
+  using type = typename FusionOpOrCallbacks::ElementAux;
+};
+
+template <class NodeOp, class... ChildOps>
+struct get_element_aux<Sm90TreeVisitor<NodeOp, ChildOps...>, cute::void_t<>> {
+  using type = typename get_element_aux<NodeOp>::type;
+};
+
+template <class... Ts>
+struct get_element_aux<FusionCallbacks<Ts...>, cute::void_t<typename FusionCallbacks<Ts...>::Operation>> {
+ private:
+  using Operation = typename FusionCallbacks<Ts...>::Operation;
+ public:
+  using type = typename get_element_aux<Operation>::type;
+};
+} // namespace cutlass:epilogue::fusion::detail
+
+template <class Callbacks>
+using get_element_aux_t = typename detail::get_element_aux<Callbacks>::type;
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
new file mode 100755
index 000000000..131d0ba5b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
@@ -0,0 +1,839 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The template argument provided for ComputeFn must be able to accept
+// exactly one template parameter.  In Standard C++, it's OK for
+// ComputeFn to have other template parameters, as long as those have
+// defaults.  For example, the following struct Foo would work.
+//
+// template<class A, class B = A>
+// struct Foo {
+//   CUTLASS_HOST_DEVICE auto operator() (A a, B b);
+// };
+//
+// However, some compilers, such as Clang, require that the argument
+// take _exactly_ one template parameter.  This is nonstandard C++
+// behavior.  One work-around for this case is to create a subclass
+// with exactly one template parameter, and then use that subclass as
+// the template argument.
+//
+// template<class A>
+// struct FooHomogeneous : public Foo<A, A> {};
+//
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct Sm90Compute {
+private:
+  using EmptyArguments = typename Sm90VisitorImpl<>::Arguments;
+
+  template <class Fn, class = void>
+  struct ComputeArguments {
+    using type = EmptyArguments;
+  };
+
+  // partial specialization for compute fns that define an Arguments member, e.g. activation hyperparameters
+  template <class Fn>
+  struct ComputeArguments<Fn, platform::void_t<typename Fn::Arguments>> {
+    using type = typename Fn::Arguments;
+  };
+
+public:
+  struct SharedStorage { };
+
+  using Arguments = typename ComputeArguments<ComputeFn<ElementCompute>>::type;
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const&, Arguments const& args, void*) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const&, Arguments const&) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute()
+      : params() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute(Params const& params, SharedStorage const& shared_storage)
+      : params(params) {}
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Params const& params)
+      : params(params) {}
+
+    Params const& params;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          ComputeOutput compute_output{};
+
+          if constexpr (cute::is_same_v<Arguments, EmptyArguments>) {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs...))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs...));
+          }
+          else {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs..., params))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs..., params));
+          }
+        }
+      );
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks(params);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Performance Optimized Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// beta * C + Z
+template <
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class InputScaleOp,  // beta
+  class ElementSource, // C
+  class InputAddOp     // Z
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle,
+              cute::void_t<decltype(declval<InputScaleOp>().is_zero())>>,
+  InputScaleOp,
+  Sm90SrcFetch<ElementSource>,
+  InputAddOp
+> : Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    if constexpr (detail::IsScalarBroadcast<InputScaleOp>::value && not is_void_v<ElementSource>) {
+      return (get<2>(scale_op.params_ptr->dScalar[0]) != 0 && scale_op.params_ptr->scalar_ptrs[0] != nullptr) || 
+              is_C_load_needed() || 
+              added_op.is_producer_load_needed();
+    }
+    else {
+      return is_C_load_needed() || added_op.is_producer_load_needed();
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& src_op = get<1>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    return (not scale_op.is_zero() && src_op.is_C_load_needed()) || added_op.is_C_load_needed();
+  }
+
+  template <class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(bool is_C_load_needed, CallbacksImpl&& impl)
+      : is_C_load_needed(is_C_load_needed), CallbacksImpl(cute::forward<CallbacksImpl>(impl)) { }
+
+    bool is_C_load_needed;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_added = get<2>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      using ElementZ = typename decltype(frg_added)::Element;
+      using ConvertZ = NumericArrayConverter<ElementCompute, ElementZ, FragmentSize, RoundStyle>;
+      using ConvertI = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertZ convert_Z{};
+      ConvertI convert_I{};
+
+      Array frg_I = convert_Z(frg_added);
+
+      if constexpr (!is_void_v<ElementSource>) {
+        Array frg_scalar = get<0>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+        Array frg_source = get<1>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+        using ElementX = typename decltype(frg_scalar)::Element;
+        using ElementY = typename decltype(frg_source)::Element;
+        using ConvertX = NumericArrayConverter<ElementCompute, ElementX, FragmentSize, RoundStyle>;
+        using ConvertY = NumericArrayConverter<ElementCompute, ElementY, FragmentSize, RoundStyle>;
+        using ComputeI = multiply_add<Array<ElementCompute, FragmentSize>>;
+        ConvertX convert_X{};
+        ConvertY convert_Y{};
+        ComputeI compute_I{};
+
+        frg_I = compute_I(convert_X(frg_scalar), convert_Y(frg_source), frg_I);
+      }
+
+      return convert_I(frg_I);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    bool is_C_load_needed = this->is_C_load_needed();
+    if (not is_C_load_needed) {
+      cute::clear(args.tCrC);
+    }
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(
+        is_C_load_needed, std::move(callbacks_tuple));
+  }
+};
+
+// ReLU with aux bit tensor dReLU/dZ
+// Aux(i) = Z(i) >= 0 ? 1 : 0
+namespace detail {
+// Placeholder node so we can retain standard EVT structure
+template <class StrideMNL>
+struct Sm90ReLUAuxStore : Sm90VisitorImpl<> {
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore(Params const& params, SharedStorage const& shared_storage) { }
+};
+} // namespace detail
+
+// Specialization on the generic compute+aux EVT
+template <
+  // Compute node
+  template <class> class Activation,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  // Aux node
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment,
+  bool EnableNullptr,
+  // Input node
+  class InputOp
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle,
+              cute::enable_if_t<cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ReLu<ElementCompute>> ||
+                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>>  >>,
+  Sm90TreeVisitor<
+    Sm90AuxStore<
+      Stages,
+      EpilogueTile,
+      cutlass::uint1b_t,
+      RoundStyle,
+      StrideMNL,
+      SmemLayoutAtom,
+      CopyOpR2S,
+      Alignment,
+      EnableNullptr
+    >,
+    InputOp
+  >
+> : Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(Params const& params_, SharedStorage const& shared_storage)
+    : params(params_), Impl(params_, shared_storage) {}
+
+  Params const& params;
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue, class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        RTensor&& tC_rAux,
+        GTensor&& tC_gAux,
+        CTensor tC_cAux,
+        ThrResidue residue_tC_cAux,
+        Params const& params,
+        CallbacksImpl&& impl)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux)),
+        tC_cAux(tC_cAux),
+        residue_tC_cAux(residue_tC_cAux),
+        params(params),
+        CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      Array frg_input = callbacks_input.visit(frg_acc, epi_v, epi_m, epi_n);
+
+      // Compute activation + aux
+      using ElementInput = typename decltype(frg_input)::Element;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ConvertAux = PackPredicates<FragmentSize>;
+      using ComputeOutput = Activation<ElementCompute>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput relu{};
+      ConvertAux convert_aux{};
+      ConvertOutput convert_output{};
+
+      Array frg_compute = convert_input(frg_input);
+      bool frg_aux[FragmentSize];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        ElementCompute pre_relu = frg_compute[i];
+        if constexpr (cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+          frg_compute[i] = relu(frg_compute[i], params_compute);
+        }
+        else {
+          frg_compute[i] = relu(frg_compute[i]);
+        }
+        if constexpr (cute::is_same_v<ElementCompute, float>) {
+          uint32_t aux;
+          asm volatile("set.equ.u32.f32 %0, %1, %2;\n" : "=r"(aux) : "f"(frg_compute[i]), "f"(pre_relu)); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else if constexpr (cute::is_same_v<ElementCompute, cutlass::half_t>) {
+          uint32_t aux;
+          cutlass::half_t compute = frg_compute[i];
+          asm volatile("set.equ.u32.f16 %0, %1, %2;\n" : "=r"(aux) : "h"(compute.raw()), "h"(pre_relu.raw())); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else {
+          frg_aux[i] = frg_compute[i] == pre_relu;
+        }
+      }
+
+      static_assert(FragmentSize % 8 == 0, "Predicate vector must be byte-aligned");
+      Tensor tC_rAux_frg = recast<typename ConvertAux::result_type>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)));   // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_aux(frg_aux);
+
+      return convert_output(frg_compute);
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      callbacks_input.end();
+
+      // Nullptr is no-op
+      if constexpr (EnableNullptr) {
+        if (params_aux.ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      // Compute vectorization
+      constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      // Copy vectorizes into byte-aligned stores
+      if constexpr (V > 1 && V % 8 == 0) {
+        using VecType = uint_bit_t<V>;
+        Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+        Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+        Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+        copy_if(predicate_fn, tC_rAux_vec, tC_gAux_vec);
+      }
+      // sub-byte vectorization, must serialize threads
+      else {
+        // Assumes no inter-warp sharing of bytes (most copy layouts should satisfy this)
+        int lane_idx = canonical_lane_idx();
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int i = 0; i < NumThreadsPerWarp; ++i) {
+          if (lane_idx == i) {
+            copy_if(predicate_fn, tC_rAux, tC_gAux);
+          }
+          __syncwarp();
+        }
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // Unpack params
+    auto const& [params_input_aux, params_compute] = params;
+    auto const& [params_input, params_aux] = params_input_aux;
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t>(params_aux.ptr_aux));
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params_aux.dAux));                     // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    auto callbacks_impl = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD), decltype(callbacks_impl)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params, cute::move(callbacks_impl));
+  }
+};
+
+// Aux load for uint1b_t
+template <
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  Stages,
+  EpilogueTile,
+  cutlass::uint1b_t,
+  StrideMNL,
+  SmemLayoutAtom,
+  CopyOpS2R,
+  Alignment,
+  EnableNullptr
+> {
+  static_assert(Alignment % 128 == 0, "sub-16B alignment not supported yet");
+
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t const* ptr_aux = nullptr;
+    cutlass::uint1b_t null_default = cutlass::uint1b_t(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const&)
+      : params(params) { }
+
+  Params const params;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux_, GTensor&& tC_gAux_, CTensor tC_cAux_, ThrResidue residue_tC_cAux_, Params const& params_)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux_)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux_)),
+        tC_cAux(tC_cAux_),
+        residue_tC_cAux(residue_tC_cAux_),
+        params(params_) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,{EPI_M,EPI_N})
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 5) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+        constexpr int V = cute::min(Alignment, size(MCL));
+        if constexpr (V > 1) {
+          using VecType = uint_bit_t<V>;
+          Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+          Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+          Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+          copy_if(predicate_fn, tC_gAux_vec, tC_rAux_vec);
+        }
+        else {
+          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+          copy_if(predicate_fn, tC_gAux, tC_rAux);
+        }
+      }
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(_,_,_,epi_m,epi_n)(coords...), residue_tC_cAux); };
+        copy_if(predicate_fn, tC_gAux(_,_,_,epi_m,epi_n), tC_rAux);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      using ElementRegister = typename remove_cvref_t<RTensor>::value_type;
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux))(epi_v);
+      }
+      else {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)))(epi_v);
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t const>(params.ptr_aux));
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params.dAux));                         // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // If byte-unaligned vectorization, store in registers as uint32_t to reduce redundant pack+unpack instruction sequences
+    constexpr int V = decltype(max_common_vector(tC_gAux.layout(), make_layout(tC_gAux.shape())))::value;
+    Tensor tC_rAux = [&] () {
+      if constexpr (V % 8 != 0) {
+        return make_tensor<uint32_t>(take<0,3>(shape(tC_gAux)));                       // (CPY,CPY_M,CPY_N)
+      } else {
+        return make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      }
+    }();
+
+    if constexpr (EnableNullptr) {
+      if (params.ptr_aux == nullptr) {
+        fill(tC_rAux, params.null_default);
+      }
+    }
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+// dReLU specialization
+template<
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle
+>
+struct Sm90Compute<
+  cutlass::epilogue::thread::dReLU,
+  ElementOutput,
+  ElementCompute,
+  RoundStyle
+> : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, typename ElementInput, typename ElementAux, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput      , FragmentSize> const& frg_input,
+          Array<ElementAux        , FragmentSize> const& frg_aux) {
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ComputeOutput = cutlass::epilogue::thread::dReLU<Array<ElementCompute, FragmentSize>>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput compute_output{};
+      ConvertOutput convert_output{};
+
+      return convert_output(compute_output(convert_input(frg_input), frg_aux)); // don't convert frg_aux for dReLU
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
new file mode 100755
index 000000000..a22bed4e0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -0,0 +1,1415 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct Sm90AccFetch : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return frg_acc;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks{};
+  }
+};
+
+// Split tree visitor fetches intermediate results from temporary accumulators
+using Sm90SplitTreeFetch = Sm90AccFetch;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns C
+template <class Element>
+struct Sm90SrcFetch : Sm90VisitorImpl<> {
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return is_C_load_needed();
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return not is_void_v<Element>;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_void_v<Element>;
+  }
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  template<class SrcTensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(SrcTensor const& tCrC)
+      : tCrC(tCrC) {}
+
+    SrcTensor const& tCrC;                                                                         // (CPY,CPY_M,CPY_N)
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<typename SrcTensor::value_type, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<typename SrcTensor::value_type, FragmentSize>>(tCrC)(epi_v);
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // register type may differ from logical type so we can't assert matching types here
+    return ConsumerStoreCallbacks(args.tCrC);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90AuxLoad {
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using CopyOpG2S =
+      SM90_TMA_LOAD
+    ;
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideMNL{}, int32_t(0)), append<3>(StrideMNL{}, _0{})),
+        take<0,2>(SmemLayoutTma{})));
+    TMA_Aux tma_load_aux;
+    Element null_default = Element(0);
+    bool use_default = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto M_AUX =
+        size(M)
+      ;
+    Tensor tensor_aux = make_tensor(make_gmem_ptr(args.ptr_aux), make_layout(make_shape(M_AUX,N,L), append<3>(args.dAux, _0{})));
+    typename Params::TMA_Aux tma_load_aux = make_tma_copy(CopyOpG2S{}, tensor_aux, take<0,2>(SmemLayoutTma{}));
+
+    bool use_default = false;
+    if constexpr (EnableNullptr) {
+      use_default = args.ptr_aux == nullptr;
+    }
+
+    return Params{tma_load_aux, args.null_default, use_default};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return true;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (params_ptr->use_default && params_ptr->null_default == Element(0));
+  }
+
+  template <class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
+    CUTLASS_DEVICE
+    ProducerLoadCallbacks(GTensor&& bGS_gAux, STensor&& bGS_sAux, Params const* params_ptr)
+      : bGS_gAux(cute::forward<GTensor>(bGS_gAux)),
+        bGS_sAux(cute::forward<STensor>(bGS_sAux)),
+        params_ptr(params_ptr) {}
+
+    GTensor bGS_gAux;                                                                  // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    STensor bGS_sAux;                                                                  // (TMA,TMA_M,TMA_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          return;
+        }
+      }
+
+      if (issue_tma_load) {
+        // Increment the expected transaction bytes of the current stage's mbarrier by the subtile's byte-size
+        constexpr uint32_t copy_bytes = size(take<0,2>(SmemLayout{})) * sizeof_bits_v<Element> / 8;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA load
+        constexpr uint16_t mcast_mask = 0;
+        int load_pipe_index = load_iteration % Stages;
+        copy(params_ptr->tma_load_aux.with(*full_mbarrier_ptr, mcast_mask),
+          bGS_gAux(_,_,_,epi_m,epi_n), bGS_sAux(_,_,_,load_pipe_index));
+      }
+    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    auto coord_shape =
+        make_coord(m, n, l)
+      ;
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), coord_shape);                       // (CTA_M,CTA_N)
+
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
+
+    ThrCopy thrblk_g2s = params_ptr->tma_load_aux.get_slice(_0{});
+    Tensor bGS_gAux = thrblk_g2s.partition_S(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    Tensor bGS_sAux = thrblk_g2s.partition_D(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+
+    return ProducerLoadCallbacks<decltype(bGS_gAux), decltype(bGS_sAux)>(
+      cute::move(bGS_gAux), cute::move(bGS_sAux), params_ptr);
+  }
+
+  template <class RTensor, class TiledS2R, class STensorS2R>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux, TiledS2R tiled_s2r, STensorS2R&& tSR_sAux, Params const* params_ptr)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tiled_s2r(tiled_s2r),
+        tSR_sAux(cute::forward<STensorS2R>(tSR_sAux)),
+        params_ptr(params_ptr) { }
+
+    TiledS2R tiled_s2r;
+    RTensor tC_rAux;                                                                          // (CPY,CPY_M,CPY_N)
+    STensorS2R tSR_sAux;                                                                      // (S2R,S2R_M,S2R_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+
+      using RLayoutS2R = decltype(cute::layout(TiledS2R{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tSR_rAux = make_tensor(tC_rAux.data(), RLayoutS2R{});                                 // (S2R,S2R_M,S2R_N)
+
+      int load_pipe_index = load_iteration % Stages;
+      copy(tiled_s2r, tSR_sAux(_,_,_,load_pipe_index), tSR_rAux);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+
+      return tC_rAux_frg(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      >(mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    auto tiled_s2r = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy)
+    );
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));            // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    auto tSR_sAux = tiled_s2r.get_slice(args.thread_idx).partition_S(sAux_epi);               // (S2R,S2R_M,S2R_N,PIPE)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_s2r), decltype(tSR_sAux)>(
+        cute::move(tC_rAux), tiled_s2r, cute::move(tSR_sAux), params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpS2R,      // Unused
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  0, EpilogueTile, Element, LayoutOrStrideMNL, 
+  SmemLayoutAtom, CopyOpS2R, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorG2R,
+    class RTensor,
+    class CTensorG2R,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensorG2R&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorG2R&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorG2R>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorG2R>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+    
+    GTensorG2R tC_gAux;
+    RTensor tC_rAux;
+    CTensorG2R tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
+      
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      auto pred_fn = [&] (auto const&... coords) {
+        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
+      };
+
+      copy_if(pred_fn, tC_gAux_vec, tC_rAux_vec);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<Element, FragmentSize>>(tC_rAux)(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL_ = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+  
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    if (get<2>(params_ptr->dScalar[0]) == 0) { 
+      // Only 1 batch
+      return scalar == Element(0);
+    }
+    else { 
+      // multiple batch
+      if (valid_scalar == false) {
+        // for stridedBatch kernel, if ptr has a valid address, we need to enable the epi_load warps.
+        return params_ptr->scalar_ptrs[0] == nullptr;
+      }
+      else {
+        // Check whether each batch is ZERO or not.
+        return scalar == Element(0);
+      }
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  bool valid_scalar = false;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Get the scalar for batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    valid_scalar = true;
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } 
+    else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      } 
+      else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+  template<class... Xs>
+  CUTLASS_DEVICE void
+  update_scalar(cute::tuple<Xs...>) {
+    // Only support multiple L-modes with fully-broadcast scalar
+    scalar = params_ptr->scalars[0];
+    valid_scalar = true;
+  }
+};
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcastPtrArray {
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    Element const* const* scalar_ptr_arrays[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+  
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    // producer load is needed if Element is not void and we have multiple scalars
+    return !cute::is_void_v<Element> and size<2>(params_ptr->dScalar[0]) != 0;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return scalar == Element(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptr_arrays[0] != nullptr) {
+      scalar = *(params_ptr->scalar_ptr_arrays[0][l_offset]);
+    }
+    else if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    }
+    else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+
+      if (params_ptr->scalar_ptr_arrays[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, *(params_ptr->scalar_ptr_arrays[i][rest_l_offset]));
+      }
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      } 
+      else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <int StagesC, class CtaTileShapeMNK, class EpilogueTile>
+[[deprecated("row broadcast only uses 0 stages")]] constexpr int
+compute_row_broadcast_stages() {
+  return ceil_div(StagesC, size<1>(zipped_divide(make_layout(take<0,2>(CtaTileShapeMNK{})), EpilogueTile{}))) + 1;
+}
+
+}
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput,
+  class ElementCompute = ElementInput,
+  class StrideMNL_ = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementInput>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90RowBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(Stages == 0, "Row broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<1>(StrideMNL{}))>, bool>; // row vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{} || IsDynamicBroadcast);
+
+  struct SharedStorage { 
+    array_aligned<ElementInput, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  struct Arguments {
+    ElementInput const* ptr_row = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dRow = {};
+  };
+
+  struct Params {
+    ElementInput const* ptr_row = nullptr;
+    ElementCompute null_default = ElementCompute(0);
+    StrideMNL dRow = {};
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {args.ptr_row, ElementCompute(args.null_default), args.dRow};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false),
+        smem(const_cast<ElementInput*>(shared_storage.smem.data())) {
+    auto const& [stride_M, stride_N, stride_L] = params.dRow;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_row == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_N == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+      is_zero_ = params.ptr_row[0] == ElementInput(0);
+    }
+  }
+
+  Params params;
+  bool is_zero_ = false;
+  ElementInput *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        Residue residue_cRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , residue_cRow(residue_cRow_)
+      , params(params_)
+      , is_nullptr(EnableNullptr && params_.ptr_row == nullptr) {
+      if (is_nullptr) {
+        fill(tSR_rRow, params.null_default);
+      }
+    }
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    Residue residue_cRow;                                                        // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+    bool is_nullptr;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (is_nullptr) {
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = filter_zeros(tGS_cRow, tGS_gRow.stride());
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), residue_cRow)) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = ElementInput(0); // Set to Zero when OOB so LDS can be issued without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0 and not is_nullptr) { // Assumes M-major subtile loop
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = make_tensor_like<ElementInput>(tSR_sRow_flt);
+        copy_aligned(tSR_sRow_flt, tSR_rRow_flt);
+
+        constexpr int FrgSize = size(tSR_rRow_flt);
+        using FrgInput = Array<ElementInput, FrgSize>;
+        using FrgCompute = Array<ElementCompute, FrgSize>;
+        using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+        Tensor tSR_rRow_input_frg = recast<FrgInput>(coalesce(tSR_rRow_flt));
+        Tensor tSR_rRow_compute_frg = recast<FrgCompute>(filter(tSR_rRow));
+        ConvertInput convert_input{};
+
+        tSR_rRow_compute_frg(_0{}) = convert_input(tSR_rRow_input_frg(_0{}));
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    auto layout_N = [&] () {
+      auto shape_N = get<1>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_N = repeat_like(shape_N, int(0));
+        if (get<1>(params.dRow) == bool(1)) {
+          stride_N = transform_leaf(compact_major<LayoutLeft>(shape_N),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_N, stride_N);
+      }
+      else {
+        return make_layout(shape_N);
+      }
+    }();
+
+    auto layout_M = make_layout(M, repeat_like(M, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dRow));
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_layout(layout_M,layout_N,layout_L));
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, ElementInput>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    Tensor tGS_cRow = thr_g2s.partition_S(args.cD);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like<ElementCompute>(take<0,3>(tSR_sRow));                        // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput,
+  class ElementCompute = ElementInput,
+  class StrideMNL_ = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementInput>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90ColBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(Stages == 0, "Column broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<0>(StrideMNL{}))>, bool>; // Column vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{} || IsDynamicBroadcast);
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementInput const* ptr_col = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    ElementInput const* ptr_col = nullptr;
+    ElementCompute null_default = ElementCompute(0);
+    StrideMNL dCol = {};
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {args.ptr_col, ElementCompute(args.null_default), args.dCol};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false) {
+    auto const& [stride_M, stride_N, stride_L] = params.dCol;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_col == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_M == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+      is_zero_ = params.ptr_col[0] == ElementInput(0);
+    }
+  }
+
+  Params params;
+  bool is_zero_;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensor tCgCol_, RTensor tCrCol_, CTensor tCcCol_, ThrResidue residue_tCcCol_, Params const& params_)
+      : tCgCol(tCgCol_),
+        tCrCol(tCrCol_),
+        tCcCol(tCcCol_),
+        residue_tCcCol(residue_tCcCol_),
+        params(params_) {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        fill(tCrCol, params.null_default);
+      }
+    }
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcCol;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      Tensor tCgCol_flt = filter_zeros(tCgCol);
+      Tensor tCrCol_flt = make_tensor_like<ElementInput>(filter_zeros(tCrCol));
+      Tensor tCcCol_flt = filter_zeros(tCcCol, tCgCol.stride());
+
+      constexpr auto MCL = decltype(max_common_layout(tCgCol_flt, tCrCol_flt)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      if constexpr (V > 1) {
+        using VecType = uint_bit_t<V * sizeof_bits_v<ElementInput>>;
+        Tensor tCgCol_vec = recast<VecType>(coalesce(tCgCol_flt));
+        Tensor tCrCol_vec = recast<VecType>(coalesce(tCrCol_flt));
+        Tensor tCcCol_vec = tensor<1>(zipped_divide(tCcCol_flt, MCL.compose(Int<V>{})));
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_vec(coords...), residue_tCcCol); };
+        copy_if(pred_fn, tCgCol_vec, tCrCol_vec);
+      }
+      else {
+        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_flt(coords...), residue_tCcCol); };
+        copy_if(pred_fn, tCgCol_flt, tCrCol_flt);
+      }
+
+      constexpr int FrgSize = size(tCrCol_flt);
+      using FrgInput = Array<ElementInput, FrgSize>;
+      using FrgCompute = Array<ElementCompute, FrgSize>;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+      Tensor tCrCol_input_frg = recast<FrgInput>(coalesce(tCrCol_flt));
+      Tensor tCrCol_compute_frg = recast<FrgCompute>(filter(tCrCol));
+      ConvertInput convert_input{};
+
+      tCrCol_compute_frg(_0{}) = convert_input(tCrCol_input_frg(_0{}));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto layout_M = [&] () {
+      auto shape_M = get<0>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_M = repeat_like(shape_M, int(0));
+        if (get<0>(params.dCol) == bool(1)) {
+          stride_M = transform_leaf(compact_major<LayoutLeft>(shape_M),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_M, stride_M);
+      }
+      else {
+        return make_layout(shape_M);
+      }
+    }();
+
+    auto layout_N = make_layout(N, repeat_like(N, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dCol));
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(layout_M,layout_N,layout_L));
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    Tensor mCol_static = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(make_layout(M),layout_N,layout_L));
+    Tensor tCgCol_static = sm90_partition_for_epilogue<ReferenceSrc>(                  // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol_static, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol_static);                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks(tCgCol, tCrCol, args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix broadcast
+// Only need to redefine this if we can multicast across cluster L
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+using Sm90MatrixBroadcast
+  = Sm90AuxLoad<Stages, EpilogueTile, Element, StrideMNL, SmemLayoutAtom, CopyOpS2R, EnableNullptr>;
+
+namespace detail {
+
+template <typename Operation, typename = void>
+struct IsScalarBroadcast {
+  static constexpr bool value = false;
+};
+
+template <typename Operation>
+struct IsScalarBroadcast<Operation, cute::enable_if_t<is_same_v<decltype(take<0,2>(typename Operation::StrideMNL{})), Stride<_0,_0>>>> {
+  static constexpr bool value = true;
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
new file mode 100755
index 000000000..f9ebe7393
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,1736 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90AuxStore {
+  using ElementAux = Element;
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        SM90_TMA_STORE{},
+        make_tensor(static_cast<Element*>(nullptr), repeat_like(StrideMNL{}, int32_t(0)), StrideMNL{}),
+        SmemLayoutTma{}));
+    TMA_Aux tma_store_aux;
+    bool is_nullptr = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+
+    bool is_nullptr = false;
+    if constexpr (EnableNullptr) {
+      is_nullptr = args.ptr_aux == nullptr;
+    }
+
+    typename Params::TMA_Aux tma_store_aux;
+    if (not is_nullptr) {
+      Tensor tensor_aux = make_tensor(args.ptr_aux, make_layout(make_shape(M,N,L), args.dAux));
+      tma_store_aux = make_tma_copy(SM90_TMA_STORE{}, tensor_aux, SmemLayoutTma{});
+    }
+
+    return {tma_store_aux, is_nullptr};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class TiledR2S,
+    class STensorR2S,
+    class STensorS2G,
+    class GTensorS2G
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rAux,
+          TiledR2S tiled_r2s,
+          STensorR2S&& tRS_sAux,
+          STensorS2G&& bSG_sAux,
+          GTensorS2G&& bSG_gAux,
+          Params const* params_ptr)
+      : tiled_r2s(tiled_r2s),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tRS_sAux(cute::forward<STensorR2S>(tRS_sAux)),
+        bSG_sAux(cute::forward<STensorS2G>(bSG_sAux)),
+        bSG_gAux(cute::forward<GTensorS2G>(bSG_gAux)),
+        params_ptr(params_ptr) {}
+
+    TiledR2S tiled_r2s;
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N)
+    STensorR2S tRS_sAux;                                                               // (R2S,R2S_M,R2S_N,PIPE)
+    STensorS2G bSG_sAux;                                                               // (S2G,S2G_M,S2G_N,PIPE)
+    GTensorS2G bSG_gAux;                                                               // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      using RLayoutR2S = decltype(cute::layout(TiledR2S{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tRS_rAux = make_tensor(tC_rAux.data(), RLayoutR2S{});                                 // (R2S,R2S_M,R2S_N)
+
+      if (issue_smem_store) {
+        int store_pipe_index = store_iteration % Stages;
+        copy(tiled_r2s, tRS_rAux, tRS_sAux(_,_,_,store_pipe_index));
+      }
+    }
+
+    CUTLASS_DEVICE void
+    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      if (issue_tma_store) {
+        // Issue the TMA store
+        int store_pipe_index = store_iteration % Stages;
+        copy(params_ptr->tma_store_aux, bSG_sAux(_,_,_,store_pipe_index), bSG_gAux(_,_,_,epi_m,epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    Tensor mAux = params_ptr->tma_store_aux.get_tma_tensor(make_shape(M,N,L));                               // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));     // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    auto tiled_r2s = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy)
+    );
+    auto tRS_sAux = tiled_r2s.get_slice(args.thread_idx).partition_D(sAux_epi);               // (R2S,R2S_M,R2S_N,PIPE)
+
+    ThrCopy thrblk_s2g = params_ptr->tma_store_aux.get_slice(_0{});
+    Tensor bSG_sAux = thrblk_s2g.partition_S(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+    Tensor bSG_gAux = thrblk_s2g.partition_D(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_r2s), decltype(tRS_sAux), decltype(bSG_sAux), decltype(bSG_gAux)>(
+            cute::move(tC_rAux),
+            tiled_r2s,
+            cute::move(tRS_sAux),
+            cute::move(bSG_sAux),
+            cute::move(bSG_gAux),
+            params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  FloatRoundStyle RoundStyle,
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpR2S,      // Unused
+  int Alignment, 
+  bool EnableNullptr
+>
+struct Sm90AuxStore<
+  0, EpilogueTile, Element, RoundStyle, LayoutOrStrideMNL, 
+  SmemLayoutAtom, CopyOpR2S, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+  
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorR2G,
+    class RTensor,
+    class CTensorR2G,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GTensorR2G&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorR2G&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorR2G>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorR2G>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+    
+    GTensorR2G tC_gAux;
+    RTensor tC_rAux;
+    CTensorR2G tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
+      
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      auto pred_fn = [&] (auto const&... coords) {
+        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
+      };
+
+      copy_if(pred_fn, tC_rAux_vec, tC_gAux_vec);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);   
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90ScalarReduction {
+private:
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(IsAtomic, "non-atomic scalar reduction not supported yet");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+  #if !defined(CUTLASS_SKIP_REDUCTION_INIT)
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mScalar_layout = make_layout(make_shape(M,N,L), args.dScalar);
+      if (args.ptr_scalar != nullptr) {
+        return fill_workspace(args.ptr_scalar, ElementOutput(args.reduction_identity), cosize(mScalar_layout), stream, cuda_adapter);
+      }
+    }
+  #endif
+
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        int l_coord,
+        CTensor tCcScalar,
+        ThrResidue residue_tCcScalar,
+        Params const& params)
+      : scalar(params.reduction_identity),
+        l_coord(l_coord),
+        tCcScalar(tCcScalar),
+        residue_tCcScalar(residue_tCcScalar),
+        params(params) {}
+
+    ElementCompute scalar;
+    int l_coord;
+    CTensor tCcScalar;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcScalar;
+    Params params;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return frg_input;
+        }
+      }
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      Tensor tCcScalar_mn = tCcScalar(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (elem_less(tCcScalar_mn(epi_v * FragmentSize + i), residue_tCcScalar)) {
+          scalar = reduce_input(scalar, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return;
+        }
+      }
+
+      using ConvertI = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+      using ReduceInput = GmemReduceFn<ElementOutput>;
+
+      ConvertI convert_I{};
+      ReduceInput reduce_input{};
+
+      ElementOutput* ptr_scalar = params.ptr_scalar + l_coord * get<2>(params.dScalar);
+      reduce_input(ptr_scalar, convert_I(scalar));
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks<decltype(args.tCcD), decltype(args.residue_tCcD)>(
+      get<3>(args.tile_coord_mnkl), args.tCcD, args.residue_tCcD, params);
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_row is assumed to point to a compact n-major (ceil_div(M,CTA_M), round_nearest(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (N, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true,
+  // Indicate the parameter order when calling RegReduceFn
+  // Seq length equals the number of RegReduceFn parameters
+  // No.0 represents tCrRow; No.1 and subsequent numbers sequentially represent frg_inputs in `visit`
+  class RegReduceSeq = cute::seq<0, 1>
+>
+struct Sm90RowReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_row = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  struct Params {
+    void* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M), size<>(N), L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_row);
+    }
+
+    return {
+      args.ptr_row,
+      args.reduction_identity,
+      args.dRow,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+#if !defined(CUTLASS_SKIP_REDUCTION_INIT)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    if constexpr (IsAtomic) {
+      Layout mRow_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dRow);
+      if (args.ptr_row != nullptr) {
+        return fill_workspace(args.ptr_row, ElementOutput(args.reduction_identity), cosize(mRow_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else
+#endif 
+    if constexpr (FinalReduction) {
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return cute::get<0>(cute::make_tuple(frg_inputs...));
+        }
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
+      Tensor tCcRow_mn = tCcRow(_,_,_,epi_m,epi_n);
+
+      if constexpr (VisitCheckOOB) {
+        using ReduceInput = RegReduceFn<ElementCompute>;
+        ReduceInput reduce_input{};
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          if (elem_less(tCcRow_mn(epi_v * FragmentSize + i), residue_tCcRow)) {
+            ElementCompute& tCrRow_vmn = tCrRow_mn(epi_v * FragmentSize + i);
+            tCrRow_vmn = transform_apply(cute::make_tuple(frg_inputs...),
+                [&] (auto&& frg_input) {
+                  return ElementCompute(frg_input[i]);
+                },
+                [&] (auto&&... cvt_frg_inputs) {
+                  auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn, cvt_frg_inputs...);
+                  return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+                });
+          }
+        }
+      }
+      else {
+        constexpr int RegFragSize = cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute)));
+        using ReduceInput = RegReduceFn<Array<ElementCompute, RegFragSize>>;
+        ReduceInput reduce_input{};
+        Tensor tCrRow_mn_frg = recast<Array<ElementCompute, RegFragSize>>(tCrRow_mn);
+
+        constexpr int RegFragArraySize = FragmentSize / RegFragSize;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < RegFragArraySize; ++i) {
+          Array<ElementCompute, RegFragSize>& tCrRow_vmn_frg = tCrRow_mn_frg(epi_v * RegFragArraySize + i);
+          tCrRow_vmn_frg = transform_apply(cute::make_tuple(frg_inputs...),
+              [&] (auto&& frg_input) {
+                using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+                using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, RegFragSize, RoundStyle>;
+                using RegFragArr = Array<Array<ElementCompute, RegFragSize>, RegFragArraySize>;
+                ConvertInput convert_input{};
+                return convert_input(reinterpret_cast<RegFragArr&>(frg_input)[i]);
+              },
+              [&] (auto&&... cvt_frg_inputs) {
+                auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn_frg, cvt_frg_inputs...);
+                return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+              });
+        }
+      }
+      return cute::get<0>(cute::make_tuple(frg_inputs...));
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cRow(_0{},_0{}), residue_cRow)) {
+        return;
+      }
+
+      int lane_m = get<0>(lane_mn);
+      [[maybe_unused]] bool is_reduced_lane = lane_m == 0;
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      Tensor tCrRow_frg = recast<FragmentShuffle>(filter(tCrRow));
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+
+      auto FrgSizePerLaneM = size(tCrRow_frg) / size<0>(lane_layout_MN);
+      constexpr bool SwapShuffle = FrgSizePerLaneM > 0;
+
+      //
+      // Swap Shuffle
+      //
+      // The normal way to reduction among threads:
+      // use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
+      // After each step of reduction, a half of threads won't work in the following steps.
+      // That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
+      //
+      // To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
+      // we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
+      // After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
+      // We can recursively do this until the problem size is 1.
+      //
+      if constexpr (SwapShuffle) { // for a NxN matrix to be reduced among N threads as a 1XN vectors
+        Tensor tCrRow_frg_ = logical_divide(tCrRow_frg, FrgSizePerLaneM);                       // (FrgSizePerLaneM, M)
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = size<1>(tCrRow_frg_) / 2; m > 0; m /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int r = 0; r < m; ++r) {
+            auto frg_A = tCrRow_frg_(_,r);
+            auto frg_B = tCrRow_frg_(_,r + m);
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < size(frg_A); ++v) {
+              // Step1: swap
+              if (not (lane_m & m)) { // the first half of threads swap fragments from the first half of data to the second
+                swap(frg_A(v), frg_B(v));
+              }
+
+              // Step2: shuffle
+              uint64_t frg_shfl = reinterpret_cast<uint64_t&>(frg_A(v));
+              // each half of threads get a half of data from the other half of threads
+              frg_shfl = __shfl_xor_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(m, _0{}));
+
+              // Step3: reduction
+              frg_A(v) = reduce_shuffle(frg_B(v), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+            }
+          }
+        }
+      }
+      else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_rows = size<0>(lane_layout_MN) / 2; reduction_rows > 0; reduction_rows /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int frg_idx = 0; frg_idx < size(tCrRow_frg); ++frg_idx) {
+            uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrRow_frg(frg_idx));
+            frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(reduction_rows, _0{}));
+            tCrRow_frg(frg_idx) = reduce_shuffle(tCrRow_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+          }
+        }
+      }
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrRow_flt = filter_zeros(tCrRow);
+        Tensor tCcRow_flt = make_tensor(tCcRow.data(), make_layout(tCrRow_flt.shape(), tCcRow.stride()));
+        auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+
+        Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(gRow_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgRow_flt = filter_zeros(tCgRow);
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if constexpr (SwapShuffle) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < FltFrgSizePerLaneM; ++i) {
+            int idx = lane_m * FltFrgSizePerLaneM + i;
+            // Only care about OOB for N mode
+            if (get<1>(tCcRow_flt(idx)) < get<1>(residue_tCcRow)) {
+              reduce_output(&tCgRow_flt(idx), convert_output(tCrRow_flt(i)));
+            }
+          }
+        }
+        else {
+          if (is_reduced_lane) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tCrRow_flt); ++i) {
+              if (elem_less(tCcRow_flt(i), residue_tCcRow)) {
+                reduce_output(&tCgRow_flt(i), convert_output(tCrRow_flt(i)));
+              }
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in M, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<0>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_ml(_,_,m,l), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCgBuf_flt = recast<ElementGmem>(filter(tCgBuf));
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCgBuf_flt_ = logical_divide(tCgBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCgBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            // Filter so we don't issue redundant copies over stride-0 modes
+            // (only works if 0-strides are in same location, which is by construction)
+            copy_aligned(filter(tCrRow), recast<ElementGmem>(filter(tCgBuf)));
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in M, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<0>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCsBuf_flt = filter(tCsBuf);
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCsBuf_flt_ = logical_divide(tCsBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCsBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            // Filter so we don't issue redunant copies over stride-0 modes
+            // (only works if 0-strides are in same location, which is by construction)
+            copy_aligned(filter(tCrRow), filter(tCsBuf));
+          }
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerRow = decltype(size<1>(sBuf_frg))::value;
+
+        constexpr int RowNum = decltype(size<0>(warp_layout_MN))::value;
+        using FragmentSmemArray = Array<FragmentSmem, RowNum>;
+
+        // Do the threadblock smem reduction
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_ml(_,_,m,l)));
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerRow; frg_idx += size(tiled_copy)) {
+          FragmentSmemArray frg_smem;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = 0; reduction_rows < RowNum; ++reduction_rows) {
+            int FragsCurrRows = reduction_rows * FragsPerRow;
+            frg_smem[reduction_rows] = sBuf_frg(FragsCurrRows + frg_idx);
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = RowNum / 2; reduction_rows > 0; reduction_rows /= 2) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int row_idx = 0; row_idx < reduction_rows; ++row_idx) {
+              frg_smem[row_idx] = reduce_smem(frg_smem[row_idx], frg_smem[row_idx + reduction_rows]);
+            }
+          }
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem[0]);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[n], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_ml) * size<3>(gBuf_ml) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+          lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+          tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gRow_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            Tensor tRgBuf_ml = gBuf_ml(_0{},n,_,_);
+            ElementCompute output = tRgBuf_ml(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int ml = 1; ml < size(tRgBuf_ml); ++ml) {
+              output = reduce_output(output, tRgBuf_ml(ml));
+            }
+            if (elem_less(cRow(_0{},n), residue_cRow)) {
+              gRow_l(_0{},n,_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            bool do_store = elem_less(cRow(_0{},n), residue_cRow);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_ml); ++l) {
+              Tensor tRgBuf_m = gBuf_ml(_0{},n,_,l);
+              ElementCompute output = tRgBuf_m(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int m = 1; m < size(tRgBuf_m); ++m) {
+                output = reduce_output(output, tRgBuf_m(m));
+              }
+              if (do_store) {
+                gRow_l(_0{},n,l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mRow = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_row), make_shape(M,N,L), params.dRow); // (M,N,L)
+    Tensor gRow_l = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      gRow_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrRow = make_tensor_like<ElementCompute>(tCgRow);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    fill(tCrRow, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_0{}, _1{}));
+    auto block_shape = ceil_div(make_shape(M,N,L), shape(gBuf_layout)); // (M_CNT, N_CNT, L_CNT)
+
+    // Let the M_CNT (the num of partial reduction results) become the outer mode
+    Layout block_layout = make_layout(block_shape, make_stride(get<1>(block_shape), _1{}, get<0>(block_shape) * get<1>(block_shape)));
+    Layout mBuf_layout = blocked_product(gBuf_layout, block_layout);
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_ml = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(_,n,_));     // (CTA_M,CTA_N,REST_M,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,                                          // (CTA_M,CTA_N,WARPS_M)
+      make_layout(make_shape(_1{},_1{},size<0>(warp_layout_MN))));
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrRow), args.tCcD, gRow_l, args.cD, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(cute::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_col is assumed to point to a compact m-major (round_nearest(M,CTA_M), ceil_div(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (M, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true
+>
+struct Sm90ColReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_col = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    void* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_col);
+    }
+
+    return {
+      args.ptr_col,
+      args.reduction_identity,
+      args.dCol,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(M, tile_M) * sizeof(int);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+#if !defined(CUTLASS_SKIP_REDUCTION_INIT)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    if constexpr (IsAtomic) {
+      Layout mCol_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dCol);
+      if (args.ptr_col != nullptr) {
+        return fill_workspace(args.ptr_col, ElementOutput(args.reduction_identity), cosize(mCol_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else
+#endif 
+    if constexpr (FinalReduction) {
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(M, tile_M) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return frg_input;
+        }
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (!VisitCheckOOB || elem_less(tCcCol_mn(epi_v * FragmentSize + i), residue_tCcCol)) {
+          ElementCompute& tCrCol_vmn = tCrCol_mn(epi_v * FragmentSize + i);
+          tCrCol_vmn = reduce_input(tCrCol_vmn, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+
+      // Runtime nullptr is noop
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+      Tensor tCrCol_frg = recast<FragmentShuffle>(filter(tCrCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = 0; frg_idx < size(tCrCol_frg); ++frg_idx) {
+          uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrCol_frg(frg_idx));
+          frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(_0{},reduction_cols));
+          tCrCol_frg(frg_idx) = reduce_shuffle(tCrCol_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+        }
+      }
+      bool is_reduced_lane = get<1>(lane_mn) == 0;
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrCol_flt = filter_zeros(tCrCol);
+        Tensor tCcCol_flt = make_tensor(tCcCol.data(), make_layout(tCrCol_flt.shape(), tCcCol.stride()));
+
+        Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(gCol_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgCol_flt = filter_zeros(tCgCol);
+
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrCol_flt); ++i) {
+            if (elem_less(tCcCol_flt(i), residue_tCcCol)) {
+              reduce_output(&tCgCol_flt(i), convert_output(tCrCol_flt(i)));
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in N, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<1>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_nl(_,_,n,l), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          // Filter so we don't issue redundant copies over stride-0 modes
+          // (only works if 0-strides are in same location, which is by construction)
+          copy_aligned(filter(tCrCol), recast<ElementGmem>(filter(tCgBuf)));
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in N, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<1>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          // Filter so we don't issue redunant copies over stride-0 modes
+          // (only works if 0-strides are in same location, which is by construction)
+          copy_aligned(filter(tCrCol), filter(tCsBuf));
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerCol = decltype(size<0>(sBuf_frg))::value;
+
+        // Do the threadblock smem reduction
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(warp_layout_MN) / 2; reduction_cols > 1; reduction_cols /= 2) {
+          int FragsPerReduction = reduction_cols * FragsPerCol;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int frg_idx = thread_idx; frg_idx < FragsPerReduction; frg_idx += size(tiled_copy)) {
+            FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerReduction));
+            sBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+          }
+          sync_fn();
+        }
+
+        // Do final smem reduction and dump to gmem workspace
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_nl(_,_,n,l)));
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerCol; frg_idx += size(tiled_copy)) {
+          FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerCol));
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[m], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_nl) * size<3>(gBuf_nl) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+                lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+                tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gCol_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            Tensor tRgBuf_nl = gBuf_nl(m,_0{},_,_);
+            ElementCompute output = tRgBuf_nl(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int nl = 1; nl < size(tRgBuf_nl); ++nl) {
+              output = reduce_output(output, tRgBuf_nl(nl));
+            }
+            if (elem_less(cCol(m,_0{}), residue_cCol)) {
+              gCol_l(m,_0{},_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            bool do_store = elem_less(cCol(m,_0{}), residue_cCol);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_nl); ++l) {
+              Tensor tRgBuf_n = gBuf_nl(m,_0{},_,l);
+              ElementCompute output = tRgBuf_n(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int n = 1; n < size(tRgBuf_n); ++n) {
+                output = reduce_output(output, tRgBuf_n(n));
+              }
+              if (do_store) {
+                gCol_l(m,_0{},l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_col), make_shape(M,N,L), params.dCol); // (M,N,L)
+    Tensor gCol_l = local_tile(mCol, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gCol_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    fill(tCrCol, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_1{}, _0{}));
+    Layout mBuf_layout = blocked_product(gBuf_layout, make_layout(ceil_div(make_shape(M,N,L), shape(gBuf_layout))));
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_nl = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(m,_,_));     // (CTA_M,CTA_N,REST_N,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,make_layout(make_shape(_1{},_1{},size<1>(warp_layout_MN)))); // (CTA_M,CTA_N,WARPS_N)
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrCol), args.tCcD, gCol_l, args.cD, gBuf_nl, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix reduction
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class CopyOpR2S,
+  class SmemLayoutAtom,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90MatrixReduction;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
new file mode 100755
index 000000000..4f7d99fa3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
@@ -0,0 +1,1139 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partitioning Helpers
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class CtaTileMN,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    CtaTileMN cT,          // (CTA_M,CTA_N,...)
+    EpilogueTile epi_tile, // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  ThrCopy thread_copy = tiled_copy.get_thread_slice(thread_idx);
+  Tensor cT_epi = flat_divide(cT, epi_tile);                                 // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,...)
+  if constexpr (ReferenceSrc) {
+    return thread_copy.partition_S(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+  else {
+    return thread_copy.partition_D(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+}
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class Engine, class LayoutMNL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    Tensor<Engine, LayoutMNL> mT,  // (M,N,L)
+    TileShapeMNK tile_shape_mnk,   // (CTA_M,CTA_N,CTA_K)
+    TileCoordMNKL tile_coord_mnkl, // (m,n,k,l)
+    EpilogueTile epi_tile,         // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  auto [m, n, k, l] = tile_coord_mnkl;
+  auto coord_shape =
+      make_coord(m, n, l)
+    ;
+  Tensor cT = local_tile(mT, take<0,2>(tile_shape_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+  Tensor tCcT =
+    sm90_partition_for_epilogue<ReferenceSrc>(cT, epi_tile, tiled_copy, thread_idx);   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+  return tCcT;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Visitor Implementation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile
+>
+struct ProducerLoadArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ProducerLoadArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    thread_idx(thread_idx) {}
+};
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile,
+  class TiledCopy,
+  class CoordTensor,
+  class Residue,
+  class ThrCoordTensor,
+  class ThrResidue,
+  class ThrSrcTensor
+>
+struct ConsumerStoreArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  TiledCopy tiled_copy;
+  CoordTensor cD;
+  Residue residue_cD;
+  ThrCoordTensor tCcD;
+  ThrResidue residue_tCcD;
+  ThrSrcTensor & tCrC;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ConsumerStoreArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      TiledCopy tiled_copy,
+      CoordTensor cD,
+      Residue residue_cD,
+      ThrCoordTensor tCcD,
+      ThrResidue residue_tCcD,
+      ThrSrcTensor & tCrC,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    tiled_copy(tiled_copy),
+    cD(cD),
+    residue_cD(residue_cD),
+    tCcD(tCcD),
+    residue_tCcD(residue_tCcD),
+    tCrC(tCrC),
+    thread_idx(thread_idx) {}
+};
+
+template <class... Ops>
+struct Sm90VisitorImplBase {
+  // Shared memory allocation
+  using SharedStorage = tuple<typename Ops::SharedStorage...>;
+  // Host side fusion arguments
+  using Arguments = tuple<typename Ops::Arguments...>;
+  // Device side fusion params (Kernel-entry API)
+  using Params = tuple<typename Ops::Params...>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        auto ret = Op::to_underlying_arguments(problem_shape, op_args, op_workspace);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return ret;
+      },
+      [] (auto&&... op_params) { return cute::make_tuple(op_params...); }
+    );
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        return Op::can_implement(problem_shape, op_args);
+      },
+      [&] (auto&&... implementable) {
+        return (true && ... && implementable);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+        return round_nearest(op_workspace_size, MinWorkspaceAlignment);
+      },
+      [&] (auto&&... op_workspace_size) {
+        return (0 + ... + op_workspace_size);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      // Initialize each operation's workspace, stopping at the first error
+      [&] (auto&& op, auto const& op_args) {
+        if (status != Status::kSuccess) {
+          return status;
+        }
+
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        status = Op::initialize_workspace(problem_shape, op_args, op_workspace, stream, cuda_adapter);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return status;
+      },
+      // Return the final status
+      [&] (auto const&...ops) { return status; }
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
+        [] (auto&& op, auto const& op_params, auto&& op_storage) {
+          using Op = cute::remove_cvref_t<decltype(op)>;
+          return Op(op_params, op_storage);
+        },
+        [] (auto&&... ops) { return cute::make_tuple(ops...); }
+      )) {}
+
+  // Ops can store kernel persistent variables (e.g. descriptors, scalars, wave counters)
+  tuple<Ops...> ops;
+};
+
+
+template <class... Ops>
+struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
+
+  using Impl = Sm90VisitorImplBase<Ops...>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl(Params const& params, SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  using Impl::ops;
+
+  //
+  // Queries for kernel runtime
+  //
+
+  // Is a specialized warp for producer TMA loads needed
+  // e.g. Aux tensor loads, broadcasts using TMA bulk copy
+  // This condition cannot change between work tiles because it is used
+  // to determine whether the load warp should exit early or not
+  // e.g. for batched beta this must always be true regardless of current batch idx
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) {
+        return (false || ... || op.is_producer_load_needed());
+      }
+    );
+  }
+
+  // Is a producer TMA load specifically for C needed
+  // If this is true then is_producer_load_needed must also be true
+  // This condition can change between work tiles because it is only used
+  // to determine whether the TMA and smem loads for C of a given tile should happen
+  // e.g. for batched beta this can be false depending on current batch idx
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) {
+        return (false || ... || op.is_C_load_needed());
+      }
+    );
+  }
+
+  //
+  // Producer load callbacks, called by the epilogue load warp.
+  // Operations usually only define this if TMA load is needed. Most operations will reuse this empy implementation
+  // Load callbacks are responsible for issuing corresponding mbarrier expect-tx ops for any TMA loads issued, but
+  // are not responsible for issuing the producer_commit barrier arrival, which is issued by the collective instead
+  // If this is non-empty, is_producer_load_needed must be true.
+  //
+  template <class CallbacksTuple>
+  struct ProducerLoadCallbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    // Before entry of the subtile load loop
+    CUTLASS_DEVICE void
+    begin() {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin();
+        }
+      );
+    }
+
+    // Entry of the subtile load loop. Aux loads usually performed here
+    // Upon entry the producer acquire of the current subtile lock has completed.
+    // Upon exit all TMA loads for this subtile must have been issued, with corresponding expect-tx operations
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.step(full_mbarrier_ptr, epi_m, epi_n, load_iteration, issue_tma_load);
+        }
+      );
+    }
+
+    // Exit of the subtile load loop.
+    CUTLASS_DEVICE void
+    end() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end();
+        }
+      );
+    }
+  };
+
+  // Producer load callbacks factory
+  // All operations must redefine this, but most can just dispatch to the base impl
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_producer_load_callbacks(args);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ProducerLoadCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+
+  //
+  // Consumer store callbacks, called by the epilogue store warps.
+  // All operations must redefine this, with optional inheritance from this empty implementation.
+  //
+  template <class CallbacksTuple>
+  struct ConsumerStoreCallbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    // Before entry of subtile store loop. Gmem broadcasts usually performed here.
+    CUTLASS_DEVICE void
+    begin() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin();
+        }
+      );
+    }
+
+    // Start of subtile store iteration
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_loop(epi_m, epi_n);
+        }
+      );
+    }
+
+    // Before visit callback. Smem broadcasts usually performed here.
+    // Upon entry, all producer loads for this subtile are completed and visible.
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.previsit(epi_m, epi_n, load_iteration, is_producer_load_needed);
+        }
+      );
+    }
+
+    // Perform the fused elementwise computation
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+
+    // After visit call. Smem reductions usually performed here
+    // reduction_buffer is an arbitrary smem tensor that can be used for workspace
+    // It is each nodes reponsibility to assert that this buffer is sufficiently sized
+    // and to ensure that this buffer is no longer needed upon callback exit
+    // i.e. results are synchronized and no longer in the reduction buffer
+    //
+    // visit_results is a rmem tensor that contains the results of visit() for an entire
+    // on the current epilogue subtile
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.reduce(reduction_buffer, sync_fn, epi_m, epi_n, is_last_iteration, visit_results);
+        }
+      );
+    }
+
+    // After reduce call, before smem async fence. Smem stores usually performed here.
+    // Upon exit, all smem stores for TMA must have been issued
+    CUTLASS_DEVICE void
+    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.postreduce(epi_m, epi_n, store_iteration, issue_smem_store);
+        }
+      );
+    }
+
+    // After smem async fence, before TMA store commit. Aux stores usually performed here
+    // Upon exit, all TMA stores for this subtile must have been issued
+    // Because of the TMA store delay optimization, this entry point must ONLY be used for TMA stores
+    // other gmem stores can be placed in the reduce or postreduce entry points
+    CUTLASS_DEVICE void
+    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.tma_store(epi_m, epi_n, store_iteration, issue_tma_store);
+        }
+      );
+    }
+
+    // End of subtile store iteration
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_loop(epi_m, epi_n);
+        }
+      );
+    }
+
+    // Exit of subtile store loop. Gmem reductions usually performed here.
+    CUTLASS_DEVICE void
+    end() {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end();
+        }
+      );
+    }
+  };
+
+  // Consumer store callbacks factory
+  // All operations must redefine this
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.template get_consumer_store_callbacks<ReferenceSrc>(args);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ConsumerStoreCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyProducerLoadCallbacks = Sm90VisitorImpl<>::ProducerLoadCallbacks<cute::tuple<>>;
+using EmptyConsumerStoreCallbacks = Sm90VisitorImpl<>::ConsumerStoreCallbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
+
+  using Impl = Sm90VisitorImpl<ChildOps..., NodeOp>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(frg_acc, epi_v, epi_m, epi_n); // child ops must be nullary (e.g. loads, trees)
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+        },
+        make_seq<Rm1>{} // restrict the transform to R-1 child ops, apply is for node op
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<ChildOps..., NodeOp>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// DAG visitors
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Most DAG fusions can be represented as a set of output trees with a common input tree
+// The common input is first evaluated, then the result is passed as the acc fragment to the output trees
+template <class InputTree, class OutputTree, class... AuxOutTrees>
+struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree> {
+
+  using Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_input = get<0>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      constexpr int Rm2 = sizeof...(AuxOutTrees);
+      cute::for_each(make_seq<Rm2>{}, // restrict the sequence to aux out trees
+        [&] (auto I) {
+          get<I+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+        }
+      );
+
+      return get<Rm2+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  // deducing the output type for all the nodes is tricky so we just convert them all to a common type
+  // if multiple compute types are needed then split into multiple subgraphs grouped by type
+  class ElementCompute,
+  class EdgeTuple, // tuple of int_sequence, each sequence is the children indices (indexed by topological order) for each node
+  class... Ops     // in topological order, last op is the output. EdgeTuple must match this order
+>
+struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using Sm90VisitorImpl<Ops...>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+            // Compute the current op with children inputs
+            [&] (auto const&... frg_inputs) {
+              auto frg_output = callbacks.visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+              using ElementOutput = typename decltype(frg_output)::Element;
+              using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+              ConvertOutput convert_output{};
+
+              return convert_output(frg_output);
+            },
+            // Get inputs in the sequence given by the children indices of the current op
+            edge_seq
+          );
+          return frg_compute; // unused
+        },
+        // Visit the last op
+        [&] (auto const&...ops) {
+          return cute::detail::apply(frg_compute_tuple,
+            // Compute the last op with children inputs
+            [&] (auto const&... frg_inputs) {
+              return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+            },
+            // Get inputs in the sequence given by the children indices of the last op
+            get<Rm1>(EdgeTuple{})
+          );
+        },
+        // Transform to visit R-1 ops, apply to visit last op
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Sm90VisitorImpl<Ops...>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Base specializations so we can have standard layout params and simple aggregate initializers
+namespace detail {
+
+template <class Op0>
+struct Sm90VisitorImplBase<Op0> {
+
+  // Retain tuple for SharedStorage because empty structs have 1B alignment
+  // tuples use multiple inheritance, avoids this problem
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage))
+      }) {}
+
+  tuple<Op0> ops;
+};
+
+template <class Op0, class Op1>
+struct Sm90VisitorImplBase<Op0, Op1> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1> ops;
+};
+
+template <class Op0, class Op1, class Op2>
+struct Sm90VisitorImplBase<Op0, Op1, Op2> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2);          
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2> ops;
+};
+
+template <class Op0, class Op1, class Op2, class Op3>
+struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage,
+    typename Op3::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+    typename Op3::Arguments op_3;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+    typename Op3::Params op_3;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    size_t op_2_workspace_size = Op2::get_workspace_size(problem_shape, args.op_2);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    uint8_t* op_3_workspace = op_2_workspace + op_2_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace),
+      Op3::to_underlying_arguments(problem_shape, args.op_3, op_3_workspace)
+    };
+  }
+  
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2) &&
+           Op3::can_implement(problem_shape, args.op_3); 
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op3::initialize_workspace(problem_shape, args.op_3, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage)),
+        Op3(params.op_3, get<3>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2, Op3> ops;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
new file mode 100755
index 000000000..53c0dce8b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree Top-K + Softmax fusion operation for sm90 TMA warp-specialized epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Top-K + Softmax reduction across columns
+// Performs a reduction of top-K values across N, and finally performs a softmax on them,
+// and sets values not in the top-K to 0.
+//
+//   Assumptions:
+//     1. CTA_N >= N (single tile across N, the mode which is reduced)
+//     2. EPI_N >= N (single epilogue tile across N, because we can reduce and revisit one
+//        epilogue tile at a time.)
+//     3. Top-K value is either 2 or 4.
+//
+
+namespace detail {
+
+// Implementations for add to sorted list and merging sorted lists,
+// with fast paths for lists of size 2 and 4 (Top-2 and Top-4).
+// Generic implementations may result in greater register use and branching,
+// and should be avoided.
+// Fast paths for Top-2 and Top-4 are written in inline PTX directly.
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce_scalar(Array<float, 2> a, float scalar) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx, %3, %4;\n"
+      "  setp.gtu.f32 p, %2, %4;\n"
+      "  selp.f32 %1, mx, %2, p;\n"
+      "  selp.f32 %0, %2, %4, p;\n"
+      "}\n" : "=f"(out[0]), "=f"(out[1]) : "f"(a[0]), "f"(a[1]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce(Array<float, 2> a, Array<float, 2> b) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .v2 .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx.x, %3, %4;\n"           // max(a1, b0)
+      "  max.f32 mx.y, %2, %5;\n"           // max(a0, b1)
+      "  setp.gtu.f32 p, %2, %4;\n"         // a0 > b0
+      "  selp.f32 %1, mx.x, mx.y, p;\n"     // a0 > b0 ? max(a1, b0) : max(a0, b1)
+      "  selp.f32 %0, %2, %4, p;\n"         // a0 > b0 ? a0 : b0
+      "}\n" : "=f"(out[0]), "=f"(out[1]) : 
+      "f"(a[0]), "f"(a[1]), "f"(b[0]), "f"(b[1]));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce_scalar(Array<float, 4> a, float scalar) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"                   // max(a3, b)
+      "  .reg .pred p0;\n"                  // a0 > b
+      "  .reg .pred p1;\n"                  // a1 > b
+      "  .reg .pred p2;\n"                  // a2 > b
+      "  max.f32 mx, %7, %8;\n"             // max(a3, b)
+      "  setp.gtu.f32 p0, %4, %8;\n"        // a0 > b
+      "  setp.gtu.f32 p1, %5, %8;\n"        // a1 > b
+      "  setp.gtu.f32 p2, %6, %8;\n"        // a2 > b
+      "  selp.f32 %3, mx, %6, p2;\n"        // a2 > b ? max(a3, b) : a2
+      "  selp.f32 %2, %6, %8, p2;\n"        // a1 = a2 > b ? a2 : b
+      "  selp.f32 %2, %2, %5, p1;\n"        // a1 > b ? max(a2, b) : a1 == a1 > b ? a1 : old_a1
+      "  selp.f32 %1, %5, %8, p1;\n"        // a0 = a1 > b ? a1 : b
+      "  selp.f32 %1, %1, %4, p0;\n"        // a0 > b ? max(a1, b) : a0 == a0 > b ? a0 : old_a0
+      "  selp.f32 %0, %4, %8, p0;\n"        // a0 = a0 > b ? a0 : b
+      "}\n" : 
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce(Array<float, 4> a, Array<float, 4> b) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mxa0b1;\n"                          // max(a0, b1)
+      "  .reg .f32 mxa1b0;\n"                          // max(a1, b0)
+
+      "  .reg .f32 mxa2b0;\n"                          // max(a2, b0)
+      "  .reg .f32 mxa1b1;\n"                          // max(a1, b1)
+      "  .reg .f32 mxa0b2;\n"                          // max(a1, b1)
+
+      "  .reg .f32 mxa1b2;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa2b1;\n"                          // max(a2, b1)
+      "  max.f32 mxa1b2, %5, %10;\n"
+      "  max.f32 mxa2b1, %6, %9;\n"
+
+      "  .reg .f32 mxa3b0;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa0b3;\n"                          // max(a2, b1)
+      "  max.f32 mxa3b0, %7, %8;\n"
+      "  max.f32 mxa0b3, %4, %11;\n"
+
+      "  .reg .pred pa0b0;\n"                          // a0 > b0
+      "  .reg .pred pa1b0;\n"                          // a1 > b0
+      "  .reg .pred pa2b0;\n"                          // a2 > b0
+      "  .reg .pred pa0b1;\n"                          // a0 > b1
+      "  .reg .pred pa1b1;\n"                          // a1 > b1
+      "  .reg .pred pa0b2;\n"                          // a0 > b2
+      "  .reg .pred pb2a0;\n"                          // b1 > a0
+      "  .reg .pred pb1a0;\n"                          // b1 > a0
+
+      "  setp.gtu.f32 pa0b0, %4, %8;\n"                // a0 > b0
+      "  setp.gtu.f32 pa1b0, %5, %8;\n"                // a1 > b0
+      "  setp.gtu.f32 pa2b0, %6, %8;\n"                // a2 > b0
+      "  setp.gtu.f32 pa0b1, %4, %9;\n"                // a0 > b1
+      "  setp.gtu.f32 pa1b1, %5, %9;\n"                // a1 > b1
+      "  setp.gtu.f32 pa0b2, %4, %10;\n"               // a0 > b2
+
+      "  not.pred pb2a0, pa0b2;\n"
+      "  not.pred pb1a0, pa0b1;\n"
+
+      "  selp.f32 mxa1b0, %5, %8, pa1b0;\n"            // max(a1, b0)
+      "  selp.f32 mxa0b1, %4, %9, pa0b1;\n"            // max(a0, b1)
+
+      "  selp.f32 mxa1b1, %5, %9, pa1b1;\n"            // max(a1, b1)
+      "  selp.f32 mxa2b0, %6, %8, pa2b0;\n"            // max(a2, b0)
+      "  selp.f32 mxa0b2, %4, %10, pa0b2;\n"           // max(a0, b2)
+
+      // a0
+      "  selp.f32 %0, %4, %8, pa0b0;\n"                // a0 = a0 > b0 ? a0 : b0
+
+      // a1
+      "  selp.f32 %1, mxa1b0, mxa0b1, pa0b0;\n"        // a1 = a0 > b0 ? max(a1, b0) : max(a0, b1)
+
+      // a2
+      "  mov.f32 %2, mxa1b1;\n"                        // a2 = max(a1, b1) ** most likely case
+      "  selp.f32 %2, mxa2b0, %2, pa1b0;\n"            // a0 > a1 > b0
+      "  selp.f32 %2, mxa0b2, %2, pb1a0;\n"            // b0 > b1 > a0
+
+      // a3
+      "  mov.f32 %3, mxa1b2;\n"                        // a3 = max(a1, b2) ** one of the most likely cases
+      "  selp.f32 %3, mxa2b1, %3, pa1b1;\n"            // a3 = a1 > b1 ? max(a2, b1) ** second most likely case
+      "  selp.f32 %3, mxa3b0, %3, pa2b0;\n"            // a0 > a1 > a2 > b0
+      "  selp.f32 %3, mxa0b3, %3, pb2a0;\n"            // b0 > b1 > b2 > a0
+      "}\n" : 
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]),
+      "f"(b[0]), "f"(b[1]), "f"(b[2]), "f"(b[3]));
+  return out;
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void add_element_to_desc_sorted_array(cutlass::Array<Element, N>& a, Element b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce_scalar(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce_scalar(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] <= b) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] and b[0] are the largest elements in a[] and b[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void merge_desc_sorted_arrays(cutlass::Array<Element, N>& a, const cutlass::Array<Element, N>& b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    int j = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] <= b[j]) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b[j];
+        ++j;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+Element topk_logsumexp(cutlass::Array<Element, N> a) {
+  // Do one less `exp`, because we know what its result will be.
+  // Assume x is a set of `x_i`s, and `x_m` is the maximum of that set.
+  // logsumexp(x) = log(sum(x_i)) = m + log(sum(x_i - m)) = m + log(1 + sum_{i != m}(x_i - x_m))
+  // Compute m + log(1 + sum_{i != m}(x_i - x_m))
+  Element sum = Element(1.0);
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 1; i < N; ++i) {
+    sum += fast_exp(a[i] - a[0]);
+  }
+  return a[0] + fast_log(sum);
+}
+
+CUTLASS_DEVICE
+float fast_masked_softmax(float value, float minimum, float logsumexp) {
+  float new_value;
+  asm volatile(
+      "{\n"
+      "  .reg .pred p0;\n"
+      // value >= minimum
+      "  setp.geu.f32 p0, %1, %2;\n"
+
+      "  .reg .f32 x_lse;\n"
+      "  .reg .f32 %%f<11>;\n"
+      "  .reg .b32 %%r<3>;\n"
+
+      // x_lse = value - minimum
+      "  sub.rn.f32  x_lse, %1, %3;\n"
+
+      // exp(x_lse)
+      // The following is derived from a ptx dump of expf.
+      // exp requires a base conversion from exp2.
+      "  fma.rn.f32 %%f1, x_lse, 0f3BBB989D, 0f3F000000;\n"
+      "  cvt.sat.f32.f32 %%f2, %%f1;\n"
+      "  fma.rm.f32 %%f3, %%f2, 0f437C0000, 0f4B400001;\n"
+      "  add.f32 %%f4, %%f3, 0fCB40007F;\n"
+      "  neg.f32 %%f5, %%f4;\n"
+      "  fma.rn.f32 %%f6, x_lse, 0f3FB8AA3B, %%f5;\n"
+      "  fma.rn.f32 %%f7, x_lse, 0f32A57060, %%f6;\n"
+      "  mov.b32 %%r1, %%f3;\n"
+      "  shl.b32 %%r2, %%r1, 23;\n"
+      "  mov.b32 %%f8, %%r2;\n"
+      "  ex2.approx.ftz.f32 %%f9, %%f7;\n"
+      "  mul.f32 %%f10, %%f9, %%f8;\n"
+
+      // Mask or softmax
+      "  selp.f32 %0, %%f10, 0f00000000, p0;\n"
+      "}\n" : "=f"(new_value) : "f"(value), "f"(minimum), "f"(logsumexp));
+  return new_value;
+}
+
+template <typename Element>
+CUTLASS_DEVICE
+Element masked_softmax(Element value, Element minimum, Element logsumexp) {
+  if constexpr (is_same_v<Element, float>) {
+    // Inline PTX implementation
+    // Significantly reduces register requirements
+    return fast_masked_softmax(value, minimum, logsumexp);
+  }
+  else {
+    return value < minimum ? Element(0.0) : fast_exp(value - logsumexp);
+  }
+}
+
+} // namespace detail
+
+template <
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool UseButterflyReduce = true
+>
+struct Sm90TopKSoftmaxColReduction {
+private:
+  static_assert(is_same_v<ElementCompute, float>, "Fused Top-K + Softmax reduction requires FP32 accumulation.");
+  static_assert(TopK == 2 || TopK == 4, "Fused Top-K + Softmax reduction only supports K=2 and K=4.");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+
+  // Reduction tensors
+  //   We have two tensors for this EVT node: a reduction tensor and a tensor holding
+  //   final reduction values (tCrSoftmax). The reason for this is that Top-K and Softmax
+  //   require different reductions, but those luckily overlap. Top-K obviously needs at least
+  //   two values (K >= 2), and softmax needs one value: logsumexp. Logsumexp is simply the log
+  //   of sum of exponents over the set, and is equivalent to m + sum(exp(x_i - m)), where m is the
+  //   maximum of all x_i elements. Since safe softmax for any element x_i is computed as
+  //   softmax(x_i) = exp(x_i - m) / sum_j(exp(x_j - max))
+  //   we can track logsumexp instead of tracking two variables (sum of exps and the max).
+  //   In addition, subtracting logsumexp from any element and taking its exp is equivalent to
+  //   computing its softmax.
+  //   
+  //   The overlap between softmax and top-K is that we don't need to reduce logsumexp along the
+  //   way at all, because any element not in the top-K is going to be masked out and set to 0.
+  //   Therefore, we only reduce the top-K elements, and when done, compute their logsumexp and
+  //   keep it, and the smallest element in the top-K for masking out non-top-K elements.
+  //
+  //   This means that our final reduction result will always be 2 elements, regardless of the value
+  //   of K: minimum of top-K, and logsumexp.
+  //
+  //   For each reduction tensor, we define a new struct for readability.
+
+  struct ReductionResult {
+    ElementCompute min_;
+    ElementCompute logsumexp_;
+
+    CUTLASS_DEVICE
+    ReductionResult() { }
+
+    CUTLASS_DEVICE
+    ReductionResult(ElementCompute min, ElementCompute logsumexp): 
+      logsumexp_(logsumexp), min_(min) { }
+
+    // Warp shuffle broadcast
+    CUTLASS_DEVICE
+    void shuffle_up_sync(uint32_t delta, int lane_id) {
+      static_assert(sizeof(ReductionResult) == sizeof(uint64_t));
+      uint64_t r = reinterpret_cast<uint64_t&>(*this);
+      r = __shfl_up_sync(0xFFFFFFFF, r, delta);
+      *this = (lane_id - static_cast<int>(delta) >= 0) ? reinterpret_cast<ReductionResult&>(r) : *this;
+    }
+  };
+
+  struct TopKResult {
+    Array<ElementCompute, TopK> top_k_;
+
+    CUTLASS_DEVICE
+    TopKResult() {
+      top_k_.fill(-cutlass::platform::numeric_limits<ElementCompute>::infinity());
+    }
+
+    // This is where we do the "final" reduction, where we compute
+    // the logsumexp for softmax, keep the smallest value in top-K,
+    // and discard the rest.
+    CUTLASS_DEVICE
+    ReductionResult reduce_final() const {
+      return ReductionResult(top_k_[TopK - 1], topk_logsumexp(top_k_));
+    }
+
+    // Butterfly reduction
+    CUTLASS_DEVICE
+    void shuffle_xor_sync(int laneMask) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_xor_sync(0xFFFFFFFF, top_k, laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[0], laneMask);
+        top_k_arr[1] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[1], laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_xor_sync(0xFFFFFFFF, top_k_[i], laneMask);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+
+    // Warp shuffle reduction
+    CUTLASS_DEVICE
+    void shuffle_down_sync(uint32_t delta) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_down_sync(0xFFFFFFFF, top_k, delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[0], delta);
+        top_k_arr[1] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[1], delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_down_sync(0xFFFFFFFF, top_k_[i], delta);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+  };
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments { };
+
+  struct Params { };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto [M, N, K, L] = problem_shape;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Cross CTA reduction is not possible because there is no guarantee that all CTAs run
+    // concurrently.
+    // Cross epilogue tile reduction is possible, but re-visiting and applying reduction
+    // to accumulators is only possible for the current epilogue tile.
+    auto [epi_M, epi_N] = EpilogueTile{};
+    return N <= tile_N && N <= epi_N && N >= TopK;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+
+    template <typename ElementAccumulator, typename ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        auto thread_crd = tCcCol_mn(epi_v * FragmentSize + i);
+        if (elem_less(thread_crd, residue_tCcCol)) {
+          TopKResult& tCrCol_vmn = tCrTopK(epi_v * FragmentSize + i);
+          detail::add_element_to_desc_sorted_array(tCrCol_vmn.top_k_, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      // `tCrTopK` and `tCrSoftmax` have 0-strides along modes that correspond to N,
+      // in order to reduce along modes in the `R2S` sublayout that correspond to N.
+      // This means we should modify and warp-reduce them according to their co-domain instead of
+      // their domain. Therefore we keep a filtered view of both and use them as necessary.
+      auto tCrTopK_f = filter(tCrTopK);
+      auto tCrSoftmax_f = filter(tCrSoftmax);
+
+      // The pattern here is: reduce Top-K first, then compute logsumexp, keep it and the
+      // last element of Top-K, use the latter to mask the visited results, and the former
+      // to apply softmax.
+      //
+      // This gives us two options: reduce the Top-K with warp shuffles, have the reduced
+      // lanes compute logsumexp and pair it with the last Top-K element, and broadcast
+      // the result back using warp shuffles.
+      //
+      // Alternatively, we can do a butterfly reduction over Top-K, and have all lanes
+      // compute their own logsumexp and skip the broadcast.
+      if constexpr (UseButterflyReduce) {
+        //
+        // 1. Butterfly reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 1; j < size<1>(lane_layout_MN); j *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_xor_sync(j);
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+          tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+        }
+      }
+      else {
+        //
+        // 1. Warp shuffle reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_down_sync(lane_layout_MN(_0{},reduction_cols));
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        bool is_reduced_lane = get<1>(lane_mn) == 0;
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+          }
+        }
+
+        //
+        // 3. Broadcast reduced values to all participants
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int broadcast_cols = 1; broadcast_cols <= size<1>(lane_layout_MN) / 2; broadcast_cols *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i).shuffle_up_sync(lane_layout_MN(_0{},broadcast_cols), get<1>(lane_mn));
+          }
+        }
+      }
+
+      //
+      // 4. Re-visit and apply top-K and softmax
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_v = 0; epi_v < size(visit_results); ++epi_v) {
+        auto& visit_frag = visit_results(epi_v);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          visit_frag[i] = detail::masked_softmax(
+            visit_frag[i],
+            tCrSoftmax(epi_v * FragmentSize + i).min_,
+            tCrSoftmax(epi_v * FragmentSize + i).logsumexp_
+          );
+        }
+      }
+
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // Reset reduced top-K values for next tile
+      // This must be done because we only assume a single epilogue tile across N,
+      // but not M.
+      fill(tCrTopK, TopKResult());
+    }
+
+    CUTLASS_DEVICE void
+    end() { }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
+      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+
+    // Make sure there's only one warp across N so we can use warp shuffle intrinsics for reduction.
+    static_assert(decltype(size<1>(warp_layout_MN))::value <= 1);
+
+    // Reduction layout
+    //   We're assuming all elements in a row (over which we're performing the reduction) are
+    //   visited in the same corresponding epilogue tile, and this is what allows us to apply the
+    //   top-K + softmax operation within `reduce()`, by re-visiting the accumulated results.
+    //
+    //   This presents a challenge, because the layout of the accumulated results is typically in
+    //   in the register to shared memory shape, or: (R2S,R2S_M,R2S_N).
+    //   This means that we still need to reduce this tensor along N.
+    //
+    //   The solution is simple: we need to flatten the layout, identify modes that correspond to
+    //   N and set their strides to 0, in order to map fragment indices corresponding to the same
+    //   row back to the same element in the tensor.
+    //
+    //   This requires some extra layout manipulation, which is as follows.
+
+    // Create new accumulator layout with column broadcast
+    auto [M, N, K] = args.tile_shape_mnk;
+    auto thr_mma = args.tiled_mma.get_thread_slice(args.thread_idx);
+    auto gColReduce = make_tensor<ElementCompute>(
+        make_layout(make_shape(M, N), make_stride(_1{}, 0_c)));                                                // (M,N)
+    auto tCrColReduce = make_tensor_like<ElementCompute>(                                       // (FrgV, MMA_M, MMA_N)
+        thr_mma.partition_C(gColReduce).layout());
+
+    // Tile the new accumulator tensor according to R2S
+    ThrCopy thread_r2s = args.tiled_copy.get_slice(args.thread_idx);
+    Tensor tRS_rSoftmax = thread_r2s.retile_S(tCrColReduce);                               // ((R2S,R2S_V),MMA_M,MMA_N)
+    auto tCrC_layout = args.tCrC.layout();                                                         // (R2S,R2S_M,R2S_N)
+
+    // Compose the new accumulator R2S layout with the expected tCrC layout to get final 
+    // reduction tensor layout.
+    auto tCrSoftmax_layout = take<0, 3>(tRS_rSoftmax.layout()).compose(tCrC_layout); // (R2S,R2S_V) o (R2S,R2S_M,R2S_N)
+
+    Tensor tCrTopK = make_tensor<TopKResult>(tCrSoftmax_layout);                                   // (R2S,R2S_M,R2S_N)
+    Tensor tCrSoftmax = make_tensor<ReductionResult>(tCrSoftmax_layout);                           // (R2S,R2S_M,R2S_N)
+    fill(tCrTopK, TopKResult());
+
+    auto args_tuple = make_tuple(
+        cute::move(tCrTopK), cute::move(tCrSoftmax), args.tCcD, args.cD,
+        lane_layout_MN, lane_mn,
+        args.residue_cD, args.residue_tCcD);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
new file mode 100755
index 000000000..9f1cd7743
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
@@ -0,0 +1,758 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This extends the contents of cutlass/functional.h with frequently used activation functions.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Identity operator
+template <typename T>
+struct Identity {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    return value;
+  }
+};
+
+template <typename T, int N>
+struct Identity<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> value) const {
+    return value;
+  }
+};
+
+/// Scale operator
+template <typename T>
+struct Scale {
+  struct Arguments {
+    using scale_type = T;
+    T scale = T(1);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, T scale) const {
+    multiplies<T> mul;
+    return mul(scale, value);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
+
+template <typename T, int N>
+struct Scale<Array<T, N>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, T scale) const {
+    multiplies<Array<T, N>> mul;
+    return mul(scale, values);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, Arguments args = Arguments()) const {
+    return this->operator()(values, args.scale);
+  }
+};
+
+/// Specialization to compose other activations with a defined unary operator
+/// e.g. Scale<Identity<T>>
+template <template <class> class Activation, typename T>
+struct Scale<Activation<T>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, typename Arguments::scale_type scale) const {
+    multiplies<T> mul;
+    Activation<T> act;
+    return mul(scale, act(value));
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
+
+/// ReLu operator - propagates NaNs
+/// Always put threshold in the right hand side of max to propagate NaN.
+template <typename T>
+struct ReLu {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T threshold, T value) const {
+    maximum<T> mx;
+
+    return mx(value, threshold);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    maximum<T> mx;
+
+    return mx(value, T(0));
+  }
+};
+
+template <typename T>
+using ReLU = ReLu<T>;
+
+template <typename T, int N>
+struct ReLu<Array<T, N>> {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
+    maximum<Array<T, N>> mx;
+
+    return mx(frag, threshold);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &frag) const {
+    maximum<Array<T, N>> mx;
+    return mx(frag, T(0));
+  }
+};
+
+// Generic clamp
+template <typename T>
+struct Clamp {
+  struct Arguments {
+    T lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::lowest();
+    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+    minimum<T, PropagateNaN> mn;
+
+    return mn(mx(value, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.lower_bound, args.upper_bound);
+  }
+};
+
+template <typename T, int N>
+struct Clamp<Array<T,N>> {
+  using Arguments = typename Clamp<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T,N>, PropagateNaN> mx;
+    minimum<Array<T,N>, PropagateNaN> mn;
+
+    return mn(mx(values, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.lower_bound, args.upper_bound);
+  }
+};
+
+// Leaky Relu operator
+template <typename T>
+struct LeakyReLU {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T leaky_alpha = T(0);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& leaky_alpha) const {
+    T res = value > T(0) ? value : value * leaky_alpha;
+    return res;
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.leaky_alpha);
+  }
+};
+
+template <typename T, int N>
+struct LeakyReLU<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename LeakyReLU<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, T const& leaky_alpha) const {
+    Array<T, N> y;
+    LeakyReLU<T> leaky_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = leaky_op(values[i], leaky_alpha);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.leaky_alpha);
+  }
+};
+
+// Tanh operator
+template <typename T>
+struct Tanh {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return fast_tanh(value);
+  }
+};
+
+template <typename T, int N>
+struct Tanh<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    Tanh<T> tanh_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = tanh_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <int N>
+struct Tanh<Array<half_t, N>> {
+  using T = half_t;
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& z) const {
+    fast_tanh_op<Array<T, N>> tanh;
+    return tanh(z);
+  }
+};
+
+// Sigmoid operator
+template <typename T>
+struct Sigmoid {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return T(1) / (T(1) + fast_exp(-value));
+  }
+};
+
+template <typename T, int N>
+struct Sigmoid<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    Sigmoid<T> sigmoid_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = sigmoid_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <int N>
+struct Sigmoid<Array<half_t, N>> {
+  using T = half_t;
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& z) const {
+    plus<Array<T, N>> add;
+
+#if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
+    multiplies<Array<T, N>> mul;
+    fast_tanh_op<Array<T, N>> tanh;
+    return mul(add(tanh(mul(z, cutlass::constants::half<T>())), cutlass::constants::one<T>()),
+               cutlass::constants::half<T>());
+#else
+    divides<Array<T, N>> div;
+    negate<Array<T, N>> neg;
+    fast_exp_op<Array<T, N>> fast_exp;
+    return div(cutlass::constants::one<T>(),
+               add(cutlass::constants::one<T>(),
+                   fast_exp(neg(z))));
+#endif
+  }
+};
+
+// SiLu (swish) operator introduced by Elfwing et al. in the following paper
+// "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning" (2017)
+// https://arxiv.org/pdf/1702.03118.pdf
+// It is used in EfficientNet and YOLOv5, for example.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html
+template <typename T>
+struct SiLu {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    Sigmoid<T> sigmoid;
+    return value * sigmoid(value);
+  }
+};
+
+template <typename T, int N>
+struct SiLu<Array<T, N>> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Sigmoid<Array<T, N>> sigmoid_op;
+    multiplies<Array<T, N>>     mul;
+    return mul(value, sigmoid_op(value));
+  }
+};
+
+template <typename T>
+using ScaledSiLu = Scale<SiLu<T>>;
+
+// Hardswish operator introduced by Howard et al. in the following paper
+// "Searching for MobileNetV3" (2019)
+// https://arxiv.org/pdf/1905.02244.pdf
+// It is used in models based on MobilenetNetV3.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html
+template <typename T>
+struct HardSwish {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 / T(6);
+  }
+};
+
+template <>
+struct HardSwish<float> {
+  using T = float;
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 * 0.16666667f;
+  }
+};
+
+template <typename T, int N>
+struct HardSwish<Array<T, N> > {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    HardSwish<T> hardswish_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = hardswish_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <int N>
+struct HardSwish<Array<half_t, N> > {
+  using T = half_t;
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    minimum<Array<T, N> > mn;
+    maximum<Array<T, N> > mx;
+    multiplies<Array<T, N> > mul;
+    plus<Array<T, N> > add;
+
+    return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f));
+  }
+};
+
+//
+// GELU function definitions implemented as described by
+//   Hendrycks, D., and Gimpel, K. in
+//   "Gaussian Error Linear Units (GELUs)." (2020)
+//   https://arxiv.org/pdf/1606.08415.pdf
+//
+// Floating-point constants are Taylor coefficients described in the paper.
+//
+
+// GELU operator
+template <typename T>
+struct GELU {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return T(cutlass::constants::half<T>() * value *
+      (cutlass::constants::one<T>() + (T)erff((float)(value * cutlass::constants::half_root_two<T>()))));
+  }
+};
+
+template <>
+struct GELU<float> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &value) const {
+    return cutlass::constants::half<float>() * value *
+      (cutlass::constants::one<float>() + erff(value * cutlass::constants::half_root_two<float>() ));
+  }
+};
+
+template <>
+struct GELU<double> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  double operator()(double const &value) const {
+    return cutlass::constants::half<double>() * value *
+      (cutlass::constants::one<double>() + erf( value * cutlass::constants::half_root_two<double>() ));
+  }
+};
+
+template <typename T, int N>
+struct GELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    GELU<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+using ScaledGELU = Scale<GELU<T>>;
+
+// GELU operator implemented using the Taylor series approximation
+template <typename T>
+struct GELU_taylor {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+
+    return T(cutlass::constants::half<T>() * z *
+      (cutlass::constants::one<T>() + fast_tanh(k0 * z * (cutlass::constants::one<T>() + k1 * z * z))));
+  }
+};
+
+template <int N>
+struct GELU_taylor<Array<half_t, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &z) const {
+
+    using T = half_t;
+    Array<half_t, N> y;
+
+    half_t k0 = half_t(0.7978845608028654);
+    half_t k1 = half_t(0.044715);
+
+    multiply_add<Array<half_t, N>> fma;
+    multiplies<Array<half_t, N>>     mul;
+    plus<Array<half_t, N>>         add;
+
+    fast_tanh_op<Array<half_t, N>> tanh;
+
+    Array<half_t, N> u = mul(mul(k0, z), fma(mul(k1, z), z, cutlass::constants::one<T>()));
+
+    y = mul(mul(z, cutlass::constants::half<T>()), add(cutlass::constants::one<T>(), tanh(u)));
+
+    return y;
+  }
+};
+
+template <typename T, int N>
+struct GELU_taylor<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    GELU_taylor<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+using ScaledGELU_taylor = Scale<GELU_taylor<T>>;
+
+/// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dGELU {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &d_t, T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+    T k2 = T(0.1070322243);
+
+    T tanh_out = fast_tanh(k0 * z * (1 + k1 * z * z));
+
+    T ff = constants::half<T>() * z * ((1 - tanh_out * tanh_out) * (k0 + k2 * z * z)) +
+      constants::half<T>() * (1 + tanh_out);
+
+    return ff * d_t;
+  }
+};
+
+template <typename T, int N>
+struct dGELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &d_t, Array<T, N> const &z) const {
+    Array<T, N> y;
+    dGELU<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+struct dReLU {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, bool d_relu) const {
+    return d_relu ? d_t : T(0);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, U d_relu) const {
+    return operator()(d_t, static_cast<bool>(d_relu));
+  }
+};
+
+template <typename T, int N>
+struct dReLU<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, bool const (&d_relu)[N]) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<uint1b_t, N> const& d_relu) const {
+    UnpackPredicates<N> unpack_op;
+
+    bool preds[N];
+    unpack_op(preds, d_relu);
+
+    return operator()(d_t, preds);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<U, N> const& d_relu) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+};
+
+/// Computes backwards pass for ReLU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dReLU_Z {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, T z) const {
+    return z < 0 ? T(0) : d_t;
+  }
+};
+
+template <typename T, int N>
+struct dReLU_Z<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<T, N> const& z) const {
+    Array<T, N> y;
+    dReLU_Z<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
+// ElementwiseFilter operator
+// Filters by a specific value and maps it to 0.0
+// Used in GEMM + comm
+template <typename T>
+struct ElementwiseFilter {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T value_to_filter = T(-0.0);
+    T filtered_value = T(0.0);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& value_to_filter, T const& filtered_value) const {
+    T res = value == value_to_filter ? filtered_value : value;
+    return res;
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.value_to_filter, args.filtered_value);
+  }
+};
+
+template <typename T, int N>
+struct ElementwiseFilter<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename ElementwiseFilter<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, T const& value_to_filter, T const& filtered_value) const {
+    Array<T, N> y;
+    ElementwiseFilter<T> filter_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = filter_op(values[i], value_to_filter, filtered_value);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.value_to_filter, args.filtered_value);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
new file mode 100755
index 000000000..86200b413
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing conversion operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Converts the result without other operations
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class Convert {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementAccumulator_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = FragmentAccumulator;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  Convert(Params const &params = Params()) {
+
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+
+  }
+
+  /// Returns true if source is needed based on state of runtime arguments
+  CUTLASS_HOST_DEVICE
+  constexpr bool is_source_needed() const {
+    return false;
+  }
+
+  /// Constexpr function to enable the compiler to optimize away the source loading if it is
+  /// never needed.
+  CUTLASS_HOST_DEVICE
+  constexpr bool is_source_ever_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source = FragmentOutput(),
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;
+
+    return destination_converter(accumulator);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
new file mode 100755
index 000000000..775630027
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Utilities for thread-level epilogues
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Class used to identify cases in which no operation is performed
+template <typename T_>
+struct NoOp {};
+
+} // namespace detail
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
new file mode 100755
index 000000000..f74a36af4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
@@ -0,0 +1,523 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename ElementSource_ = ElementOutput_
+>
+class LinearCombination {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params 
+  {
+    ElementCompute alpha;                         ///< scales accumulators
+    ElementCompute beta;                          ///< scales source tensor
+    ElementCompute const *alpha_ptr;              ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;               ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute const* const* alpha_ptr_array; ///< array of pointers to accumulator scalar per group/batch
+    ElementCompute const* const* beta_ptr_array;  ///< array of pointers to source scalar per group/batch
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      alpha_ptr_array(nullptr),
+      beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ):
+      alpha(alpha), beta(beta),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ):
+      alpha(alpha), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array,
+      ElementCompute const* const* beta_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(beta_ptr_array) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(nullptr) { }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombination(Params const &params, int group_idx = 0) {
+    if (params.alpha_ptr_array != nullptr && params.alpha_ptr_array[group_idx] != nullptr) {
+      alpha_ = *(params.alpha_ptr_array[group_idx]);
+    }
+    else if (params.alpha_ptr != nullptr) {
+      alpha_ = *params.alpha_ptr;
+    }
+    else {
+      alpha_ = params.alpha;
+    }
+    if (params.beta_ptr_array != nullptr && params.beta_ptr_array[group_idx] != nullptr) {
+      beta_ = *(params.beta_ptr_array[group_idx]);
+    }
+    else if (params.beta_ptr != nullptr) {
+      beta_ = *params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling with source: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const &accumulator,
+      FragmentSource const &source) const {
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    if (Scale == ScaleType::Nothing)
+      return destination_converter(converted_accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if (Scale == ScaleType::NoBetaScaling)
+      intermediate = converted_source;
+    else
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    if (Scale == ScaleType::Nothing)
+      return destination_converter(converted_accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    return destination_converter(intermediate);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    // Convert to destination numeric type
+
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    multiply_add<ElementCompute> madd;
+
+    if constexpr (Scale == ScaleType::NoBetaScaling) {
+      intermediate = source_converter(source);
+    }
+    else {
+      intermediate = multiply(beta_, source);                            // X =  beta * C + uniform
+    }
+
+    intermediate = madd(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    return destination_converter(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Convert to destination numeric type
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+
+    intermediate = multiply(alpha_, accumulator);    // D = alpha * Accum
+    return destination_converter(intermediate);
+  }
+};
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = vector_alpha * accumulator + (optional) vector_beta/scalar_beta * source
+///
+template <
+  typename ElementOutput_,            ///< Data type used to load and store tensors
+  int Count,                          ///< Number of elements computed per operation.
+  typename ElementAccumulator_,       ///< Accumulator data type
+  typename ElementCompute_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round,
+  typename ElementSource_
+>
+class LinearCombination<ElementOutput_,
+                        Count,
+                        ElementAccumulator_,
+                        ElementCompute_,
+                        ScaleType::PerChannelScaling,
+                        Round,
+                        ElementSource_> {
+public:
+        
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+  static constexpr bool IsPerChannelScalingSupported = true;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params
+  {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator vector
+    ElementCompute const *beta_ptr;        ///< pointer to source vector
+    ElementCompute beta;                   ///< scales source tensor
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute beta
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(beta) { }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+
+public:
+
+  /// Constructs the function object
+  CUTLASS_HOST_DEVICE
+  LinearCombination(Params const& params) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + vector_beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha,
+      FragmentCompute const& vbeta) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(vbeta, converted_source);                             // X = vector_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + scalar_beta(from host) * source 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+
+    intermediate = mul_add_source(beta_, converted_source);                           // X =  scalar_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = vector_alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentCompute const& valpha) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(valpha, converted_accumulator);    // D = vector_alpha * Accum
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
new file mode 100755
index 000000000..c5ffdaa03
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
+namespace { // (anonymous)
+template<class Op, class Enable = void>
+struct kIsHeavy_member_or_false {
+  static constexpr bool value = false;
+};
+template<class Op>
+struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
+  static constexpr bool value = Op::kIsHeavy;
+};
+
+} // namespace (anonymous)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+struct EmptyArguments {};
+
+template<class T, class = void>
+struct ElementwiseOpDispatcher {
+  using Arguments = EmptyArguments;
+
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value);
+  }
+};
+
+template<class T>
+struct ElementwiseOpDispatcher<T, std::void_t<typename T::Arguments>> {
+  using Arguments = typename T::Arguments;
+
+  Arguments args;
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments args_):args(args_) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value, args);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasElementwise {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::Default;
+
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
new file mode 100755
index 000000000..ead1123ca
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -0,0 +1,610 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayMaximum {
+
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Array<Element, ElementsPerAccess>  const &rhs) const {
+
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Element                                   rhs) const {
+
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs);
+    }
+
+    return result;
+  }
+};
+
+
+/// Partial specialization: Element=float
+template <int ElementsPerAccess>
+struct ArrayMaximum<float, ElementsPerAccess> {
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    Array<float, ElementsPerAccess>  const &rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    float rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs);
+    }
+
+    return result;
+  }
+};
+
+/// Partial specialization: Element=half
+template <int ElementsPerAccess>
+struct ArrayMaximum<half_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    Array<half_t, ElementsPerAccess>  const &rhs) const {
+
+    Array<half_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(rhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const *rhs_ptr = reinterpret_cast<__half const *>(rhs.raw_data());
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    half_t const &rhs) const {
+
+    Array<half_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    __half rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half2 rhs_pair = __half2half2(rhs_raw);
+
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const  rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+/// Partial specialization: Element=bfloat16_t
+template <int ElementsPerAccess>
+struct ArrayMaximum<bfloat16_t, ElementsPerAccess> {
+
+  using NvType   = __nv_bfloat16;
+  using NvTypeV2 = __nv_bfloat162;
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    Array<bfloat16_t, ElementsPerAccess>  const &rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2 const *rhs_ptr = reinterpret_cast<NvTypeV2 const *>(rhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    #else
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const *rhs_ptr = reinterpret_cast<NvType const *>(rhs.raw_data());
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    bfloat16_t                                   rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvType rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvTypeV2 rhs_pair = __bfloat162bfloat162(rhs_raw);
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const  rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, int ElementsPerAccess>
+struct ReluConditional {
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<Element, ElementsPerAccess> const &fragment, 
+    Element threshold) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !(fragment[i] < threshold);
+    }
+  }
+};
+
+template <int ElementsPerAccess>
+struct ReluConditional<half_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<half_t, ElementsPerAccess> const &fragment, 
+    half_t threshold) const {
+
+    __half y = reinterpret_cast<__half const &>(threshold);
+    __half const *x = reinterpret_cast<__half const *>(fragment.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+
+template <int ElementsPerAccess>
+struct ReluConditional<bfloat16_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<bfloat16_t, ElementsPerAccess> const &fragment,
+    bfloat16_t threshold) const {
+
+    __nv_bfloat16 y = reinterpret_cast<__nv_bfloat16 const &>(threshold);
+    __nv_bfloat16 const *x = reinterpret_cast<__nv_bfloat16 const *>(fragment.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing
+/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization.
+///
+/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>.
+///
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  int ElementsPerAccess,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasRelu {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementVector = ElementVector_;
+
+  using ElementT = uint1b_t;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using ElementwiseOp = ReLu<ElementCompute>;
+  using BinaryOp = plus<ElementCompute>;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementZ threshold;                    ///< ReLu threshold
+
+    //
+    // Methods
+    //
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute()), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr),
+      threshold(ElementCompute()) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold_ = ElementCompute()
+    ): 
+      alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+
+      threshold = convert_threshold(threshold_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(ElementZ()) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold_ = ElementCompute()
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+
+      threshold = convert_threshold(threshold_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(ElementZ()) {
+    }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementZ threshold_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementZ const &>(allones);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+
+      ElementCompute z = alpha_ * tmp_Accum[i];
+      z += beta_ * tmp_C[i];
+
+      z = binary_op(z, V[i]);
+      result_Z[i] = z;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    //
+    // Compute condition
+    //
+
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions); 
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_Z[i] = z;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    //
+    // Compute condition
+    //
+
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+
+    // 
+    // Compute conditions
+    //
+
+    //
+    // Store
+    //
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
new file mode 100755
index 000000000..aad9b5238
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -0,0 +1,685 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear scaling operations used by epilogues. Values are clamped before
+         converting to the output element type.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationClampIsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationClamp {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_accumulator;
+    
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements then clamps
+/// the output before converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+/// or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+/// above.
+/// TODO: Add logic to fallback to the default approach
+template <
+    /// Data type used to load and store< tensors
+    typename ElementOutput_,
+    /// Number of elements computed per operation
+    int Count,
+    ///< Control Alpha and Beta scaling
+    ScaleType::Kind Scale = ScaleType::Default,
+    /// Rounding mode
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class FastLinearCombinationClamp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    /// scales accumulators
+    ElementCompute alpha;
+    /// scales source tensor
+    ElementCompute beta;
+    /// pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *alpha_ptr;
+    /// pointer to source scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : alpha(ElementCompute(1)),
+          beta(ElementCompute(0)),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha)
+        : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  FastLinearCombinationClamp(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            FragmentOutput const &source,
+                            ElementCompute uniform = ElementCompute(0)) const {
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementOutput, kCount, Round>
+        source_converter;
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate =
+          mul_add_source(beta_, converted_source);  // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator,
+                                         intermediate);  // D = alpha * Accum + X
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
new file mode 100755
index 000000000..74eb8213e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
@@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Functor performing linear combination followed by dGelu operation
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDGelu {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static bool const kIsHeavy = true;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDGelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    dGELU<ElementCompute>  gelu_op;
+
+    // dGelu
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    dGELU<ElementCompute>  gelu_op;
+
+    // dGelu with conversion
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
new file mode 100755
index 000000000..aed173056
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
@@ -0,0 +1,452 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file  
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDRelu {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output 
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementTensor threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = ElementTensor(params.threshold);
+    participates_in_reduction_  = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementTensor const &>(allones);
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  int Count,                                           ///< Number of elements computed per operation
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDReluConditionalBits {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = uint1b_t;
+
+  static bool const kIsHeavy = false;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  FragmentTensor predicate_mask_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDReluConditionalBits(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    participates_in_reduction_ = true;
+    predicate_mask_.clear();
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    predicate_mask_.clear();
+
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+      
+      bit_not<FragmentTensor> not_op;
+      predicate_mask_ = not_op(predicate_mask_);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    bit_or<FragmentTensor> or_op;
+
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+
+    unpack_predicates(conditions, predicates);
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    bit_or<FragmentTensor> or_op;
+
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+
+    unpack_predicates(conditions, predicates);
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
new file mode 100755
index 000000000..818b21aa8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with GELU operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the GELU activation to an array of elements.
+///
+/// D = gelu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
new file mode 100755
index 000000000..a6bd9d672
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Activation, class = void>
+struct GenericActivationTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class Activation>
+struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename Activation::Arguments;
+};
+
+template <typename T>
+struct LinearCombinationGenericParams {
+  T alpha;                  ///< scales accumulators
+  T beta;                   ///< scales source tensor
+  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams():
+    alpha(T(1)),
+    beta(T(0)),
+    alpha_ptr(nullptr),
+    beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T alpha,
+    T beta = T(0)
+  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T const *alpha_ptr,
+    T const *beta_ptr = nullptr
+  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by an activation function to an array of elements.
+///
+/// D = activation(alpha * accumulator + beta * source + uniform)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGeneric {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params
+    : LinearCombinationGenericParams<ElementCompute>,
+      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGeneric(Params const &params) {
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.beta = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(params_.beta, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
+    }
+
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
new file mode 100755
index 000000000..e1dde1a6a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operations with a generic element-wise activation
+  function. Scaling factors are applied to operands A, B, and C. The pre-activation auxiliary
+  output is also returned.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+///   D = activation(Aux)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  typename ElementAuxOutput_,                          ///< Data type used to store auxiliary output
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGenericWithScalingAndAbsMax {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAuxOutput = ElementAuxOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = float;
+
+  static bool const kIsScalingAndAmaxAuxOutputNeeded = (platform::is_same<ElementAuxOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementAuxOutput, cutlass::float_e5m2_t>::value);
+  static bool const kIsScalingAndAmaxOutputNeeded    = (platform::is_same<ElementOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementOutput, cutlass::float_e5m2_t>::value);
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAuxOutput = Array<ElementAuxOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    struct ActivationParams
+      : LinearCombinationGenericParams<ElementCompute>,
+        GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+      using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+    };
+
+    ActivationParams activation;
+    ElementScalingFactor const* scale_a_ptr = nullptr;   ///< pointer to a scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_b_ptr = nullptr;   ///< pointer to b scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_c_ptr = nullptr;   ///< pointer to c scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_d_ptr = nullptr;   ///< pointer to d scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_aux_ptr = nullptr; ///< pointer to aux scalar - if not null, loads it from memory
+
+    ElementAbsmax * abs_max_aux_ptr = nullptr;      ///< pointer to location to store amax of Aux
+    ElementAbsmax * abs_max_D_ptr   = nullptr;      ///< pointer to location to store amax of D
+
+    CUTLASS_HOST_DEVICE
+    Params() :
+      scale_a_ptr(nullptr),
+      scale_b_ptr(nullptr),
+      scale_c_ptr(nullptr),
+      scale_d_ptr(nullptr),
+      scale_aux_ptr(nullptr),
+      abs_max_aux_ptr(nullptr),
+      abs_max_D_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ActivationParams activation_params,
+           ElementScalingFactor const* scale_a_ptr,
+           ElementScalingFactor const* scale_b_ptr,
+           ElementScalingFactor const* scale_c_ptr,
+           ElementScalingFactor const* scale_d_ptr,
+           ElementScalingFactor const* scale_aux_ptr,
+           ElementAbsmax * abs_max_aux_ptr,
+           ElementAbsmax * abs_max_D_ptr) :
+           activation(activation_params),
+           scale_a_ptr(scale_a_ptr),
+           scale_b_ptr(scale_b_ptr),
+           scale_c_ptr(scale_c_ptr),
+           scale_d_ptr(scale_d_ptr),
+           scale_aux_ptr(scale_aux_ptr),
+           abs_max_aux_ptr(abs_max_aux_ptr),
+           abs_max_D_ptr(abs_max_D_ptr) {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+  // Scaling factors for output and auxiliary output
+  ElementCompute scale_d_;
+  ElementCompute scale_aux_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericWithScalingAndAbsMax(Params const &params) :
+    params_(params),
+    skip_elementwise_(false),
+    scale_d_(ElementCompute(params.scale_d_ptr ? *(params.scale_d_ptr) : ElementScalingFactor(1))),
+    scale_aux_(ElementCompute(params.scale_aux_ptr ? *(params.scale_aux_ptr) : ElementScalingFactor(1)))
+  {
+    params_.activation.alpha = (params.activation.alpha_ptr ? *params.activation.alpha_ptr : params.activation.alpha);
+    params_.activation.beta = (params.activation.beta_ptr ? *params.activation.beta_ptr : params.activation.beta);
+    auto scale_a =
+        ElementCompute(params.scale_a_ptr ? *(params.scale_a_ptr) : ElementScalingFactor(1));
+    auto scale_b =
+        ElementCompute(params.scale_b_ptr ? *(params.scale_b_ptr) : ElementScalingFactor(1));
+    auto scale_c =
+        ElementCompute(params.scale_c_ptr ? *(params.scale_c_ptr) : ElementScalingFactor(1));
+
+    multiplies<ElementCompute> multiply;
+    params_.activation.alpha = multiply(params.activation.alpha, multiply(scale_a, scale_b));
+    params_.activation.beta = multiply(params.activation.beta, scale_c);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.activation.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.activation.beta = ElementCompute(1);
+    }
+
+    // Only the final partition should perform the activation function
+    // and scale the output and auxiliary output values.
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+      scale_d_ = ElementCompute(1.);
+      scale_aux_ = ElementCompute(1.);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + (beta * scale_c * source) + bias
+  ///      D = activation(Aux)
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias,
+    FragmentOutput const &source) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.beta, converted_source);
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + bias
+  ///      D = activation(Aux)
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.alpha, converted_accumulator);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_output_abs_max() const {
+    return params_.abs_max_D_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_aux_output_abs_max() const {
+    return params_.abs_max_aux_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_d() const {
+    return scale_d_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_aux() const {
+    return scale_aux_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
new file mode 100755
index 000000000..ef51a318b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -0,0 +1,69 @@
+/*************************************************************************************************** 
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with HardSwish operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the HardSwish activation to an array of elements.
+///
+/// D = hardswish(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
+                                                            ElementCompute_, Scale, Round>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
new file mode 100755
index 000000000..5989f09ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
@@ -0,0 +1,231 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationLeakyRelu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta_bias;              ///< scales bias tensor
+    ElementCompute leaky_alpha;            ///< leaky_alpha
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta_bias(ElementCompute(0)),
+      leaky_alpha(ElementCompute(1)) 
+       { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta_bias,
+      ElementCompute leaky_alpha = ElementCompute(1)
+    ): alpha(alpha), beta_bias(beta_bias), leaky_alpha(leaky_alpha) {
+
+    }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_bias_;
+  ElementCompute leaky_alpha_recip_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationLeakyRelu(Params const &params) {
+    alpha_ = (params.alpha);
+    beta_bias_ = (params.beta_bias);
+    leaky_alpha_recip_ = (ElementCompute(params.leaky_alpha));    
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_bias_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    LeakyReLU<ComputeFragment> leakyrelu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_bias_, converted_source);                        // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    LeakyReLU<ComputeFragment> leakyrelu;
+    //printf("in doing with bias");
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+    
+    
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
new file mode 100755
index 000000000..271055676
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct LinearCombinationParams {
+  uint64_t alpha_data[2];
+  uint64_t beta_data[2];
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationParams()
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  { }
+
+  template <typename ElementCompute>
+  CUTLASS_HOST_DEVICE 
+  LinearCombinationParams(ElementCompute alpha, ElementCompute beta) 
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  {
+#if defined(__CUDA_ARCH__)
+    reinterpret_cast<ElementCompute&>(alpha_data) = alpha;
+    reinterpret_cast<ElementCompute&>(beta_data) = beta;
+#else
+    memcpy( alpha_data, &alpha, sizeof(ElementCompute) ); 
+    memcpy( beta_data, &beta, sizeof(ElementCompute) ); 
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
new file mode 100755
index 000000000..ff32f13b0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations on planar-complex arrays
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/complex.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to arrays of planar-complex elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  ScaleType::Kind Scale = ScaleType::Default           ///< Control Alpha and Beta scaling
+>
+class LinearCombinationPlanarComplex {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = complex<ElementCompute>;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = ArrayPlanarComplex<ElementOutput, kCount>;
+  using FragmentAccumulator = ArrayPlanarComplex<ElementAccumulator, kCount>;
+  using ComputeFragment = ArrayPlanarComplex<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementScalar alpha{ElementCompute(1)};         ///< scales accumulators
+    ElementScalar beta{ElementCompute(0)};          ///< scales source tensor
+    ElementScalar const* alpha_ptr{nullptr};        ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementScalar const* beta_ptr{nullptr};         ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar alpha,
+      ElementScalar beta
+    ): alpha(alpha), beta(beta)
+    {}
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar const *alpha_ptr,
+      ElementScalar const *beta_ptr
+    ): alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) 
+    {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementScalar alpha_;
+  ElementScalar beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPlanarComplex(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    return beta_.real() != ElementCompute(0) || beta_.imag() != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source{
+      source_converter(source.real), 
+      source_converter(source.imag)};
+
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real), 
+      accumulator_converter(accumulator.imag)};
+
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+
+    // Perform binary operations
+  
+    // complex multiply: I = beta * C
+    ComputeFragment intermediate {
+      mul_op(beta_.real(), converted_source.real),
+      mul_op(beta_.real(), converted_source.imag)
+    };
+
+    intermediate.real = mul_add_op(-beta_.imag(), converted_source.imag, intermediate.real);
+    intermediate.imag = mul_add_op( beta_.imag(), converted_source.real, intermediate.imag);
+
+    // complex multiply-add: I = alpha * AB + I
+    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real, intermediate.real);
+    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag, intermediate.imag);
+
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return FragmentOutput{
+      destination_converter(intermediate.real), 
+      destination_converter(intermediate.imag)};
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real), 
+      accumulator_converter(accumulator.imag)};
+
+    // Perform binary operations
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+
+    // complex multiply-add: I = alpha * AB + I
+    ComputeFragment intermediate {
+      mul_op(alpha_.real(), converted_accumulator.real),
+      mul_op(alpha_.real(), converted_accumulator.imag)
+    };
+
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return FragmentOutput{
+      destination_converter(intermediate.real), 
+      destination_converter(intermediate.imag)};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
new file mode 100755
index 000000000..bbdc49862
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -0,0 +1,572 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationReluIsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
new file mode 100755
index 000000000..76ad59244
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
@@ -0,0 +1,543 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a relu operation used by epilogues.
+  This one only supports relu0 and tries to folding relu into other instructions.  Thus,
+  serial splitk is not supported by this one.  For example, relu can be folded into 
+  hfma2/hmul2 for sm80+
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationRelu0IsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu0 {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add_relu0<FragmentCompute> mul_add_relu0_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+
+      // Compute threshold optionally
+      intermediate = relu(intermediate);
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
new file mode 100755
index 000000000..ec4083de6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue functor specialized for residual blocks in deep neural networks.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          template <typename T> class BinaryOp2_ = detail::NoOp,
+          bool StoreT_ = false,
+          typename ElementVector_ = ElementC_>
+class LinearCombinationResidualBlock {
+public:
+  static bool const kIsSingleSource = false;
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp1 = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using BinaryOp2 = BinaryOp2_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+
+private:
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation UnaryOp(BinaryOp(BinaryOp(ActivationOp(AB + bias), residual1), residual2))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual1, FragmentC const &residual2,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp1 binary_op1;
+    BinaryOp2 binary_op2;
+    ActivationOp activation;
+
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual1 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual1);
+    FragmentCompute tmp_residual2 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual2);
+
+    FragmentCompute z =
+        binary_op2(binary_op1(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual1), beta_ * tmp_residual2);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+
+/// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          bool StoreT_,
+          typename ElementVector_>
+class LinearCombinationResidualBlock<ElementOutput_, ElementAccumulator_,
+          ElementCompute_, ElementC_, ElementsPerAccess,
+          ActivationOp_, BinaryOp1_, UnaryOp_,
+          detail::NoOp, StoreT_, ElementVector_> {
+public:
+  static bool const kIsSingleSource = true;
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+
+private:
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation UnaryOp(BinaryOp(ActivationOp(AB + bias), residual))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp binary_op;
+    ActivationOp activation;
+
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual);
+
+    FragmentCompute z =
+        binary_op(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
new file mode 100755
index 000000000..35251177f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with Sigmoid operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements.
+///
+/// D = sigmoid(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
new file mode 100755
index 000000000..fa346b068
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -0,0 +1,69 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with SiLU operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements.
+///
+/// D = silu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
new file mode 100755
index 000000000..c3ceea0ab
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operation, bias addition, and tensor-tensor
+  elementwise operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Returns whether a source operand is needed for a combination of binary operation and scale
+/// type. Simple specialized checks are made for cases in which 0 is an identity element of
+/// the binary operation.
+template <class BinaryOp, class ElementCompute, ScaleType::Kind Scale>
+CUTLASS_HOST_DEVICE
+bool is_binary_op_source_needed(ElementCompute scale) {
+  if constexpr (cute::is_same_v<BinaryOp, NoOp<ElementCompute>>) {
+    return false;
+  }
+  else if constexpr (cute::is_same_v<BinaryOp, plus<ElementCompute>> || cute::is_same_v<BinaryOp, minus<ElementCompute>>) {
+    // Cases for binary operators for which 0 is an identity element
+    if constexpr (Scale == ScaleType::NoBetaScaling) return true;
+
+    if constexpr (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if constexpr (Scale == ScaleType::Nothing) return false;
+
+    return scale != ElementCompute(0);
+  }
+
+  return true;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Compute a tensor-tensor broadcast epilogue.
+ *
+ * @param ElementOutput_ Data type used to load and store tensors
+ * @param ElementAccumulator_ Accumulator data type
+ * @param ElementCompute_ Data type used to compute linear combination
+ * @param ElementBias_ Data type of Bias elements
+ * @param ActivationFunctor_ Fused Activation
+ * @param BinaryOp0_ Binary operation to perform on O0 and C0. detail::NoOp means no operation
+ * @param BinaryOp1_ Binary operation to perform on O1 and C1. detail::NoOp means no operation
+ * @param UnaryOp_ Unary operation to perform on final result
+ * @param Scale Controls the type of Alpha and Beta scaling to perform
+ * @param Round How values should be rounded in conversions
+ * @param ElementSource_ Data type used for source operands
+ *
+ *  Computes the following:
+ *      O0 = alpha * accumulator + bias
+ *      O1 = BinaryOp0(O0, beta * C0)
+ *      O2 = BinaryOp1(O1, beta * C1)
+ *      D  = UnaryOp(O2)
+ */
+template <
+  class ElementOutput_,
+  class ElementAccumulator_ = ElementOutput_,
+  class ElementCompute_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  template <class T> class ActivationFunctor_ = Identity,
+  template <class T> class BinaryOp0_ = plus,
+  template <class T> class BinaryOp1_ = detail::NoOp,
+  template <class T> class UnaryOp_ = Identity,
+  ScaleType::Kind Scale = ScaleType::Default,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  class ElementSource_ = ElementOutput_
+>
+class LinearCombinationTensorBroadcast {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementBias = ElementBias_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  using UnaryOp = UnaryOp_<ElementCompute>;
+  using BinaryOp0 = BinaryOp0_<ElementCompute>;
+  using BinaryOp1 = BinaryOp1_<ElementCompute>;
+  using ActivationFunctor = ActivationFunctor_<ElementCompute>;
+
+  static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentBias = Array<ElementBias, kCount>;
+
+  static constexpr FloatRoundStyle kRound = Round;
+  using NoOpType = detail::NoOp<ElementCompute>;
+  static constexpr bool IsBinaryOp0Enabled = !cute::is_same_v<BinaryOp0, NoOpType>;
+  static constexpr bool IsBinaryOp1Enabled = !cute::is_same_v<BinaryOp1, NoOpType>;
+  static constexpr bool IsUnaryOpEnabled = !cute::is_same_v<UnaryOp, NoOpType> && !cute::is_same_v<UnaryOp, Identity<ElementCompute>>;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha{};                          ///< scales accumulators
+    ElementCompute beta{};                           ///< scales source tensor
+    ElementCompute const* alpha_ptr = nullptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const* beta_ptr = nullptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr, ElementCompute const* beta_ptr)
+        : alpha_ptr(alpha_ptr),
+          beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr)
+        : alpha_ptr(alpha_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta)
+        : alpha(alpha),
+          beta(beta) {}
+  };
+
+private:
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTensorBroadcast(Params const& params)
+      : alpha_(params.alpha_ptr ? *params.alpha_ptr : params.alpha),
+        beta_(params.beta_ptr ? *params.beta_ptr : params.beta) {}
+
+  /// Returns true if source 0 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source0_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp0, ElementCompute, Scale>(beta_);
+  }
+
+  /// Returns true if source 1 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source1_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp1, ElementCompute, Scale>(beta_);
+  }
+
+  //
+  // Specialization for scalar
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source0, ElementC source1, ElementBias const bias) {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementCompute, ElementBias, Round> bias_converter;
+    NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    ActivationFunctor act;
+    multiplies<ElementCompute> mul;
+    multiply_add<ElementCompute> madd;
+
+    ElementCompute intermediate = accumulator_converter(accumulator);
+    intermediate = madd(alpha_, intermediate, bias_converter(bias));
+    intermediate = act(intermediate);
+
+    // Apply BinaryOp0, if needed
+    if constexpr (IsBinaryOp0Enabled) {
+      BinaryOp0 bin0;
+      ElementCompute converted_source = source_converter(source0);
+      intermediate = bin0(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply BinaryOp1, if needed
+    if constexpr (IsBinaryOp1Enabled) {
+      BinaryOp1 bin1;
+      ElementCompute converted_source = source_converter(source1);
+      intermediate = bin1(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply UnaryOp, if needed
+    if constexpr (IsUnaryOpEnabled) {
+      UnaryOp unary;
+      intermediate = unary(intermediate);
+    }
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
new file mode 100755
index 000000000..8a2ce5a2a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Functor performing linear combination with elementwise
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationWithElementwise {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static bool const kIsHeavy = true;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationWithElementwise(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
new file mode 100755
index 000000000..b24d4f953
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing reduction operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a reduction sum to an array of elements.
+///
+///
+template <
+  typename Element_,                             ///< Data type used to load and store tensors
+  int Count                                      ///< Number of elements computed per operation
+>
+class ReductionOpPlus {
+public:
+
+  using Element = Element_;
+  static int const kCount = Count;
+
+  using Fragment = Array<Element, kCount>;
+  using Operator = plus<Fragment>;
+
+  /// Host-constructable parameters structure
+  struct Params { };
+
+private:
+
+  /// reduction operator
+  Operator operator_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  ReductionOpPlus(Params const &params) {
+
+  }
+
+  /// Computes Compute => 
+  CUTLASS_HOST_DEVICE
+  Fragment operator()(
+    Fragment const &lhs,
+    Fragment const &rhs) const {
+
+    return operator_(lhs, rhs);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
new file mode 100755
index 000000000..d1a466213
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Enum defines the behaviors of the epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specifies internal data type for computation
+/// Note :
+///  1. Scalar means alpha/beta is a single value from host(constant param) or device memory.
+///  2. Vector means alpha/beta is a vector always from device memory.
+struct ScaleType {
+  enum Kind {
+    Default,                           // D = scalar_alpha x Acc + scalar_beta x C
+    NoBetaScaling,                     // D = scalar_alpha x Acc + C
+    OnlyAlphaScaling,                  // D = scalar_alpha x Acc
+    PerChannelScaling,                 // D = vector_alpha x Acc + vector_beta x C
+    OnlyAlphaPerChannelScaling,        // D = vector_alpha x Acc
+    Nothing                            // D = Acc
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
new file mode 100755
index 000000000..30af039bc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
+  typename Operator_ = arch::OpMultiplyAddComplex
+> 
+struct DefaultEpilogueComplexTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex
+> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
new file mode 100755
index 000000000..e86e4f92b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
+  typename Operator_ = arch::OpMultiplyAddComplex,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+> 
+struct DefaultEpilogueComplexTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput
+    , kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess, 
+  BlasMode BlasMode_
+>
+struct DefaultEpilogueComplexTensorOpBlas3 <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex
+                                      , BlasMode_
+> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
new file mode 100755
index 000000000..8770f6196
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Direct store epilogue
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_direct_store.h"
+#include "cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Given a properly constructed epilogue, returns a direct store epilogue
+template <typename EpilogueTensorOp>
+struct DefaultEpilogueDirectStore {
+
+  using OutputTileIterator = DirectStoreEpilogueIterator<typename EpilogueTensorOp::OutputTileIterator::Element>;
+
+  using Epilogue = EpilogueDirectStore<
+    typename EpilogueTensorOp::Shape,
+    typename EpilogueTensorOp::WarpMmaOperator,
+    EpilogueTensorOp::kPartitionsK,
+    OutputTileIterator,
+    typename EpilogueTensorOp::AccumulatorFragmentIterator,
+    typename EpilogueTensorOp::WarpTileIterator,
+    typename EpilogueTensorOp::SharedLoadIterator,
+    typename EpilogueTensorOp::OutputOp
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
new file mode 100755
index 000000000..e38e0ff6d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -0,0 +1,241 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Constructs a default epilogue for planar complex outputs.
+
+  This template reuses components for real-valued epilogues and applies them to planar complex
+  output matrices.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMma_,
+  typename OpcodeClass_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm70,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm75,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm80,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassSimt, 
+  ArchTag_,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueSimt<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
new file mode 100755
index 000000000..f3119fa40
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -0,0 +1,443 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using SIMT.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
+#include "cutlass/epilogue/warp/tile_iterator_simt.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_simt.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h" 
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_depthwise.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueSimt {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <typename Shape_,        // ThreadBlock Shape
+          typename WarpMmaSimt_,  // mma_depthwise_simt
+          typename OutputOp_,
+          int ElementsPerAccess_,
+          typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >
+struct DefaultDirectConvEpilogueSimt {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using WarpShape = typename WarpMmaSimt::Shape;
+  using OutputOp = OutputOp_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static int const kElementsPerAccess = ElementsPerAccess_;
+
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  /// Number of threads total
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN
+  >;
+
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Thread map
+  //
+  
+  using OutputTileThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ThreadBlockOutputShape::kC, ThreadBlockOutputShape::kNHW>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorDirectConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ThreadOutputShape,
+    ThreadBlockOutputShape 
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
+    typename WarpMmaSimt::Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorPitchLinear<
+    OutputTileThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueDepthwise<
+    Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    WarpMmaSimt,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
new file mode 100755
index 000000000..1d62f4fc3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -0,0 +1,904 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  float, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  bfloat16_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  int32_t, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput, 
+  int32_t, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+                "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
new file mode 100755
index 000000000..e1ae5a24c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+>
+struct DefaultEpilogueTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
new file mode 100755
index 000000000..f73edfdec
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops on Volta.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueVoltaTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
new file mode 100755
index 000000000..b0e89a4ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
@@ -0,0 +1,126 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Default configuration for epilogue computing absolute maximum of output and auxiliary outputs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for absolute-maximum-computing  epilogues with TensorOps
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementAuxOutput,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithAbsMax {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the output
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Stores the auxiliary output
+  //
+  using AuxOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementAuxOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithAbsMax<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    AuxOutputTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
new file mode 100755
index 000000000..16e045e1e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueWithBroadcastSimt {
+
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimt<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for strided dgrad epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastSimtStridedDgrad {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for streamk epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultStreamkEpilogueWithBroadcastTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueStreamkWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for VoltaTensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueWithBroadcastVoltaTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
new file mode 100755
index 000000000..34ecfb741
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
@@ -0,0 +1,177 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionVoltaTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
new file mode 100755
index 000000000..3b1c5dc19
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using WMMA.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for WMMA TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWmmaTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
new file mode 100755
index 000000000..2092caf4d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for SIMT accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename MmaSimtPolicy_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapSimt {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Computes number of thread-level matrix multiplies are needed to span a warp
+    static int const kGroupCount =
+      WarpShape::kM / (MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM);
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    /// Number of iterations
+    static int const kIterations = MmaSimtPolicy::LaneMmaShape::kM * kGroupCount;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap<
+    OutputTileShape<                          // Shape
+      ThreadblockShape::kN, 
+      1, 
+      MmaSimtPolicy::WarpShape::kRow, 
+      Detail::WarpCount::kM, 
+      1>,
+    OutputTileShape<                          // Count
+      1, 
+      MmaSimtPolicy::LaneMmaShape::kM, 
+      Detail::kGroupCount, 
+      1, 
+      Detail::kIterations>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
new file mode 100755
index 000000000..e39ca9d53
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -0,0 +1,208 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapTensorOp {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedOutputTileThreadMap<
+      layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
+                               WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedConvThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+
+  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedConvOutputTileThreadMap<
+      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
+                  WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
new file mode 100755
index 000000000..1eac4a183
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -0,0 +1,228 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  int PartitionsK,
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ElementAccumulator
+>
+struct DefaultThreadMapVoltaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_, 
+  WarpShape_, 
+  PartitionsK, 
+  ElementOutput_, 
+  ElementsPerAccess, 
+  half_t> {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = half_t;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  PartitionsK,
+  ElementOutput_,
+  ElementsPerAccess,
+  float> {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = float;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
new file mode 100755
index 000000000..0dccf6525
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -0,0 +1,113 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for Wmma TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename InstructionShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapWmmaTensorOp {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    /// Wmma Tensor Operations fundamentally perform operations on InstructionShape::kM rows
+    static int const kTensorOpRows = InstructionShape::kM;
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
new file mode 100755
index 000000000..11f89b658
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element_>
+class DirectStoreEpilogueIterator {
+public:
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = 1;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+      stride = layout.stride(0) * sizeof(Element);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+public:
+
+  //
+  // Data members
+  //
+
+  Element *pointer;     // pointer to the output matrix
+
+  LongIndex stride;     // stride in elements between rows
+
+  TensorCoord extent;   // extent of output matrix
+
+  int thread_idx;       // thread index
+
+  TensorCoord threadblock_offset;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  DirectStoreEpilogueIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer_,
+    TensorCoord extent_,
+    int thread_idx_,
+    TensorCoord threadblock_offset_ = TensorCoord(),
+    int const * indices = nullptr
+  ): 
+    pointer(pointer_),
+    stride(params.stride / sizeof(Element)),
+    extent(extent_),
+    thread_idx(thread_idx_),
+    threadblock_offset(threadblock_offset_)
+  {
+
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
new file mode 100755
index 000000000..48b66a144
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
@@ -0,0 +1,543 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class Epilogue :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+
+public:
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+
+public:
+
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+
+    // No-op
+    CUTLASS_DEVICE
+    void load() { }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+
+
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+
+    // Load addend source fragment from global memory
+    CUTLASS_DEVICE
+    void load() {
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+    }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  Epilogue(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    // Initialize/load source-fragment data
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
+
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(
+        output_fragment,
+        output_op,
+        aligned_accum_fragment,
+        source_fragment);
+
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
+
+  template<class Seq>
+  struct acc2smem;
+
+  template <size_t... Seq>
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                    AccumulatorFragmentIterator const &iterator_begin,
+                    WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+
+  /// Streams the result to global memory
+  template <typename SourceAspect>
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter)
+    {
+      //
+      // Load the source
+      //
+
+        source.load();
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+        iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      if (kPartitionsK > 1) {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, aligned_accum_fragment[0]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
new file mode 100755
index 000000000..6853f5f04
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -0,0 +1,240 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <type_traits>
+#include <utility>
+#endif
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// This is used for metaprogramming epilogue functors. If they define 
+// `static bool const kIsHeavy = true;`, then the epilogue functor itself is
+// not inlined. This results in smaller code and is advantageous if the epilogue
+// functor consists of many instructions.
+//
+// If the epilogue functor does not define `kIsHeavy` or if it is `false`, then
+// the behavior from CUTLASS 2.5 and before is retained. The epilogue is fully
+// unrolled and inlined.
+//
+
+template<class> 
+struct TypeSink {  typedef void type; };
+
+template<class T> using TypeSinkT = typename TypeSink<T>::type;
+
+template<class T, class=void> struct IsEpilogueFunctorHeavy {
+  static bool const value = false;
+};
+
+template<class T> struct IsEpilogueFunctorHeavy<T, TypeSinkT< decltype( T::kIsHeavy ) > > {
+  static bool const value = T::kIsHeavy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Base class for epilogues defining warp-level 
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpShape_,                      ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerIteration = 1
+>
+class EpilogueBase {
+public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using Padding = Padding_;
+
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename AccumulatorTile::Element;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = FragmentsPerIteration;
+
+public:
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      (Shape::kRow + Padding::kRow) * kFragmentsPerIteration, 
+      Shape::kColumn + Padding::kColumn
+    >;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(), 
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  SharedStorage &shared_storage_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBase(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
new file mode 100755
index 000000000..294e9a514
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Basic subset of epilogue functionality for supporting StreamK decompositions
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/block_striped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// StreamK epilogue functionality for cross-block accumulator fragment reduction
+template <
+  typename Shape,                          ///< Shape of threadblock tile (concept: GemmShape)
+  int PartitionsK,
+  typename WarpMmaOperator,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  typename AccumulatorFragmentIterator>    ///< Iterator for enumerating fragments within the per-thread tile of raw accumulators
+class EpilogueBaseStreamK
+{
+
+protected:
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+                        Shape::kM / WarpMmaOperator::Shape::kM,
+                        Shape::kN / WarpMmaOperator::Shape::kN,
+                        PartitionsK>;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+public:
+
+  /// Number of AccumulatorTile fragments per thread
+  static int const kAccumulatorFragments = AccumulatorFragmentIterator::Policy::kIterations;
+
+protected:
+
+  /// Number of AccumulatorTile fragments per block output tile
+  static int const kOutputTileFragments = kBlockThreads * kAccumulatorFragments;
+
+  /// Block-striped transfer utility for sharing AccumulatorFragment
+  using BlockStripedT = BlockStriped<kBlockThreads, AccumulatorFragment>;
+
+  /// AccumulatorFragment stride in the shared workspace between different peer blocks (each thread block can share accumulators for up to two block output tiles)
+  static const int kPeerFragmentStride = kOutputTileFragments * 2;
+
+public:
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =sizeof(AccumulatorFragment) * kPeerFragmentStride;
+
+public:
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBaseStreamK(
+      int thread_idx)                                       ///< ID of a thread within the threadblock
+  :
+      thread_idx(thread_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace
+  CUTLASS_DEVICE
+  void reduce(
+      AccumulatorFragment &accum_fragment,                  ///< [out] sum of all shared accumulator fragments for these peer partials
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *workspace_ptr)
+  {
+    plus<AccumulatorFragment> add_fragments;
+
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = (peer_idx_begin * kPeerFragmentStride) + (reduce_fragment_idx * kBlockThreads);
+
+    // Load first peer fragment
+    BlockStripedT::load(accum_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+    fragment_offset += kPeerFragmentStride;         // Move to next peer
+    fragment_offset += kOutputTileFragments;        // Move to the set of fragments for this peer's "non-started" output tile
+
+    // Reduce fragments from additional peers
+    #pragma unroll 2
+    for (; fragment_offset < peer_idx_end * kPeerFragmentStride; fragment_offset += kPeerFragmentStride)
+    {
+      // Load peer fragment
+      AccumulatorFragment addend_fragment;
+      BlockStripedT::load(addend_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+      // Add peer fragment
+      accum_fragment = add_fragments(accum_fragment, addend_fragment);
+    }
+  }
+
+
+  /// Shares the accumulator set with peers in the global workspace
+  CUTLASS_DEVICE
+  void share(
+      int peer_idx,
+      void *workspace_ptr,
+      AccumulatorTile const &accumulators,
+      bool started_tile)                      ///< Whether this thread block computed the first work volume for the current output tile
+  {
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = peer_idx * kPeerFragmentStride;
+
+    if (!started_tile) {
+      // Move to the set of fragments for the "non-started" output tile
+      fragment_offset += kOutputTileFragments;
+    }
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    // Convert raw accumulator tile to fragments and store
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kAccumulatorFragments; ++iter)
+    {
+      // Acquire reordered accumulator fragment
+      AccumulatorFragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      // Store accumulator fragment
+      BlockStripedT::store(fragment_workspace + fragment_offset, accum_fragment, this->thread_idx);
+
+      fragment_offset += kBlockThreads;
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
new file mode 100755
index 000000000..83cbc8ab3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
@@ -0,0 +1,335 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for Depthwise convoltuion
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <typename Shape_,                   ///< Shape of threadblock tile (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename WarpMmaOperator_,         ///< Warp-level MMA operator (concept:
+                                             ///< gemm::warp::MmaTensorOp)
+          typename OutputTileIterator_,      ///< Tile iterator reading and writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting accumulators
+          typename WarpTileIterator_,    ///< Warp-scoped tile iterator writing accumulators to SMEM
+          typename SharedLoadIterator_,  ///< Threadblock-scoped tile iterator loading from SMEM
+          typename OutputOp_,            ///< Output operator
+          typename Padding_  ///< Padding added to SMEM allocation to avoid bank conflicts (concept:
+                             ///< MatrixShape)
+          >
+class EpilogueDepthwise {
+ public:
+  using Shape = Shape_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType =
+      Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN>;
+
+ public:
+  static_assert(SharedLoadIterator::Fragment::kElements ==
+  OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<ThreadBlockOutputShape::kNHW, ThreadBlockOutputShape::kC>;
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<Shape::kRow, Shape::kColumn>;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() { return storage.data(); }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(storage.data(), Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n;  // warp coordinates within a cta
+  int tid_m, tid_n;    // thread coordinates within a warp
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDepthwise(SharedStorage &shared_storage,  ///< Shared storage object
+                    int thread_idx_,                ///< ID of a thread within the threadblock
+                    int warp_idx_,                  ///< ID of warp within threadblock
+                    int lane_idx_                   ///< Id of thread within warp
+                    )
+      : thread_idx(thread_idx_),
+        warp_idx(warp_idx_),
+        lane_idx(lane_idx_),
+        shared_load_iterator_(shared_storage.reference(), thread_idx_),
+        warp_tile_iterator_(shared_storage.reference(), thread_idx_, lane_idx_) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(OutputOp const &output_op,                ///< Output operator
+                  OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                  AccumulatorTile const &accumulators,  ///< Complete warp-level accumulator tile
+                  OutputTileIterator source_iterator,   ///< Threadblock tile coordinate in GEMM (in
+                                                        ///< units of threadblock tiles)
+                  const int smem_base_offset) {         ///< SMEM base offset for epilogue operation
+    // initiate the smem base offset for different output tile.
+    warp_tile_iterator_.set_smem_base_address(smem_base_offset);
+
+    shared_load_iterator_.set_smem_base_address(smem_base_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+ private:
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators,      ///< Complete warp-level accumulator tile
+      OutputTileIterator source_iterator) {     ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    source_fragment.clear();
+
+    source_iterator.load(source_fragment);
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators) {    ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment) {
+      
+    OutputAccessType *output_frag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    OutputAccessType const *source_frag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,  ///< Output operator
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
+    OutputAccessType *output_frag_ptr = reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
new file mode 100755
index 000000000..02de00dd6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
@@ -0,0 +1,347 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs and convolution using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueDirectStore {
+public:
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = MatrixShape<0, 0>;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+  
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = 1;
+
+  static int constexpr kSmemTiles = 1;
+  static int constexpr kSmemPointerOffset = 0;
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage { } ;
+
+private:
+
+  // Assume accumulator tile is multipile interleaved 32x32 tile.
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename platform::conditional<
+                              platform::is_same<ElementAccumulator, float>::value,
+                              MatrixShape<2, 2>,
+                              MatrixShape<1, 4> >::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = MatrixShape<4, 4>;
+
+  static_assert(OutputOp::kCount >= 2, 
+    "The direct store epilogue for Tensor Ops requires the output functor have kCount >= 2.");
+
+private:
+
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n; // warp coordinates within a cta
+  int tid_m, tid_n;   // thread coordinates within a warp
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDirectStore(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx_,                   ///< ID of a thread within the threadblock
+    int warp_idx_,                     ///< ID of warp within threadblock
+    int lane_idx_                     ///< Id of thread within warp
+  ):
+    thread_idx(thread_idx_), 
+    warp_idx(warp_idx_), 
+    lane_idx(lane_idx_) 
+  {
+    
+    // warp offsetting calculations
+    warp_offset = warp_idx * WarpShape::kM * WarpShape::kN;
+    int warp_id_mn = warp_idx % (WarpCount::kM * WarpShape::kN);
+    warp_m = warp_id_mn % WarpCount::kM;
+    warp_n = warp_id_mn / WarpCount::kM;
+    MatrixCoord warp_offset_coord(warp_m*WarpShape::kM, warp_n*WarpShape::kN);
+    
+    // thread offsetting calculations
+    int quad = (lane_idx >> 2);
+    int lane_in_quad = (lane_idx & 3);
+
+    // this seems to be te correct layout
+    tid_m = quad;
+    tid_n = 2 * lane_in_quad;
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    }
+    else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+private:
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+      ElementOutput *source_ptr = source_iterator.pointer + mL * source_iterator.stride;
+
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
+        int nL = nL_base + accum_n;
+          
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+
+        OutputFragmentType output_fragment;
+
+        if(guard) {
+          reinterpret_cast<OutputAccessType &>(output_fragment) = 
+            *reinterpret_cast<OutputAccessType const *>(source_ptr + nL);
+        }
+
+        // Perform output operator
+        output_fragment = output_op(accum_fragment, output_fragment);
+
+        if(guard) {
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = reinterpret_cast<OutputAccessType const &>(output_fragment);
+        }
+      }
+    }
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
+        int nL = nL_base + accum_n;
+          
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+                   
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+
+        OutputFragmentType output_fragment;
+
+        // Perform output operator
+        output_fragment = output_op(accum_fragment);
+
+        if(guard) { 
+
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = 
+            reinterpret_cast<OutputAccessType const &>(output_fragment);      
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
new file mode 100755
index 000000000..43b14c356
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
@@ -0,0 +1,212 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename ElementAccumulator_,
+  typename ElementOutput_,
+  typename ThreadBlockShape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  bool ReduceKForA_
+>
+class EpilogueGemmKReduction {
+
+public:
+
+  using ThreadBlockShape = ThreadBlockShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Accumulator element
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Output element
+  using ElementOutput = ElementOutput_;
+
+  /// Output access size
+  static int const kElementsPerAccess = 1;
+
+  static bool const kReduceKForA = ReduceKForA_;
+
+  static int const kThreadBlockSize = kReduceKForA ? ThreadBlockShape::kM : ThreadBlockShape::kN;
+
+  static int const kWarpSize = kReduceKForA ? WarpShape::kM : WarpShape::kN;
+
+  static int const kIterations = kWarpSize / 8;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kIterations>;
+
+private:
+
+  int thread_offset_;
+  ElementOutput* pointer_;
+  int col_;
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueGemmKReduction(
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx,                     ///< Id of thread within warp
+    int threadblock_offset,
+    ElementOutput* pointer 
+  )
+  {
+     col_ = lane_idx % 4;
+     thread_offset_ = threadblock_offset * kThreadBlockSize
+                    + warp_idx * kWarpSize 
+                    + lane_idx / 4 + col_ * 8;
+
+     pointer_ = pointer + LongIndex(thread_offset_);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    int size,
+    FragmentAccumulator &gemm_k_with_reduction_accumulation,
+    bool LoadForSerialSplitK
+  ) {
+      bool guard[kIterations / 4];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        guard[i] = ((thread_offset_ + i * 32) < size);
+      }
+
+      Array<ElementOutput, kIterations / 4> source;
+      source.clear();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        ElementOutput *source_ptr = reinterpret_cast<ElementOutput *>(&source);
+        cutlass::arch::global_load<ElementOutput, sizeof(ElementOutput)>(
+                                                  source_ptr[i],
+                                                  (void *)(pointer_ + i * 32),
+                                                  guard[i] && LoadForSerialSplitK);
+
+      }
+
+      FragmentAccumulator sum = gemm_k_with_reduction_accumulation;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations; ++i) {
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 1);
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 2);
+      }
+
+      Array<ElementAccumulator, kIterations / 4> intermediate;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        if (col_ == 0) {
+          intermediate[i] = sum[0 + i * 4];
+        }
+  
+        if (col_ == 1) {
+          intermediate[i] = sum[1 + i * 4];
+        }
+  
+        if (col_ == 2) {
+          intermediate[i] = sum[2 + i * 4];
+        }
+  
+        if (col_ == 3) {
+          intermediate[i] = sum[3 + i * 4];
+        }
+      }
+
+      NumericArrayConverter<ElementAccumulator, ElementOutput, kIterations / 4> source_converter;
+      Array<ElementAccumulator, kIterations / 4> converted_source = source_converter(source);
+
+      plus<Array<ElementAccumulator, kIterations / 4>> plus_source;
+      intermediate = plus_source(intermediate, converted_source);
+
+      NumericArrayConverter<ElementOutput, ElementAccumulator, kIterations / 4> converter;
+      Array<ElementOutput, kIterations / 4> result = converter(intermediate);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        cutlass::arch::global_store<ElementOutput, sizeof(ElementOutput)>(result[i], 
+                                                (void *)(pointer_ + i * 32), guard[i]);
+      }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
new file mode 100755
index 000000000..b294244cd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator for planar-complex output representations.
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+>
+class EpiloguePlanarComplex {
+public:
+  
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = ArrayPlanarComplex<
+    typename WarpMmaOperator::FragmentC::Element, 
+    WarpMmaOperator::FragmentC::kElements
+  >;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+  
+  /// Shape of each warp-level operation
+  using WarpShape = typename WarpMmaOperator::Shape;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Shared memory allocation
+  struct SharedStorage {
+
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      Shape::kRow + Padding::kRow, 
+      Shape::kColumn + Padding::kColumn
+    >;
+
+    static int const kImaginaryStride = StorageShape::kCount;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, kImaginaryStride * 2> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(), 
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  SharedStorage &shared_storage_;
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePlanarComplex(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    shared_load_iterator_(shared_storage.reference(), thread_idx),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    OutputTileIterator destination_iterator_real,     ///< Tile iterator for destination
+    OutputTileIterator destination_iterator_imag,     ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator_real,          ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    OutputTileIterator source_iterator_imag) {        ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    typename OutputTileIterator::Fragment source_fragment_real;
+    typename OutputTileIterator::Fragment source_fragment_imag;
+
+    if (!output_op.is_source_needed()) {
+      source_iterator_real.clear_mask();
+      source_iterator_imag.clear_mask();
+    }
+
+    source_fragment_real.clear();
+    source_fragment_imag.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator_real(accumulators.real);
+    AccumulatorFragmentIterator accum_fragment_iterator_imag(accumulators.imag);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator_real.load(source_fragment_real);
+      source_iterator_imag.load(source_fragment_imag);
+
+      ++source_iterator_real;
+      ++source_iterator_imag;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_real;
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_imag;
+
+      accum_fragment_iterator_real.load(accum_fragment_real);
+      accum_fragment_iterator_imag.load(accum_fragment_imag);
+      
+      ++accum_fragment_iterator_real;
+      ++accum_fragment_iterator_imag;
+
+      this->warp_tile_iterator_.store(accum_fragment_real);
+      this->warp_tile_iterator_.store_with_pointer_offset(accum_fragment_imag, SharedStorage::kImaginaryStride);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_real[kPartitionsK];
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_imag[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment_real[0]);
+      shared_load_iterator_.load_with_pointer_offset(aligned_accum_fragment_imag[0], SharedStorage::kImaginaryStride);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      static_assert(kPartitionsK  == 1, "Sliced-K not supported for planar complex at this time");
+    
+      //
+      // Compute the output result
+      //
+     
+      typename OutputTileIterator::Fragment output_fragment_real;
+      typename OutputTileIterator::Fragment output_fragment_imag;
+
+      apply_output_operator_(
+        output_fragment_real, 
+        output_fragment_imag, 
+        output_op, 
+        aligned_accum_fragment_real[0],
+        aligned_accum_fragment_imag[0], 
+        source_fragment_real,
+        source_fragment_imag);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator_real.store(output_fragment_real);
+      destination_iterator_imag.store(output_fragment_imag);
+
+      ++destination_iterator_real;
+      ++destination_iterator_imag;
+    }
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment_real,
+    typename OutputTileIterator::Fragment &output_fragment_imag,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_real,
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_imag,
+    typename OutputTileIterator::Fragment const &source_fragment_real,
+    typename OutputTileIterator::Fragment const &source_fragment_imag) {
+
+    OutputAccessType *output_frag_real_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment_real);
+
+    OutputAccessType *output_frag_imag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment_imag);
+
+    AccumulatorAccessType const *compute_frag_real_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_real);
+
+    AccumulatorAccessType const *compute_frag_imag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_imag);
+
+    OutputAccessType const *source_frag_real_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_real);
+
+    OutputAccessType const *source_frag_imag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_imag);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      auto result_fragment = output_op(
+        make_ArrayPlanarComplex(compute_frag_real_ptr[i], compute_frag_imag_ptr[i]), 
+        make_ArrayPlanarComplex(source_frag_real_ptr[i], source_frag_imag_ptr[i])
+      );
+
+      output_frag_real_ptr[i] = result_fragment.real;
+      output_frag_imag_ptr[i] = result_fragment.imag;
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
new file mode 100755
index 000000000..2be1fa55a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMM/CONV to store accumulator in shared memory after
+    applying scale, bias loaded from global memory and element-wise operations.
+
+    This Epilogue is typically used in fused GEMM/CONV to stage the intermediate accumulator.
+
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename SmemTileIterator_,               ///< Shared memory Tile iterator to output to shared memory
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename ScaleBiasIterator_,              ///< Iterator to load scale and bias from global memory
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueSmemAccumulator {
+
+public:
+
+  using SmemTileIterator = SmemTileIterator_;
+
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+
+  using ScaleBiasIterator = ScaleBiasIterator_;
+
+  using OutputOp = OutputOp_;
+
+  /// Fragment of accumulator tile
+  using FragmentAccumulator = typename AccumulatorFragmentIterator::Fragment;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentScaleBias = typename ScaleBiasIterator::Fragment;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueSmemAccumulator() {}
+
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator,          ///< Complete warp-level accumulator tile
+    ScaleBiasIterator scale_iterator,             ///< iterator for scale vector in global memory
+    ScaleBiasIterator bias_iterator) {            ///< iterator for bias vector in global memory
+ 
+  
+    // Fragment to load scale bias from global memory
+    FragmentScaleBias tb_frag_scale;
+    FragmentScaleBias tb_frag_bias;
+      
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+  
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+  
+    /// Load scale and bias from global memory
+  
+    if(PerChannelScale)
+        scale_iterator.load(tb_frag_scale);
+  
+    bias_iterator.load(tb_frag_bias);
+  
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+    
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+  
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using ScaleBiasAccessType = typename OutputOp::FragmentScaleBias;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+  
+  
+        ScaleBiasAccessType const * scale_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_scale);
+        ScaleBiasAccessType const * bias_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_bias);
+   
+        FragmentSmemAccessType * smem_frag_ptr =  
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+  
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+  
+          AccumulatorAccessType const * accumulator_frag_ptr = 
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it],
+                scale_frag_ptr[cid * kOutputIterations + it], bias_frag_ptr[cid * kOutputIterations + it]);
+          }
+        }
+  
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+  
+      }
+    }
+  }
+
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator) {          ///< Complete warp-level accumulator tile
+ 
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+  
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+  
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+    
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+  
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+  
+        FragmentSmemAccessType * smem_frag_ptr =  
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+  
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+  
+          AccumulatorAccessType const * accumulator_frag_ptr = 
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it]);
+          }
+        }
+  
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+  
+      }
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+ 
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
new file mode 100755
index 000000000..9efbee477
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
@@ -0,0 +1,443 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#include <cuda/std/utility>
+#else
+#include <assert.h>
+#include <utility>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueStreamkWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueStreamkWithBroadcastOpBase : EpilogueWithBroadcastOpBase<
+                                            ElementC_,
+                                            ElementAccumulator_,
+                                            ElementCompute_,
+                                            ElementZ_,
+                                            ElementT_,
+                                            ElementsPerAccess,
+                                            StoreZ,
+                                            StoreT
+                                            > 
+{
+
+  /// Parameters structure - required
+  struct Params { };
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  EpilogueStreamkWithBroadcastOpBase(Params const &params_) { }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }  
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }  
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueStreamkWithBroadcast;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Two sources
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+      OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator1, source_iterator2, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Single source
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
new file mode 100755
index 000000000..8202284b6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
+
+  The epilogue finds max values in each row of the row-major output matrix and stores them.
+  The max values are also used for a further round of threadblock scoped reduction operation, where
+  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename ElementAccumulator_,
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ElementwiseFunctor_,
+  bool UseMasking_ = false
+>
+class EpilogueVisitorSoftmax {
+public:
+
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+  static bool const kUseMasking = UseMasking_;
+
+  /// Argument structure
+  struct Arguments {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+
+    //
+    // Methods
+    //
+    Arguments():
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      int64_t                               batch_stride_C_,
+      int64_t                               batch_stride_D_,
+      int64_t                               batch_stride_Max_,
+      int64_t                               batch_stride_Sum_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_Max(batch_stride_Max_),
+      batch_stride_Sum(batch_stride_Sum_)
+    {
+
+    }
+
+  };
+
+  struct Params {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params()
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Max(args.batch_stride_Max),
+      batch_stride_Sum(args.batch_stride_Sum)
+    {
+
+    }
+  };
+
+  /// Shared storage
+  struct SharedStorage {
+
+  };
+
+private:
+
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  MatrixCoord                           extent_real_;
+  ElementwiseFunctor                    elementwise_;
+
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+
+  ElementNorm                           *ptr_Max_;
+  ElementSum                            *ptr_Sum_;
+
+  int                                   column_offset_;
+
+  ElementSoftmaxCompute                 accum_max_;
+  ElementSoftmaxCompute                 accum_sum_;
+
+  MatrixCoord                           thread_offset_;
+
+  float                                 infinity_;
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueVisitorSoftmax(
+    Params const &params,
+    SharedStorage &shared_storage,
+    cutlass::MatrixCoord const &problem_size,
+    int thread_idx,
+    int warp_idx,
+    int lane_idx,
+    typename OutputTileIterator::Params params_C,
+    typename OutputTileIterator::Params params_D,
+    typename OutputTileIterator::Element *ptr_C,
+    typename OutputTileIterator::Element *ptr_D,
+    ElementNorm *ptr_Max = nullptr,
+    ElementSum *ptr_Sum = nullptr,
+    cutlass::MatrixCoord const &threadblock_offset = cutlass::MatrixCoord(0, 0),
+    int column_offset = 0,
+    cutlass::MatrixCoord const &problem_size_real = cutlass::MatrixCoord(0, 0),
+    float infinity = 10000.0f
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size),
+    elementwise_(params.elementwise),
+    iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+    iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+    ptr_Max_(ptr_Max),
+    ptr_Sum_(ptr_Sum),
+    column_offset_(column_offset),
+    extent_real_(problem_size_real),
+    infinity_(infinity)
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+    
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // Clear accumulators for max and sum when starting a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+    using Mul = cutlass::multiplies<SoftmaxFragment>;
+    using Minus = cutlass::minus<SoftmaxFragment>;
+    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
+
+    Minus     minus;
+    Exp       exponential;
+
+    SoftmaxFragment result;
+
+    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+
+    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      result = source_converter(elementwise_(accum));
+    }else{
+      result = source_converter(elementwise_(accum, source_vector));
+    }
+
+    thread_offset_ =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+
+    bool column_guard = (thread_offset_.column() < extent_.column());
+
+    if (kUseMasking) {
+      int elements_in_boundary = extent_real_.column() - thread_offset_.column();
+      elements_in_boundary = (elements_in_boundary > kElementsPerAccess) ? kElementsPerAccess : elements_in_boundary;
+      elementwise_padding_(result, elements_in_boundary);
+    }
+
+    ElementSoftmaxCompute accum_max_prev = accum_max_;
+
+    // Compute the maximum within one row
+    if (!column_idx) {
+      // This is the first fragment in a new row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result);
+      }
+    }
+    else {
+      // This is an additional fragment in the same row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result, accum_max_);
+      }
+    }
+
+    // proactively compute max in warps
+    accum_max_ = warp_reduce_max_(accum_max_);
+
+    ElementSoftmaxCompute updater = fast_exp(accum_max_prev - accum_max_);
+
+    SoftmaxFragment intermediate = exponential(minus(result, accum_max_));
+
+    if (kHasMultiStepsInRow) {
+      if (!column_idx) {
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate) : ElementSoftmaxCompute(0);
+      } else {
+        // Algorithm in $3.1, https://arxiv.org/pdf/2205.14135v1.pdf
+        // S* = S* x updater + sum_row(P'), where updater = exp(M* - M_row)
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate, accum_sum_ * updater) : accum_sum_ * updater;
+      }
+    } else {
+      accum_sum_ = (column_guard) ? sum_accumulator_(intermediate, accum_sum_) : ElementSoftmaxCompute(0);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    // Compute accumulate sum only in the last step
+    accum_sum_ = warp_reduce_sum_(accum_sum_);
+
+    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
+    bool row_guard = thread_offset_.row() < extent_.row();
+    bool is_write_thread = row_guard && is_first_thread_in_tile;
+
+    int block_batch = blockIdx.z;
+
+    ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
+    ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
+
+    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
+              convert_norm_output(accum_max_),
+              (void *)curr_ptr_max,
+              is_write_thread);
+
+    arch::global_store<ElementSum, sizeof(ElementSum)>(
+              convert_sum_output(accum_sum_),
+              (void *)curr_ptr_sum,
+              is_write_thread);
+
+    // Clear accumulators for max and sum when finishing a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void elementwise_padding_(SoftmaxFragment &result, int elements_in_boundary) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      result[i] = (i < elements_in_boundary) ? result[i] : ElementSoftmaxCompute(-infinity_);
+    }
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_sum_(ElementSoftmaxCompute sum_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
+      sum_ += tmp;
+    }
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_max_(ElementSoftmaxCompute max_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
+      max_ = fast_max(max_, tmp);
+    }
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  void clear_accum_() {
+
+    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+    accum_max_ = ElementSoftmaxCompute(min_float);
+    accum_sum_ = ElementSoftmaxCompute(0);
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute sum_) {
+    // ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute max_ = accum[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
new file mode 100755
index 000000000..9bae7a742
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
@@ -0,0 +1,923 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+
+  \brief Threadblock-level epilogue computing:
+    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+    D = activation(Aux)
+
+    if Aux is fp8 type:
+        abs_max_output = max( abs(aux) | (for every aux in Aux))
+        Aux = scale_aux * Aux
+    endif
+
+    if D is fp8 type:
+        abs_max_output = max( abs(d) | (for every d in D))
+        D = scale_d * D
+    endif
+
+    Parameter Aux is optionally stored to global memory
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#include <cuda/std/utility>
+#else
+#include <assert.h>
+#include <utility>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper class for keeping track of absolute maximums and performing scaling
+template <
+  typename Iterator,        // Iterator type used for storing the data for which absolute maximum and scaling
+                            // will be computed. This type is used for predicating absolute maximum calculations.
+  typename Fragment,        // Type of input to be computed on
+  bool ScalingAndAmaxNeeded // Whether to perform absolute maximum and scaling operations
+>
+struct ScalingAndAmaxHelper;
+
+/// Partial specialization that does not perform scaling or calculate an absolute maximum
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, false> {
+  using Element = typename Fragment::Element;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale) { }
+
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& inp) {
+    return inp;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return Element(0.);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) { }
+};
+
+/// Partial specialization that keeps track of an absolute maximum value of inputs seen
+/// and scales inputs
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, true> {
+  using Element = typename Fragment::Element;
+  using AccessType = typename Iterator::AccessType;
+  using ThreadMap = typename Iterator::ThreadMap;
+
+  Element abs_max;
+  Element scale;
+
+  // Operators
+  maximum_with_nan_propogation<Element> max_op;
+  absolute_value_op<Element> abs_op;
+  multiplies<Fragment> multiply;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale_) : abs_max(0.), scale(scale_) { }
+
+  // Compute the absolute maximum value between `abs_max` and the entries
+  // of `frag` for predicated-on entries of `iterator`. Return a scaled
+  // version of `inp`.
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& frag) {
+    using PredicateGroup = Array<Element, Iterator::ThreadMap::kElementsPerAccess>;
+    PredicateGroup const *frag_ptr = reinterpret_cast<PredicateGroup const *>(&frag);
+
+    typename Iterator::Mask mask;
+    iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + iterator.thread_start_row()) < iterator.extent_row());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask.predicates[column];
+
+            if (guard) {
+              int access_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+              CUTLASS_PRAGMA_UNROLL
+              for (int i = 0; i < PredicateGroup::kElements; ++i) {
+                abs_max = max_op(abs_max, abs_op(frag_ptr[access_idx][i]));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Perform scaling
+    return multiply(scale, frag);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return abs_max;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) {
+    scale = scale_;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AuxOutputTileIterator_,          ///< Tile iterator writing auxiliary output tensors
+  typename ElementVector_,                  ///< Data type of bias vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsen the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithAbsMax :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AuxOutputTileIterator = AuxOutputTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = typename OutputOp::ElementAbsmax;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Helpers for (optionally) computing absolute maximums and scaling output and auxiliary output
+  using OutputScaler = detail::ScalingAndAmaxHelper<OutputTileIterator,
+                                                    FragmentCompute,
+                                                    OutputOp::kIsScalingAndAmaxOutputNeeded>;
+
+  using AuxOutputScaler = detail::ScalingAndAmaxHelper<AuxOutputTileIterator,
+                                                       FragmentCompute,
+                                                       OutputOp::kIsScalingAndAmaxAuxOutputNeeded>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of auxiliary output
+  using ElementAuxOutput = typename AuxOutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Auxiliary output access type
+  using AuxAccessType = Array<ElementAuxOutput, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithAbsMax(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp &output_op,                              ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    OutputScaler output_scaler(output_op.get_scale_d());
+
+    AuxOutputScaler aux_scaler(output_op.get_scale_aux());
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+
+    // Store the absolute maximum values of the output and auxiliar tensors, if needed.
+    if (output_op.get_ptr_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(output_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_output_abs_max(), local_abs_max);
+    }
+
+    if (output_op.get_ptr_aux_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(aux_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_aux_output_abs_max(), local_abs_max);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp &output_op,                              ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        FragmentCompute frag_Z_compute;
+        FragmentCompute frag_Aux_compute;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z_compute,
+          frag_Aux_compute,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+        frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+        NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                              OutputTileIterator::Fragment::kElements> cvt_to_dst;
+        typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+        // Always store the output
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+
+        // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+        if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+          frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+          NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                                AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+          typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+          aux_iterator.store(frag_Aux);
+          ++aux_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp &output_op,                          ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      FragmentCompute frag_Z_compute;
+      FragmentCompute frag_Aux_compute;
+
+      apply_output_operator_(
+        frag_Z_compute,
+        frag_Aux_compute,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+      frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+      NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                            OutputTileIterator::Fragment::kElements> cvt_to_dst;
+      typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+      // Always store the output
+      destination_iterator.store(frag_Z);
+      ++destination_iterator;
+
+      // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+      if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+        frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+        NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                              AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+        typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+        aux_iterator.store(frag_Aux);
+        ++aux_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_Aux_ptr[i],
+          frag_AB_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn],
+          frag_C_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i],
+        frag_Aux_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
new file mode 100755
index 000000000..7e6d2a698
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
@@ -0,0 +1,1718 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#include <cuda/std/utility>
+#else
+#include <assert.h>
+#include <utility>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueWithBroadcastOpBase {
+  
+  using ElementOutput = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = StoreZ;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT;
+
+  /// Parameters structure - required
+  struct Params { };
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  EpilogueWithBroadcastOpBase(Params const &params_) { }
+
+  /// Determine if the source is needed. May return false if 
+  bool is_source_needed() const {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) { }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C1,
+    FragmentC const &frag_C2,
+    FragmentCompute const &V) const {
+
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }  
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }  
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueWithBroadcast;
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = false;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,              ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,              ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator1,
+        source_iterator2,
+        tensor_iterator);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+    
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+      
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,          ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,          ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment1;
+    source_fragment1.clear();
+    typename OutputTileIterator::Fragment source_fragment2;
+    source_fragment2.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator1.load(source_fragment1);
+      ++source_iterator1;
+
+      source_iterator2.load(source_fragment2);
+      ++source_iterator2;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment1,
+        source_fragment2,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C1,
+    typename OutputTileIterator::Fragment const &frag_C2,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C1_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C1);
+
+    OutputAccessType const *frag_C2_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C2);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C1_ptr[i],
+          frag_C2_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i], 
+        frag_T_ptr[i], 
+        frag_AB_ptr[i], 
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+        OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment1;
+      source_fragment1.clear();
+      typename OutputTileIterator::Fragment source_fragment2;
+      source_fragment2.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator1 += reduce_fragment_idx;
+        source_iterator1.load(source_fragment1);
+
+        source_iterator2 += reduce_fragment_idx;
+        source_iterator2.load(source_fragment2);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment1,
+          source_fragment2,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator += reduce_fragment_idx;
+        destination_iterator.store(frag_Z);
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator += reduce_fragment_idx;
+        tensor_iterator.store(frag_T);
+      }
+    }
+};
+
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator,
+        tensor_iterator);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+    
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+      
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i], 
+        frag_T_ptr[i], 
+        frag_AB_ptr[i], 
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment;
+      source_fragment.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator += reduce_fragment_idx;
+        source_iterator.load(source_fragment);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
new file mode 100755
index 000000000..1d4c7016b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
@@ -0,0 +1,823 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with reduction over each column 
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands
+  typename ElementVector_,                  ///< Pointer to reduction vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename ReductionOp_,                    ///< Reduction operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithReduction : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using ReductionOp = ReductionOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  static bool const kIsSingleSource = true;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used in reduction
+  using ReductionFragment = Array<
+    ElementAccumulator, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  /// Used for the reduction
+  struct ReductionDetail {
+
+    /// If true, accumulator coordinates are computed and out-of-bounds checks are enabled when
+    /// performing the reduction.
+    static bool const kOobCheck = false;
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("ReductionDetail {\n");
+      printf(
+        "  kElementsPerAccess:%d\nkColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kElementsPerAccess,
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+      AlignedArray<ElementAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;    ///< Shared storage for reduction
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Shared memory pointer fo rreduction
+  ElementAccumulator *reduction_ptr_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithReduction(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    reduction_ptr_(shared_storage.reduction.data()),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector * reduction_output_ptr,             ///< Reduction output vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    ReductionFragment reduction_fragment;
+    reduction_fragment.clear();
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        reduction_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator,
+        problem_size,
+        threadblock_offset);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        reduction_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator,
+        tensor_iterator,
+        problem_size,
+        threadblock_offset);
+    }
+
+    if (output_op.participates_in_reduction()) {
+      reduction_(problem_size, threadblock_offset, reduction_output_ptr, reduction_fragment);
+    }
+  }
+
+private:
+
+  /// Perform the reduction
+  CUTLASS_DEVICE
+  void reduction_(
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset,            ///< Problem size needed to guard against out-of-bounds accesses
+    ElementVector * reduction_output_ptr,          ///< Reduction output vector
+    ReductionFragment const & reduction_fragment) {
+
+    //
+    // Store the partially reduced value to SMEM
+    //
+
+    // Guard against uses of the existing SMEM tile
+    __syncthreads();
+    
+    using AccessType = AlignedArray<ElementAccumulator, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Determine a compacted thread arrangement to store to SMEM.
+    //
+    int const kThreadsPerRow = Shape::kN / (ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess);
+
+    MatrixCoord thread_offset(
+      thread_idx_ / kThreadsPerRow, 
+      (thread_idx_ % kThreadsPerRow) * ThreadMap::kElementsPerAccess);
+   
+    //
+    // Each thread store its fragment to a SMEM
+    //
+
+    AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
+      &reduction_ptr_[thread_offset.row() * Shape::kN + thread_offset.column()]);
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&reduction_fragment);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+      int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
+
+      aligned_reduction_ptr[col_idx] = frag_ptr[column];
+    }
+
+    __syncthreads();
+
+    //
+    // Now, threads are assigned several columns of the output. They fetch over all rows from
+    // the compacted SMEM tile and perform a reduction.
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
+      int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
+
+      ReductionOp reduction_op;
+      ElementAccumulator reduction_element = ElementAccumulator();
+
+      int output_column_idx = threadblock_offset.column() + column_idx;
+
+      if (column_idx < Shape::kN && output_column_idx < problem_size.column()) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
+          if (row) {
+            auto frag = reduction_ptr_[row * Shape::kN + column_idx];
+
+            reduction_element = reduction_op(reduction_element, frag);
+          }
+          else {
+
+            reduction_element = reduction_ptr_[column_idx];
+          }
+        }
+
+        // Store
+        reduction_output_ptr[column_idx] = ElementVector(reduction_element);
+      }
+    }
+  }
+
+  template<class Seq>
+  struct acc2smem;
+
+  template <size_t... Seq>
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    ReductionFragment &reduction_fragment,            ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additioanl tensor operand
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    typename TensorTileIterator::Fragment tensor_fragment;
+    tensor_fragment.clear();
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert and store fragment
+      //
+
+      tensor_iterator.load(tensor_fragment);
+      ++tensor_iterator;
+      
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      //
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      //
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      FragmentCompute compute_fragment;
+
+      apply_output_operator_source_not_needed_(
+        reduction_fragment,
+        compute_fragment, 
+        output_op, 
+        aligned_accum_fragment[0],
+        tensor_fragment,
+        destination_iterator);
+
+      //
+      // Store the final result
+      //
+      
+      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
+
+      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    ReductionFragment &reduction_fragment,        ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    TensorTileIterator tensor_iterator,            ///< Threadblock tile iterator for additioanl tensor operand
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    typename TensorTileIterator::Fragment tensor_fragment;
+    tensor_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_fragment.clear();
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      tensor_iterator.load(tensor_fragment);
+      ++tensor_iterator;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      FragmentCompute compute_fragment;
+
+      apply_output_operator_(
+        reduction_fragment, 
+        compute_fragment, 
+        output_op, 
+        aligned_accum_fragment[0], 
+        source_fragment,
+        tensor_fragment,
+        destination_iterator);
+
+      //
+      // Convert and store the final result
+      //
+
+      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
+
+      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
+
+      destination_iterator.store(output_fragment);      
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    ReductionFragment &reduction_fragment,
+    FragmentCompute &compute_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment,
+    typename TensorTileIterator::Fragment const &tensor_fragment,
+    OutputTileIterator const & destination_iterator) {
+      
+    ComputeAccessType *compute_frag_ptr = 
+      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
+
+    AccumulatorAccessType const *accum_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    OutputAccessType const *source_frag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    TensorAccessType const *tensor_frag_ptr =
+      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], source_frag_ptr[i], tensor_frag_ptr[i]);
+    }
+
+    //
+    // Partial reduction over each column
+    //
+
+    ReductionOp reduction_op;
+
+    typename OutputTileIterator::Mask mask;
+    destination_iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
+
+      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
+      bool column_guard = mask.predicates[column_vector_idx];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
+
+        bool fetch;
+        if (ReductionDetail::kOobCheck) {
+          int row_idx = (row % ThreadMap::Iterations::kRow);
+          int residual = (row / ThreadMap::Iterations::kRow);
+
+          int group_idx = (residual % ThreadMap::Iterations::kGroup);
+          residual = (residual / ThreadMap::Iterations::kGroup);
+
+          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
+
+          int row_offset = row_idx * ThreadMap::Delta::kRow 
+            + group_idx * ThreadMap::Delta::kGroup 
+            + cluster_idx * ThreadMap::Delta::kCluster;
+
+          int output_row = destination_iterator.thread_start_row() + row_offset;
+
+          fetch = (output_row < destination_iterator.extent_row() && column_guard);
+        }
+        else {
+          fetch = true;
+        }
+
+        ElementCompute value = ElementCompute();
+        if (fetch) {
+          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
+        }
+
+        reduction_fragment[column] = reduction_op(
+          reduction_fragment[column], 
+          value);
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    ReductionFragment &reduction_fragment,
+    FragmentCompute &compute_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename TensorTileIterator::Fragment const &tensor_fragment,
+    OutputTileIterator const & destination_iterator
+  ) {
+    
+    ComputeAccessType *compute_frag_ptr = 
+      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
+
+    AccumulatorAccessType const *accum_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    TensorAccessType const *tensor_frag_ptr =
+      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], tensor_frag_ptr[i]);
+    }
+
+    //
+    // Partial reduction over each column
+    //
+
+    ReductionOp reduction_op;
+
+    typename OutputTileIterator::Mask mask;
+    destination_iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
+
+      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
+      bool column_guard = mask.predicates[column_vector_idx];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
+
+        bool fetch;
+        if (ReductionDetail::kOobCheck) {
+          int row_idx = (row % ThreadMap::Iterations::kRow);
+          int residual = (row / ThreadMap::Iterations::kRow);
+
+          int group_idx = (residual % ThreadMap::Iterations::kGroup);
+          residual = (residual / ThreadMap::Iterations::kGroup);
+
+          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
+
+          int row_offset = row_idx * ThreadMap::Delta::kRow 
+            + group_idx * ThreadMap::Delta::kGroup 
+            + cluster_idx * ThreadMap::Delta::kCluster;
+
+          int output_row = destination_iterator.thread_start_row() + row_offset;
+
+          fetch = (output_row < destination_iterator.extent_row() && column_guard);
+        }
+        else {
+          fetch = true;
+        }
+
+        ElementCompute value = ElementCompute();
+        if (fetch) {
+          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
+        }
+
+        reduction_fragment[column] = reduction_op(
+          reduction_fragment[column], 
+          value);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
new file mode 100755
index 000000000..6ab9cf069
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
@@ -0,0 +1,409 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Generic epilogue for implementing certain kinds of fused epilogue behavior.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class EpilogueFusedVisitorConcept {
+public:
+
+  static int const kIterations = 1;
+  static int const kElementsPerAccess = 4;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+
+  /// Arguments structure
+  struct Arguments {  };
+
+  /// Params structure
+  struct Params {
+
+    Params() { }
+    Params(Arguments const &args) { }
+  };
+
+  /// Shared storage
+  struct SharedStorage { };
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueFusedVisitorConcept(
+    Params const &params,                                         ///< Parameters routed to the epilogue
+    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
+    MatrixCoord const &problem_size,                              ///< Problem size of the output
+    int thread_idx,                                               ///< Thread index within the threadblock
+    int warp_idx,                                                 ///< Warp index within the threadblock
+    int lane_idx,                                                 ///< Lane index within the warp
+    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)) {  ///< Coordinate
+
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Visitor_,                        ///< Functor containing fused operations (satisfies EpilogueFusedVisitorConcept)
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (true || !IsEpilogueFunctorHeavy<Visitor_>::value)
+>
+class EpilogueWithVisitor :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Visitor = Visitor_;
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = Visitor::kElementsPerAccess;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using SharedStorage = typename Base::SharedStorage;
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitor(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.reference(), thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    Visitor & visitor,
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    visitor.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? Visitor::kIterations : 1)
+    for (int iter_idx = 0; iter_idx < Visitor::kIterations; ++iter_idx) {
+
+      //
+      // Load the source
+      //
+
+      visitor.begin_step(iter_idx);
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<Visitor::kIterations>>::push(
+          iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1) {
+
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Iterate over output fragments
+      //
+
+      AccumulatorAccessType const *accum_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+      int const kAccumulatorFragmentCount = AccumulatorTile::kElements / (Visitor::kIterations * AccumulatorAccessType::kElements);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+        int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+        int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+        // Start a new row of the output fragment
+        if (!col_idx) {
+          visitor.begin_row(row_idx);
+        }
+
+        visitor.visit(
+          iter_idx,
+          row_idx,
+          col_idx,
+          idx,
+          accum_frag_ptr[idx]
+        );
+
+        // End the row of the output fragment
+        if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+          visitor.end_row(row_idx);
+        }
+      }
+
+      //
+      // Conclude the step
+      //
+
+      visitor.end_step(iter_idx);
+    }
+
+    visitor.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to create an EpilogueWithVisitor from an existing epilogue
+template <typename Visitor_, typename Existing_, bool IterationsUnroll = true>
+struct EpilogueWithVisitorFromExistingEpilogue  {
+
+  using Epilogue = EpilogueWithVisitor<
+    Visitor_,
+    typename Existing_::Shape,
+    typename Existing_::WarpMmaOperator,
+    Existing_::kPartitionsK,
+    typename Existing_::AccumulatorFragmentIterator,
+    typename Existing_::WarpTileIterator,
+    typename Existing_::SharedLoadIterator,
+    typename Existing_::Padding,
+    Existing_::kFragmentsPerIteration,
+    IterationsUnroll
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
new file mode 100755
index 000000000..259f0729c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
@@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ /*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace detail {
+
+struct EVT2xBase { };
+
+template <class T>
+static constexpr bool is_2x_evt_v = platform::is_base_of<EVT2xBase, T>::value;
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename DefaultEpilogue,                 ///< Default Epilogue Descriptor
+  typename FusionCallbacks_,                ///< The called fusion callbacks
+  int Stages = 2,                           ///< Software pipeline stages for epilogue
+  int IterationsUnroll = true               ///< Used to reduce binary size when epilogue op is large
+>
+class EpilogueWithVisitorCallbacks :
+  public EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>,
+  public EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>,
+  public detail::EVT2xBase
+   {
+
+public:
+
+  static_assert(Stages <= 2, "Sm80 EVT only support upto 2 Stages.");
+
+  // Whether the epilogue is pipelined
+  static bool constexpr Pipelined = Stages > 1;
+
+  using FusionCallbacks = FusionCallbacks_;
+
+  using OutputTileIterator = typename DefaultEpilogue::OutputTileIterator;
+  // Number of epilogue iterations. 
+  // Each iteration processes a 8xThreadblockTile::kN output tile
+  static const int kIterations = OutputTileIterator::kIterations;
+
+  using Base = EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>;
+  
+  using BaseStreamK = EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>;
+
+  static int const kPartitionsK = DefaultEpilogue::kPartitionsK;
+
+  using AccumulatorFragmentIterator = typename DefaultEpilogue::AccumulatorFragmentIterator;
+  using WarpTileIterator = typename DefaultEpilogue::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultEpilogue::SharedLoadIterator;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  struct OutputOp{
+    using ElementAccumulator = ElementAccumulator;
+    using Params = typename FusionCallbacks::Arguments;
+  };
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  // Output access size
+  static int const kElementsPerAccess = DefaultEpilogue::kElementsPerAccess;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using Params = typename FusionCallbacks::Params;
+
+  static size_t constexpr kSmemStageOffset = sizeof(Base::SharedStorage) / sizeof(ElementAccumulator);
+  static int constexpr kAccumulatorFragmentCount = AccumulatorTile::kElements / (kIterations * AccumulatorAccessType::kElements) / kPartitionsK;
+
+  struct SharedStorage {
+    typename Base::SharedStorage acc_smem[Stages];
+    typename FusionCallbacks::SharedStorage callback_smem;
+  };
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  FusionCallbacks fusion_callbacks;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitorCallbacks(
+    const Params &params_callbacks,   ///< Epilogue Visitor params
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.acc_smem[0], thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx),
+    shared_load_iterator_(shared_storage.acc_smem[0].reference(), thread_idx),
+    fusion_callbacks(params_callbacks, shared_storage.callback_smem)
+  { }
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      cutlass::gemm::GemmCoord threadblock_tile_offset,
+      ProblemShape problem_shape,
+      int thread_idx) 
+  {
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    callbacks.begin_step(reduce_fragment_idx);
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    //
+    // Iterate over output fragment
+    //
+
+    AccumulatorAccessType const *accum_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+      int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+      int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+      // Start a new row of the output fragment
+      if (!col_idx) {
+        callbacks.begin_row(row_idx);
+      }
+
+      callbacks.visit(
+        reduce_fragment_idx,
+        row_idx,
+        col_idx,
+        idx,
+        accum_frag_ptr[idx]
+      );
+
+      // End the row of the output fragment
+      if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+        callbacks.end_row(row_idx);
+      }
+    }
+
+    callbacks.end_step(reduce_fragment_idx);
+    callbacks.end_epilogue();
+  }
+
+  /// Streams the result to global memory
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void operator()(
+    AccumulatorTile const &accumulators,
+    cutlass::gemm::GemmCoord threadblock_tile_offset,
+    ProblemShape problem_shape,
+    int thread_idx
+    ) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    if constexpr(Pipelined){
+      __syncthreads();
+
+      //
+      // Pipeline Prologue
+      //
+      size_t warp_iterator_offset = kSmemStageOffset;
+      size_t smem_iterator_offset = kSmemStageOffset;
+      callbacks.begin_step(0);
+    
+      acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            0, accum_fragment_iterator, this->warp_tile_iterator_);
+      
+      this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+      warp_iterator_offset = -warp_iterator_offset;
+
+      //
+      // Pipeline Loop
+      //
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 1; iter_idx < kIterations + 1; ++iter_idx) {
+
+        __syncthreads();
+
+        // Skip the load for epilogue
+        if (iter_idx < kIterations) {
+          callbacks.begin_step(iter_idx);
+
+          acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+              iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+          this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+          warp_iterator_offset = -warp_iterator_offset;
+        }
+        
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        shared_load_iterator_.add_pointer_offset(smem_iterator_offset);
+        smem_iterator_offset = -smem_iterator_offset;
+        
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx-1,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx-1);
+      }
+    } else {
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
+
+        //
+        // Load the source
+        //
+
+        callbacks.begin_step(iter_idx);
+
+        //
+        // Convert and store fragment
+        //
+
+        __syncthreads();
+
+        acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+        __syncthreads();
+
+        //
+        // Load fragments from shared memory
+        //
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx);
+      }
+    }
+
+    callbacks.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
new file mode 100755
index 000000000..d41a0fa43
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs.
+
+  This does not attempt to target any particular output layout. Instead, each threadblock
+  streams out its accumulator elements using 128b store operations. This assumes all threadblocks
+  have unique output tiles.
+
+  The target data layout is:
+  - threadblock indices mapped to linear offsets as (m, n, k), where m is fastest-changing
+  - threadblock output space partitioned into warps; each warp's region is contiguous
+  - per-thread accumulators partitioned into 128b accesses
+  - output memory striped across the threads of a warp
+
+  This enables very fast streaming of data, completely limited by the memory system. No predication
+  or data exchange is performed, and each threadblock is assumed to have a full region of memory
+  to write to.
+
+  This epilogue establishes an upper bound for epilogue performance and is suitable for
+  reductions across the GEMM K dimension which require a separate workspace.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,      ///< shape of accumulator tile (concept: MatrixShape)
+  int WarpCount,        ///< number of warps
+  typename FragmentC_   ///< warp-level GEMM operator (concept: gemm::warp::Mma)
+>
+class EpilogueWorkspace {
+public:
+
+  using Shape = Shape_;
+  using FragmentC = FragmentC_;
+  using ElementC = typename FragmentC::value_type;
+
+  static int const kWarpCount = WarpCount;
+
+  /// Optimize for 128b accesses
+  static int const kAccessSizeInBits = 128;
+
+  /// Warp size from the perspective of memory operations
+  static int const kWarpSize = 32;
+
+  /// Vector length of accesses
+  static int const kElementsPerAccess = 
+    kAccessSizeInBits / sizeof_bits<ElementC>::value;
+
+  /// Number of stores per thread
+  static int const kIterations = FragmentC::kElements / kElementsPerAccess;
+
+  static_assert(
+    !(FragmentC::kElements % kElementsPerAccess), 
+    "The number of accumulators must be divisible by the access size.");
+
+  /// Total number of vectorized accesses in warp (in units of vector)
+  static int const kWarpAccesses = kIterations * kWarpSize;
+
+  /// Total number of vectorized accesses in threadblock tile (in units of vector)
+  static int const kThreadblockAccesses = kWarpAccesses * kWarpCount;
+
+  /// Parameters structure
+  struct Params {
+
+    /// Pointer to C matrix
+    ElementC *ptr_C;
+
+    /// Stride between tiles along the GEMM N dimension (in units of vectors)
+    int stride_n;
+
+    /// Stride between tiles along the GEMM K dimension (in units of vectors)
+    int stride_k;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementC *ptr_C,   ///< Pointer to C matrix
+      int stride_n_,      ///< Stride between tiles along the GEMM N dimension (in units of ElementC)
+      int stride_k_       ///< Stride between tiles along the GEMM K dimension (in units of ElementC)
+    ):
+      ptr_C(ptr_C), stride_n(stride_n_ / kElementsPerAccess), stride_k(stride_k_ / kElementsPerAccess) {
+
+    }
+  };
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    // Intentionally empty
+  };
+
+private:
+
+  struct alignas((kAccessSizeInBits / 8)) AccessType {
+    Array<ElementC, kElementsPerAccess> storage;
+  };
+
+  /// Constant reference to parameters object
+  AccessType *pointer_;
+
+  /// Stride between tiles along the n dimension (in vectors)
+  int stride_n_;
+
+  /// Stride between tiles along the k dimension (in vectors)
+  int stride_k_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWorkspace(
+    Params const &params,     ///< Host-constructable params object
+    SharedStorage &,          ///< Shared storage object
+    int warp_idx,             ///< ID of warp within threadblock
+    int lane_idx              ///< Id of thread within warp
+
+  ):
+    pointer_(reinterpret_cast<AccessType *>(params.ptr_C)),
+    stride_n_(params.stride_n), 
+    stride_k_(params.stride_k) {
+
+    // Add per-thread offset
+    pointer_ += lane_idx + warp_idx * kWarpAccesses;
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    cutlass::gemm::GemmCoord problem_size,       ///< Problem size of GEMM (units of ElementC)
+    cutlass::gemm::GemmCoord tb_tile_coord,      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    FragmentC const &accum) {     ///< Accumulator tile
+    
+    // Compute offset for entire threadblock (note, per-thread offset has been folded in already)
+    AccessType *pointer = pointer_ + 
+      tb_tile_coord.m() * kThreadblockAccesses + 
+      tb_tile_coord.n() * stride_n_ +
+      tb_tile_coord.k() * stride_k_;
+
+    // Cast to vectorized view of accumulator fragments
+    AccessType const * src_pointer = reinterpret_cast<AccessType const *>(&accum);
+
+    // Write out accumulators at full speed
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kIterations; ++i) {
+      pointer[i * kWarpSize] = src_pointer[i];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
new file mode 100755
index 000000000..8b1cd4fd3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
@@ -0,0 +1,433 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <class... Ops>
+struct VisitorImpl2x: fusion::detail::Sm90VisitorImplBase<Ops...> {
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::Sm90VisitorImplBase;
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::ops;
+
+  template <class CallbacksTuple>
+  struct Callbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    /// Called at the start of the epilogue just before iterating over accumulator slices
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin_epilogue();
+        }
+      );
+    }
+
+    /// Called at the start of one step before starting accumulator exchange
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_step(step_idx);
+        }
+      );
+    }
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after accumulators have been exchanged for each accumulator vector
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    end_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after all accumulator elements have been visited
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_step(step_idx);
+        }
+      );
+    }
+
+    /// Called after all steps have been completed
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end_epilogue();
+        }
+      );
+    }
+  };
+
+  // Callbacks factory
+  // All operations must redefine this
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_callbacks(
+          threadblock_tile_offset,
+          thread_idx,
+          problem_shape);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return Callbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyCallbacks = VisitorImpl2x<>::Callbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct TreeVisitor2x : VisitorImpl2x<ChildOps..., NodeOp> {
+
+  using VisitorImpl2x<ChildOps..., NodeOp>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc);
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+        },
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<
+    decltype(VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+struct TopologicalVisitor2x : VisitorImpl2x<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using VisitorImpl2x<Ops...>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+          // Compute the current op with children inputs
+          [&] (auto const&... frg_inputs) {
+            auto frg_output = callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+            using ElementOutput = typename decltype(frg_output)::Element;
+            using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+            ConvertOutput convert_output{};
+
+            return convert_output(frg_output);
+          },
+          // Get inputs in the sequence given by the children indices of the current op
+          edge_seq
+        );
+        return frg_compute;
+      },
+      // Visit the last op
+      [&] (auto const&...ops) {
+        return cute::detail::apply(frg_compute_tuple,
+          // Compute the last op with children inputs
+          [&] (auto const&... frg_inputs) {
+            return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+          },
+          // Get inputs in the sequence given by the children indices of the last op
+          get<Rm1>(EdgeTuple{})
+        );
+      },
+      // Transform to visit R-1 ops, apply to visit last op
+      make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<decltype(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+
+template <class NodeOp, class... ChildOps>
+using Sm80EVT = TreeVisitor2x<NodeOp, ChildOps...>;
+
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+using Sm80TopologicalVisitor = TopologicalVisitor2x<ElementCompute, EdgeTuple, Ops...>;
+
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// OutputTileThreadLayout translate the CUTLASS 2.X OutputTileOptimalThreadMap into cute layout
+// used by CUTLASS 3.X Epilogue
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  int Stages_=1
+>
+struct OutputTileThreadLayout: DefaultThreadMapTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  ThreadblockShape_::kK/WarpShape_::kK,
+  Element_,
+  ElementsPerAccess>::Type {
+
+  using Base = typename DefaultThreadMapTensorOp<
+    ThreadblockShape_,
+    WarpShape_,
+    ThreadblockShape_::kK/WarpShape_::kK,
+    Element_,
+    ElementsPerAccess>::Type;
+  using Base::Base;
+
+  // Software pipeline stages in epilogue
+  static_assert(Stages_ <= 2, "Sm80 EVT only support upto 2 Stages.");
+  static const int Stages = Stages_;
+
+  using ThreadShape = cute::Shape<
+    cute::Int<Base::Detail::kAccessWidth>,                 // lane col idx
+    cute::Int<Base::Detail::kAccessRows>,                  // lane row idx
+    cute::Int<Base::Detail::kWarpsRemainingForRows>,       // warp row idx
+    cute::Int<Base::Shape::kGroup>,                        // group idx
+    cute::Int<Base::Shape::kCluster>                       // cluster idx
+  >;
+
+  using Shape = typename Base::Shape;
+  using Count = typename Base::Count;
+
+  using ThreadMapShape = cute::Shape<
+    // Column
+    Int<Base::kElementsPerAccess>,                // vector
+    Int<Base::Detail::kAccessWidth>,              // lane_col_coord
+    Int<Base::Iterations::kColumn>,               // iteration::column
+    // Row
+    Int<Base::Detail::kAccessRows>,               // lane_row_coord
+    Int<Base::Iterations::kRow>,                  // iterations in row
+    Int<Base::Detail::kWarpsRemainingForRows>,    // warp_row_coord
+    Int<Count::kRow>,                             // iteration::row
+    Int<Count::kGroup>,                           // iteration::group
+    Int<Shape::kGroup>,                           // group_coord
+    Int<Count::kCluster>,                         // iteration::cluster
+    Int<Shape::kCluster>                          // cluster_coord
+  >;
+
+  // The shape of CTA Tile
+  using CtaShapeMNL = cute::Shape<
+    Int<
+      Shape::kRow * Count::kRow *
+      Shape::kGroup * Count::kGroup *
+      Shape::kCluster * Count::kCluster
+    >,
+    Int<Shape::kColumn * Count::kColumn>,
+    _1
+  >;
+
+  static const int kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  static auto tid2coord(int thread_idx) {
+    return cute::idx2crd(thread_idx, ThreadShape{});
+  }
+
+  template <class TensorInput>
+  CUTLASS_DEVICE
+  static auto partition(TensorInput &&xT, int thread_idx, gemm::GemmCoord threadblock_tile_offset) {
+
+    // (BLK_M,BLK_N)
+    Tensor bCxT = local_tile(
+      xT, CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+    )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k());
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = tid2coord(thread_idx);
+
+    // transform to column-major
+    Tensor bCxT_nm = make_tensor(
+      std::forward<decltype(bCxT)>(bCxT).data(), make_layout(get<1>(bCxT.layout()), get<0>(bCxT.layout()))
+    ).compose(make_layout(ThreadMapShape{}));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    return bCxT_nm(_,lane_col_coord,_,lane_row_coord,_,warp_row_coord,_,_,group_coord,_,cluster_coord);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
new file mode 100755
index 000000000..69a0feab2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
@@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct VisitorCompute : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+          ComputeOutput compute_output{};
+          ConvertOutput convert_output{};
+
+          return convert_output(compute_output(cvt_frg_inputs...));
+        }
+      );
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
new file mode 100755
index 000000000..7a332f11f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
@@ -0,0 +1,583 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct VisitorAccFetch : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      return frg_acc;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks{};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar broadcast
+template<
+  class Element,
+  class StrideMNL = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct VisitorScalarBroadcast {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) || // scalar broadcast, e.g. alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_1>>) ||
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  struct Callbacks: EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    // Get the scalar for batched broadcast
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> ||
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+      update_scalar(threadblock_tile_offset.k());
+    }
+    return Callbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][l_offset]);
+      } else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorAuxLoad{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  // Software pipeline stages
+  static const int Stages = ThreadMap::Stages;
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux(_,_,_,step_idx%Stages));
+      auto src_v = filter(tC_gAux(_,_,_,step_idx));
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_rAux(_,_,_,step_idx%Stages));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux(_,_,_,iter_idx%Stages)));
+      return tC_rAux_frg(frg_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(
+      group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, Stages
+    Tensor tC_rAux = make_tensor<VecType>(
+      make_layout(flatten(make_shape(take<0,3>(tC_gAux.shape()), Int<Stages>{}))));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowBroadcast {
+
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = get<1>(coord_v(i)) < n;
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColBroadcast {
+
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rCol);
+      Tensor pred = make_tensor<bool>(shape(tC_gCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tC_cCol(i)) < m;
+      }
+      copy_if(pred, tC_gCol, tC_rCol);
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
new file mode 100755
index 000000000..1c24e22d5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
@@ -0,0 +1,805 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL
+>
+struct VisitorAuxStore{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(frg_idx) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      auto src_v = filter(tC_rAux);
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_gAux(_,_,_,step_idx));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_store<VecType, sizeof(VecType)>(src_v(i), (void*)&dst_v(i), guard);
+      }
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    Tensor tC_rAux = make_tensor_like(take<0,3>(tC_gAux));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper functions
+template <
+  template <class> class ReduceFn,
+  int kThreads, class T>
+CUTLASS_DEVICE
+void intra_warp_row_reduce(T& value) {
+  using ReduceInput = ReduceFn<T>;
+  ReduceInput reduce_input{};
+  constexpr int kHalfThreads = kThreads >> 1;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = kHalfThreads; i > 0; i >>= 1) {
+    value = reduce_input(value, __shfl_xor_sync(0xFFFFFFFF, value, i));
+  }
+}
+
+template <
+  template <class> class ReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementFragment, int FragmentSize>
+CUTLASS_DEVICE
+void fragment_reduce(ElementCompute& value, Array<ElementFragment, FragmentSize> const& frg) {
+  using ReduceInput = ReduceFn<ElementCompute>;
+  ReduceInput reduce_input{};
+  using ConvertInput = NumericConverter<ElementCompute, ElementFragment, RoundStyle>;
+  ConvertInput convert_input{};
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < FragmentSize; ++i) {
+    value = reduce_input(value, convert_input(frg[i]));
+  }
+}
+
+template<
+  template <class> class AtomicReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementOutput>
+CUTLASS_DEVICE
+void atomic_reduce(ElementOutput* ptr, ElementCompute const& value) {
+  using ReduceOutput = AtomicReduceFn<ElementOutput>;
+  using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+  ReduceOutput reduce_output{};
+  ConvertOutput convert_output{};
+
+  reduce_output(ptr, convert_output(value));
+}
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to the first thread in each row.
+        // Only the first thread in each row is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::Detail::kAccessWidth == 0;
+      }
+
+    GTensor tC_gCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+    int n;
+    int curr_iter_idx;
+    bool is_writing_thread;
+
+    ElementCompute reduction_accum;
+
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      reduction_accum = ElementCompute(params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      curr_iter_idx = iter_idx;
+
+      int coord_n = get<1>(tC_cCol(column_idx, row_idx, iter_idx));
+      if (coord_n < n) {
+        fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+      }
+
+      // Intra-warp reduction
+      if (column_idx + 1 == ThreadMap::Iterations::kColumn) {
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::Detail::kAccessWidth>(reduction_accum);
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE auto
+    end_row(int row_idx) {
+      bool guard = get<0>(tC_cCol(_0{}, row_idx,curr_iter_idx)) < m;
+
+      if (guard && is_writing_thread) {
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gCol(row_idx,curr_iter_idx), reduction_accum);
+      }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+    // FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cCol = group_modes<2,5>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_cCol),
+      ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>
+>
+struct VisitorRowReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  using SharedStorageShape = decltype(select<0,1,2,3,5,8,10>(typename ThreadMap::ThreadMapShape{}));
+
+  struct SharedStorage {
+    AlignedArray<ElementCompute, size(SharedStorageShape{}), 16> reduction;
+  };
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<ElementOutput>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params),
+      smem_reduce(const_cast<ElementCompute*>(shared_storage.reduction.data())) { }
+
+  Params const* params_ptr;
+  ElementCompute* smem_reduce;
+
+  template <
+    class RTensorR2S, class STensorR2S, class CTensorR2S,
+    class STensorS2R, class RTensorS2R, class CTensorS2R,
+    class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      // R->S
+      RTensorR2S&& tRS_rSrc,
+      STensorR2S&& tRS_sRows,
+      CTensorR2S&& tRS_cSrc,
+      // S->R
+      STensorS2R&& tSR_sRows,
+      RTensorS2R&& tSR_rRows,
+      CTensorS2R&& tSR_cRows,
+      // R->G
+      GTensor&& tC_gRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      // R->S
+      tRS_rSrc(cute::forward<RTensorR2S>(tRS_rSrc)),
+      tRS_sRows(cute::forward<STensorR2S>(tRS_sRows)),
+      tRS_cSrc(cute::forward<CTensorR2S>(tRS_cSrc)),
+      // S->R
+      tSR_sRows(cute::forward<STensorS2R>(tSR_sRows)),
+      tSR_rRows(cute::forward<RTensorS2R>(tSR_rRows)),
+      tSR_cRows(cute::forward<CTensorS2R>(tSR_cRows)),
+      // R->G
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    // R->S
+    RTensorR2S tRS_rSrc;
+    STensorR2S tRS_sRows;
+    CTensorR2S tRS_cSrc;
+    // S->R
+    STensorS2R tSR_sRows;
+    RTensorS2R tSR_rRows;
+    CTensorS2R tSR_cRows;
+    // R->G
+    GTensor tC_gRow;
+    CTensor tC_cRow;
+
+    Params const* params_ptr;
+    int n;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      fill(tRS_rSrc, params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      Tensor tRS_rRow_frg = recast<Array<ElementCompute, FragmentSize>>(coalesce(tRS_rSrc));
+
+      int coord_m = get<0>(tRS_cSrc(column_idx,row_idx,iter_idx));
+      if (coord_m < m)
+        reduction(tRS_rRow_frg[column_idx], convert_input(frg_input));
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      //
+      // Store the partially reduced value to SMEM
+      //
+
+      // Guard against uses of the existing SMEM tile
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tRS_rSrc); ++i) {
+        copy_vec<VecType>(filter(tRS_rSrc), filter(tRS_sRows));
+      }
+
+      __syncthreads();
+
+      //
+      // Now, threads are assigned several columns of the output. They fetch over all rows from
+      // the compacted SMEM tile and perform a reduction.
+      //
+
+      fill(tSR_rRows, params_ptr->reduction_identity);
+
+      using ReduceInputReg = RegReduceFn<ElementCompute>;
+      ReduceInputReg reduce_input_reg{};
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(tSR_rRows); ++j) {
+        if (get<0>(tSR_cRows(j)) < get<1>(typename ThreadMap::CtaShapeMNL{}) && get<1>(tC_cRow(j)) < n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_sRows) / size(tSR_rRows); ++i) {
+            tSR_rRows(j) = reduce_input_reg(tSR_rRows(j), tSR_sRows(i + j * size(tSR_sRows) / size(tSR_rRows)));
+          }
+          atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gRow(j), tSR_rRows(j));
+        }
+
+      }
+    }
+
+  private:
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE ElementCompute
+    reduction(Array<ElementCompute, FragmentSize>& reduce_buffer, Array<ElementCompute, FragmentSize> const& result) {
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ReduceInput reduce_input{};
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+            reduce_buffer[i] = reduce_input(reduce_buffer[i], result[i]);
+        }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    //
+    // Step 1: reduce fragment input (Src) into tRS_rSrc
+    //
+
+    // VECTOR,FRAGMENT_COL
+    Tensor tRS_rSrc = make_tensor<ElementCompute>(select<0,2>(typename ThreadMap::ThreadMapShape{}));
+
+    Tensor cSrc = make_identity_tensor(mRow.shape());
+    // FRAGMENT_COLUMN, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tRS_cSrc = group_modes<2,5>(ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    //
+    // Step 2: copy the partial results in tRS_rSrc to sRows in shared memory
+    //
+
+    // VECTOR,ACCESS_WIDTH,FRAGMENT_COL,ACCESS_ROWS,WARPS_PER_ROW,GROUPS,CLUSTERS
+    Tensor sRows = make_tensor(
+      make_smem_ptr(smem_reduce), SharedStorageShape{}
+    );
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = ThreadMap::tid2coord(thread_idx);
+    Tensor tRS_sRows = sRows(_,lane_col_coord,_,lane_row_coord,warp_row_coord,group_coord,cluster_coord);
+
+    //
+    // Step 3: copy the partial results in sRows to tSR_sRow for reduction
+    //
+
+    // VECTOR*ACCESS_WIDTH*FRAGMENT_COL,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor sRows_nm = coalesce(group_modes<1,5>(group_modes<0,3>(sRows)), Shape<_1,_1>{});
+    // SMEM_ROW/THREADS,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor tSR_sRows = outer_partition(sRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx);
+    // SMEM_ROW/THREADS
+    Tensor tSR_rRows = make_tensor_like(tSR_sRows(_,_0{}));
+    // Coord
+    Tensor cRows_nm = make_identity_tensor(sRows_nm.shape());
+    Tensor tSR_cRows = outer_partition(cRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx)(_,_0{});
+
+    //
+    // Step 4: atomically reduce the results to global memory
+    //
+
+    Tensor tC_gRow = outer_partition(
+      // Cta tile
+      local_tile(
+        mRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_),Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      // Cta tile
+      local_tile(
+        cRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    return Callbacks<
+      decltype(tRS_rSrc), decltype(tRS_sRows),
+      decltype(tRS_cSrc), decltype(tSR_sRows),
+      decltype(tSR_rRows), decltype(tSR_cRows),
+      decltype(tC_gRow), decltype(tC_cRow),
+      ProblemShape>(
+      // R->S
+      cute::move(tRS_rSrc),
+      cute::move(tRS_sRows),
+      cute::move(tRS_cSrc),
+      // S->R
+      cute::move(tSR_sRows),
+      cute::move(tSR_rRows),
+      cute::move(tSR_cRows),
+      // R->G
+      cute::move(tC_gRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>
+>
+struct VisitorScalarReduction {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(){ };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class CTensor, class GTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      CTensor&& tC_cSrc,
+      GTensor&& tC_gScalar,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_cSrc(cute::forward<CTensor>(tC_cSrc)),
+      tC_gScalar(cute::forward<GTensor>(tC_gScalar)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to this first thread.
+        // Only the first thread of each warp is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::kWarpSize == 0;
+      }
+
+      GTensor tC_gScalar;
+      CTensor tC_cSrc;
+      Params const* params_ptr;
+      ProblemShape problem_shape;
+      bool is_writing_thread;
+
+      ElementCompute reduction_accum;
+
+      CUTLASS_DEVICE void
+      begin_epilogue() {
+        reduction_accum = ElementCompute(params_ptr->reduction_identity);
+      }
+
+      template <class ElementAccumulator, class ElementInput, int FragmentSize>
+      CUTLASS_DEVICE auto
+      visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+            Array<ElementAccumulator, FragmentSize> const& frg_acc,
+            Array<ElementInput, FragmentSize> const& frg_input) {
+
+        auto coord = tC_cSrc(column_idx, row_idx, iter_idx);
+        if (elem_less(coord, problem_shape)) {
+          fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+        }
+
+        return frg_input;
+      }
+
+      CUTLASS_DEVICE auto
+      end_epilogue() {
+        // Intra-warp reduction
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::kWarpSize>(reduction_accum);
+
+        // Atomically reduce to global memory
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gScalar(_0{},_0{}), reduction_accum);
+      }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor cSrc = make_identity_tensor(problem_shape);
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cSrc = group_modes<2,5>(
+      ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_)
+    );
+
+    Tensor mScalar = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_scalar),
+      problem_shape,
+      params_ptr->dScalar
+    );
+
+    Tensor tC_gScalar = mScalar(_,_,threadblock_tile_offset.k());
+
+    return Callbacks<
+      decltype(tC_cSrc), decltype(tC_gScalar),
+      ProblemShape>(
+      cute::move(tC_cSrc),
+      cute::move(tC_gScalar),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
new file mode 100755
index 000000000..96fbc01d7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Higher-level header file includes all the CUTLASS 2x visitors
+*/
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_load.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_store.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_compute.hpp"
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
new file mode 100755
index 000000000..305f5d783
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator without splitk
+template <
+    /// Shape of threadblock tile (concept: GemmShape)
+    typename Shape_,
+    /// Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+    typename WarpMmaOperator_,
+    /// Number of partitions of the K dimension
+    int PartitionsK,
+    /// Tile iterator reading and writing output tensors
+    typename OutputTileIterator_,
+    /// Fragment iterator selecting accumulators
+    typename AccumulatorFragmentIterator_,
+    /// Output operator
+    typename OutputOp_,
+    /// Number of interleaved k
+    int InterleavedK>
+class InterleavedEpilogue :
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputOp = OutputOp_;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Accumulator element
+  using ElementAccumulator = typename AccumulatorTile::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<typename OutputTileIterator::Element,
+                                 OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpMmaOperator::Shape::kM,
+                      Shape::kN / WarpMmaOperator::Shape::kN, kPartitionsK>;
+
+public:
+
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "This must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements %
+                  OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+
+public:
+
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+
+
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      // Load addend source fragment from global memory
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {};
+
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedEpilogue(
+      SharedStorage &shared_storage,  ///< Shared storage object
+      int thread_idx,                 ///< ID of a thread within the threadblock
+      int warp_idx,                   ///< ID of warp within threadblock
+      int lane_idx)                   ///< Id of thread within warp
+  :
+      BaseStreamK(thread_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Redcuce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
+
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
+
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(output_fragment, output_op, accum_fragment, source_fragment);
+
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
+
+
+  /// Streams the result to global memory
+  template <typename SourceAspect>
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert fragment
+      //
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, accum_fragment);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.set_iteration_index(iter);
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
new file mode 100755
index 000000000..730088273
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template<
+  typename TensorLayout_,                             ///! The original output tensor layout
+  typename OutputIteratorLayout_,                     ///! Layout used by epilogue output iterator
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  conv::Operator ConvOperator,                        ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter {
+
+  using TensorLayout = TensorLayout_;
+  using OutputIteratorLayout = OutputIteratorLayout_;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix (KxRSC) 
+  // Conv3d row-major matrix (KxTRSC)
+  static int const kWgradStrideIdx = 
+    platform::is_same<TensorLayout, layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0);
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride(kTensorStrideIdx);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template <
+  int InterleavedK,
+  typename TensorRef_,
+  conv::Operator ConvOperator,
+  typename ConvProblemSize_
+>
+struct ConvOutputIteratorParameter<
+  layout::TensorNCxHWx<InterleavedK>, 
+  layout::TensorNCxHWx<InterleavedK>,
+  TensorRef_,
+  ConvOperator,
+  ConvProblemSize_>
+{ 
+
+  using TensorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputIteratorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return problem_size.output_extent();
+  }
+
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
new file mode 100755
index 000000000..617b8e39f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Metaprogram for determining the mapping of output elements to threads for epilogue tiles.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple defining point in output tile
+template <
+  int Column,
+  int Row,
+  int Group,
+  int Cluster,
+  int Tile
+>
+struct OutputTileShape {
+  static int const kColumn = Column;
+  static int const kRow = Row;
+  static int const kGroup = Group;
+  static int const kCluster = Cluster;
+  static int const kTile = Tile;
+
+  static int const kCount = kColumn * kRow * kGroup * kCluster * kTile;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Iterations, typename Delta>
+struct OutputTileThreadMapHelpers {
+
+  /// Determines the iteration index of a vector access according to the thread map
+  CUTLASS_HOST_DEVICE
+  static void iteration_index(
+    int &column_idx,
+    int &row_idx,
+    int &group_idx,
+    int &cluster_idx,
+    int &tile_idx,
+    int iter_idx) {
+
+    column_idx = iter_idx % Iterations::kColumn;
+    int residual   = iter_idx / Iterations::kColumn;
+
+    row_idx    = residual % Iterations::kRow;
+    residual       = residual / Iterations::kRow;
+
+    group_idx  = residual % Iterations::kGroup;
+    residual       = residual / Iterations::kGroup;
+
+    cluster_idx = residual % Iterations::kCluster;
+    tile_idx    = residual / Iterations::kCluster;
+  }
+
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+
+    int column_idx;
+    int row_idx;
+    int group_idx;
+    int cluster_idx;
+    int tile_idx;
+
+    iteration_index(column_idx, row_idx, group_idx, cluster_idx, tile_idx, iter_idx);
+
+    return
+      MatrixCoord(
+        row_idx     * Delta::kRow     +
+        group_idx   * Delta::kGroup   +
+        cluster_idx * Delta::kCluster +
+        tile_idx    * Delta::kTile,
+
+        column_idx  * Delta::kColumn);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <
+  typename ThreadMap_,
+  typename Shape_,
+  typename Iterations_,
+  typename Delta_,
+  typename Count_
+>
+struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
+
+  /// Conventional thread map (concept: ThreadMap)
+  using ThreadMap = ThreadMap_;
+
+  /// Number of threads participating in the operation
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Number of scalar elements per access
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  /// Shape of the tile
+  using Shape = Shape_;
+
+  /// Iterations performed by each thread
+  using Iterations = Iterations_;
+
+  /// Delta between accesses
+  using Delta = Delta_;
+
+  /// Number of iterator iterations 
+  using Count = Count_;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+
+    using Index = typename layout::PitchLinearCoord::Index;
+    
+    layout::PitchLinearCoord coord = ThreadMap::initial_offset(thread_idx);
+
+    Index cluster = coord.strided() / (Shape::kGroup * Shape::kRow);
+    Index cluster_residual = coord.strided() % (Shape::kGroup * Shape::kRow);
+
+    Index group = cluster_residual / (Shape::kRow);
+    Index row = cluster_residual % (Shape::kRow);
+
+    return MatrixCoord{
+      row + group * Shape::kRow * Count::kRow 
+        + cluster * Shape::kGroup * Count::kGroup * Shape::kRow * Count::kRow,
+      coord.contiguous()
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// RowArrangement determines how one or more warps cover a region of consecutive rows.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize,
+  bool Is2dTile
+>
+struct RowArrangement;
+
+/// RowArrangement in which each warp's access is a 1D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
+  static int const kWarpSize = 32;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  static int const kIterationsRow = 1;
+  static int const kDeltaRow = 1;
+  static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
+  static int const kDeltaColumn = kWarpSize * kElementsPerAccess;
+
+  static int const kAccessWidth = kWarpSize;
+  static int const kAccessRows = 1;
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = WarpsRemaining;
+};
+
+/// RowArrangement in which each warp's access is a 2D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {
+
+  static int const kMemoryAccessSize = 256; // Preferred access size
+  static int const kWarpSize = 32;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  struct Detail {
+    static int const kShapeRow = Shape::kRow / WarpsRemaining;
+    static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;
+
+    static int const kTargetMemoryAccessWidth = 
+      kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);
+
+    static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
+  };
+
+  static int const kAccessWidth = 
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      kWarpSize / Detail::kShapeRow
+      : const_min(
+          Detail::kShapeWidth,
+        const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
+        ));
+
+  static int const kAccessRows =
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      Detail::kShapeRow
+      : const_min(Shape::kRow, kWarpSize / kAccessWidth));
+
+  static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
+  static int const kDeltaRow = kAccessRows;
+
+  static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
+  static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;
+
+  static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
+  static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
+  static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );
+
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = 1;
+};
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D space across warps to achieve several performance
+/// objectives:
+///
+///   - coalesced memory accesses in units of 128 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <
+  typename Shape_,
+  typename Count_,
+  int Threads,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct OutputTileOptimalThreadMap {
+
+  using Shape = Shape_;
+  using Count = Count_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {
+
+    // Clusters
+    static int const kIterationsCluster = 
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kCluster / kWarpCount
+        : 1);
+
+    static int const kDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kCompactedDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kWarpPartitionsCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        kWarpCount
+        : kWarpCount / Shape::kCluster);
+
+    static int const kWarpsRemainingForGroups =
+      ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);
+
+    // Groups
+    static int const kIterationsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kGroup / kWarpsRemainingForGroups
+        : 1);
+
+    static int const kDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kCompactedDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kWarpPartitionsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+
+    static int const kWarpsRemainingForRows =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+    
+    // Rows
+    using RowArrangement = detail::RowArrangement<
+      Shape,
+      kWarpsRemainingForRows,
+      kElementsPerAccess,
+      kElementSize,
+      (Shape::kRow > kWarpsRemainingForRows)
+    >;
+
+    // Warp partitions
+    using WarpPartitions = OutputTileShape<
+      RowArrangement::kWarpPartitionsColumn,
+      RowArrangement::kWarpPartitionsRow,
+      kWarpPartitionsGroup,
+      kWarpPartitionsCluster,
+      1>;
+
+    static int const kAccessWidth = RowArrangement::kAccessWidth;
+    static int const kAccessRows = RowArrangement::kAccessRows;
+  };
+
+  //
+  // Output
+  //
+
+  using Iterations = OutputTileShape<
+    Detail::RowArrangement::kIterationsColumn, 
+    Detail::RowArrangement::kIterationsRow, 
+    Detail::kIterationsGroup, 
+    Detail::kIterationsCluster, 
+    1>;
+
+  using Delta = OutputTileShape<
+    Detail::RowArrangement::kDeltaColumn,
+    Detail::RowArrangement::kDeltaRow,
+    Detail::kDeltaGroup,
+    Detail::kDeltaCluster,
+    1>;
+
+  /// Initial offset function
+  CUTLASS_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+
+//    int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
+    int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
+
+    int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
+    int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
+
+    int row_idx = residual_group / Detail::WarpPartitions::kRow;
+    int col_idx = residual_group % Detail::WarpPartitions::kRow;
+
+    // Compute per-lane offset
+    int lane_row_offset = lane_idx / Detail::kAccessWidth;
+    int lane_col_offset = lane_idx % Detail::kAccessWidth;
+
+    // Compute coordinate in output space
+    int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
+    int group_offset = group_idx * Shape::kRow * Count::kRow;
+    int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
+    int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
+
+    return MatrixCoord(
+      cluster_offset + group_offset + row_offset + lane_row_offset,
+      column_offset + lane_col_offset * kElementsPerAccess
+    );
+  }
+
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+    return OutputTileThreadMapHelpers<Iterations, Delta>::iteration_offset(iter_idx);
+  }
+
+  /// Compacted thread map in which the 4D region is contiguous
+  struct CompactedThreadMap {
+
+
+    using Shape = Shape_;
+
+    using TileShape = MatrixShape<
+      Shape::kTile * Shape::kCluster * Shape::kGroup * Shape::kRow,
+      Shape::kColumn
+    >;
+
+    using Iterations = OutputTileShape<
+      Detail::RowArrangement::kIterationsColumn,
+      Detail::RowArrangement::kIterationsRow,
+      Detail::kIterationsGroup,
+      Detail::kIterationsCluster,
+      1>;
+
+    using Delta = OutputTileShape<
+      Detail::RowArrangement::kDeltaColumn,
+      Detail::RowArrangement::kDeltaRow,
+      Detail::kCompactedDeltaGroup,
+      Detail::kCompactedDeltaCluster,
+      1>;
+
+    /// Number of elements within each vector access
+    static int const kElementsPerAccess = ElementsPerAccess;
+
+    /// Number  of threads
+    static int const kThreads = Threads;
+
+    /// Function to compute each thread's initial offset
+    CUTLASS_DEVICE
+    static MatrixCoord initial_offset(int thread_idx) {
+
+//      int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
+      int warp_idx = thread_idx / kWarpSize;
+      int lane_idx = thread_idx % kWarpSize;
+
+      // Compute warp location
+      int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
+      int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
+
+      int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
+      int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
+
+      int row_idx = residual_group / Detail::WarpPartitions::kRow;
+      int col_idx = residual_group % Detail::WarpPartitions::kRow;
+
+      // Compute per-lane offset
+      int lane_row_offset = lane_idx / Detail::kAccessWidth;
+      int lane_col_offset = lane_idx % Detail::kAccessWidth;
+
+      // Compute coordinate in output space
+      int cluster_offset = cluster_idx * Shape::kRow * Shape::kGroup;
+      int group_offset = group_idx * Shape::kRow;
+      int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
+      int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
+
+      MatrixCoord coord(
+        cluster_offset + group_offset + row_offset + lane_row_offset,
+        column_offset + lane_col_offset * kElementsPerAccess
+      );
+
+      return coord;
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 3D interleaved layout across warps
+/// to achieve several performance objectives:
+///
+///   - coalesced memory accesses in units of 64 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <typename WarpCount_, typename Iterations_, int Threads,
+          int ElementsPerAccess, int ElementSize>
+struct InterleavedOutputTileThreadMap {
+  using WarpCount = WarpCount_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {};
+
+  //
+  // Output
+  //
+
+  using Iterations = Iterations_;
+
+  using Delta = layout::PitchLinearShape<kWarpSize * kElementsPerAccess, 1>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static layout::PitchLinearCoord initial_offset(int thread_idx) {
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    layout::PitchLinearCoord warp_footprint{
+        Delta::kContiguous * Iterations::kContiguous,
+        Delta::kStrided * Iterations::kStrided};
+
+    layout::PitchLinearCoord warp_offset{warp_idx % WarpCount::kContiguous,
+                                         warp_idx / WarpCount::kContiguous};
+
+    // Compute per-lane offset
+    layout::PitchLinearCoord thread_offset_in_warp{
+        lane_idx * kElementsPerAccess, 0};
+
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    return thread_offset_in_threadblock_tile;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D interleaved layout across warps
+/// to achieve several performance objectives:
+///
+///   - coalesced memory accesses in units of 64 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <typename WarpCount_, typename Iterations_, int Threads,
+          int ElementsPerAccess, int ElementSize>
+struct InterleavedConvOutputTileThreadMap {
+  using WarpCount = WarpCount_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {};
+
+  //
+  // Output
+  //
+
+  using Iterations = Iterations_;
+
+  using Delta = MatrixShape<kWarpSize / 4, 4 * kElementsPerAccess>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    MatrixCoord warp_footprint{
+        Delta::kRow * Iterations::kRow,
+        Delta::kColumn * Iterations::kColumn,
+    };
+
+    MatrixCoord warp_offset{warp_idx % WarpCount::kRow,
+                            warp_idx / WarpCount::kRow};
+
+    // Compute per-lane offset
+    MatrixCoord thread_offset_in_warp{lane_idx / 4,
+                                      (lane_idx % 4) * kElementsPerAccess};
+
+    MatrixCoord thread_offset_in_threadblock_tile =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    return thread_offset_in_threadblock_tile;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
new file mode 100755
index 000000000..9943ea256
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -0,0 +1,1387 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
+  bool UseCUDAStore = false
+>
+class PredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout):
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor5DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
+  uint8_t *byte_pointer_;
+
+  /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_ may be with different address computation compared to byte_pointer_.
+  uint8_t *store_byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const *indices_;
+
+  /// PermuteDLayout
+  PermuteDLayout permute_layout_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord(),
+    int const *indices = nullptr
+  ): 
+    params_(params), indices_(indices),
+    permute_layout_(PitchLinearCoord(extent.column(), extent.row()), params_.stride * kElementsPerAccess / sizeof(AccessType))
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+    }
+
+    // store_byte_pointer_ is set to be the same with byte_pointer_ unless PermuteD is used.
+    store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t *>(pointer) : byte_pointer_;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = store_byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+            
+            if (PermuteD) {
+
+              int col_offset = column * ThreadMap::Delta::kColumn;
+
+              int col = col_offset + thread_start_column_;
+              int row = row_offset + thread_start_row_;
+
+              // Locate memory_pointer
+              memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset
+                 + permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) / kElementsPerAccess);
+            }
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[0] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[0],
+                  guard);
+            }
+
+            if (!PermuteD) {
+              memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD && !PermuteD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          if (!ScatterD && !PermuteD) {
+            byte_pointer += params_.increment_group;
+          }
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        if (!ScatterD && !PermuteD) {
+          byte_pointer += params_.increment_cluster;
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+            (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+	  if (output_P > convolution_P - 2) row_add_P = 0;
+	  if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P/2) * (convolution_Q/2) +
+            ((output_P + row_add_P)/2) * (convolution_Q/2) + (output_Q + row_add_Q)/2;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    if (!ScatterD && !PermuteD) {
+      store_byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      if (!ScatterD) {
+        byte_pointer_ += params_.advance_group;
+      }
+
+      if (!ScatterD && !PermuteD) {
+        store_byte_pointer_ += params_.advance_group;
+      }
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        if (!ScatterD) {
+          byte_pointer_ += params_.advance_cluster;
+        }
+
+        if (!ScatterD && !PermuteD) {
+          store_byte_pointer_ += params_.advance_cluster;
+        }
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+
+          if (!ScatterD) {
+            byte_pointer_ += params_.advance_tile;
+          }
+
+          if (!ScatterD && !PermuteD) {
+            store_byte_pointer_ += params_.advance_tile;
+          }
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    byte_pointer_ += (params_.advance_row * increment);
+    store_byte_pointer_ += (params_.advance_row * increment);
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    byte_pointer_ += (params_.advance_group * increment_row);
+    store_byte_pointer_ += (params_.advance_group * increment_row);
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    byte_pointer_ += (params_.advance_cluster * increment_group);
+    store_byte_pointer_ += (params_.advance_cluster * increment_group);
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    byte_pointer_ += (params_.advance_tile * increment_cluster);
+    store_byte_pointer_ += (params_.advance_tile * increment_cluster);
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | InterleavedPredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int InterleavedN           ///< Number of Interleaved N 
+>
+class InterleavedPredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+
+  using Element = Element_;
+
+  using Layout = layout::ColumnMajorInterleaved<InterleavedN>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Iterations::kCount;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Uses a non-template class
+  struct Params : InterleavedPredicatedTileIteratorParams {
+    using Base = InterleavedPredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      Base(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>()
+      ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = (ThreadMap::Iterations::kContiguous < 8)
+                                  ? 8
+                                  : ThreadMap::Iterations::kContiguous;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// A thread's starting column position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_col_;
+
+  /// Internal iteration counter
+  int iteration_contiguous_;
+
+  int iteration_strided_;
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedPredicatedTileIterator(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset,
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    params_(params) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) +
+                                TensorCoord(threadblock_offset.contiguous() * InterleavedN,
+                                 threadblock_offset.strided() / InterleavedN);
+
+    extent_col_ = extent.strided() / InterleavedN;
+    thread_start_col_ = thread_offset.strided();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.contiguous() + ThreadMap::Delta::kContiguous * c) <
+           (extent.contiguous() * InterleavedN));
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
+      LongIndex(thread_offset.strided()) * LongIndex(params_.stride) + 
+      LongIndex(thread_offset.contiguous()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    iteration_contiguous_ = iteration_strided_ = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
+
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+
+    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
+
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
+
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+
+    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
+
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int iteration) {
+    iteration_contiguous_ = iteration % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = iteration / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIterator &operator++() {
+
+    ++iteration_contiguous_;
+    byte_pointer_ += params_.advance_row;
+
+    if (iteration_contiguous_ == ThreadMap::Iterations::kContiguous) {
+
+      iteration_contiguous_ = 0;
+      ++iteration_strided_;
+      byte_pointer_ += params_.advance_column;
+
+      if (iteration_strided_ == ThreadMap::Iterations::kStrided) {
+        iteration_strided_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIterator &operator+=(int increment)
+  {
+    // Contiguous
+    iteration_contiguous_ += increment;
+    int increment_strided = iteration_contiguous_ / ThreadMap::Iterations::kContiguous;
+    iteration_contiguous_ = iteration_contiguous_ % ThreadMap::Iterations::kContiguous;
+    byte_pointer_ += (params_.advance_row * increment);
+
+    // Strided
+    iteration_strided_ += increment_strided;
+    byte_pointer_ += (params_.advance_column * increment_strided);
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | InterleavedMaskedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int InterleavedN           ///< Number of Interleaved N
+>
+class InterleavedConvPredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+
+  using Element = Element_;
+
+  using Layout = layout::TensorNCxHWx<InterleavedN>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = Tensor4DCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Iterations::kCount;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    LongIndex stride_col;           ///< stride in bytes between columns
+    LongIndex stride_row;           ///< stride in bytes between rows
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Status initialize(typename Layout::Stride stride_) {
+      stride_col = stride_[1];
+      stride_row = stride_[2];
+
+      return Status::kSuccess;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params() {
+      initialize(cutlass::make_Coord(0, 0, 0));
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+
+      initialize(layout.stride());
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount =
+        (ThreadMap::Iterations::kRow < 8) ? 8 : ThreadMap::Iterations::kRow;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in pq 
+  Index extent_pq_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_col_;
+
+  /// Internal iteration counter
+  LongIndex iteration_row_;
+  LongIndex iteration_col_;
+
+  uint32_t pq_mul_;
+
+  uint32_t pq_shr_;
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedConvPredicatedTileIterator(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    MatrixCoord threadblock_offset
+  ):
+    params_(params) {
+    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+                                
+    extent_col_ = extent.c();
+    extent_pq_ = extent.h() * extent.w();
+    extent_row_ = extent.n() * extent_pq_;
+
+    find_divisor(pq_mul_, pq_shr_, extent_pq_);
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_col_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int r = 0; r < ThreadMap::Iterations::kRow; ++r) {
+      mask_.predicates[r] =
+          ((thread_offset.row() + ThreadMap::Delta::kRow * r) < extent_row_);
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+                    ((thread_start_col_ / InterleavedN) * params_.stride_col +
+                     (thread_start_col_ % InterleavedN)) *
+                        sizeof_bits<Element>::value / 8;
+
+    // Initialize internal state counter
+    iteration_row_ = iteration_col_ = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *memory_pointer =
+        reinterpret_cast<AccessType const *>(byte_pointer);
+
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int iteration) {
+    iteration_row_ = iteration % ThreadMap::Iterations::kRow;
+    iteration_col_ = iteration / ThreadMap::Iterations::kRow;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedConvPredicatedTileIterator &operator++() {
+
+    ++iteration_row_;
+
+    if (iteration_row_ == ThreadMap::Iterations::kRow) {
+
+      iteration_row_ = 0;
+      ++iteration_col_;
+      byte_pointer_ += params_.stride_col;
+
+      if (iteration_col_ == ThreadMap::Iterations::kColumn) {
+        iteration_col_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
new file mode 100755
index 000000000..2b86ac0ea
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
@@ -0,0 +1,615 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+/// It provides a fast path for the case Rank = 2 which does not need div/rem to 
+/// calculate modes.
+
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int Rank
+>
+class PredicatedTileIteratorAffineRankN {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::AffineRankN<Rank>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+  static_assert( !(Layout::kRank % 2), 
+    "Layout rank must be even. This assumes the first half of the modes correspond to the 'row' "
+    "and the second half of the modes correspond to the 'column'");
+
+  static bool const kBigEndian = false;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    Layout layout;
+
+    /// Stride in units of bytes along M modes
+    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
+
+    /// Stride in units of bytes along N modes
+    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+    int64_t rank2_inc_col;
+    int64_t rank2_inc_row;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(TensorCoord const &extent, Layout const &layout_): layout(layout_) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2; ++i) {
+        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+      }
+
+      if (kBigEndian) {
+        // "Big Endian" scheme
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+          divmod_m[i] = FastDivmod(extent[i + 1]);
+          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+        }
+      }
+      else {
+        // "Little Endian" scheme
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+          divmod_m[i] = FastDivmod(extent[i]);
+          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+        }
+      }
+
+      #if 0
+      //
+      // Debug print statements to verify extents and strides are passed correctly.
+      //
+      printf("PredicatedTileIteratorAffine::Params() entered\n");
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank; ++i) {
+        printf("  extent[%d]: %d\n", i, extent[i]);
+      }
+      for (int i = 0; i < Layout::kRank; ++i) {
+        printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
+      }
+      printf("PredicatedTileIteratorAffine::Params() returning\n");
+      #endif
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout_): layout(layout_) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2; ++i) {
+        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+      }
+
+      rank2_inc_col = ThreadMap::Delta::kColumn * stride_n[0];
+      rank2_inc_row = ThreadMap::Delta::kRow * stride_m[0];
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have been computed)
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Offsets in columns, cached for performance
+  int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorAffineRankN(
+    Params const & params,
+    Element *pointer,
+    MatrixCoord extent,
+    int thread_idx,
+    MatrixCoord threadblock_offset = MatrixCoord(),
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ): 
+    params_(params)
+  {
+
+    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_col_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    if (Layout::kRank > 2) {
+      // Initialize predicates
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+        // 
+        // Compute coordinate and decompose into N modes
+        //
+
+        int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
+
+        mask_.predicates[c] = coord_n < extent.column();
+        
+        Coord<Layout::kRank / 2, Index> modes_n;
+
+        int64_t offset_modes_n = 0;
+
+        if (kBigEndian) {
+          modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+        else {
+          modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+
+        offset_modes_n_[c] = offset_modes_n;
+
+      }
+
+      if (!pointer) {
+        mask_.clear();
+      }
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+    uint8_t const *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
+        int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          // 
+          // Compute coordinate and decompose into M modes
+          //
+
+          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+          Coord<Layout::kRank / 2, Index> modes_m;
+
+          if (Layout::kRank > 2) {
+            if (kBigEndian) {
+              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            } else {
+              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            }
+
+            offset_modes_m = dot(modes_m, params_.stride_m);
+          }
+
+          //
+          // Compute the offset due to modes M
+          //
+
+          bool row_guard = (coord_m < extent_row_);
+          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            // 
+            // Compute coordinate and decompose into N modes
+            //
+            
+            if (Layout::kRank > 2) {
+              offset_modes_n = offset_modes_n_[column];
+            }
+
+            //
+            // Compute the pointer and access
+            //
+            bool guard;
+
+            if (Layout::kRank > 2) {
+              guard = row_guard && mask_.predicates[column];
+            } else {
+              guard = (coord_m < extent_row_) && 
+              ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
+            }
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+              (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+              guard
+            );
+
+            if (Layout::kRank == 2) {
+              offset_modes_n += params_.rank2_inc_col;
+            }
+          }
+
+          if (Layout::kRank == 2) {
+            offset_modes_m += params_.rank2_inc_row;
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
+        int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          // 
+          // Compute coordinate and decompose into M modes
+          //
+
+          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+          Coord<Layout::kRank / 2, Index> modes_m;
+
+          if (Layout::kRank > 2) {
+            if (kBigEndian) {
+              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            } else {
+              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            }
+
+            offset_modes_m = dot(modes_m, params_.stride_m);
+          }
+
+          //
+          // Compute the offset due to modes M
+          //
+
+          bool row_guard = (coord_m < extent_row_);
+          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            // 
+            // Compute coordinate and decompose into N modes
+            //
+            
+            if (Layout::kRank > 2) {
+              offset_modes_n = offset_modes_n_[column];
+            } 
+
+            //
+            // Compute the pointer and access
+            //
+            bool guard;
+            if (Layout::kRank > 2) {            
+              guard = row_guard && mask_.predicates[column];
+            } else {
+              guard = (coord_m < extent_row_) && ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
+            }
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+                guard);
+
+            if (Layout::kRank == 2) {
+              offset_modes_n += params_.rank2_inc_col;
+            }
+          }
+
+          if (Layout::kRank == 2) {
+            offset_modes_m += params_.rank2_inc_row;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineRankN &operator++() {
+
+    ++state_[0];
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
new file mode 100755
index 000000000..7f7f17b5a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Rank
+>
+struct PredicatedTileIteratorAffineLayoutRankNParams {
+  using Layout = layout::AffineRankN<Rank>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static bool const kBigEndian = false;
+  
+  //
+  // Data members
+  //
+
+  Layout layout;
+
+  /// Stride in units of bytes along M modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
+
+  /// Stride in units of bytes along N modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  int64_t rank2_inc_col;
+  int64_t rank2_inc_row;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams() { }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(TensorCoord const &extent, 
+                                                Layout const &layout_,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    if (kBigEndian) {
+      // "Big Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i + 1]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+      }
+    }
+    else {
+      // "Little Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+      }
+    }
+
+    #if 0
+    //
+    // Debug print statements to verify extents and strides are passed correctly.
+    //
+    printf("PredicatedTileIteratorAffine::Params() entered\n");
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  extent[%d]: %d\n", i, extent[i]);
+    }
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
+    }
+    printf("PredicatedTileIteratorAffine::Params() returning\n");
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(Layout const &layout_,
+                                                int32_t threadmap_delta_kColumn,
+                                                int32_t threadmap_delta_kRow,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    rank2_inc_col = threadmap_delta_kColumn * stride_n[0];
+    rank2_inc_row = threadmap_delta_kRow * stride_m[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
new file mode 100755
index 000000000..c2583674c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
@@ -0,0 +1,633 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,                     ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,                        ///< Element data type
+  BlasMode BlasMode_ = BlasMode::kGemm   ///< Tile Iterator for a Symmetric or Hermitian Kernel
+>
+class PredicatedTileIteratorBlas3 {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static BlasMode const kBlasMode = BlasMode_;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  static_assert( AccessType::kElements == 1, "BLAS3 Epilogue must use AccessType::kElements as 1");
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Fill Mode for a tile on diagonal of a symmetric kernel
+  cutlass::FillMode fill_mode;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Starting address of the matrix  
+  size_t matrix_start_addr; 
+ 
+  static_assert((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian), 
+    "Unsupported blas3 mode.");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorBlas3(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset
+    , cutlass::FillMode fill_mode
+  ): 
+    params_(params), fill_mode(fill_mode)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    thread_start_row_ = thread_offset.row();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Check Symmetric kernel modes (Lower and Upper - for diagonal CTAs, None for rest CTAs)
+    if ((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian) && 
+        fill_mode == cutlass::FillMode::kInvalid) {
+      arch::device_breakpoint();
+    }
+
+    // Starting address of the matrix
+    matrix_start_addr =  reinterpret_cast<size_t>(pointer); 
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void load_symmetric_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      load_with_byte_offset(frag, 0);
+    }
+    else {
+      load_symmetric_with_byte_offset(frag, 0);
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void store_symmetric_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+
+              AccessType *frag_ptr_modify = const_cast<AccessType *>(frag_ptr);
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr_modify);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      store_with_byte_offset(frag, 0);
+    }
+    else {
+      store_symmetric_with_byte_offset(frag, 0); 
+    }
+
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorBlas3 &operator++() {
+
+    ++state_[0];
+    byte_pointer_ += params_.advance_row;
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
new file mode 100755
index 000000000..a59437c09
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIteratorConv | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
+  bool UseCUDAStore = false,
+  int Rank = 4
+>
+class PredicatedTileIteratorConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  static int const kRank = Rank;
+  using Layout = typename platform::conditional<kRank == 4,
+                                       layout::TensorNHWC,
+                                       layout::TensorNDHWC>::type;
+
+  using Stride = typename Layout::Stride;
+  static int const kStrideRank = Layout::kStrideRank;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using MappedLayout = layout::RowMajor;
+  using Index = typename MappedLayout::Index;
+  using LongIndex = typename MappedLayout::LongIndex;
+  using TensorCoord = typename MappedLayout::TensorCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod[kStrideRank - 1];
+    Stride tensor_stride;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor4DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[2] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[1] /* P for Fprop & H for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor5DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[3] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[2] /* P for Fprop & H for Deconv*/);
+      divmod[2] = FastDivmod(tensor_extent[1] /* Z for Fprop & D for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorConv(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ):
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>(row_offset + thread_start_row_, params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess + tensor_offset / kElementsPerAccess],
+                guard);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>((row_offset + thread_start_row_), params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[tensor_offset / kElementsPerAccess] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[tensor_offset / kElementsPerAccess],
+                  guard);
+            }
+
+            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator++() {
+
+    ++state_[0];
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
new file mode 100755
index 000000000..8d7bf7edb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: PitchLinearThreadMap)
+  typename Element_,         ///< Element data type
+  typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+class PredicatedTileIteratorDirectConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+
+  using ConvProblemSize = typename cutlass::conv::Conv2dProblemSize;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / AccessType::kElements;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorDirect2dConvParams {
+    using Base = PredicatedTileIteratorDirect2dConvParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize const &problem_size): 
+      PredicatedTileIteratorDirect2dConvParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        problem_size,
+        {ThreadBlockOutputShape::kH, ThreadBlockOutputShape::kW}
+      ) 
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kContiguous;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorDirect2dConvParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  ///     
+  Element *pointer_;
+
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Initial thread output location
+  int thread_start_n_, thread_start_p_, thread_start_q_;
+
+  /// Current threadblock tile index
+  int tile_index_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorDirect2dConvParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorDirectConv(
+    PredicatedTileIteratorDirect2dConvParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params), pointer_(pointer)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    // stride dim (PQ)
+    thread_start_row_ = thread_offset.column();
+    // contiguous dim (Channels)
+    thread_start_column_ = threadblock_offset.column() + thread_offset.row();
+
+    tile_index_ = threadblock_offset.row();
+
+    set_tile_index(0);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void set_tile_index(const int index) { 
+   
+    int residual;
+    params_.pq_divmod(thread_start_n_, residual, tile_index_ + index);
+    params_.q_divmod(thread_start_p_, thread_start_q_, residual);
+
+    // Compute the base output coord of ThreadBlock
+    thread_start_p_ *= ThreadBlockOutputShape::kH;
+    thread_start_q_ *= ThreadBlockOutputShape::kW;
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      mask_.predicates[c] = ((thread_start_column_ 
+        + c * ThreadMap::Delta::kContiguous) < extent_column_);
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer_) {
+      mask_.clear();
+    }
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirectConv &operator++() {
+    // do nothing
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
new file mode 100755
index 000000000..5e9aa22bd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct OutputTileShapeDesc {
+
+  int column;
+  int row;
+  int group;
+  int cluster;
+  int tile;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(): column(0), row(0), group(0), cluster(0), tile(0) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(
+    int column_,
+    int row_,
+    int group_,
+    int cluster_,
+    int tile_
+  ):
+    column(column_),
+    row(row_),
+    group(group_),
+    cluster(cluster_),
+    tile(tile_) { }
+
+  /// Total number of points in the 5D space
+  CUTLASS_HOST_DEVICE
+  int count() const {
+    return column * row * group * cluster * tile;
+  }
+
+  #if 0
+  CUTLASS_HOST_DEVICE
+  void print() const {
+    printf("{%d, %d, %d, %d, %d}", column, row, group, cluster, tile);
+  }
+  #endif
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template.
+template <typename Shape>
+CUTLASS_HOST_DEVICE
+OutputTileShapeDesc make_OutputTileShapeDesc() {
+  return OutputTileShapeDesc(
+    Shape::kColumn,
+    Shape::kRow,
+    Shape::kGroup,
+    Shape::kCluster,
+    Shape::kTile
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread map description
+struct OutputTileThreadMapDesc {
+
+  int threads;
+  int elements_per_access;
+  OutputTileShapeDesc shape;
+  OutputTileShapeDesc iterations;
+  OutputTileShapeDesc delta;
+  OutputTileShapeDesc count;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc() { }
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc(
+    int threads_,
+    int elements_per_access_,
+    OutputTileShapeDesc shape_,
+    OutputTileShapeDesc iterations_,
+    OutputTileShapeDesc delta_,
+    OutputTileShapeDesc count_
+  ):
+    threads(threads_), 
+    elements_per_access(elements_per_access_),
+    shape(shape_),
+    iterations(iterations_),
+    delta(delta_),
+    count(count_) 
+  {
+    
+  }
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
+template <typename ThreadMap>
+CUTLASS_HOST_DEVICE
+OutputTileThreadMapDesc make_OutputTileThreadMapDesc() {
+  return OutputTileThreadMapDesc(
+    ThreadMap::kThreads,
+    ThreadMap::kElementsPerAccess,
+    make_OutputTileShapeDesc<typename ThreadMap::Shape>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Iterations>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Delta>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Count>()
+  );
+}
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct for PredicatedTileIterator
+//
+
+struct PredicatedTileIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+
+  LongIndex stride;               ///< stride in bytes between rows
+
+  LongIndex increment_row;        ///< increment quantity (in bytes) to advance when moving between rows
+  LongIndex increment_group;      ///< increment quantity (in bytes) to advance when moving to the next group
+  LongIndex increment_cluster;    ///< increment quantity (in bytes) to advance when moving to the next cluster
+
+  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
+  LongIndex advance_group;        ///< amount to add to move to the next 'group' position
+  LongIndex advance_cluster;      ///< amount to add to move to the next 'cluster' position
+  LongIndex advance_tile;         ///< amount to add to move to the next 'tile'
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_, OutputTileThreadMapDesc thread_map) {
+    
+    stride = stride_;
+
+    increment_row = stride * thread_map.delta.row;
+
+    increment_group = stride * thread_map.delta.group
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    increment_cluster = stride * thread_map.delta.cluster
+      - stride * thread_map.delta.group * (thread_map.iterations.group - 1)
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    advance_row = stride * thread_map.shape.row;
+
+    advance_group = 
+      stride * 
+      (thread_map.shape.group - 1) * thread_map.shape.row * thread_map.count.row;
+    
+    advance_cluster = 
+      stride * 
+      thread_map.count.group * 
+      thread_map.shape.group * 
+      thread_map.count.row * 
+      thread_map.shape.row;
+    
+    advance_tile =
+      stride * 
+      thread_map.shape.group * 
+      thread_map.shape.row * 
+      thread_map.shape.cluster * 
+      thread_map.shape.tile;
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride_, OutputTileThreadMapDesc thread_map) {
+    return initialize(LongIndex(stride_), thread_map); 
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams() {
+    initialize(LongIndex(0), OutputTileThreadMapDesc());
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams(Index stride, OutputTileThreadMapDesc thread_map) {
+    initialize(stride, thread_map);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams(LongIndex stride, OutputTileThreadMapDesc thread_map) {
+    initialize(stride, thread_map);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct for PredicatedTileIteratorDirect2dConv
+//
+
+struct PredicatedTileIteratorDirect2dConvParams{
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  LongIndex stride;
+  LongIndex stride_n;
+  LongIndex stride_p;
+
+  int N;
+  int P;
+  int Q;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_,
+                    cutlass::conv::Conv2dProblemSize const &problem_size,
+                    MatrixCoord threadblock_output_shape) {
+    stride = stride_; // The stride per row of output tensor (bytes)
+    stride_n = problem_size.P * problem_size.Q;
+    stride_p = problem_size.Q ;
+
+    N = problem_size.N;
+    P = problem_size.P;
+    Q = problem_size.Q;
+
+    // Fastdivmod for output O, P, Q
+    if(threadblock_output_shape.row() != 0 && threadblock_output_shape.column() !=0 ){
+      // MSVC emits a "potential divide by 0" warning as error
+      // if the code just divides without a check and substitution.
+
+      CUTLASS_ASSERT(threadblock_output_shape.row() != 0);
+      const auto row_denom = threadblock_output_shape.row() != 0 ?
+        threadblock_output_shape.row() : cutlass::MatrixCoord::Index(1);
+      int tiles_p =
+          (problem_size.P + (threadblock_output_shape.row() - 1)) / row_denom;
+
+      CUTLASS_ASSERT(threadblock_output_shape.column() != 0);
+      const auto col_denom = threadblock_output_shape.column() != 0 ?
+        threadblock_output_shape.column() : cutlass::MatrixCoord::Index(1);
+      int tiles_q = (problem_size.Q + (threadblock_output_shape.column() - 1)) /
+                    col_denom;
+
+      pq_divmod = FastDivmod(tiles_p * tiles_q);
+      q_divmod = FastDivmod(tiles_q);
+    }
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(
+      Index stride_,
+      cutlass::conv::Conv2dProblemSize const &problem_size = cutlass::conv::Conv2dProblemSize(),
+      MatrixCoord threadblock_output_shape = MatrixCoord()) {
+    return initialize(LongIndex(stride_), problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams() { initialize(LongIndex(0)); }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(Index stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(LongIndex stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//  InterleavedPredicatedTileIterator
+///////////////////////////////////////////////////////////////////////////////
+
+
+/// Predicated tile access iterator descriptor object containing template dependent state
+struct InterleavedPredicatedTileIteratorDesc {
+
+  int element_size_bits;
+  int elements_per_access;
+  int threadmap_warp_size;
+  layout::PitchLinearCoord threadmap_iterations;
+  layout::PitchLinearCoord threadmap_delta;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc() { }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc(
+    int element_size_bits_,
+    int elements_per_access_,
+    int threadmap_warp_size_,
+    layout::PitchLinearCoord threadmap_iterations_,
+    layout::PitchLinearCoord threadmap_delta_
+  ):
+    element_size_bits(element_size_bits_),
+    elements_per_access(elements_per_access_),
+    threadmap_warp_size(threadmap_warp_size_),
+    threadmap_iterations(threadmap_iterations_),
+    threadmap_delta(threadmap_delta_) { }
+};
+
+//
+// Parameters struct InterleavedPredicatedTileIterator
+//
+
+struct InterleavedPredicatedTileIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+
+  LongIndex stride;               ///< stride in bytes between rows
+  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
+  LongIndex advance_column;       ///< amount to add to move to the next 'column' position
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_, InterleavedPredicatedTileIteratorDesc desc) {
+    
+    stride = stride_;
+
+    advance_row = desc.threadmap_delta.contiguous() * desc.element_size_bits / 8;
+
+    advance_column = stride_ - desc.threadmap_iterations.contiguous() *
+                               desc.elements_per_access *
+                               desc.element_size_bits *
+                               desc.threadmap_warp_size / 8;
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams() {
+    initialize(LongIndex(0), InterleavedPredicatedTileIteratorDesc());
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams(Index stride, InterleavedPredicatedTileIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams(LongIndex stride, InterleavedPredicatedTileIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
+template <typename Element, typename ThreadMap>
+CUTLASS_HOST_DEVICE
+InterleavedPredicatedTileIteratorDesc make_InterleavedPredicatedTileIteratorDesc() {
+  return InterleavedPredicatedTileIteratorDesc(
+    sizeof_bits<Element>::value,
+    ThreadMap::kElementsPerAccess,
+    ThreadMap::kWarpSize,
+    {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+    {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an MakePredicatedTileIteratorDesc from a template 
+// dependent state
+template <typename Element, typename Layout,
+   typename ThreadMap>
+  struct MakePredicatedTileIteratorDesc;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for layout::RowMajor output data.
+template <typename Element, typename ThreadMap>
+struct MakePredicatedTileIteratorDesc <
+    Element, layout::RowMajor, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc operator()() {
+
+    return make_OutputTileThreadMapDesc<ThreadMap>();
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for layout::ColumnMajorInterleaved<InterleavedN> output data.
+template <typename Element, typename ThreadMap, int InterleavedN>
+struct MakePredicatedTileIteratorDesc <
+    Element, layout::ColumnMajorInterleaved<InterleavedN>, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc operator()() {
+
+    return make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
new file mode 100755
index 000000000..2fbbc9a4f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
@@ -0,0 +1,309 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief PredicatedTileIteratorPredicates.
+
+  PredicatedTileIteratorPredicates enables both upper and lower bounds for predicates.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator predicates used to bound computations in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Element data type
+>
+class PredicatedTileIteratorPredicates {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index lower_extent_row_;
+  Index upper_extent_row_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// Internal state counter
+  int state_[3];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(lower_extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(upper_extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPredicates(
+    PredicatedTileIteratorParams const & params,
+    TensorCoord lower_extent,
+    TensorCoord upper_extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    lower_extent_row_ = lower_extent.row();
+    upper_extent_row_ = upper_extent.row();
+    thread_start_row_ = thread_offset.row();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < upper_extent.column()) &&
+        ((thread_offset.column() + ThreadMap::Delta::kColumn * c) >= lower_extent.column());
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPredicates &operator++() {
+
+    ++state_[0];
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Gets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+
+  ///< Gets lower_extent_row_
+  CUTLASS_DEVICE Index get_lower_extent_row() {
+    return lower_extent_row_;
+  }
+
+  ///< Gets upper_extent_row_
+  CUTLASS_DEVICE Index get_upper_extent_row() {
+    return upper_extent_row_;
+  }
+
+  ///< Gets thread_start_row_
+  CUTLASS_DEVICE Index get_thread_start_row() {
+    return thread_start_row_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
new file mode 100755
index 000000000..94b71b9b8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@@ -0,0 +1,479 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Element data type
+>
+class PredicatedTileIteratorStridedDgrad {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    /// Convolution problem size
+    cutlass::conv::Conv2dProblemSize problem_size;
+    int tiled_rows_per_filter;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize problem_size_, int threadblock_row): 
+      problem_size(problem_size_), 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+  
+      int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_row);
+
+      tiled_rows_per_filter = tile_m_per_filter * threadblock_row;
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Starting Dx h and w dimension for strided dgrad mapping
+  int start_h_, start_w_;
+
+  /// Effective Dy P and Q dimensions for strided dgrad mapping
+  int p_, q_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have been computed)
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorStridedDgrad(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    int r = start_r;
+    int s = start_s;
+
+    if (params_.problem_size.mode == cutlass::conv::Mode::kConvolution) {
+      r = (params_.problem_size.R - 1 - r);
+      s = (params_.problem_size.S - 1 - s);
+    }
+
+    // compute starting coordinates in Dx start_h_ and start_w_
+    strided_dgrad_starting_coords(
+      params_.problem_size, 
+      stride_h_divmod, stride_w_divmod, 
+      r, s, 
+      start_h_, start_w_);
+
+    p_ = (params_.problem_size.H - start_h_ + params_.problem_size.stride_h - 1) / params_.problem_size.stride_h;
+    q_ = (params_.problem_size.W - start_w_ + params_.problem_size.stride_w - 1) / params_.problem_size.stride_w;
+
+    extent_row_ = extent.row();
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          // remapping rows to find the mapped_row_offset
+          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
+
+          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
+          int n = npq_offset / (p_ * q_); 
+          int residual = npq_offset % (p_ * q_);
+          int p = residual / q_;
+          int q = residual % q_;
+        
+          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
+                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
+                                  (start_w_ + q * params_.problem_size.stride_w);
+          bool row_guard = mapped_row_offset < extent_row_;
+
+          int64_t row_byte_offset = mapped_row_offset * params_.stride;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
+                guard);
+          }
+        }
+      }
+    }
+  }
+
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          // remapping rows to find the mapped_row_offset
+          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
+
+          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
+          int n = npq_offset / (p_ * q_); 
+          int residual = npq_offset % (p_ * q_);
+          int p = residual / q_;
+          int q = residual % q_;
+        
+          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
+                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
+                                  (start_w_ + q * params_.problem_size.stride_w);
+          bool row_guard = mapped_row_offset < extent_row_;
+
+          int64_t row_byte_offset = mapped_row_offset * params_.stride;
+          
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType) >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
+                guard);            
+          }
+        }
+      }
+    }
+  }
+
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorStridedDgrad &operator++() {
+
+    ++state_[0];
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
new file mode 100755
index 000000000..ccdb4a9f8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int MaxAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8
+>
+class SharedLoadIterator {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    ThreadMap::kElementsPerAccess, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIterator(
+    TensorRef ref,
+    int thread_idx
+  ):
+    byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
+    stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ +=
+      thread_offset.row() * stride_ + 
+      thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    byte_pointer_ += 
+      offset.row() * Shape::kRow * stride_ + 
+      offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          uint8_t const *byte_pointer = byte_pointer_ + 
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+          LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = 
+                memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
new file mode 100755
index 000000000..eef4d22bd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -0,0 +1,594 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Accumulator data type
+  int ElementSizeBits_,      ///< Size of accumulator in bits
+  int OutputSizeBits_,       ///< Size of output element in bits
+  int ElementsPerAccess,     ///< Vector length of output vector
+  int ContiguousLanes,       ///< Number of lanes in the warp writing to contiguous elements
+                             ///  in the global memory tensor
+  bool EightBitsOutputOrLess = (OutputSizeBits_ <= 8)
+>
+class SharedLoadIteratorMixed;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Accumulator data type
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8, false> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    ThreadMap::kElementsPerAccess, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointers
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] = reinterpret_cast<LoadType const *>(ref.data());
+
+      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+      int bank_offset = (col_idx * int(sizeof(LoadType)) / 128) % kLoadsPerAccess;
+
+      col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+      pointers_[i] += thread_offset.row() * stride_ + col_idx;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); 
+
+              LoadType const *memory_pointer = pointers_[v] + row_ptr_offset;
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for
+///   int32_t x 16 => int8_t/int4b_t x 16 and
+///   float x 16 => float_e4m3_t/float_e5m2_t x 16
+template <
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
+  int OutputSizeBits_       ///< Size of output element in bits
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 16, 8, true> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 16;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    16, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 16;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i);
+ 
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for:
+///   int32_t x 8 => int8_t/int4b_t x 8 and
+///   float x 8 => float_e4m3_t/float_e5m2_t x 8
+template <
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
+  int OutputSizeBits_
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 8, 8, true> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    8, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 2;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i);
+
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
new file mode 100755
index 000000000..5af6997ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+  
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_,  ///< Thread map (conept: PitchLinearThreadMap)
+          typename Element_,    ///< Element data type
+          int MaxAlignment = ThreadMap_::kElementsPerAccess *sizeof_bits<Element_>::value / 8>
+class SharedLoadIteratorPitchLinear {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType =
+      AlignedArray<Element,
+                   const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+                   const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+  /// Base address offset
+  Index base_smem_address_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorPitchLinear(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8),
+        base_smem_address_(0) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    // thread_offset.row() is contiguous dim
+    // thread_offset.column() is stride dim
+    byte_pointer_ += thread_offset.row() * sizeof(AccessType) / kElementsPerAccess+
+                     thread_offset.column() * stride_ ;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    byte_pointer_ +=
+        offset.row() * ThreadMap::StorageShape::kContiguous * sizeof(AccessType) / kElementsPerAccess +
+        offset.column() * ThreadMap::StorageShape::kStrided * stride_;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        uint8_t const *byte_pointer =
+            byte_pointer_ + s * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous * ThreadMap::kElementsPerAccess *
+                sizeof_bits<Element>::value / 8 +
+            pointer_offset * sizeof_bits<Element>::value / 8 + base_smem_address_;
+
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+        LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kLoadsPerAccess; ++v) {
+          frag_ptr[frag_base_idx * kLoadsPerAccess + v] = memory_pointer[v];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
new file mode 100755
index 000000000..84a096c65
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -0,0 +1,187 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorComplexTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
+  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    complex<OperatorElementC>, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  static int const kRealIndex = 0;
+
+  /// Offset into the accumulator fragment
+  static int const kImaginaryIndex = 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<OperatorElementC, 2 * kImaginaryIndex>;
+
+  /// This is the complete warp-level accumulator tile.
+  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kImaginaryIndex>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      auto const & real_accum_array = accumulators_[accumulator_access_offset + kRealIndex];
+      auto const & imag_accum_array = accumulators_[accumulator_access_offset + kImaginaryIndex / Policy::kElementsPerAccess];
+
+      // Pack real and imaginary parts into a structure. This is likely to result in MOVs
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
+
+        frag_ptr[n][i].real() = real_accum_array[i];
+        frag_ptr[n][i].imag() = imag_accum_array[i]; 
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
new file mode 100755
index 000000000..13b00762e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorGaussianComplexTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
+  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorGaussianComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    complex<OperatorElementC>, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC
+  static int const kElementsAccumulatorPerPart = 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
+
+  /// Offset into the accumulator fragment part 1
+  static int const kPart1Index = kElementsAccumulatorPerPart * 0;
+
+  /// Offset into the accumulator fragment part 2
+  static int const kPart2Index = kElementsAccumulatorPerPart * 1;
+
+  /// Offset into the accumulator fragment part 3
+  static int const kPart3Index = kElementsAccumulatorPerPart * 2;
+
+  /// This is the complete warp-level accumulator tile holding part1, part2, and part3
+  using AccumulatorTile = Array<OperatorElementC, kElementsAccumulatorPerPart * 3>;
+
+  /// This is the complete warp-level accumulator tile holding final output of complex<T> type 
+  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kElementsAccumulatorPerPart>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index];
+      auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess];
+      auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess];
+
+      // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
+
+        frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i];
+        frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; 
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
new file mode 100755
index 000000000..92d3bf582
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fragment iterator for SIMT accumulator arrangements
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator,              ///< matrix multiply operation (concept: arch::Mma)
+  typename Layout,                ///< target shared memory layout
+  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class FragmentIteratorSimt;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,     ///< shape of the warp-level GEMM tile
+  typename Operator_ ,     ///< matrix multiply operator (concept: arch::Mma)
+  typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class FragmentIteratorSimt<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Layout = layout::RowMajor;
+
+  /// Policy for warp-level epilogue components
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<typename Operator::ElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+
+      int accumulator_access_offset = index_ * Policy::kAccessesPerIteration + n;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
new file mode 100755
index 000000000..a69f0fd25
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for col-major shared memory
+/// Only works for 168x tensor core kernels
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::ColumnMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::ColumnMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    4 * Policy::OperatorCount::kRow * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Policy::kAccumulatorRowStride; ++i) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < (Policy::OperatorCount::kRow * 2); ++m) {
+
+        int accumulator_access_offset = 
+          index * Policy::kAccumulatorColumnStride + m * Policy::kAccumulatorRowStride / Policy::kElementsPerAccess + i;
+
+        frag_ptr[m + i * Policy::OperatorCount::kRow * 2] = accumulators_[accumulator_access_offset];
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
new file mode 100755
index 000000000..4979a3803
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+class FragmentIteratorVoltaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = half_t;
+  using Layout = layout::RowMajor;
+
+  /// Policy operator
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    static int const kAccessesPerMma = Policy::kElementsPerMma / Policy::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      
+      int tile_access_idx = 
+        (tile_n * Policy::TileIterations::kRow + (index_ & 2) / 2) * Policy::MmaIterations::kCount * kAccessesPerMma;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * kAccessesPerMma; ++mma_n) {
+
+        int mma_access_idx = ((mma_n & 1) * 2 + (index_ & 1)) * kAccessesPerMma + (mma_n & 2) / 2;
+
+        frag_ptr[tile_n * Policy::MmaIterations::kColumn * kAccessesPerMma +
+          mma_n] = accumulators_[tile_access_idx + mma_access_idx];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = float;
+  using Layout = layout::RowMajor;
+
+  /// Policy operator
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    int const kRegsPerMmaRow = 2;
+      
+    CUTLASS_PRAGMA_UNROLL
+    for (int reg_row = 0; reg_row < Policy::kRowsPerMmaTile; ++reg_row) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+    
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * 2; ++mma_n) {
+
+          int mma_idx = (index_ & 1) + (index_ & 2) * Policy::MmaIterations::kCount / 2 +
+            (tile_n * Policy::TileIterations::kRow) * Policy::MmaIterations::kCount + (mma_n & 1) * 2;
+
+          int reg_offset = reg_row * kRegsPerMmaRow + (mma_n & 2) * 2;
+          int reg_idx = mma_idx * Policy::kElementsPerMma + reg_offset;
+
+          *frag_ptr = accumulators_[reg_idx / Policy::kElementsPerAccess];
+          ++frag_ptr;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
new file mode 100755
index 000000000..955409f32
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#if !(defined(__clang__) && defined(__CUDA__))
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorWmmaTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
+>
+class FragmentIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kCount>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+private:
+
+  /// Internal access type
+  using AccessType = WmmaFragmentArray<OperatorFragmentC, Policy::kWmmaFragmentsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) { 
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+      
+      int accumulator_access_offset = index_ * Policy::OperatorCount::kColumn + n;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
+#else
+#error (defined(__clang__) && defined(__CUDA__))
+#endif // !defined(__clang__)
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
new file mode 100755
index 000000000..b30bf19d6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of SimtOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename WarpShape,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator,             ///< matrix multiply operation (concept: arch::Mma)
+  typename Layout,               ///< destination layout in shared memory
+  typename MmaSimtPolicy         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+struct SimtPolicy;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator_,            ///< matrix multiply operation (concept: arch::Mma)
+  typename MmaSimtPolicy_        ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+struct SimtPolicy<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+
+  static_assert(!(WarpShape::kM % MmaSimtPolicy::WarpShape::kRow), "Divisibility");
+  static_assert(!(WarpShape::kN % MmaSimtPolicy::WarpShape::kColumn), "Divisibility");
+
+  /// Number of iterations
+  static int const kIterations = WarpShape::kM / MmaSimtPolicy::WarpShape::kRow;
+
+  /// Number of accumulators written per iteration
+  static int const kElementsPerIteration = 
+    (WarpShape::kN / MmaSimtPolicy::WarpShape::kColumn);
+
+  /// Total number of accumulators
+  static int const kAccumulatorElementCount = kElementsPerIteration * kIterations;
+
+  /// Number of consecutive elements
+  static int const kElementsPerAccess = MmaSimtPolicy::LaneMmaShape::kN;
+
+  /// Number of rows per epilogue iteration
+  static int const kRowsPerIteration = MmaSimtPolicy::WarpShape::kRow;
+
+  /// Number of accesses made in one iteration
+  static int const kAccessesPerIteration = kElementsPerIteration / kElementsPerAccess;
+
+  /// Number of elements in between accumulator chunks of (LaneMmaShape::kM x LaneMmaShape::kN)
+  using Delta = MatrixShape<
+    MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM,
+    MmaSimtPolicy::WarpShape::kColumn * MmaSimtPolicy::LaneMmaShape::kN
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
new file mode 100755
index 000000000..b3f3a4f59
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
+  typename Layout         ///< target shared memory layout
+>
+struct TensorOpPolicy; 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct TensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
+    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = 8;
+  static bool const kDivisible = 
+    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction = OperatorShape::kM / kRowsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kRow * kIterationsPerInstruction;
+
+  using TileIterations = MatrixShape<kIterations, 1>;
+
+  static int const kAccumulatorRowStride = kElementsPerAccess;
+  static int const kAccumulatorColumnStride = kElementsPerAccess * OperatorCount::kRow * kIterationsPerInstruction;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct TensorOpPolicy<WarpShape, OperatorShape, layout::ColumnMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
+    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 1;
+  static int const kColumnsPerIteration = 8;
+  static bool const kDivisible = 
+    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction = OperatorShape::kN / kColumnsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kColumn * kIterationsPerInstruction;
+
+  using TileIterations = MatrixShape<kIterations, 1>;
+
+  // Hard code for 16x8
+  static int const kAccumulatorRowStride = 2;
+  static int const kAccumulatorColumnStride = 4 * OperatorCount::kRow;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major-interleaved
+template <
+    typename WarpShape,  ///< shape of warp-level GEMM (concept: MatrixShape)
+    typename OperatorShape,   ///< matrix multiply operation (concept: arch::Mma)
+    int InterleavedK     ///< number of interleaved k
+    >
+struct TensorOpPolicy<WarpShape, OperatorShape,
+                      layout::ColumnMajorInterleaved<InterleavedK> > {
+  /// Number of operations
+  using OperatorCount = MatrixShape<WarpShape::kM / OperatorShape::kM,
+                                    WarpShape::kN / OperatorShape::kN>;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = 8;
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction =
+      OperatorShape::kM / kRowsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = WarpShape::kN / InterleavedK *
+                                 OperatorCount::kRow *
+                                 kIterationsPerInstruction;
+
+  static int const kElementsPerIteration = InterleavedK / OperatorShape::kN * kElementsPerAccess;
+
+  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
+
+  // Number of externally visible iterations
+  //static int const kTileIterations = OperatorCount::kRow * kIterationsPerInstruction;
+  using TileIterations = MatrixShape<1, WarpShape::kN / InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
new file mode 100755
index 000000000..0f470ff76
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -0,0 +1,785 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/simt_policy.h"
+
+#define CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES 1
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator,      ///< matrix multiply operation (concept: arch::Mma)
+  typename Element,       ///< data type of element to be written
+  typename Layout,        ///< target shared memory layout
+  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimt;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
+  typename Element_,       ///< data type of element to be written
+  typename MmaSimtPolicy_         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimt<WarpShape_, Operator_, Element_, layout::RowMajor, MmaSimtPolicy_> {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+    + 1
+#endif
+  >;
+
+private:
+
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    1
+  >;
+
+#else
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    Policy::kElementsPerAccess
+  >;
+#endif
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    pointer_ += layout_({
+      lane_offset.row(),
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+      // de-vectorized stores
+      using ScalarAccessType = AlignedArray<Element, 1>;
+      ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
+      ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+          scalarPointer[n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s] = scalarFragPtr[n * Policy::kElementsPerAccess + s];
+        }
+      }
+#else
+    // original vector stores
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
+    }
+#endif
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+          typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
+          typename Element_,       ///< data type of element to be written
+          typename Layout_,         ///< target shared memory layout
+          typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirectConv {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<Policy::kRowsPerIteration, WarpShape::kN>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<typename Operator::ElementC, Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<typename Operator::ElementC, Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0,
+                              0
+                              >;
+
+private:
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    Policy::kElementsPerAccess
+  >;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements) {
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    pointer_ += layout_({
+      lane_offset.row(),
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    // original vector stores
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType * load_pointer_ = reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      load_pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address){
+    base_smem_address_ = address;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,               ///< shape of warp-level GEMM (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename Operator_,                ///< matrix multi ply operation (concept: arch::Mma)
+          typename Element_,                 ///< data type of element to be written
+          typename Layout_,                  ///< target shared memory layout
+          typename MmaSimtPolicy_            ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirect2dConv {
+ public:
+  using WarpShape = WarpShape_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  // Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<ThreadOutputShape::kNHW, ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % MmaSimtPolicy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / MmaSimtPolicy::LaneMmaShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Operator::FragmentC;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = AccumulatorTile;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, 0>;
+
+ private:
+  // Storage type for accessing memory
+  using AccessType = AlignedArray<Element, MmaSimtPolicy::LaneMmaShape::kN>;
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv(TensorRef const &ref, unsigned thread_id, unsigned lane_id)
+      : pointer_(reinterpret_cast<AccessType *>(ref.data())),
+        layout_(ref.stride()[0] / AccessType::kElements) {
+  
+    auto lane_layout = MmaSimtPolicy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    // Get base HW offset of current threads
+    const int threadgroup = thread_id / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    const int base_p = (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    const int base_q = (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    const int row_offset = base_p * ThreadBlockOutputShape::kW + base_q;
+
+    pointer_ += layout_(
+        {row_offset,
+         lane_offset.column() * MmaSimtPolicy::LaneMmaShape::kN / int(AccessType::kElements)});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv &add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType *storer_pointer_ =
+        reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadOutputShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadOutputShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (w + h * ThreadBlockOutputShape::kW) *
+                           (ThreadBlockOutputShape::kC / AccessType::kElements) +
+                       col;
+          storer_pointer_[offset + pointer_offset / int(AccessType::kElements)] =
+              frag_ptr[w + h * ThreadOutputShape::kW + col];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,        ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator_,         ///< matrix multiply operation (concept: arch::Mma)
+  typename Element_,          ///< data type of element to be written
+  typename Layout_,            ///< target shared memory layout
+  typename MmaSimtPolicy_     ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimtCanonical {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = Layout_;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess + 1
+  >;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    1
+  >;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Guard to indicate whether the shape is divisible
+  bool divisible_;
+
+  /// Extent of the output tensor
+  MatrixCoord extent_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements),
+    divisible_(true),
+    extent_(WarpShape::kM, WarpShape::kN) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    thread_offset_ = {
+      lane_offset.row() * Shape::kRow, 
+      lane_offset.column() * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({
+      lane_offset.row() * Shape::kRow,
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(
+    TensorRef const &ref,
+    TensorCoord const &extent,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements),
+    divisible_(false),
+    extent_(extent) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    thread_offset_ = {
+      lane_offset.row() * Shape::kRow, 
+      lane_offset.column() * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({
+      lane_offset.row() * Shape::kRow,
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row(), 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(), 
+      coord_offset.column()
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    // de-vectorized stores
+    using ScalarAccessType = AlignedArray<Element, 1>;
+    ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
+    ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+        
+        int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
+        int frag_idx = n * Policy::kElementsPerAccess + s;
+        
+        int col = thread_offset_.column() + ptr_idx;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          scalarPointer[ptr_idx] = scalarFragPtr[frag_idx];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+      // de-vectorized loads
+      using ScalarAccessType = AlignedArray<Element, 1>;
+      ScalarAccessType *scalarFragPtr = reinterpret_cast<ScalarAccessType *>(&frag);
+      ScalarAccessType const *scalarPointer = reinterpret_cast<ScalarAccessType const*>(pointer_) + pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+          
+          int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
+          int frag_idx = n * Policy::kElementsPerAccess + s;
+          
+          int col = thread_offset_.column() + ptr_idx;
+
+          if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+            scalarFragPtr[frag_idx] = scalarPointer[ptr_idx];
+          }
+        }
+      }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & operator++() {
+    return add_tile_offset({1, 0});
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
new file mode 100755
index 000000000..0bef03106
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -0,0 +1,671 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element,       ///< data type of element to be written
+  typename Layout         ///< target shared memory layout
+>
+class TileIteratorTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_        ///< data type of element to be written
+>
+class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorLayout = Layout;
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of times this iterator can be incremented
+  using TileIterations = typename Policy::TileIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column() / Policy::kElementsPerAccess});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(),
+      coord_offset.column() / Policy::kElementsPerAccess
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      frag_ptr[n] = pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator++() {
+    return add_tile_offset({1, 0});
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,       ///< data type of element to be written
+  int InterleavedK         ///< number of interleaved k
+>
+class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, 
+                            layout::ColumnMajorInterleaved<InterleavedK> > {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorInterleaved<InterleavedK>;
+  using TensorLayout = Layout;                ///< shared memory tensor ref layout
+
+  using TensorRef = TensorRef<Element, TensorLayout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+//    Policy::kRowsPerIteration,
+    WarpShape::kM,
+    InterleavedK
+  >;
+
+  /// This is the fragment size produced by one tile
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction 
+        * Policy::kElementsPerIteration>;
+
+  /// This is the fragment size produced by one iteration
+//  using Fragment = Array<
+//    Element, Policy::kElementsPerIteration >;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  using TileIterations = typename Policy::TileIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerIteration>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  TensorLayout layout_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerIteration
+    };
+
+    pointer_ += (layout_({thread_offset_.row(), thread_offset_.column()}) / Policy::kElementsPerAccess);
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += (layout_({
+      coord_offset.row(),
+      coord_offset.column()
+    }) / Policy::kElementsPerAccess);
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+      
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
+
+      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
+        ptr[a + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n * Policy::kAccessPerIteration + a];
+
+//        printf("store thread %d, address %p, bank %ld\n", threadIdx.x, pointer_+a+n*Detail::kLanesInQuad, 
+//            ((long long)(pointer_+a+n*Detail::kLanesInQuad)>>2)&0x1f);
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
+
+      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
+        frag_ptr[n * Policy::kAccessPerIteration + a] = ptr[a + pointer_offset / Policy::kElementsPerAccess];
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator++() {
+    return add_tile_offset({0, 1});
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,       ///< data type of element to be written
+  typename Layout_
+>
+class TileIteratorTensorOpCanonical {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  static int const kAccessSize = 1;
+  static int const kAccessCount = Policy::kElementsPerAccess / kAccessSize;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, kAccessSize>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Guard to indicate whether the shape is divisible
+  bool divisible_;
+
+  /// Extent of the output tensor
+  MatrixCoord extent_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]),
+    divisible_(true),
+    extent_(WarpShape::kM, WarpShape::kN) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(
+    TensorRef const &ref,
+    TensorCoord const &extent,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]),
+    divisible_(false),
+    extent_(extent) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(),
+      coord_offset.column()
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < kAccessCount; ++a) {
+
+        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
+        int frag_idx = n * kAccessCount + a;
+
+        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          pointer_[ptr_idx] = frag_ptr[frag_idx];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < kAccessCount; ++a) {
+
+        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
+        int frag_idx = n * kAccessCount + a;
+        
+        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          frag_ptr[frag_idx] = pointer_[ptr_idx];
+        }
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & operator++() {
+    return add_tile_offset({1, 0});
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
new file mode 100755
index 000000000..c512dd873
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -0,0 +1,1081 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This is an optimization available on CUDA 11.2 and beyond that eliminates branches in the epilogue.
+#define CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED ((__CUDACC_VER_MAJOR__ * 10 + __CUDACC_VER_MINOR__) >= 112)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory. This is optimized
+/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output
+/// data type is smaller. 
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,              ///< data type of accumulator element
+  int ElementSizeBits,            ///< Size of accumulator element in bits
+  int OutputSizeBits,             ///< Size of output element in bits
+  int OutputElementCount,         ///< number of elements in output vector
+  int ContiguousLanes,            ///< Number of consecutive lanes writing to contiguous memory
+  bool EightBitsOutputOrLess = (OutputSizeBits <= 8)
+>
+class TileIteratorTensorOpMixed {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = OutputElementCount;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 
+      (OutputElementCount * sizeof_bits<Element>::value) / (const_min(128, OutputElementCount * sizeof_bits<Element>::value));
+
+    // Currently support max 4 ptr
+    static constexpr int kMaxPointerCount{4};
+
+    static_assert(kPointerCount <= kMaxPointerCount, "Can only accommodate four pointers at present.");
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Logical column in which warp tile is aligned
+  int warp_column_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / Policy::kElementsPerAccess),
+    warp_column_(0) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2;
+
+      ptr += column_idx;
+
+      pointers_[i % Detail::kPointerCount] = ptr;
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / Policy::kElementsPerAccess;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + 
+        tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess;
+    }
+
+    warp_column_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    AccessType *ptr = pointers_[0];
+
+#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+    // When the optimization is enabled, small tiles require separate logic.
+    bool kN32_optimization = (WarpShape::kN * Detail::kLanesInQuad * Policy::kElementsPerAccess * sizeof_bits<Element>::value) % 1024 == 0;
+    if (kN32_optimization) {
+      int ptr_idx = ((warp_column_ * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      } else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      } else if (ptr_idx == 2) {
+        ptr = pointers_[2];
+      } else if (ptr_idx == 3) {
+        ptr = pointers_[3];
+      }
+    }
+
+#endif
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      
+#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+      //
+      // When the optimization is enabled, this expression suffices to obtain the SMEM pointer.
+      //
+      if (WarpShape::kN == 64) {
+        ptr = pointers_[n / 4];
+      }
+      else if (!kN32_optimization)
+#endif
+      {
+        // This is the reference implementation
+        int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+        int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+  
+        if (ptr_idx == 0) {
+          ptr = pointers_[0 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 1) {
+          ptr = pointers_[1 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 2) {
+          ptr = pointers_[2 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 3) {
+          ptr = pointers_[3 % Detail::kPointerCount];
+        }
+      }
+
+      int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess;
+      ptr[offset] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+
+      AccessType const *smem_ptr = pointers_[ptr_idx];
+      frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
+  int OutputSizeBits              ///< Size of output element in bits
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8, true> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added 
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount] = {0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  int OutputSizeBits              ///< Size of output element in bits
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8, true> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+   
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+ 
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 16 => float_e4m3_t/float_e5m2_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape),
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount] = {0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 8 => float_e4m3_t/float_e5m2_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#undef CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
new file mode 100755
index 000000000..8ce4750c3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -0,0 +1,440 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+struct TileIteratorVoltaTensorOp; 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using Element = half_t;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of elements per access
+  static int const kElementsPerAccess = Policy::kElementsPerAccess;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+    static int const kRowsPerQuad = 4;
+    static int const kColumnsPerQuad = 8;
+    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
+    static int const kAccessQuadDelta = 16;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorVoltaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
+
+    int quad_id = lane_id / Detail::kLanesInQuad;
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    int quad_row_idx = ((quad_id & 4) >> 1) + (quad_id & 1);
+    int quad_col_idx = ((quad_id & 2) >> 1);
+
+    int row = quad_row_idx * Detail::kRowsPerQuad + lane_in_quad;
+    int column = quad_col_idx * Detail::kColumnsPerQuad;
+
+    pointer_ += layout_({row, column / kElementsPerAccess});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
+
+        int access_quad = access_idx / 2;
+        int access = access_idx % 2;
+
+        int ptr_offset = tile_idx * InterleavedTileShape::kN / Policy::kElementsPerAccess +
+          access_quad * Detail::kAccessQuadDelta / Policy::kElementsPerAccess + 
+          access + pointer_offset / Policy::kElementsPerAccess;
+
+        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
+
+        AccessType access_vector = frag_ptr[frag_idx];
+
+        pointer_[ptr_offset] = access_vector;
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
+
+        int access_quad = access_idx / 2;
+        int access = access_idx % 2;
+
+        int ptr_offset = tile_idx * Detail::kTileDelta + access_quad * Detail::kAccessQuadDelta + 
+          access + pointer_offset / Policy::kElementsPerAccess;
+
+        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
+
+        frag_ptr[frag_idx] = pointer_[ptr_offset];
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment const &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using Element = float;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of elements per access
+  static int const kElementsPerAccess = Policy::kElementsPerAccess;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+    static int const kRowsPerQuad = 4;
+    static int const kColumnsPerQuad = 8;
+    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
+    static int const kAccessQuadDelta = 16;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorVoltaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
+
+    int quad_id = lane_id / Detail::kLanesInQuad;
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    int const kQuadRowDelta = 4;
+    int const kQuadColumnDelta = 2 * Policy::MmaIterations::kColumn;
+
+    int quad_row_offset = ((quad_id & 4) / 2 + (quad_id & 1)) * kQuadRowDelta;
+    int quad_column_offset = (quad_id & 2) / 2 * kQuadColumnDelta;
+
+    int thread_row_offset = (lane_in_quad & 1);
+    int thread_column_offset = (lane_in_quad & 2) / 2;
+
+    int row = quad_row_offset + thread_row_offset;
+    int column = quad_column_offset + thread_column_offset;
+
+    pointer_ += layout_({row, column});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    int const kAccessesPerRow = Policy::TileIterations::kColumn * Policy::MmaIterations::kColumn * 2;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int row_idx = 0; row_idx < Policy::kRowsPerMmaTile; ++row_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < kAccessesPerRow; ++access_idx) {
+
+        int frag_idx = row_idx * kAccessesPerRow + access_idx;
+
+        int ptr_column_offset = (access_idx & 1) * 2 + 
+          (access_idx & 2) * Policy::MmaIterations::kColumn * 2 + 
+          (access_idx & 4) * Policy::MmaIterations::kColumn * 2;
+
+        int ptr_row_offset = row_idx * 2;
+
+        int ptr_offset = layout_({ptr_row_offset, ptr_column_offset}) + pointer_offset / Policy::kElementsPerAccess;
+
+        pointer_[ptr_offset] = frag_ptr[frag_idx];
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    assert(0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment const &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
new file mode 100755
index 000000000..951833d4e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -0,0 +1,227 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#if !(defined(__clang__) && defined(__CUDA__))
+
+#include "cutlass/cutlass.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,       ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorFragment,    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
+  typename Layout               ///< target shared memory layout
+>
+class TileIteratorWmmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,          ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,      ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorFragment_    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
+>
+class TileIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorFragment_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorFragment = OperatorFragment_;
+  using Layout = layout::RowMajor;
+
+  //
+  // Derived types
+  //
+  using WmmaDataType = typename OperatorFragment::element_type;
+  using Element = typename cutlass::arch::WmmaToCutlassDataType<WmmaDataType>::Type; ///< Data Type of element stored in nvcuda::wmma::frament         
+  using TensorRef = TensorRef<Element, Layout>;                                      ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                                                   ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = WmmaFragmentArray<OperatorFragment, Policy::OperatorCount::kColumn * Policy::kWmmaFragmentsPerAccess>;
+
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+
+  /// Padding quantity 
+  // (Epilogue shared memory padding for WMMA Gemm kernel is set to run optimaly on Turing)
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess
+  >;
+
+private:
+
+  /// Storage type for accessing memory
+  //using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to shared memory
+  TensorRef ref_;
+
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp(): ref_(nullptr) { 
+
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ): ref_(ref) {
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & add_pointer_offset(Index pointer_offset) {
+    ref_.add_pointer_offset(pointer_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+    ref_.add_coord_offset({tile_offset.row() * OperatorShape::kM, tile_offset.column() * WarpShape::kN});
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+      
+      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
+
+      nvcuda::wmma::store_matrix_sync(
+        ptr, 
+        frag[n], 
+        ref_.stride()[0], 
+        nvcuda::wmma::layout_t::mem_row_major
+      ); 
+    
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+ 
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+
+      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
+
+      nvcuda::wmma::load_matrix_sync(         
+        frag[n], 
+        ptr,
+        ref_.stride()[0], 
+        nvcuda::wmma::layout_t::mem_row_major
+      ); 
+    
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // !defined(__clang__)
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
new file mode 100755
index 000000000..f6df868e3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -0,0 +1,195 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+struct VoltaTensorOpPolicy; 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_          ///< shape of warp-level GEMM (concept: GemmShape)
+>
+struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = half_t;
+  using Layout = layout::RowMajor;
+
+  /// Shape of one warp-levelinstruction
+  using InstructionShape = gemm::GemmShape<16, 16, 4>;
+
+  /// Number of mma operations performed for one 32x32x4 interleaved tile
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / InstructionShape::kM,
+    InterleavedTileShape::kN / InstructionShape::kN
+  >;
+
+  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
+  using TileIterations = MatrixShape<
+    WarpShape::kM / InterleavedTileShape::kM,
+    WarpShape::kN / InterleavedTileShape::kN
+  >;
+
+  /// Number of accumulator elements owned by each thread per Mma
+  static int const kElementsPerMma = 8;
+  static int const kRowsPerIteration = 16;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  /// Number of accumulator elements stored per memory instruction to shared memory
+  static int const kElementsPerAccess = 4;
+  
+  /// Number of accesses performed per interleaved tile
+  static int const kAccessesPerInterleavedTile = 4;
+
+  /// Total number of iterations needed to cover the entire tile
+  static int const kIterations = TileIterations::kRow * 2;
+
+  //
+  // Derived types
+  //
+
+  /// Array type for aligned memory accesses
+  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    ElementC, 
+    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    ElementC, 
+    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_          ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = float;
+  using Layout = layout::RowMajor;
+
+  /// Shape of one warp-levelinstruction
+  using InstructionShape = gemm::GemmShape<16, 16, 4>;
+
+  /// Number of mma operations performed for one 32x32x4 interleaved tile
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / InstructionShape::kM,
+    InterleavedTileShape::kN / InstructionShape::kN
+  >;
+
+  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
+  using TileIterations = MatrixShape<
+    WarpShape::kM / InterleavedTileShape::kM,
+    WarpShape::kN / InterleavedTileShape::kN
+  >;
+
+  /// Number of accumulator elements owned by each thread per Mma
+  static int const kElementsPerMma = 8;
+  static int const kRowsPerIteration = 16;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  /// Number of accumulator elements stored per memory instruction to shared memory
+  static int const kElementsPerAccess = 2;
+  
+  /// Number of accesses performed per interleaved tile
+  static int const kAccessesPerInterleavedTile = 8;
+
+  /// Number of rows per interleaved tile
+  static int const kRowsPerMmaTile = 2;
+
+  /// Total number of iterations needed to cover the entire tile
+  static int const kIterations = TileIterations::kRow * MmaIterations::kRow;
+
+  //
+  // Derived types
+  //
+  
+  /// Array type for aligned memory accesses
+  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    ElementC, 
+    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    ElementC, 
+    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
new file mode 100755
index 000000000..a09c1f792
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -0,0 +1,101 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
+  typename Layout         ///< target shared memory layout
+>
+struct WmmaTensorOpPolicy; 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct WmmaTensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    WarpShape::kM / OperatorShape::kM,
+    WarpShape::kN / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = OperatorShape::kM;
+  static int const kWmmaFragmentsPerAccess = 1;
+
+  //
+  // Derived quantities
+  //
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kRow;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/fast_math.h b/lightllm-kernel/cutlass/include/cutlass/fast_math.h
new file mode 100755
index 000000000..fa3873c5e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/fast_math.h
@@ -0,0 +1,1067 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/uint128.h"
+#include "cutlass/coord.h"
+#include "cutlass/half.h"
+
+/**
+ * \file
+ * \brief Math utilities
+ */
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+CUTLASS_HOST_DEVICE void swap(T &lhs, T &rhs) {
+  T tmp = lhs;
+  lhs = rhs;
+  rhs = tmp;
+}
+
+/******************************************************************************
+ * Static math utilities
+ ******************************************************************************/
+
+/// Mixed precision dot product
+template <typename Index, typename LongIndex, int N>
+CUTLASS_HOST_DEVICE LongIndex dot(
+  Coord<N, Index> const &coord,
+  Coord<N, LongIndex> const &stride,
+  LongIndex acc = LongIndex()) {
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < N; ++n) {
+    acc += LongIndex(coord[n]) * stride[n];
+  }
+  return acc;
+}
+
+/**
+ * Statically determine if N is a power-of-two
+ */
+template <int N>
+struct is_pow2 {
+  static bool const value = ((N & (N - 1)) == 0);
+};
+
+/**
+ * Statically determine log2(N), rounded down
+ */
+template <int N, int CurrentVal = N, int Count = 0>
+struct log2_down {
+  /// Static logarithm value
+  enum { value = log2_down<N, (CurrentVal >> 1), Count + 1>::value };
+};
+
+// Base case
+template <int N, int Count>
+struct log2_down<N, 1, Count> {
+  enum { value = Count };
+};
+
+/**
+ * Statically determine log2(N), rounded up
+ */
+template <int N, int CurrentVal = N, int Count = 0>
+struct log2_up {
+  /// Static logarithm value
+  enum { value = log2_up<N, (CurrentVal >> 1), Count + 1>::value };
+};
+
+// Base case
+template <int N, int Count>
+struct log2_up<N, 1, Count> {
+  enum { value = ((1 << Count) < N) ? Count + 1 : Count };
+};
+
+/**
+ * Statically estimate sqrt(N) to the nearest power-of-two
+ */
+template <int N>
+struct sqrt_est {
+  enum { value = 1 << (log2_up<N>::value / 2) };
+};
+
+/**
+ * For performing a constant-division with a compile-time assertion that the
+ * Divisor evenly-divides the Dividend.
+ */
+template <int Dividend, int Divisor>
+struct divide_assert {
+  enum { value = Dividend / Divisor };
+
+  static_assert((Dividend % Divisor == 0), "Not an even multiple");
+};
+
+/******************************************************************************
+ * Rounding
+ ******************************************************************************/
+
+/**
+ * Round dividend up to the nearest multiple of divisor
+ */
+template <typename dividend_t, typename divisor_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
+  return ((dividend + divisor - 1) / divisor) * divisor;
+}
+
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t abs_for_integer(value_t a) {
+  return ((a > 0) ? a : -a);
+}
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd(value_t a, value_t b) {
+  for (;;) {
+    if (a == 0) return cutlass::abs_for_integer(b);
+    b %= a;
+    if (b == 0) return cutlass::abs_for_integer(a);
+    a %= b;
+  }
+}
+
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm(value_t a, value_t b) {
+  value_t temp = cutlass::gcd(a, b);
+  return (temp != 0) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
+}
+
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd_cxx11(value_t a, value_t b) {
+  return (a == 0 || b == 0) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
+}
+
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm_cxx11(value_t a, value_t b) {
+  return cutlass::gcd_cxx11(a, b) ? (cutlass::abs_for_integer(a) / cutlass::gcd_cxx11(a, b) *
+                                    cutlass::abs_for_integer(b))
+                                  : value_t{};
+}
+
+/// Returns the smallest value in the half-open range [a, a+b) that is a multiple of b
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+int round_up(int a, int b) {
+  return ((a + b - 1) / b) * b;
+}
+
+/// Returns the ceiling of (a / b)
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+int ceil_div(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * log2 computation, what's the
+ * difference between the below codes and
+ * log2_up/down codes?
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t clz(value_t x) {
+  for (int i = 31; i >= 0; --i) {
+    if ((1 << i) & x)
+      return value_t(31 - i);
+  }
+  return value_t(32);
+}
+
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t find_log2(value_t x) {
+  int a = int(31 - clz(x));
+  a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
+  return a;
+}
+
+
+/**
+ * Find divisor, using find_log2
+ */
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void find_divisor(unsigned int& mul, unsigned int& shr, unsigned int denom) {
+  if (denom == 1) {
+    mul = 0;
+    shr = 0;
+  } else {
+    unsigned int p = 31 + find_log2(denom);
+    unsigned m = unsigned(((1ull << p) + unsigned(denom) - 1) / unsigned(denom));
+
+    mul = m;
+    shr = p - 32;
+  }
+}
+
+/**
+ * Find quotient and remainder using device-side intrinsics
+ */
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigned int shr) {
+
+  #if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+  #else
+  quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src);
+  #endif
+
+  // The remainder.
+  rem = src - (quo * div);
+}
+
+// For long int input
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, unsigned int shr) {
+
+  #if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+  #else
+  quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src);
+  #endif
+  // The remainder.
+  rem = src - (quo * div);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation.
+///
+/// This object precomputes two values used to accelerate the computation and is best used
+/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
+/// marshalled along other kernel arguments using the 'Params' pattern.
+///
+/// Example:
+///
+///
+///   int quotient, remainder, dividend, divisor;
+///
+///   FastDivmod divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmod {
+  using value_div_type = int;
+  using value_mod_type = int64_t;
+  int32_t divisor = 1;
+  uint32_t multiplier = 0u;
+  uint32_t shift_right = 0u;
+
+  // Find quotient and remainder using device-side intrinsics
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int& remainder, int dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? int(((int64_t)dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
+
+  /// For long int input
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int64_t& remainder, int64_t dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? ((dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
+
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+
+  constexpr FastDivmod() = default;
+
+  CUTLASS_HOST_DEVICE
+  FastDivmod(int divisor_): divisor(divisor_) {
+    assert(divisor_ >= 0);
+    if (divisor != 1) {
+      unsigned int p = 31 + find_log2(divisor);
+      unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
+
+      multiplier = m;
+      shift_right = p - 32;
+    }
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(int &quotient, int &remainder, int dividend) const {
+    fast_divmod(quotient, remainder, dividend);
+  }
+
+  /// Computes integer division using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  int div(int dividend) const {
+    int quotient, remainder;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Alias for `div` to match the interface of FastDivmodU64
+  CUTLASS_HOST_DEVICE
+  int divide(int dividend) const {
+    return div(dividend);
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  ///
+  /// Simply returns the quotient
+  CUTLASS_HOST_DEVICE
+  int divmod(int &remainder, int dividend) const {
+    int quotient;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(int &quotient, int64_t &remainder, int64_t dividend) const {
+    fast_divmod(quotient, remainder, dividend);
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  int divmod(int64_t &remainder, int64_t dividend) const {
+    int quotient;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Returns the divisor when cast to integer
+  CUTLASS_HOST_DEVICE
+  operator int() const { return divisor; }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division.
+///
+/// This object precomputes two values used to accelerate the computation and is best used
+/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
+/// marshalled along other kernel arguments using the 'Params' pattern.
+///
+/// Example:
+///
+///
+///   uint64_t quotient, remainder, dividend, divisor;
+///
+///   FastDivmodU64 divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmodU64 {
+
+  uint64_t divisor;
+  uint64_t multiplier;
+  unsigned int shift_right;
+  unsigned int round_up;
+
+  //
+  // Static methods
+  //
+
+  /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
+  CUTLASS_HOST_DEVICE
+  static uint32_t integer_log2(uint64_t x) {
+    uint32_t n = 0;
+    while (x >>= 1) {
+      ++n;
+    }
+    return n;
+  }
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(): divisor(0), multiplier(0), shift_right(0), round_up(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(uint64_t divisor_): divisor(divisor_), multiplier(1), shift_right(0), round_up(0) {
+
+    if (divisor) {
+      shift_right = integer_log2(divisor);
+
+      if ((divisor & (divisor - 1)) == 0) {
+        multiplier = 0;
+      }
+      else {
+        uint64_t power_of_two = (uint64_t(1) << shift_right);
+        uint64_t multiplier_lo = uint128_t(0, power_of_two) / divisor;
+        multiplier = uint128_t(power_of_two, power_of_two) / divisor;
+        round_up = (multiplier_lo == multiplier ? 1 : 0);
+      }
+    }
+  }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    uint64_t quotient = 0;
+
+    #ifdef __CUDA_ARCH__
+      uint64_t x = dividend;
+      if (multiplier) {
+        x = __umul64hi(dividend + round_up, multiplier);
+      }
+      quotient = (x >> shift_right);
+    #else
+      quotient = dividend / divisor;
+    #endif
+
+    return quotient;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t quotient, uint64_t dividend) const {
+    return dividend - quotient * divisor;
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(quotient, dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division
+/// in which the divisor is a power of two.
+struct FastDivmodU64Pow2 {
+
+  uint64_t divisor;
+  unsigned int shift_right;
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(): divisor(0), shift_right(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(uint64_t divisor_): divisor(divisor_), shift_right(FastDivmodU64::integer_log2(divisor_)) { }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    return dividend >> shift_right;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t dividend) const {
+    // See https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#division-modulo-operations
+    return dividend & (divisor - 1);
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the coordinate decomposition from a linear index (64-bit linear index => coord<int32_t>)
+///
+/// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that
+/// a coordinate of <Rank> indices can be decomposed by <Rank - 1> div/mod operations.
+/// Note, is assumed that element divmod[0] divides by extent[1].
+///
+/// For example, assume 4-D coordinate (n, p, q, c) is mapped to a linear index `npqc`. This
+/// can be decomposed via three divide and modulus operations:
+///
+///      c = npqc % C;         |  divmod[2] = FastDivmodU64(C)
+///    npq = npqc / C;         |   coord[3] = c
+///
+///      q =  npq % Q;         |  divmod[1] = FastDivmodU64(Q)
+///     np =  npq / Q;         |   coord[2] = q
+///
+///      p =   np % P;         |  divmod[0] = FastDivmodU64(P)
+///      n =   np / P;         |   coord[1] = p
+///
+///                            |   coord[0] = n
+///
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
+  uint64_t linear_idx,                    ///< Linear index to decompose
+  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = Rank; i > 1; --i) {
+    uint64_t remainder;
+    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
+    coord[i - 1] = int(remainder);
+  }
+
+  coord[0] = int(linear_idx);
+
+  return coord;
+}
+
+/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
+  int linear_idx,                    ///< Linear index to decompose
+  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = Rank; i > 1; --i) {
+    int remainder;
+    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
+    coord[i - 1] = int(remainder);
+  }
+
+  coord[0] = int(linear_idx);
+
+  return coord;
+}
+
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
+  uint64_t linear_idx,                    ///< Linear index to decompose
+  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank - 1; ++i) {
+    uint64_t remainder;
+    linear_idx = divmod[i].divmod(remainder, linear_idx);
+    coord[i] = int(remainder);
+  }
+
+  coord[Rank - 1] = int(linear_idx);
+
+  return coord;
+}
+
+/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
+  int linear_idx,                    ///< Linear index to decompose
+  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank - 1; ++i) {
+    int remainder;
+    linear_idx = divmod[i].divmod(remainder, linear_idx);
+    coord[i] = int(remainder);
+  }
+
+  coord[Rank - 1] = int(linear_idx);
+
+  return coord;
+}
+
+/// Safely computes the offset of a linear index in bytes for all types
+template <typename Element>
+CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index) {
+
+  static_assert(
+    (sizeof_bits<Element>::value >= 8 && !(sizeof_bits<Element>::value % 8)) ||
+    (sizeof_bits<Element>::value <  8 && !(8 % sizeof_bits<Element>::value)),
+    "Size of numeric type in bits must either be divisible by 8 bits, or 8 bits must be divisible by the size.");
+
+  if (sizeof_bits<Element>::value >= 8) {
+    return index * (sizeof_bits<Element>::value / 8);
+  }
+  else {
+    int const kElementsPerByte = ((8 / sizeof_bits<Element>::value) + ((sizeof_bits<Element>::value >= 8) ? 1 : 0));
+    return index / kElementsPerByte;
+  }
+}
+
+CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index, int64_t element_sizeof_bits) {
+  if (element_sizeof_bits >= 8) {
+    return index * (element_sizeof_bits / 8);
+  }
+  else {
+    int64_t const kElementsPerByte = ((8 / element_sizeof_bits) + ((element_sizeof_bits >= 8) ? 1 : 0));
+    return index / kElementsPerByte;
+  }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Min/Max
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Min {
+  static int const kValue = (A < B) ? A : B;
+};
+
+template <int A, int B>
+struct Max {
+  static int const kValue = (A > B) ? A : B;
+};
+
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17 int const_min(int a, int b) {
+    return (b < a ? b : a);
+}
+
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17 int const_max(int a, int b) {
+    return (b > a ? b : a);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_min(T a, T b) {
+  return (b < a ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_min(float a, float b) {
+  return fminf(a, b);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_max(T a, T b) {
+  return (a < b ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_max(float a, float b) {
+  return fmaxf(a, b);
+}
+
+CUTLASS_HOST_DEVICE
+float fast_cos(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::cosf(theta);
+  #else
+  return std::cos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_cos(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::cos(theta);
+  #else
+  return std::cos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_sin(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sinf(theta);
+  #else
+  return std::sin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_sin(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sin(theta);
+  #else
+  return std::sin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_acos(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::acosf(theta);
+  #else
+  return std::acos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_acos(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::acos(theta);
+  #else
+  return std::acos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_asin(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::asinf(theta);
+  #else
+  return std::asin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_asin(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::asin(theta);
+  #else
+  return std::asin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_sqrt(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sqrtf(theta);
+  #else
+  return std::sqrt(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_sqrt(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sqrt(theta);
+  #else
+  return std::sqrt(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_exp(float x) {
+  #if defined(__CUDA_ARCH__)
+  return ::expf(x);
+  #else
+  return std::exp(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_exp(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::exp(x);
+  #else
+  return std::exp(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t fast_exp(half_t x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
+      return (half_t)(::hexp(x.to_half()));
+  #else
+      return (half_t)(fast_exp(float(x)));
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_log(float x) {
+  #if defined(__CUDA_ARCH__)
+  return ::logf(x);
+  #else
+  return std::log(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_log(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::log(x);
+  #else
+  return std::log(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_tanh(float x) {
+  #if defined(__CUDA_ARCH__)
+    #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+      float y;
+      asm volatile ( "tanh.approx.f32 %0, %1; " : "=f"(y) : "f"(x));
+      return y;
+    #else
+      return ::tanhf(x);
+    #endif
+  #else
+  return std::tanh(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_tanh(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::tanh(x);
+  #else
+  return std::tanh(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t fast_tanh(half_t x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(x.raw()) : "h"(x.raw()));
+  return x;
+
+  #else
+  return half_t(fast_tanh(float(x)));
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct fast_exp_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &rhs) const {
+    return fast_exp(rhs);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
+template <int N>
+struct fast_exp_op<Array<half_t, N>> {
+  CUTLASS_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
+
+    Array<half_t, N> result;
+
+    // use x2 specialization
+    __half2 const *in  = reinterpret_cast<__half2 const *>(&rhs);
+    __half2 *out = reinterpret_cast<__half2 *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      out[i] = ::h2exp(in[i]);
+    }
+
+    // residual
+    if (N % 2) {
+      half_t last = rhs[N - 1];
+      result[N - 1] = half_t(::hexp(last.to_half()));
+    }
+
+    return result;
+  }
+};
+#endif // #if defined(__CUDA_ARCH__)
+
+template <typename T, int N>
+struct fast_exp_op<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+
+    fast_exp_op<T> fast_op;
+    Array<T, N> y;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = fast_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct fast_tanh_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &rhs) const {
+    return fast_tanh(rhs);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+template <int N>
+struct fast_tanh_op<Array<half_t, N>> {
+  CUTLASS_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
+
+    Array<half_t, N> result;
+
+    // use x2 specialization
+    uint32_t const *in  = reinterpret_cast<uint32_t const *>(&rhs);
+    uint32_t *out = reinterpret_cast<uint32_t *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm volatile ("tanh.approx.f16x2 %0, %1;" : "=r"(out[i]) : "r"(in[i]));
+    }
+
+    // residual
+    if (N % 2) {
+      uint16_t const *in = reinterpret_cast<uint16_t const *>(&rhs);
+      uint16_t *out = reinterpret_cast<uint16_t *>(&result);
+      asm volatile ("tanh.approx.f16 %0, %1;" : "=h"(out[N - 1]) : "h"(in[N - 1]));
+    }
+
+    return result;
+  }
+};
+#endif // #if defined(__CUDA_ARCH__)
+
+template <typename T, int N>
+struct fast_tanh_op<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+
+    fast_tanh_op<T> fast_op;
+    Array<T, N> y;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = fast_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Absolute value function
+template <typename T>
+CUTLASS_HOST_DEVICE
+T absolute_value(T x) {
+  if (x < T()) {
+    return -x;
+  }
+  return x;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/float8.h b/lightllm-kernel/cutlass/include/cutlass/float8.h
new file mode 100755
index 000000000..38ea4008c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/float8.h
@@ -0,0 +1,1284 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using IEEE half-precision floating-point types in host or
+      device code.
+*/
+
+#pragma once
+
+// FP8 types are available starting CUDA 11.8+
+#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#define CUDA_FP8_ENABLED 1
+#endif
+
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ >= 900)
+#    if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#  elif (__CUDA_ARCH__ == 890)
+#    if (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#  endif // (__CUDA_ARCH__ >= 900)
+#endif // defined(__CUDA_ARCH__)
+
+#ifdef __GNUC__
+// Ignore checks on reinterpret-casts that are being used for bitcasts.
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDACC_RTC__)
+
+#include "cutlass/floating_point_nvrtc.h"
+
+#else
+//
+// Standard Library headers belong here to avoid conflicts with NVRTC.
+//
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+#ifdef CUDA_FP8_ENABLED
+#include <cuda_fp8.h>
+#endif
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  FP8 Has 2 encodings possible : E4M3 and E5M2
+//
+//  E4M3 : 7  |  6 5 4 3  |  2 1 0
+//  E5M2 : 7  |  6 5 4 3 2  |  1 0
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class FloatEncoding {
+    E4M3,
+    E5M2
+};
+
+template<FloatEncoding T>
+struct alignas(1) float8_base {
+
+    static constexpr bool IS_E4M3 = (T == FloatEncoding::E4M3);
+    static constexpr bool IS_E5M2 = (T == FloatEncoding::E5M2);
+
+    // Number of Bits representing mantissa and exponents
+    static constexpr int FP32_NUM_BITS = 32;
+    static constexpr int FP32_NUM_EXPONENT_BITS = 8;
+    static constexpr int FP32_NUM_MANTISSA_BITS = 23;
+    static constexpr uint32_t FP32_NAN = 0x7fffffff;
+    static constexpr uint32_t FP32_INFINITY_MASK = 0x7f800000;
+    static constexpr int FP32_MAX_EXPONENT  =  127;
+    static constexpr int FP32_MIN_EXPONENT  = -126;
+    static constexpr int FP32_EXPONENT_BIAS =  127;
+
+    static constexpr int FP16_NUM_BITS = 16;
+    static constexpr int FP16_NUM_EXPONENT_BITS = 5;
+    static constexpr int FP16_NUM_MANTISSA_BITS = 10;
+    static constexpr uint16_t FP16_NAN = 0x7fff;
+    static constexpr uint16_t FP16_INFINITY_MASK = 0x7c00;
+    static constexpr int FP16_MAX_EXPONENT  = 15;
+    static constexpr int FP16_MIN_EXPONENT  = -14;
+    static constexpr int FP16_EXPONENT_BIAS = 15;
+
+    static constexpr int FP8_NUM_BITS = 8;
+    static constexpr int FP8_NUM_EXPONENT_BITS = IS_E4M3 ? 4 : 5;
+    static constexpr int FP8_NUM_MANTISSA_BITS = IS_E4M3 ? 3 : 2;
+    static constexpr uint8_t  FP8_NAN = 0x7f; // Also F8_INF
+    static constexpr uint8_t  FP8_INFINITY_MASK = IS_E4M3 ? 0x78 : 0x7c;
+    static constexpr int FP8_MAX_EXPONENT  = IS_E4M3 ?  7 :  15;
+    static constexpr int FP8_MIN_EXPONENT  = IS_E4M3 ? -6 : -14;
+    static constexpr int FP8_EXPONENT_BIAS = IS_E4M3 ?  7 :  15;
+
+    static constexpr uint8_t  FP8_EXPONENT_MASK = (1 << FP8_NUM_EXPONENT_BITS) - 1;
+    static constexpr uint8_t  FP8_MANTISSA_MASK = (1 << FP8_NUM_MANTISSA_BITS) - 1;
+
+    static constexpr uint8_t FP8_MAX_FLT = (IS_E4M3 ? 0x7e : 0x7b);
+
+    // 256 in float
+    static constexpr uint32_t FP8_SAT_VAL_FP32 = 0x43800000;
+
+    //
+    // Data members
+    //
+
+    /// Data container
+    uint8_t storage;
+
+    /// Ctors.
+    CUTLASS_HOST_DEVICE
+    float8_base() : storage(0) { }
+
+    /// Is finite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isfinite(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7f800000) < 0x7f800000;
+    }
+
+    /// Is NaN implementation
+    CUTLASS_HOST_DEVICE
+    static bool isnan(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7fffffff) > 0x7f800000;
+    }
+
+    /// Is infinite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isinf(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Sign = 0 for +inf, 1 for -inf
+        // Exponent = all ones
+        // Mantissa = all zeros
+        return (s == 0x7f800000) || (s == 0xff800000);
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static uint8_t convert_float_to_fp8(float const& flt) {
+
+        // software implementation rounds toward nearest even
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Extract the bits in the FP32 type
+        uint8_t sign = uint8_t((s >> 24 & 0x80));
+        int32_t exp = int32_t((s >> FP32_NUM_MANTISSA_BITS) & 0xff) - FP32_EXPONENT_BIAS;
+        int mantissa = s & 0x7fffff;
+        uint8_t u = 0;
+
+        uint8_t const kF8_NaN = 0x7f;
+
+        // NaN => NaN
+        if (isnan(flt)) {
+            return kF8_NaN;
+        }
+
+        // Inf => MAX_FLT (satfinite)
+        if (isinf(flt)) {
+            return sign | FP8_MAX_FLT;
+        }
+
+        // Special handling
+        if (exp == -128) {
+            // int8 range is from -128 to 127
+            // So 255(inf) - 127(bias) = 128 - will show up as -128
+
+            // satfinite
+            return (sign | FP8_MAX_FLT);
+        }
+
+        int sticky_bit = 0;
+
+        bool skip_sign = false;
+        bool may_be_nan = false;
+
+        if ( (exp >= FP8_MIN_EXPONENT) && (exp <= FP8_MAX_EXPONENT) ) {
+            // normal fp32 to normal fp8
+            exp = exp + FP8_EXPONENT_BIAS;
+            u = uint8_t((uint32_t(exp) & FP8_EXPONENT_MASK) << FP8_NUM_MANTISSA_BITS);
+            u = uint8_t(u | (mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS)));
+        } else if(exp < FP8_MIN_EXPONENT) {
+            // normal single-precision to subnormal float8-precision representation
+            int rshift = (FP8_MIN_EXPONENT - exp);
+            if (rshift < FP32_NUM_BITS) {
+                mantissa |= (1 << FP32_NUM_MANTISSA_BITS);
+
+                sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
+
+                mantissa = (mantissa >> rshift);
+                u = (uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS- FP8_NUM_MANTISSA_BITS)) & FP8_MANTISSA_MASK);
+            } else {
+                mantissa = 0;
+                u = 0;
+            }
+        // Exponent > FP8_MAX_EXPONENT - this is a special case done to match HW
+        // 0x4380_0000 to 0x43e0_0000 - maps from 256 to 448, and does not saturate / inf.
+        } else {
+            if( exp == (FP8_MAX_EXPONENT + 1) ) {
+                uint8_t mantissa_tmp = uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS));
+                if( mantissa_tmp < FP8_MANTISSA_MASK) {
+                    exp = exp + FP8_EXPONENT_BIAS;
+                    u = uint8_t(uint32_t(exp) << FP8_NUM_MANTISSA_BITS) | mantissa_tmp;
+                    may_be_nan =  (mantissa_tmp == (FP8_MANTISSA_MASK-1));
+                } else {
+                    // satfinite
+                    return (sign | FP8_MAX_FLT);
+                }
+            } else{
+                // satfinite
+                return (sign | FP8_MAX_FLT);
+            }
+        }
+
+        // round to nearest even
+        int NUM_BITS_SHIFT = FP32_NUM_MANTISSA_BITS - (FP8_NUM_MANTISSA_BITS + 1);
+        int round_bit = ((mantissa >> NUM_BITS_SHIFT) & 1);
+        sticky_bit |= ((mantissa & ((1 << NUM_BITS_SHIFT) - 1)) != 0);
+
+        if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+            u = uint8_t(u + 1);
+            if( may_be_nan ) {
+                skip_sign = true;
+            }
+        }
+
+        if (u > FP8_MAX_FLT) {
+            // satfinite
+            u = (sign | FP8_MAX_FLT);
+        }
+
+        if( ! skip_sign ) {
+            u |= sign;
+        }
+
+        return u;
+    }
+
+
+    /// Converts a fp8 value stored as a uint8_t to a float
+    CUTLASS_HOST_DEVICE
+    static float convert_fp8_to_float(uint8_t const& x) {
+
+        uint32_t constexpr kF32_NaN = 0x7fffffff;
+
+        uint8_t const &f8 = x;
+        uint32_t sign = (f8 >> (FP8_NUM_BITS - 1)) & 1;
+        uint32_t exp = (f8 >> FP8_NUM_MANTISSA_BITS) & FP8_EXPONENT_MASK;
+        uint32_t mantissa = f8 & FP8_MANTISSA_MASK;
+        unsigned f = (sign << (FP32_NUM_BITS-1));
+
+        if (IS_E4M3 && exp == 15 && mantissa == 0x7) {
+            f = kF32_NaN;
+        }
+        else if (exp > 0 && (IS_E4M3 || exp < (FP8_MAX_EXPONENT + FP8_EXPONENT_BIAS + 1))) {
+            // normal
+            exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS);
+            f = f |
+                (exp << FP32_NUM_MANTISSA_BITS) |
+                (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+        } else if (exp == 0) {
+            if (mantissa) {
+                // subnormal
+                exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS) + 1;
+                while ((mantissa & (1 << FP8_NUM_MANTISSA_BITS)) == 0) {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= FP8_MANTISSA_MASK;
+                f = f |
+                    (exp << FP32_NUM_MANTISSA_BITS) |
+                    (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+            } else {
+                // sign-preserving zero
+            }
+        } else {
+            if(mantissa == 0){
+                // Sign-preserving infinity
+                f = (f | 0x7f800000);
+            } else {
+                // Canonical NaN
+                f = kF32_NaN;
+            }
+        }
+
+        #if defined(__CUDA_ARCH__)
+        return reinterpret_cast<float const&>(f);
+        #else
+        float flt;
+        std::memcpy(&flt, &f, sizeof(flt));
+        return flt;
+        #endif
+    }
+};
+
+
+// Forward declaration of float_e5m2_t to define float_e4m3_t <=> float_e5m2_t
+// conversions in class float_e4m3_t
+struct float_e5m2_t;
+
+
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E4M3
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
+
+    using Base = float8_base<FloatEncoding::E4M3>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t bitcast(uint8_t x) {
+        float_e4m3_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E4M3 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E4M3 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e4m3_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(half x) {
+        storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(double x): float_e4m3_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(int x): float_e4m3_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(unsigned x): float_e4m3_t(float(x)) {
+    }
+
+    /// E5M2 conversion. Defined after float_e5m2_t is defined.
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float_e5m2_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e4m3_t & operator=(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+        return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e4m3_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E5M2
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
+
+    using Base = float8_base<FloatEncoding::E5M2>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t bitcast(uint8_t x) {
+        float_e5m2_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E5M2 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E5M2 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e5m2_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(half x) {
+      storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(double x): float_e5m2_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(int x): float_e5m2_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(unsigned x): float_e5m2_t(float(x)) {
+    }
+
+    /// E4M3 conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float_e4m3_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e5m2_t & operator=(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+      return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+    
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e5m2_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator+(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs) {
+    return float_e4m3_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator*(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator/(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator+=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator-=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator*=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator/=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator++(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator--(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator++(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator--(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator+(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs) {
+    return float_e5m2_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator*(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator/(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator+=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator-=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator*=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator/=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator++(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator--(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator++(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator--(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// float_e4m3_t <=> float_e5m2_t conversions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// float_e4m3_t <= float_e5m2_t
+CUTLASS_HOST_DEVICE
+float_e4m3_t::float_e4m3_t(float_e5m2_t x) {
+    storage = from_float(float_e5m2_t::to_float(x)).storage;
+}
+
+/// float_e5m2_t <= float_e4m3_t
+CUTLASS_HOST_DEVICE
+float_e5m2_t::float_e5m2_t(float_e4m3_t x) {
+    storage = from_float(float_e4m3_t::to_float(x)).storage;
+}
+
+///////////////////////////////////////////////////////////////
+///
+/// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
+/// This umbrella datatype can be enabled when a user provides a specific
+/// datatype in runtime argument list.
+///
+/// Currently supported runtime datatypes compatible with type_erased_dynamic_float8_t:
+///   QMMAFormat::E5M2
+///   QMMAFormat::E4M3
+///
+///////////////////////////////////////////////////////////////
+
+union type_erased_dynamic_float8_t {
+  uint8_t data;
+  cutlass::float_e5m2_t e5m2;
+  cutlass::float_e4m3_t e4m3;
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e5m2_t() const {
+    return e5m2;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e4m3_t() const {
+    return e4m3;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+}  // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+}  // namespace platform
+
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(long double x) {
+  return cutlass::float_e4m3_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(unsigned long long int x) {
+  return cutlass::float_e4m3_t(int(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(long double x) {
+  return cutlass::float_e5m2_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(unsigned long long int x) {
+  return cutlass::float_e5m2_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h b/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
new file mode 100755
index 000000000..fdbd80fcd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines categories for floating point numbers for use in NVRTC-compiled code
+*/
+
+#pragma once
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// All floating-point numbers can be put in one of these categories.
+enum  {
+    FP_NAN =
+# define FP_NAN 0
+      FP_NAN,
+    FP_INFINITE =
+# define FP_INFINITE 1
+      FP_INFINITE,
+    FP_ZERO =
+# define FP_ZERO 2
+      FP_ZERO,
+    FP_SUBNORMAL =
+# define FP_SUBNORMAL 3
+      FP_SUBNORMAL,
+    FP_NORMAL =
+# define FP_NORMAL 4
+      FP_NORMAL
+};
+
+CUTLASS_HOST_DEVICE
+int fpclassify(float const& f) {
+
+  uint32_t s;
+
+  #if defined(__CUDA_ARCH__)
+  s = reinterpret_cast<uint32_t const &>(f);
+  #else
+  std::memcpy(&s, &f, sizeof(s));
+  #endif
+
+  uint32_t exp      = s & 0x7f800000;
+  uint32_t mantissa = s & 0x007fffff;
+
+  if (exp == 0x7f800000) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/functional.h b/lightllm-kernel/cutlass/include/cutlass/functional.h
new file mode 100755
index 000000000..5b2bc3c67
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/functional.h
@@ -0,0 +1,930 @@
+  /***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Define basic numeric operators
+
+    This is inspired by the Standard Library's <functional> header.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#endif
+
+#include <cuda_runtime.h>
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include <mma.h>
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#ifdef _MSC_VER
+// Provides support for alternate operators such as 'and', 'or', ...
+#include <iso646.h>
+#endif // _MSC_VER
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct absolute_value_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return abs(lhs);
+  }
+};
+
+template <>
+struct absolute_value_op<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const { return fabs(lhs); }
+};
+
+template <typename T>
+struct plus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs += rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct minus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs -= rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct multiplies {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs *= rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct scale {
+  T const scaling_factor_;
+
+  CUTLASS_HOST_DEVICE
+  scale(float scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  T operator()(T const &rhs) const {
+    T result = rhs * scaling_factor_;
+    return result;
+  }
+};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+/// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
+template<>
+struct plus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hadd2(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hsub2(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hmul2(lhs, rhs);
+  }
+};
+
+/// Partial specializations needed when __CUDA_NO_HALF_OPERATORS__ is set
+template<>
+struct plus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hadd(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hsub(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hmul(lhs, rhs);
+  }
+};
+#endif // defined(__CUDA_ARCH__)
+
+
+/// Squares with optional conversion
+template <typename T, typename Output = T>
+struct square {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Returns the magnitude squared of an element.
+template <typename T, typename Output = T>
+struct magnitude_squared {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output = T>
+struct square_difference {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs, T rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs) - Output(rhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output = T>
+struct magnitude_squared_difference {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs, T rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs) - Output(rhs);
+    return mul_op(y, y);
+  }
+};
+
+// Computes the reciprocal square root
+template <typename T>
+struct inverse_square_root;
+
+template <>
+struct inverse_square_root<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs) const {
+#if defined(__CUDA_ARCH__)
+    return rsqrtf(lhs);
+#else
+    return 1.f / std::sqrt(lhs);
+#endif
+  }
+};
+
+template <>
+struct inverse_square_root<half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &lhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 520)
+    auto result = hrsqrt(reinterpret_cast<__half const &>(lhs));
+    return reinterpret_cast<half_t const &>(result);
+#else
+    return half_t(1.f / std::sqrt(half_t::convert(lhs)));
+#endif
+  }
+};
+
+/// Divides
+template <typename T>
+struct divides {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs /= rhs;
+    return lhs;
+  }
+};
+
+/// reciprocal_approximate
+template <typename T>
+struct reciprocal_approximate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return divides<T>{}(T(1), lhs);
+  }
+};
+
+template <>
+struct reciprocal_approximate <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      ret = 1.0f / lhs;
+    #endif
+    return ret;
+  }
+};
+
+/// reciprocal_approximate with ftz
+template<typename T>
+struct reciprocal_approximate_ftz :  reciprocal_approximate<T>
+{};
+
+template <>
+struct reciprocal_approximate_ftz <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      if (std::fpclassify(lhs) == FP_SUBNORMAL) {
+        lhs = 0.0f;
+      }
+      ret = 1.0f / lhs;
+      if (std::fpclassify(ret) == FP_SUBNORMAL) {
+        ret = 0.0f;
+      }
+    #endif
+    return ret;
+  }
+};
+
+/// Negate
+template <typename T>
+struct negate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return -lhs;
+  }
+};
+
+/// Greater equal
+template <typename T>
+struct greater_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs >= rhs);
+  }
+};
+
+/// Greater
+template <typename T>
+struct greater {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs > rhs);
+  }
+};
+
+/// Less equal
+template <typename T>
+struct less_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs <= rhs);
+  }
+};
+
+/// Less
+template <typename T>
+struct less {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs < rhs);
+  }
+};
+
+template <typename T, bool PropagateNaN = false>
+struct maximum {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &lhs, T const &rhs) const {
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      // Call isnan unqualified, so argument-dependent lookup (ADL)
+      // will find overloads such as cutlass::isnan(half_t).
+      // Calling ::isnan or std::isnan directly would force
+      // implicit conversions to float of custom number types
+      // in the cutlass namespace (e.g., cutlass::half_t).
+      return lhs > rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (lhs < rhs ? rhs : lhs);
+    }
+  }
+};
+
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template<typename T>
+struct maximum_with_default_nan_propagation : public maximum<T>
+{};
+
+template <>
+struct maximum<float, false> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs, float const &rhs) const {
+    return fmaxf(lhs, rhs);
+  }
+};
+
+template <>
+struct maximum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    return lhs > rhs || isnan(lhs) ? lhs : rhs;
+#endif
+  }
+};
+
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template <typename T>
+struct maximum_with_nan_propagation : maximum<T, true>
+{};
+
+// This alias exists for backwards compatibility only.
+// Please use the correctly spelled class template above.
+template <typename T>
+using maximum_with_nan_propogation = maximum_with_nan_propagation<T>;
+
+template <typename T, bool PropagateNaN = false>
+struct minimum {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &lhs, T const &rhs) const {
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      return lhs < rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (rhs < lhs ? rhs : lhs);
+    }
+  }
+};
+
+template <>
+struct minimum<float, false> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs, float const &rhs) const {
+    return fminf(lhs, rhs);
+  }
+};
+
+template <>
+struct minimum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("min.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    // No need for ADL; call std::isnan(float) on host and ::isnan(float) on device.
+    return lhs < rhs || (CUTLASS_CMATH_NAMESPACE :: isnan(lhs)) ? lhs : rhs;
+#endif
+  }
+};
+
+template <typename T>
+struct minimum_with_nan_propagation : minimum<T, true> 
+{};
+
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(abs_op(lhs), abs_op(rhs));
+  }
+};
+
+// assumes the left operand is already an absolute value
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value_reduction {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(lhs, abs_op(rhs));
+  }
+};
+
+/// Fused multiply-add
+template <typename A, typename B = A, typename C = A>
+struct multiply_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    return C(a) * C(b) + c;
+  }
+};
+
+template <typename T>
+struct square_and_plus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    multiply_add<T> multiply_add_op;
+    return multiply_add_op(rhs, rhs, lhs);
+  }
+};
+
+// Fused multiply-add that takes exactly one template parameter.
+// This is useful for working around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template <typename A>
+struct homogeneous_multiply_add : public multiply_add<A, A, A>
+{};
+
+/// Fused multiply-add
+template <typename A, typename B = A, typename C = A>
+struct multiply_add_relu0 {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    maximum<C> mx;
+    return mx(C(a) * C(b) + c, C(0));
+  }
+};
+
+/// Guarded-multiply-add
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    return C(a) * C(b) + c;
+  }
+};
+
+/// Guarded-multiply-add
+template <>
+struct guarded_multiply_add<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    // Namespace-qualifying isnan as cutlass::isnan saves the compiler
+    // the trouble of argument-dependent lookup.  Calling std::isnan or
+    // ::isnan here would result in unwanted implicit conversion to float.
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    return a * b + c;
+#endif
+  }
+};
+
+/// Guarded-multiply-add-relu0
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add_relu0 {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    maximum<C> mx;
+    return mx(C(a) * C(b) + c, C(0));
+  }
+};
+
+template <>
+struct guarded_multiply_add_relu0<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.relu.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    maximum<half_t> mx;
+    return mx(a * b + c, half_t(0));
+#endif
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct and_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a & b) + c);
+  }
+};
+
+
+/// Fused multiply-add
+template <typename T>
+struct xor_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a ^ b) + c);
+  }
+};
+
+namespace detail {
+
+// Whether namespace-unqualified conj(t) for t of type T is
+// well-formed.  This says whether the compiler can find
+// namespace-unqualified conj(T) via argument-dependent lookup.
+// If so, then CUTLASS assumes that conj(t) returns
+// the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_unqualified_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_unqualified_conj<
+    T,
+    decltype(conj(cutlass::platform::declval<T>()), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_unqualified_conj_v = has_unqualified_conj<T>::value;
+  
+} // namespace detail
+
+// forward declaration (needed for conjugate below)
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z);
+
+namespace detail {
+
+// Whether cutlass::conj(t) for t of type T is well-formed.
+// If so, then CUTLASS assumes that cutlass::conj(t)
+// returns the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_cutlass_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_cutlass_conj<
+    T,
+    decltype(cutlass::conj(cutlass::platform::declval<T>()), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_cutlass_conj_v = has_cutlass_conj<T>::value;
+
+} // namespace detail
+  
+// Return the complex conjugate of the input.
+//
+// If the struct hasn't already been specialized for type T, then
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where either (namespace-unqualified) conj(z) or
+//    cutlass::conj(z) is well formed, declare "using cutlass::conj;"
+//    and return conj(z); and
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup).  cutlass::conj has a
+// totally generic overload, but a more type-specific overload in any
+// namespace will take precedence.
+//
+// Case (3) covers non-Standard non-complex number types.
+//
+// Users should not generally need to specialize this struct for their
+// own custom complex or noncomplex types.  The idiomatic way to
+// identify a type T as "complex" is to make namespace-unqualified
+// calls to conj(T) findable via argument-dependent lookup.
+template <typename T>
+struct conjugate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& z) const {
+    if constexpr (cutlass::platform::is_arithmetic_v<T>) {
+      return z;
+    }
+    else if constexpr (detail::has_unqualified_conj_v<T> || detail::has_cutlass_conj_v<T>) {
+      using cutlass::conj;
+      return conj(z);
+    }
+    else {
+      return z;
+    }
+  }
+};
+
+template <typename T>
+struct first {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first, T const &...) const {
+    return first;
+  }
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first) const {
+    return first;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct logical_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((static_cast<bool>(a) && static_cast<bool>(b)) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((static_cast<bool>(a) || static_cast<bool>(b)) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return T(!(a));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct bit_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a & b;
+  }
+};
+
+template <typename T>
+struct bit_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a | b;
+  }
+};
+
+template <typename T>
+struct bit_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return ~a;
+  }
+};
+
+template <typename T>
+struct bit_xor {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a ^ b;
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// Atomic reductions
+
+template <typename T>
+struct atomic_add
+{
+  CUTLASS_DEVICE
+  void operator()(T *ptr, const T &data)
+  {
+#if defined(__CUDA_ARCH__)
+    atomicAdd(ptr, data);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+template<>
+struct atomic_add<double>
+{
+  CUTLASS_DEVICE
+  void operator()(double *ptr, const double &data)
+  {
+#if !defined(__CUDA_ARCH__)
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#elif (__CUDA_ARCH__ >= 600)
+    atomicAdd(ptr, data);
+#else
+    // Use CAS loop
+    unsigned long long int* ptr_int = reinterpret_cast<unsigned long long int*>(ptr);
+    unsigned long long int old_int = *ptr_int;
+    unsigned long long int assumed_int;
+
+    do {
+      double update = data + __longlong_as_double(old_int);
+      assumed_int = old_int;
+      old_int = atomicCAS(ptr_int, assumed_int, __double_as_longlong(update));
+    } while (assumed_int != old_int);
+#endif // (__CUDA_ARCH__ >= 600)
+  }
+};
+
+template<>
+struct atomic_add<half2>
+{
+  CUTLASS_DEVICE
+  void operator()(half2 *ptr, const half2 &data)
+  {
+#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 600))
+      CUTLASS_UNUSED(ptr);
+      CUTLASS_UNUSED(data);
+      CUTLASS_NOT_IMPLEMENTED();
+#else
+    // Vector-2 atomic reduction requires .target sm_60 or higher
+    uint32_t word = reinterpret_cast<const uint32_t&>(data);
+    asm volatile ("red.gpu.global.add.noftz.f16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
+#endif // (__CUDA_ARCH__ >= 600)
+  }
+};
+
+template <typename T>
+using red [[deprecated("use atomic_add instead")]] = atomic_add<T>;
+
+template <typename T>
+struct atomic_maximum {
+  CUTLASS_DEVICE
+  T operator()(T *ptr, T value) const {
+#if defined(__CUDA_ARCH__)
+    return atomicMax(ptr, value);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+template <>
+struct atomic_maximum<float> {
+  CUTLASS_DEVICE
+  float operator()(float *ptr, float value) const {
+#if defined(__CUDA_ARCH__)
+    // In device code, make sure that we do NOT try to use
+    // std::signbit, as that won't work if building with NVRTC.
+    // Instead, prefix "::" to call signbit from the global namespace,
+    // which CUDA guarantees to work in device code without including
+    // any headers.
+    //
+    return ! ::signbit(value) ?
+      __int_as_float(atomicMax((int*)ptr, __float_as_int(value))) :
+      __uint_as_float(atomicMin((unsigned int*)ptr, __float_as_uint(value)));
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+// is_atomic
+template <class Fn>
+struct is_atomic : platform::false_type {};
+template <class T>
+struct is_atomic<atomic_add<T>> : platform::true_type {};
+template <class T>
+struct is_atomic<atomic_maximum<T>> : platform::true_type {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for nvcuda::wmma::fragment<Use, m, n, k, T, Layout>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+template<typename Use, int m, int n, int k, typename T, typename Layout>
+struct plus<nvcuda::wmma::fragment<Use, m, n, k, T, Layout>>
+{
+  using Fragment = nvcuda::wmma::fragment<Use, m, n, k, T, Layout>;
+  using ElementType = typename Fragment::element_type;
+
+  CUTLASS_HOST_DEVICE
+  Fragment operator()(Fragment const &lhs, Fragment const &rhs) const
+  {
+    Fragment result;
+    plus<ElementType> scalar_op;
+
+    ElementType *result_elts = reinterpret_cast<ElementType*>(&result);
+    const ElementType *lhs_elts = reinterpret_cast<const ElementType*>(&lhs);
+    const ElementType *rhs_elts = reinterpret_cast<const ElementType*>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Fragment::num_elements; i++) {
+      result_elts[i] = scalar_op(lhs_elts[i], rhs_elts[i]);
+    }
+
+    return result;
+  }
+};
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
new file mode 100755
index 000000000..8d95967f9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"
+#include "cute/atom/mma_traits_sm90_gmma_sparse.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+//
+// Some named constants
+//
+constexpr int tma_alignment_bytes = 16;
+constexpr int cp_async_min_alignment_bytes = 4;
+constexpr int sm90_smem_capacity_bytes = 232448;
+
+// Maps 2.x A matrix layout tag to respective GMMA major mode enum
+template <class ElementA, class LayoutA>
+constexpr cute::GMMA::Major
+gmma_ss_tag_to_major_A() {
+  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
+                not cute::is_same_v<ElementA, tfloat32_t> &&
+                sizeof(ElementA) != 1) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+// Maps 2.x B matrix layout tag to respective GMMA major mode enum
+template <class ElementB, class LayoutB>
+constexpr cute::GMMA::Major
+gmma_ss_tag_to_major_B() {
+  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>() &&
+                not cute::is_same_v<ElementB, tfloat32_t> &&
+                sizeof(ElementB) != 1) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+template <class LayoutA>
+constexpr cute::GMMA::Major
+gmma_rs_tag_to_major_A() {
+  // MN major mode is only valid for non-TF32 and non-int MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>()) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+
+template <class LayoutB>
+constexpr cute::GMMA::Major
+gmma_rs_tag_to_major_B() {
+  // MN major mode is only valid for non-TF32 and non-int MMAs
+  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>()) {
+    return cute::GMMA::Major::MN;
+  }
+  else {
+    return cute::GMMA::Major::K;
+  }
+}
+// Maps a rank-1 cute::Shape<> representing the cluster shape on to the TMA atom that should be used with it
+template <class UnimodalClusterShape>
+constexpr auto
+sm90_cluster_shape_to_tma_atom(UnimodalClusterShape) {
+  static_assert(cute::rank(UnimodalClusterShape{}) == 1,
+    "Use this function to figure out TMA for each mode individually.");
+
+  if constexpr (cute::size(UnimodalClusterShape{}) == 1) {
+    return cute::SM90_TMA_LOAD{};
+  }
+  else {
+    return cute::SM90_TMA_LOAD_MULTICAST{};
+  }
+}
+
+// Generates the most efficient possible TiledCopy with simt copy atom(e.g. cp.async) given a set of parameters.
+template<class CopyAtom, int ThreadCount, int Alignment, class StrideType, class TileMN, class TileK>
+constexpr auto
+make_simt_gmem_tiled_copy() {
+  using namespace cute;
+
+  constexpr int TileSizeMN  = cute::size(TileMN{});
+  constexpr int TileSizeK   = cute::size(TileK{});
+
+  // Maximize the number of threads along the gmem major mode to promote coalesced reads
+  // While making sure our thread layout tiles the threadblock tile evenly
+
+  if constexpr (cutlass::gemm::detail::is_k_major<StrideType>()) {
+    // K major thread layout for K major gmem
+    constexpr int threads_major = (ThreadCount >= TileSizeK / Alignment) ? (TileSizeK  / Alignment) : ThreadCount;
+    constexpr int threads_minor = ThreadCount / threads_major;
+    static_assert(threads_major > 0);
+    static_assert(ThreadCount % threads_major == 0);
+    static_assert(threads_minor == 0 || (TileSizeMN % threads_minor == 0));
+    return make_tiled_copy(
+      CopyAtom{},
+      Layout<Shape <Int<threads_minor>,Int<threads_major>>,
+             Stride<Int<threads_major>,                _1>>{},
+      Layout<Shape<_1,Int<Alignment>>>{});
+  }
+  else if constexpr (cutlass::gemm::detail::is_mn_major<StrideType>()) {
+    // MN major thread layout for MN major gmem
+    constexpr int threads_major = (ThreadCount >= TileSizeMN / Alignment) ? (TileSizeMN  / Alignment) : ThreadCount;
+    constexpr int threads_minor = ThreadCount / threads_major;
+    static_assert(threads_major > 0);
+    static_assert(ThreadCount % threads_major == 0);
+    static_assert(threads_minor == 0 || (TileSizeK % threads_minor == 0));
+    return make_tiled_copy(
+      CopyAtom{},
+      Layout<Shape <Int<threads_major>,Int<threads_minor>>,
+             Stride<                _1,Int<threads_major>>>{},
+      Layout<Shape<Int<Alignment>,_1>>{});
+  } else {
+    static_assert(cute::is_void_v<CopyAtom>, "Unsupported gmem layout for automatic gmem tiled copy builder.");
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the optimal GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, const bool is_ws_transposed_B = false>
+constexpr auto
+rs_smem_selector() {
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (sizeof(ElementType) == 4){
+      if constexpr (is_ws_transposed_B) {
+        // only optimized transpositionB(SW32 and SW128 for tf32) can be used, but prefer SW32 due to free bank conflict
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{})");
+        }
+      }
+      else {
+        // Fall into SW32 due to free bank conflict
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+        }
+      }
+    }
+    // Used for int8, fp8, fp16 and bf16 I/O kernels
+    else if constexpr (sizeof(ElementType) == 1 || sizeof(ElementType) == 2) {
+      if constexpr (sizeof(ElementType) == 1 && is_ws_transposed_B) {
+        // Only optimized transpositionB (SW32 for int8 and fp8) can be used
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_128_Atom<ElementType>{})");
+        }
+      }
+      else {
+        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW64_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+        }
+        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+        }
+        else {
+          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+        }
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<ElementType>, "Smem selector does not support this element type");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_smem_selector()
+{
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_MN_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+// Helper for SS GMMA smem selection that considers a tensor TileShape:
+//   (BLK_MN, BLK_K)
+//   or hierarchically
+//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
+//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
+template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, class Sparsity>
+CUTE_HOST_DEVICE constexpr
+auto
+ss_smem_selector_sparse()
+{
+  using namespace cute;
+
+  auto BLK_MN0 = size<0>(BLK_MN{});
+  auto BLK_K0  = size<0>(BLK_K{});
+
+  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
+  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
+
+  if constexpr (major == GMMA::Major::MN) {
+    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
+    }
+  }
+  else if constexpr (major == GMMA::Major::K) {
+    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
+      return GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{};
+    }
+    else {
+      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
+                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
+    }
+  }
+}
+
+template <class ElementA, class ElementB>
+constexpr bool
+is_input_size_two_bytes() {
+  return (sizeof(ElementA) == 2 && sizeof(ElementB) == 2);
+}
+
+template <class ElementA, class ElementB>
+constexpr bool
+is_input_fp8() {
+  return ((cute::is_same_v<ElementA, float_e4m3_t> || cute::is_same_v<ElementA, float_e5m2_t>) &&
+          (cute::is_same_v<ElementB, float_e4m3_t> || cute::is_same_v<ElementB, float_e5m2_t>));
+}
+
+// We need to handle the tuples in this function since it is used in SFINAE dispatch in the CollectiveBuilder.
+// At that point, it is not guaranteed that the tuples have been split out into the required parts.
+template <class MaybeTupleElementA, class LayoutA, class MaybeTupleElementB, class LayoutB>
+constexpr bool
+is_use_rmem_A() {
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
+
+  constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
+  constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
+                                cutlass::gemm::detail::is_k_major_B<LayoutB>();
+  constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
+  return IsUseRmemA;
+}
+
+template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, int RequiredAlignment>
+constexpr bool
+is_aligned() {
+  return ((sizeof(ElementA) * AlignmentA) % RequiredAlignment == 0) &&
+         ((sizeof(ElementB) * AlignmentB) % RequiredAlignment == 0);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
new file mode 100755
index 000000000..8657aad2b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -0,0 +1,1048 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
+template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
+template<int CapacityBytes, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_single_affine_transformed_input(StageCount<stages> stage_count) {
+  return stages;
+}
+
+template <class Element>
+constexpr int get_bits_for_possibly_void_element() {
+  if constexpr (cute::is_same_v<Element, void>) {
+    return 0;
+  }
+  else {
+    return sizeof_bits<Element>::value;
+  }
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
+template<int CapacityBytes, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override_single_affine_transformed_input(StageCountAutoCarveout<carveout_bytes> stage_count) {
+
+  // 32 bytes to account for barriers etc.
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr int scale_zero_k_tile = 1;
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto s_bits = get_bits_for_possibly_void_element<ElementScale>();
+  constexpr auto z_bits = get_bits_for_possibly_void_element<ElementZero>();
+
+  constexpr auto scale_bytes = cutlass::bits_to_bytes(s_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
+  constexpr auto zero_bytes  = cutlass::bits_to_bytes(z_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
+  static_assert(scale_bytes % 128 == 0, "Scale bytes must be a multiple of 128");
+  static_assert(zero_bytes  % 128 == 0, "Zero bytes must be a multiple of 128");
+
+  // When scales are void, s_bits will be 0 so no smem will be allocated for scales.
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(scale_bytes + zero_bytes + mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+template <class ElementA, class LayoutA, class ElementB, class LayoutB>
+constexpr bool
+is_swapAB(){
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAkBmn = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
+                                 cutlass::gemm::detail::is_mn_major_B<LayoutB>();
+  constexpr bool SwapAB = !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  return SwapAB;
+}
+
+template <class ElementA, class LayoutA, class ElementB, class LayoutB, class KernelScheduleType>
+constexpr bool
+is_warpspecialized_transpose_B(){
+  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
+  constexpr bool IsLayoutAmnBmn = cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
+                                  cutlass::gemm::detail::is_mn_major_B<LayoutB>();
+  constexpr bool IsWarpSpecialized = cute::is_base_of_v<KernelTmaWarpSpecialized, KernelScheduleType>                ||
+                                     cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelScheduleType>        ||
+                                     cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, KernelScheduleType>     ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecialized, KernelScheduleType>            ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, KernelScheduleType>    ||
+                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, KernelScheduleType>;
+  constexpr bool IsWarpSpecializedTransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn && IsWarpSpecialized;
+  return IsWarpSpecializedTransposeB;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_any_of_v<KernelScheduleType,
+                         KernelTmaWarpSpecialized,
+                         KernelTmaWarpSpecializedCooperative,
+                         KernelTmaWarpSpecializedPingpong,
+                         KernelPtrArrayTmaWarpSpecializedCooperative,
+                         KernelPtrArrayTmaWarpSpecializedPingpong>) &&
+       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input || (IsFP8Input && !IsArrayOfPointersGemm),
+                "KernelPtrArrayTmaWarpSpecialized[Cooperative|Pingpong] is only compatible with FP8 FastAccum version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      /* For FP8 use a separate mainloop compared to other datatypes */
+      cute::conditional_t<IsFP8Input,
+          MainloopSm90TmaGmmaWarpSpecializedFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+          MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
+      detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool SwapAB = detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
+      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS Mixed Scaled GEMM
+template <
+  class ElementPairA_,
+  class GmemLayoutATag_,
+  int AlignmentA,
+  class ElementPairB_,
+  class GmemLayoutBTag_,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementPairA_,
+    GmemLayoutATag_,
+    AlignmentA,
+    ElementPairB_,
+    GmemLayoutBTag_,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedMixedInput> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongMixedInput> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeMixedInput>)>
+> {
+
+private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementPairA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementPairB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementPairA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementPairB_>;
+  static constexpr bool NeitherIsTuple = !cute::is_tuple<ElementPairA_>::value && !cute::is_tuple<ElementPairB_>::value;
+
+public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementPairA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementPairB_>;
+  static_assert(cute::is_tuple<ElementPairA_>::value ^ cute::is_tuple<ElementPairB_>::value ||
+               (NeitherIsTuple && (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value)),
+    "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  template<class T>
+  static auto get_stride(T const& t) {
+    if constexpr (not cute::is_layout<T>::value) {
+      return t;
+    }
+    else {
+      return cute::stride(t);
+    }
+  }
+
+  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
+  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
+
+  using ElementPairA = cute::conditional_t<IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementPairA_>;
+  using ElementPairB = cute::conditional_t<!IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementPairB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
+      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+  static_assert(!IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
+
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to RF and we must swap the operands.
+  static constexpr bool SwapAB = !IsATransformed;
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  using ElementMma = cute::conditional_t<IsATransformed, ElementB, ElementA>;
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeMixedInput>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
+      ElementMma, ElementMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, TiledMmaGmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementA,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementB,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  using RealElementA = cute::conditional_t<SwapAB, ElementB, ElementA>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementA, ElementB>;
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_single_affine_transformed_input<detail::sm90_smem_capacity_bytes,
+      RealElementA, RealElementB, ElementScale, ElementZero, TileShape_MNK>(StageCountType{});
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before MMA.
+  using StrideA = cute::conditional_t<cute::is_layout<GmemLayoutATag_>::value, GmemLayoutATag_, TagToStrideA_t<GmemLayoutATag>>;
+  using StrideB = cute::conditional_t<cute::is_layout<GmemLayoutBTag_>::value, GmemLayoutBTag_, TagToStrideB_t<GmemLayoutBTag>>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementPairA,
+      StrideA,
+      ElementPairB,
+      StrideB,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_any_of_v<KernelScheduleType,
+                        KernelTmaWarpSpecializedFP8FastAccum,
+                        KernelTmaWarpSpecializedPingpongFP8FastAccum,
+                        KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
+                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  static constexpr bool IsArrayOfPointersGemm = cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>;
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>;
+
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative, Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
+      ElementA, ElementB, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelTma> &&
+                     not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>()));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmma<PipelineStages, ClusterShape_MNK>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct [[deprecated("Use one of KernelCpAsyncWarpSpecialized schedules instead")]]
+CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      cute::is_same_v<KernelScheduleType, KernelMultistage>>
+> {
+  // Map to warp-specialized kernels for better performance
+  using CollectiveOp = typename CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelCpAsyncWarpSpecialized
+  >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync_WS_SS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int   AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int   AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
+    >
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
+                "Minimum alignment required for cp.async is 4B.");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
+      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  static constexpr int NumLoadWarpGroups = cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ? 2 : 1;
+
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
+  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>;
+  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
+  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>;
+  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
+      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      void,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      void,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_CpAsync_WS_RS
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int   AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int   AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
+       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
+      detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
+    >
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
+                "Minimum alignment required for cp.async is 4B.");
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool SwapAB = detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
+      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+
+  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
+      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
+
+  static constexpr int NumLoadWarpGroups = 1;
+
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
+  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>;
+  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
+  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>;  
+  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
+      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
+      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
+      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
+      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<
+      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA auto kernel schedule
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+using ExtractedElementA = detail::deduce_mixed_width_dtype_t<0, ElementA>;
+using ExtractedElementB = detail::deduce_mixed_width_dtype_t<0, ElementB>;
+
+static constexpr bool IsTmaCompatible = detail::is_aligned<
+    ExtractedElementA, AlignmentA, ExtractedElementB, AlignmentB, detail::tma_alignment_bytes>();
+
+// Users opt into scales via the builder by passing a tuple of Elements for the input that will be scaled. We detect
+// scale support if ONLY one of the inputs have tuples to describe them.
+static constexpr bool OnlyOneIsTuple = cute::is_tuple<ElementA>::value ^ cute::is_tuple<ElementB>::value;
+static constexpr bool IsDifferentWidth = sizeof_bits<ExtractedElementA>::value != sizeof_bits<ExtractedElementB>::value;
+static constexpr bool IsMixedWidthInput = IsDifferentWidth || (IsDifferentWidth && OnlyOneIsTuple);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
+  // Persistent schedules perform best for CUDA Toolkits with version >= 12.1
+  // KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
+  using KernelTmaWarpSpecializedScheduleSameInput = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+      KernelTmaWarpSpecializedPingpong, KernelTmaWarpSpecializedCooperative>;
+
+  using KernelTmaWarpSpecializedScheduleMixedInput = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+      KernelTmaWarpSpecializedPingpongMixedInput, KernelTmaWarpSpecializedCooperativeMixedInput>;
+
+  using KernelTmaWarpSpecializedSchedule = cute::conditional_t<IsMixedWidthInput, KernelTmaWarpSpecializedScheduleMixedInput, KernelTmaWarpSpecializedScheduleSameInput>;
+#else
+  using KernelTmaWarpSpecializedSchedule = cute::conditional_t<IsMixedWidthInput, KernelTmaWarpSpecializedMixedInput, KernelTmaWarpSpecialized>;
+#endif
+
+  // Non-persistent schedule is a safer choice for CpAsync kernels due to register pressure
+  using KernelCpAsyncWarpSpecializedSchedule = KernelCpAsyncWarpSpecialized;
+  using KernelSchedule = cute::conditional_t<IsTmaCompatible, KernelTmaWarpSpecializedSchedule, KernelCpAsyncWarpSpecializedSchedule>;
+  static_assert((cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedSchedule> && IsMixedWidthInput) || !IsMixedWidthInput, "Only TMA warp specialized kernels are supported for mixed width input.");
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
new file mode 100755
index 000000000..f9aa7bab2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
@@ -0,0 +1,268 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse configs specific for SM90 structure sparse kernels
+*/
+
+
+#pragma once
+
+#include "cute/atom/mma_traits_sm90_gmma.hpp"  // cute::GMMA::Major
+#include "cute/layout.hpp"                     // cute::Layout, cute::Shape, cute::Stride
+#include "cute/numeric/integral_constant.hpp"  // cute::Int
+#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
+#include "cute/pointer_sparse.hpp"             // cute::is_sparse
+#include "cute/util/type_traits.hpp"           // cute::is_same_v, cute::conditional_t
+#include "cutlass/fast_math.h"                 // cutlass::round_up
+#include "cutlass/layout/matrix.h"             // cutlass::RowMajor, cutlass::ColumnMajor
+
+namespace cutlass {
+
+using namespace cute;
+
+template<
+  class ElementAMma_,
+  GMMA::Major GmmaMajorA,
+  class ElementEMma_,
+  class MinTileShapeK = Int<32>
+>
+struct Sm90GemmSparseConfig {
+
+  static_assert(cute::is_sparse<ElementAMma_>::value, "ElementAMma MUST be sparse elem");
+  static_assert(cute::is_sparse<ElementEMma_>::value, "ElementEMma MUST be sparse elem");
+
+  // A
+  using ElementAMma         = ElementAMma_;
+  using ElementAMmaRaw      = typename ElementAMma::raw_type;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+
+  // Metadata (E)
+  using ElementEMma         = ElementEMma_;
+  using ElementEMmaRaw      = typename ElementEMma::raw_type;
+  using ElementEMmaSparsity = Int<ElementEMma::sparsity>;
+
+  // MMA type
+  static constexpr bool IsQmma = cute::is_same_v<ElementAMmaRaw, float_e4m3_t> && ElementAMmaSparsity{} == _2{} ||
+                                  cute::is_same_v<ElementAMmaRaw, float_e5m2_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsImma = cute::is_same_v<ElementAMmaRaw, int8_t> && ElementAMmaSparsity{} == _2{} ||
+                                 cute::is_same_v<ElementAMmaRaw, uint8_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsHmma = cute::is_same_v<ElementAMmaRaw, half_t> && ElementAMmaSparsity{} == _2{} ||
+                                 cute::is_same_v<ElementAMmaRaw, bfloat16_t> && ElementAMmaSparsity{} == _2{};
+  static constexpr bool IsTfmma = cute::is_same_v<ElementAMmaRaw, tfloat32_t> && ElementAMmaSparsity{} == _2{} || 
+                                  cute::is_same_v<ElementAMmaRaw, float> && ElementAMmaSparsity{} == _2{};
+  static_assert(int(IsQmma) + int(IsImma) + int(IsHmma) + int(IsTfmma) == 1, "Ambigious Input Type Config (failed to choose MMA type)");
+
+  // Number of ElementARaw stored in ElementAMmaRaw. For Hopper this is always 1.
+  using ElemsARawPerElementAMmaRaw = _1;
+
+  // ElementA Sparsity Ratio
+  using ElementASparsity = ElementAMmaSparsity;
+  static_assert(ElementASparsity{} == _2{}, "ElementASparsity must be 2 for Hopper Sparse Gemm");
+
+  // Logical/Physical ElementA per Chunk
+  using LogicalElemsAPerChunk = conditional_t<IsTfmma, _2, _4>;
+  using PhysicalElemsAPerChunk = Int<LogicalElemsAPerChunk{} / ElementASparsity{}>;
+
+  // Metadata Bits
+  using ElementEBitsPerChunk = _4;
+  using ElementEBitsPerElementAMma = cute::conditional_t<IsTfmma, _4, _2>;
+
+  // Metadata Layout. Unit in corresbonding logical elements.
+  // Basic metadata block is (16,64) for 8-bit, (16,32) for 16-bit, (16,16) for 32-bit data types.
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-wgmma-metadata-64n32-f16bf16
+  // Tensor E layout atom stacks 4 basic blocks along M mode to align with WGMMA instruction shape and
+  // stacks 1-4 blocks along K mode and reorders memory layout to allow for vectorized loads from smem.
+  using BlockK = Int<512 / sizeof_bits_v<ElementAMmaRaw>>;
+  static_assert(MinTileShapeK{} % BlockK{} == 0, "MinTileShapeK must be a multiple of BlockK");
+  using NumK = decltype(MinTileShapeK{} / BlockK{});
+
+  using TensorEAtom_32bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_8,_2,NumK>>{}, 
+                                                         Step <Step <_3,_1,_5>, Step <_0,_4,  _2>>{}));
+
+  using TensorEAtom_16bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_16,_2,NumK>>{},
+                                                         Step <Step <_3,_1,_5>, Step < _0,_4,  _2>>{}));
+
+  using TensorEAtom_8bit  = decltype(make_ordered_layout(Shape<_64,MinTileShapeK>{},
+                                                         Step < _1,           _0>{}));
+
+  using TensorEAtom = cute::conditional_t<(IsQmma || IsImma),  TensorEAtom_8bit, 
+                      cute::conditional_t<IsTfmma, TensorEAtom_32bit,
+                      TensorEAtom_16bit>>;
+
+  // Logical elems that construct the atomK for tensorE/A.  
+  using TensorEAtomK = Int<size<1>(TensorEAtom{})>;
+  using TensorEAtomM = Int<size<0>(TensorEAtom{})>;
+
+  // Tensor E alignment requirements
+  using TensorEAlignmentM = TensorEAtomM;
+  using TensorEAlignmentK = TensorEAtomK;
+
+  // Tensor A alignment requirements
+  // When A is MN major, TensorAAlignmentK needs to be multiplier of chunk size
+  // When A is K major, TensorAAlignmentK needs to be multiplier of TMA requirements times tensorA sparsity
+  //   this is b.c. TensorACompressed needs to satisfy TMA requirements
+  using TensorAAlignmentK = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
+                                                LogicalElemsAPerChunk,
+                                                Int<128 / cute::sizeof_bits_v<ElementAMma>>>;
+
+  // When A is MN Major, TensorAAlignmentM needs to be multiplier of TMA requirements
+  // When A is K Major, no requirements on TensorAAlignmentM.
+  using TensorAAlignmentM = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
+                                                Int<128 / cute::sizeof_bits_v<ElementAMmaRaw> * ElemsARawPerElementAMmaRaw{}>,
+                                                _1>;
+
+  // The following two functions are provided for user determine the static layouts type
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA() {
+    using LayoutMMajor = Layout<Shape <int32_t,
+                                       Shape<ElementASparsity, int32_t>,
+                                       int32_t>,
+                                Stride<ElementASparsity,
+                                       Stride<_1, int64_t>,
+                                       int64_t>>;
+
+    using LayoutKMajor = Layout<Shape <int32_t,
+                                       Shape<ElementASparsity, int32_t>,
+                                       int32_t>,
+                                Stride<int64_t,
+                                       Stride<_1, ElementASparsity>,
+                                       int64_t>>;
+
+    if constexpr (GmmaMajorA == GMMA::Major::MN) {
+      return LayoutMMajor{};
+    }
+    else {
+      return LayoutKMajor{};
+    }
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutE() {
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(0)),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(0)),
+                 int32_t(0)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(0)),
+                  int64_t(0))
+    );
+  }
+
+  // This function is used to revert a CuTe layout to a Cutlass layout tag (RowMajor/ColumnMajor)
+  template <class ShapeA, class StrideA>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutA_tag(Layout<ShapeA, StrideA> layout_a) {
+    /*
+      (m, (2, k/2), l) : (2, (1, m*2), m*k) M-major
+      (m, (2, k/2), l) : (k, (1, 2), m*k) K-major
+    */
+    // Check if the given layout_a is possibly a sparse tensorA layout.
+    static_assert(rank_v<ShapeA> == 3 && depth_v<ShapeA> == 2, "Rank and depth mismatch with the sparse tensorA's layout.");
+    static_assert(rank(get<1>(ShapeA{})) == 2 && rank(flatten(ShapeA{})) == 4,
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<1,0>(StrideA{}) == 1 && get<1,0>(ShapeA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+    static_assert(get<0>(StrideA{}) == ElementASparsity{} || get<1,1>(StrideA{}) == ElementASparsity{},
+                  "Not likely to be a sparse tensorA's layout.");
+
+    if constexpr (get<0>(StrideA{}) == ElementASparsity{}) {
+      return cutlass::layout::ColumnMajor{};
+    }
+    else {
+      return  cutlass::layout::RowMajor{};
+    }
+  }
+
+  // Fill tensor A layout from dynamic problem shape
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutA(ProblemShape problem_shape) {
+
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorA Alignment requirement
+    const auto M_AlignedAC = cutlass::round_up(M, TensorAAlignmentM{});
+    const auto K_AlignedAC = cutlass::round_up(K, TensorAAlignmentK{});
+
+    if constexpr (GmmaMajorA == GMMA::Major::MN) {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC) / ElementASparsity{}),
+                   int32_t(L)),
+        make_stride(ElementASparsity{},
+                    make_stride(_1{}, int64_t(M_AlignedAC) * ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+    else {
+      return make_layout(
+        make_shape(int32_t(M_AlignedAC),
+                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC / ElementASparsity{})),
+                   int32_t(L)),
+        make_stride(int64_t(K_AlignedAC),
+                    make_stride(_1{}, ElementASparsity{}),
+                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
+      );
+    }
+  }
+
+  // Fill tensor E layout from dynamic problem shape
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  fill_layoutE(ProblemShape problem_shape) {
+    const auto [M, N, K, L] = problem_shape;
+
+    // Round up to satisfy TensorEAlignment requirement
+    const auto M_AlignedE = cutlass::round_up(M, TensorEAlignmentM{});
+    const auto K_AlignedE = cutlass::round_up(K, TensorEAlignmentK{});
+
+    // TensorEAtom first along m-dim, then along k-dim, then along batch
+    static_assert(TensorEAlignmentM{} == TensorEAtomM{}, "Shape below assumes TensorEAlignmentM == TensorEAtomM");
+    static_assert(TensorEAlignmentK{} == TensorEAtomK{}, "Shape below assumes TensorEAlignmentK == TensorEAtomK");
+
+    return make_layout(
+      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(M_AlignedE / TensorEAtomM{})),
+                 make_shape(shape<1>(TensorEAtom{}), int32_t(K_AlignedE / TensorEAtomK{})),
+                 int32_t(L)),
+      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
+                  make_stride(stride<1>(TensorEAtom{}), int64_t(M_AlignedE * TensorEAtomK{})),
+                  (L == 1) ? int64_t(0) : int64_t(M_AlignedE * K_AlignedE))
+    );
+  }
+};
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
new file mode 100755
index 000000000..9b608fe02
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_sparse(StageCount<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
+constexpr int
+compute_stage_count_or_override_sparse(cute::Int<stages> stage_count) {
+  return stages;
+}
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
+template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int carveout_bytes>
+constexpr int
+compute_stage_count_or_override_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
+  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
+  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
+  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
+  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
+  constexpr int stage_bytes =
+    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
+    static_cast<int>(mainloop_pipeline_bytes);
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
+       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input, "FP8 sparse collective currently only supports FastAccum schedules");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMmaRaw = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma    = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMmaRaw, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
+      ElementAMmaRaw, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
+                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
+
+  using LayoutA = decltype(SparseConfig::deduce_layoutA());
+  using LayoutE = decltype(SparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
+      GmmaMajorA, ElementAMmaRaw, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementBMma, ElementEMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void; 
+  using SmemCopyAtomB = void; 
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+// GMMA_TMA_WS_SS_FP8_FAST_ACCUM_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccum> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongFP8FastAccum> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>)>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
+      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
+                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
+
+  using LayoutA = decltype(SparseConfig::deduce_layoutA());
+  using LayoutE = decltype(SparseConfig::deduce_layoutE());
+  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
+      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
+      ElementAMma, ElementB, ElementEMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using SmemCopyAtomA = void; 
+  using SmemCopyAtomB = void; 
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      LayoutPairAE,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+// GMMA_TMA_WS_RS_SPARSE
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
+       detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Mainloop with sparse A sourced from RF is not implemented.");
+};
+
+// Sparse GMMA auto kernel schedule
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassSparseTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+
+  using KernelSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
+                                             cute::conditional_t<IsFP8Input,
+                                                                 KernelTmaWarpSpecializedPingpongFP8FastAccum,
+                                                                 KernelTmaWarpSpecializedPingpong>,
+                                             cute::conditional_t<IsFP8Input,
+                                                                 KernelTmaWarpSpecializedCooperativeFP8FastAccum,
+                                                                 KernelTmaWarpSpecializedCooperative>>;
+
+  using CollectiveOp = typename CollectiveBuilder<
+      arch::Sm90,
+      arch::OpClassSparseTensorOp,
+      ElementA,
+      GmemLayoutATag,
+      AlignmentA,
+      ElementB,
+      GmemLayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape_MNK,
+      ClusterShape_MNK,
+      StageCountType,
+      KernelSchedule
+    >::CollectiveOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
new file mode 100755
index 000000000..ccd8d8b3c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
new file mode 100755
index 000000000..c0570d37a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
@@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+using StageCountAuto = StageCountAutoCarveout<0>;
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct KernelScheduleAuto final {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
new file mode 100755
index 000000000..103da9af7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
@@ -0,0 +1,49 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/sm70_mma_twostage.hpp"
+#include "cutlass/gemm/collective/sm80_mma_multistage.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
new file mode 100755
index 000000000..feef54962
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class TiledMma,
+  class GmemTiledCopyA,
+  class SmemLayoutAtomA,
+  class SmemCopyAtomA,
+  class TransformA,
+  class GmemTiledCopyB,
+  class SmemLayoutAtomB,
+  class SmemCopyAtomB,
+  class TransformB
+>
+struct CollectiveMma {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
new file mode 100755
index 000000000..374fffafc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// It would promote (add) the results from the tensor core accumulators to the
+/// main accumulators when the number of MMAs reaches the max number of MMA
+/// interval specified by user, after that the tensor core accumulators are
+/// zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8Accumulation {  
+ using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8Accumulation(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
new file mode 100755
index 000000000..3d9e03edf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
@@ -0,0 +1,597 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStageUnpredicated,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStageUnpredicated;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    (void)residue_mnk;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto copy_a_thr = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto copy_b_thr = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = copy_a_thr.partition_S(gA);                                  // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = copy_a_thr.partition_D(sA);                                  // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBgB = copy_b_thr.partition_S(gB);                                  // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = copy_b_thr.partition_D(sB);                                  // (BCPY,BCPY_N,BCPY_K)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA);                           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB);                           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsA           = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsB           = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy gmem to rmem for the first k_tile
+    copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+    copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+    if (--k_tile_count > 0) ++k_tile_iter;
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;     // static
+        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          // Copy gmem to rmem
+          copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+          if (--k_tile_count > 0) ++k_tile_iter;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStage,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA.data() = &gA(0, get<2>(residue_mnk), 0);
+    gB.data() = &gB(0, get<2>(residue_mnk), 0);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the rmem tiles to account for predicated off loads
+    clear(tArA);
+    clear(tBrB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tArA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tArA(_,_,k));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBrB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBrB(_,_,k));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.make_fragment_A(thr_mma.partition_A(sA));           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.make_fragment_B(thr_mma.partition_B(sB));           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsA           = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
+    Tensor tCsB           = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;    // static
+        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBrB);
+          ++k_tile_iter;
+          --k_tile_count;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
new file mode 100755
index 000000000..a129b56e3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
@@ -0,0 +1,707 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm80CpAsyncUnpredicated<Stages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+  >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsyncUnpredicated<Stages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  // Follow the change in TestSmall: TileShape switch to CtaShape 
+  // For sm80 arch, CtaShape should euqal to TileShape
+  using CtaShape_MNK = TileShape;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    (void) residue_mnk;
+    //assert(residue_mnk == make_tuple(0,0,0));
+
+    //
+    // PREFETCH
+    //
+
+    // Start async loads for all pipes but the last
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));
+      copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));
+      cp_async_fence();
+      --k_tile_count;
+      if (k_tile_count > 0) { ++k_tile_iter; }
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0));                     // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0));                     // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_A) == size(tiled_mma));
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_B) == size(tiled_mma));
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(sA);                  // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                   // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(sB);                  // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                   // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -(DispatchPolicy::Stages-1))
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<x>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          
+          // Advance the tile
+          --k_tile_count;
+          if (k_tile_count > 0) { ++k_tile_iter; }
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_
+>
+struct CollectiveMma<
+    MainloopSm80CpAsync<
+      Stages,
+      ClusterShape_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+   >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsync<
+                          Stages,
+                          ClusterShape_>;
+  using TileShape = TileShape_;
+  // Follow the change in TestSmall: TileShape switch to CtaShape 
+  // In legacy arch, it should be same
+  using CtaShape_MNK = TileShape;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,                   // (BLK_M, BLK_K, K_TILES)
+      TensorB gB,                   // (BLK_N, BLK_K, K_TILES)
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA);
+    gB = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_A.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_B.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the smem tiles to account for predicated off loads
+    clear(tAsA);
+    clear(tBsB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      constexpr int k_pipe = 0;
+
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe));
+        }
+      }
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Start async loads for 1st k-tile onwards, no k-residue handling needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      if (k_tile_count <= 0) {
+        clear(tApA);
+        clear(tBpB);
+      }
+      copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));  // CpAsync
+      copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));  // CpAsync
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA(_,_,0));                    // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB(_,_,0));                    // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A   = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A     = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA           = smem_thr_copy_A.partition_S(sA);                   // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);                    // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB              = smem_thr_copy_B.partition_S(sB);                // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);                 // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          // Set all predicates to false if we are going to overshoot bounds
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          ++k_tile_iter;
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
new file mode 100755
index 000000000..628750fc3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  // Assumption: StrideA is congruent with Problem_MK
+  using TMA_A = decltype(make_tma_copy(
+      GmemTiledCopyA{},
+      make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+      SmemLayoutA{}(_,_,cute::Int<0>{}),
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+      size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+  // Assumption: StrideB is congruent with Problem_NK
+  using TMA_B = decltype(make_tma_copy(
+      GmemTiledCopyB{},
+      make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+      SmemLayoutB{}(_,_,cute::Int<0>{}),
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  struct Params {
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    void* tensormaps;
+    InternalElementA const** ptr_A;
+    StrideA dA;
+    InternalElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,mock_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,mock_L), stride_b));
+    TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    void* tensormaps = workspace;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      tensormaps,
+      reinterpret_cast<InternalElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<InternalElementB const**>(args.ptr_B),
+      args.dB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorMapA, class TensorMapB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all 
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was 
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    InternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    InternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<InternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<InternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
new file mode 100755
index 000000000..69b31fdab
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,677 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalGmemTiledCopyA = cute::conditional_t<!SwapAB, GmemTiledCopyA, GmemTiledCopyB>;
+  using InternalGmemTiledCopyB = cute::conditional_t<!SwapAB, GmemTiledCopyB, GmemTiledCopyA>;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes(size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value)) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+  static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<256, _0> { 
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, 256> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, 256> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    InternalElementA const* ptr_A = nullptr;
+    InternalStrideA dA{};
+    InternalElementB const* ptr_B = nullptr;
+    InternalStrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    if constexpr (not SwapAB) {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_A),
+        args.dA,
+        reinterpret_cast<InternalElementB const*>(args.ptr_B),
+        args.dB
+      };
+    }
+    else {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_B),
+        args.dB,
+        reinterpret_cast<InternalElementB const*>(args.ptr_A),
+        args.dA
+      };
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    InternalGmemTiledCopyA gmem_tiled_copy_a;
+    InternalGmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all 
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was 
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                     // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), GmmaSmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB);                                   // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
+                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+    // first k tile
+    {
+      pipeline.consumer_wait(smem_pipe_read);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+          transpose.synchronize();
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+      
+      
+      if (k_tile_count - 1 > 0) {
+        if (!skip_wait) {
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+        copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+        transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      }
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    --k_tile_count;
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == size<2>(tCrA) - 1) {
+          if (!skip_wait) {
+            pipeline.consumer_wait(smem_pipe_read);
+          }
+          copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        } else {
+          copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          if (k_block < 2) {
+            transpose.synchronize(k_block);                                      // make transpose of k_block available
+          }
+          if (k_block == 0) {
+            transpose(sB, gmma_sB, read_stage, 1);
+          }
+        }
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    if (k_tile_count > 0) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block < 2) {
+          transpose.synchronize(k_block);                                           // make k_block transpose available
+        }
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
new file mode 100755
index 000000000..e336bd475
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all 
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was 
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
new file mode 100755
index 000000000..b30fed1c8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only
+  // (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+
+  static constexpr bool uses_universal_transposition() {
+    if constexpr (TransposeB) {
+      return transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>();
+    }
+    else {
+      return false;
+    }
+  }
+
+  static_assert(!uses_universal_transposition(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+  
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> { 
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, SmemAlignmentA> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, SmemAlignmentB> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<InternalElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<InternalElementB>::value)) ;
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});       // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});       // (BLK_N,BLK_K,PIPE)
+      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+      
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+    
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB_position_dependent = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), 
+                                          GmmaSmemLayoutB{});                                     // (BLK_N,BLK_K,PIPE)
+    Tensor gmma_sB = as_position_independent_swizzle_tensor(gmma_sB_position_dependent);          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB_position_dependent);                // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+    Tensor tCsA_copy_view  = smem_thr_copy_A.partition_S(sA);                                      // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA_copy_view) == size<1>(tCrA_copy_view));                                  // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA_copy_view) == size<2>(tCrA_copy_view));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) > _2{}, "RS loops require more than 2 MMA k-iterations for correctness.");
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
+                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+    
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        if(k_block == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        }
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+      --k_tile_count;
+      if(k_tile_count == 0) {
+        return;
+      }
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+      transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+        if (k_block == size<2>(tCrA) - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        } 
+        else {
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          transpose.synchronize(k_block);                                      // make transpose of k_block available
+          transpose(sB, gmma_sB, read_stage, k_block + 1);
+        }
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);                                           // make k_block transpose available
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+  
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100755
index 000000000..a3efc67e8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1560 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/detail/collective.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementAOptionalTuple,
+  class StrideA_,
+  class ElementBOptionalTuple,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+private:
+  template <class PointerType>
+  static constexpr auto
+  get_logical_ptr(PointerType const* ptr) {
+    if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
+      return subbyte_iterator<PointerType const>(ptr);
+    }
+    else {  
+      return ptr;
+    }
+  }
+
+  enum class ConversionMode {
+    DirectConvert,
+    ConvertAndScale,
+    ConvertAndScaleWithZero
+  };
+
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value, 
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+    "[ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using StrideB = StrideB_;
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the code to compile when the scale is void.
+  using NonVoidStrideScale = cute::conditional_t<
+      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert((IsATransformed && cutlass::gemm::detail::is_k_major<StrideA>()) || 
+                (!IsATransformed && cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                (cutlass::gemm::detail::is_k_major<StrideA>() && 
+                 cutlass::gemm::detail::is_k_major<StrideB>()), 
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(), 
+    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  // Scale layout atom set after swapping.
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealInternalElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealInternalElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using InternalTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using InternalTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<
+                             DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(InternalSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+
+  template<class LayoutAtom, class TileShape, class Stride>
+  static constexpr
+  CUTLASS_HOST_DEVICE
+  auto get_smem_layout(LayoutAtom layout_atom, TileShape const& tile_shape, Stride const& stride) {
+    if constexpr (not cute::is_layout<Stride>::value) {
+      return tile_to_shape(
+        layout_atom,
+        append(tile_shape, Int<DispatchPolicy::Stages>{}),
+        cute::conditional_t< ::cutlass::gemm::detail::is_major<0,Stride>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{});
+    }
+    else {
+      auto gmem_tile = composition(stride, tile_shape);
+      return make_layout_like(append(gmem_tile, make_layout(Int<DispatchPolicy::Stages>{}, 0)));
+    }
+  }
+
+  using SmemLayoutA = decltype(get_smem_layout(InternalSmemLayoutAtomA{}, select<0,2>(TileShape{}), InternalStrideA{}));
+  using SmemLayoutB = decltype(get_smem_layout(InternalSmemLayoutAtomB{}, select<1,2>(TileShape{}), InternalStrideB{}));
+    
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+    SmemLayoutAtomScale{}, 
+    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  // NOTE: Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+private:
+  static constexpr ConversionMode 
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } 
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+
+  static constexpr auto
+  elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } 
+    else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto
+  elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale ) {
+      return 0;
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after the public section.
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    }
+    else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+public:
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    static constexpr int scale_elements = elements_per_smem_scale();
+    static constexpr int zero_elements = elements_per_smem_zero();
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> {
+      cute::ArrayEngine<RealInternalElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  template<class Shape, class Stride>
+  static constexpr
+  CUTLASS_HOST_DEVICE
+  auto get_gmem_layout(Shape const& shape, Stride const& stride) {
+    if constexpr (not cute::is_layout<Stride>::value) {
+      return make_layout(shape, stride);
+    }
+    else {
+      return stride;
+    }
+  }
+
+  // Device side kernel params
+  struct Params {
+  private:
+    using Outer = CollectiveMma<DispatchPolicy, TileShape_, 
+                                ElementAOptionalTuple, StrideA_, 
+                                ElementBOptionalTuple, StrideB_,
+                                TiledMma_, 
+                                GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                                TransformA_,
+                                GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
+                                TransformB_>;
+
+  public:
+
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(get_gmem_layout(repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}));
+    using LayoutB = decltype(get_gmem_layout(repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+   using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+   using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{})); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    int reload_factor = (group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalStrideA dA;
+    InternalStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(get_logical_ptr(ptr_A), get_gmem_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(get_logical_ptr(ptr_B), get_gmem_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0, tma_transaction_bytes, 1, dA, dB };
+    } 
+    else if constexpr (ModeHasScales) {
+      auto scale_k = (K + args.group_size - 1) / args.group_size;
+      ElementScale const* ptr_S = args.ptr_S;
+      StrideScale dS = args.dS;
+      Tensor tensor_scale = make_tensor(get_logical_ptr(ptr_S), make_layout(make_shape(M,scale_k,L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{}); // mcast along N mode for this M load, if any
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero = make_tensor(get_logical_ptr(args.ptr_Z), make_layout(make_shape(M,scale_k,L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{}); // mcast along N mode for this M load, if any
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    bool check_aligned_A = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(get_gmem_layout(cute::make_shape(M,K,L), args.dA));
+
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    bool check_aligned_B = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(get_gmem_layout(cute::make_shape(N,K,L), args.dB));
+
+    bool check_aligned_S = true;
+    bool check_aligned_Z = true;
+    bool check_mode_args = true;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      check_mode_args = check_mode_args && (args.ptr_S == nullptr);
+      check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+    } 
+    else if constexpr (ModeHasScales) {
+      const int scale_mn = SwapAB ? N : M;
+      const int scale_k = (K + args.group_size - 1) / args.group_size;
+      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      check_aligned_S = cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+      check_mode_args = check_mode_args && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      check_mode_args = check_mode_args && args.group_size != 0;
+      check_mode_args = check_mode_args && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        check_aligned_Z = cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+        check_mode_args = check_mode_args && (args.ptr_Z != nullptr);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+    }
+
+    if (!check_mode_args) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n");
+    }
+    if (!check_aligned_A) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor A meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_B) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor B meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_S) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor S (scale) meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_Z) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor Z (zeros) meet the minimum alignment requirements for TMA.\n");
+    }
+
+    return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    }  
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
+    }
+    
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(get_gmem_layout(make_shape(M,K,L), mainloop_params.dA))); // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(get_gmem_layout(make_shape(N,K,L), mainloop_params.dB))); // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } 
+    else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M,scale_k,L));          // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));         // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M,scale_k,L));         // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }  
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <
+    class... Ts,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A, B and Scales
+    //
+    
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      }
+      else if constexpr (ModeHasScales) {
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+        // on the fly.
+        // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
+        // is a multiple of the threadblock tile K
+        int const scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when group_size == K.
+        if (cute::elect_one_sync()) copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } 
+        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        } 
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+    
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+    
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = make_fragment_like<RealInternalElementA>(tCrA_mma);
+    
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info = retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) { // prefetch next block
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+      transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }     
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+        if (K_BLOCK_MAX > 1) { // prefetch next block
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+        }
+        warpgroup_wait<K_BLOCK_MAX - 1>(); 
+        transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_BLOCK_MAX - 1>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) { 
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+          if (K_BLOCK_MAX > 1) { // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+              partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+          }
+          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+        } 
+        else {
+          if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_BLOCK_MAX - 1>();
+        if (k_block == K_BLOCK_MAX - 1) { // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 1, read_stage);
+          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+  
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class... Ts>
+  CUTLASS_DEVICE
+  auto partition_extra_tma_inputs(
+    Params const& mainloop_params,
+    cute::tuple<Ts...> const& load_inputs,
+    TensorStorage& shared_tensors,
+    uint2 const& cluster_local_block_id,
+    int const m_coord, 
+    int const l_coord) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } 
+    else if constexpr (ModeHasScales) {
+      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);                                            // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);          
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma>
+  CUTLASS_DEVICE 
+  auto partition_extra_mma_info(
+    ThreadMma const& mma_thread_slice,
+    TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (UseScaleLookupTable) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS_neg = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+      Tensor tCrS_pos = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS_neg, tCrS_pos);
+      }
+    }
+    else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).layout()); 
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE
+  auto retile_extra_mma_info(
+    TiledMma const& tiled_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const warp_group_thread_idx) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
+      
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } 
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA,
+            class TensorASmemView,
+            class TensorACopyView,
+            class... Ts,
+            class... Us
+            >
+  CUTLASS_DEVICE
+  void copy_A_and_extra_info(
+    SmemTiledCopyA const& smem_tiled_copy_A,
+    TensorASmemView const& tCsA,
+    TensorACopyView& tCrA_copy_view,
+    cute::tuple<Ts...> const& partitioned_mma_extra_info,
+    cute::tuple<Us...> const& tiled_copy_and_views,
+    int k_block,
+    int read_stage) {
+
+    copy(smem_tiled_copy_A, tCsA(_,_,k_block,read_stage), tCrA_copy_view(_,_,k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } 
+      else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view    = cute::get<1>(tiled_copy_and_views);
+        auto tCsS              = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_,_,k_block,read_stage), tCrS_copy_view(_,_,k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ              = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view    = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_,_,k_block,read_stage), tCrZ_copy_view(_,_,k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");         
+        }
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+  
+  // Helper functions to select packing for conversion
+  template <class SrcType,
+            class DstType,
+            int Cosize>
+  struct select_packing { // Naive packing policy
+    static constexpr auto value() {
+      return Int<cute::gcd(Cosize, 32 / cute::min(sizeof_bits_v<SrcType>, sizeof_bits_v<DstType>))>{};
+    }
+  };
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(Array<cutlass::int4b_t, 4> const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<uint16_t const&>(source));
+  }
+  CUTLASS_DEVICE
+  static uint32_t to_reg(Array<cutlass::int4b_t, 8> const& source) {
+    return reinterpret_cast<uint32_t const&>(source);
+  }
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class TensorPos,
+            class TensorNeg,
+            int N>
+  CUTLASS_DEVICE
+  static Array<RealInternalElementB, N> lookup_table_convert(
+    cute::Int<N> _,
+    Array<cutlass::int4b_t, N> const& source,
+    TensorPos const& scale_neg, 
+    TensorNeg const& scale_pos, 
+    int scale_idx) {
+
+    static_assert(N == 4 || N == 8);
+    uint32_t res[N / 4];
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign; // ((reg & 0x88888888) | 0x64206420) >> 1 
+    asm volatile(
+      "{\n"
+      "  lop3.b32 %0, %1, %2, %3, %4;\n" \
+      "}\n"
+      : "=r"(sign)
+      : "r"(reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut)
+    );
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = reg & 0x77777777;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>=16, sign >>=16) {
+      Array<uint32_t, 2> const& _scale_neg = reinterpret_cast<Array<uint32_t, 2> const&>(scale_neg[scale_idx + i * 4]);
+      Array<uint32_t, 2> const& _scale_pos = reinterpret_cast<Array<uint32_t, 2> const&>(scale_pos[scale_idx + i * 4]);
+      asm volatile(
+        "{\n"
+        "  .reg .b32 pos, neg                    ;\n" \
+        "  prmt .b32 neg, %3, %4, %1             ;\n" \
+        "  prmt .b32 pos, %5, %6, %1             ;\n" \
+        "  prmt .b32 %0, pos, neg, %2            ;\n" \
+        "}\n"
+        : "=r"(res[i])
+        : "r"(lut_idx), "r"(sign), "r"(_scale_neg[0]), "r"(_scale_neg[1]), "r"(_scale_pos[0]), "r"(_scale_pos[1])
+      );
+    }
+    return reinterpret_cast<Array<RealInternalElementB, N>&>(res);
+  }
+
+  template <class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Layout const& tensor) {
+    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0, "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+  template <class Engine,
+            class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+
+  /// Utilities to transform A.
+  template <class EngineIn,
+            class EngineOut, 
+            class LayoutIn,
+            class LayoutOut,
+            class... Ts>
+  CUTLASS_DEVICE
+  void transform_A_kblock(
+    Tensor<EngineIn, LayoutIn> const& tCrA_load, 
+    Tensor<EngineOut, LayoutOut>& tCrA_mma,
+    cute::tuple<Ts...> const& partitioned_extra_info,
+    int const k_block) {
+
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    auto const& src = tCrA_load(_, _, k_block);
+    auto const& dst = tCrA_mma(_, _, k_block);
+    auto pSrc = raw_pointer_cast(src.data());
+    auto pDst = const_cast<DstType*>(raw_pointer_cast(dst.data()));
+    constexpr int num_elements = decltype(size(src))::value;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
+      using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+      using SrcArray = cutlass::Array<SrcType, pack>;
+      using DstArray = cutlass::Array<DstType, pack>;
+      constexpr int iters = num_elements / pack;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < iters; ++i) {
+        SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
+        DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
+        *pDstArr = Converter::convert(*pSrcArr);
+      }
+    } 
+    else if constexpr (UseScaleLookupTable) {
+      static_assert(is_same_v<RealInternalElementA, cutlass::int4b_t>, "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64, "Lookup table only supports 8 8bit scale values now.");
+      static_assert(num_elements % 4 == 0 && num_elements >= 4, "Lookup table requires a vector size of 4x when converting.");
+      constexpr int pack = num_elements % 8 == 0? 8 : 4;
+      constexpr int iters = num_elements / pack;
+      using SrcArray = cutlass::Array<SrcType, pack>;
+      using DstArray = cutlass::Array<DstType, pack>;
+
+      auto const& tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto const& tCrS_pos = cute::get<2>(partitioned_extra_info);
+      auto const& scale_neg = tCrS_neg(_, _, k_block);
+      auto const& scale_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scale_neg));
+
+      static_check_scale(scale_neg);
+      static_check_scale(scale_pos);
+      if (k_block == 0) {
+        auto pNeg = raw_pointer_cast(tCrS_neg.data());
+        auto pPos = const_cast<ElementScale*>(raw_pointer_cast(tCrS_pos.data()));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < cosize(tCrS_neg.layout()); ++i)
+        {
+          // pPos[i] = pNeg[i] & 0x7F7F7F7F7F7F7F00;
+          cutlass::Array<uint32_t, 2> const& _scale_neg = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(pNeg[i]);
+          cutlass::Array<uint32_t, 2> & _scale_pos = reinterpret_cast<cutlass::Array<uint32_t, 2> &>(pPos[i]);
+          asm volatile(
+              "{\n"
+              "  and  .b32 %0, %2, %4             ;\n" \
+              "  and  .b32 %1, %3, %5             ;\n" \
+              "}\n"
+              : "=r"(_scale_pos[0]), "=r"(_scale_pos[1])
+              : "r"(_scale_neg[0]), "r"(_scale_neg[1]), "n"(0x7F7F7F00), "n"(0x7F7F7F7F)
+              );
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < iters; i ++) {
+        SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(raw_pointer_cast(src.data())) + i;
+        DstArray* pDstArr = reinterpret_cast<DstArray*>(raw_pointer_cast(dst.data())) + i;
+        
+        *pDstArr = lookup_table_convert(Int<pack>{}, *pSrcArr, scale_neg, scale_pos, i * pack);
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
+        using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using SrcArray = cutlass::Array<SrcType, pack>;
+        using DstArray = cutlass::Array<DstType, pack>;
+        constexpr int iters = num_elements / pack;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < iters; ++i) {
+          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
+          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
+          *pDstArr = Converter::convert(*pSrcArr);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < pack; ++j) {
+            (*pDstArr)[j] = (*pDstArr)[j] * scales[i*pack + j];
+          }
+        }
+      }
+      else {
+        constexpr int pack1 = decltype(select_packing<SrcType, ElementScale, num_elements>::value())::value;
+        constexpr int pack2 = decltype(select_packing<ElementScale, DstType, num_elements>::value())::value;
+        constexpr int pack = cute::gcd(pack1, pack2);
+        using Converter1 = cutlass::NumericArrayConverter<ElementScale, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using Converter2 = cutlass::NumericArrayConverter<DstType, ElementScale, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using SrcArray = cutlass::Array<SrcType, pack>;
+        using DstArray = cutlass::Array<DstType, pack>;
+        using StageArray = cutlass::Array<ElementScale, pack>;
+        constexpr int iters = num_elements / pack;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < iters; ++i) {
+          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
+          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
+          StageArray stageArr;
+          stageArr = Converter1::convert(*pSrcArr);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < pack; ++j) {
+            stageArr[j] = stageArr[j] *  scales[i*pack + j];
+          }
+          *pDstArr = Converter2::convert(stageArr);
+        }
+      }
+    } 
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
+      auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      auto const& zeros = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
+        using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using SrcArray = cutlass::Array<SrcType, pack>;
+        using DstArray = cutlass::Array<DstType, pack>;
+        constexpr int iters = num_elements / pack;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < iters; ++i) {
+          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
+          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
+          *pDstArr = Converter::convert(*pSrcArr);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < pack; ++j) {
+            (*pDstArr)[j] = (*pDstArr)[j] * scales[i*pack + j] + zeros[i*pack + j];
+          }
+        }
+      }
+      else {
+        constexpr int pack1 = decltype(select_packing<SrcType, ElementScale, num_elements>::value())::value;
+        constexpr int pack2 = decltype(select_packing<ElementScale, DstType, num_elements>::value())::value;
+        constexpr int pack = cute::gcd(pack1, pack2);
+        using Converter1 = cutlass::NumericArrayConverter<ElementScale, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using Converter2 = cutlass::NumericArrayConverter<DstType, ElementScale, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+        using SrcArray = cutlass::Array<SrcType, pack>;
+        using DstArray = cutlass::Array<DstType, pack>;
+        using StageArray = cutlass::Array<ElementScale, pack>;
+        constexpr int iters = num_elements / pack;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < iters; ++i) {
+          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
+          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
+          StageArray stageArr;
+          stageArr = Converter1::convert(*pSrcArr);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < pack; ++j) {
+            stageArr[j] = stageArr[j] *  scales[i*pack + j] + zeros[i*pack + j];
+          }
+          *pDstArr = Converter2::convert(stageArr);
+        }
+      }
+      return;
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
new file mode 100755
index 000000000..daaed6210
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
@@ -0,0 +1,539 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  static constexpr int ThreadCount = CUTE_STATIC_V(size(TiledMma{}));
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage {
+    cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+    cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TMA_LOAD_A,
+    class TensorB, class TMA_LOAD_B,
+    class FrgTensorC,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      TensorA const& gA, TMA_LOAD_A& tma_load_a,
+      TensorB const& gB, TMA_LOAD_B& tma_load_b,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      char* shared_memory,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_A.data()), SmemLayoutA{});                 // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_B.data()), SmemLayoutB{});                 // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    //
+    // Prepare TMA membars and PREFETCH
+    //
+
+    // Number of pipelined k-tiles in smem
+    constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+
+    // NOTE: Another parameter: Partition the pipeline between active MMAs and active TMAs
+    // Tunable via the dispatch policy to tollerate latencies evenly across the math and compute stages
+    // K_PIPE_MMAS: The max number of active MMA pipes at beginning of every loop
+    // K_PIPE_TMAS: The max number of active TMA pipes at beginning of every loop (geq 1)
+    constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+    constexpr int K_PIPE_TMAS = K_PIPE_MAX - K_PIPE_MMAS;
+    static_assert(0 <= K_PIPE_MMAS && K_PIPE_MMAS <  K_PIPE_MAX);
+    static_assert(0 <  K_PIPE_TMAS && K_PIPE_TMAS <= K_PIPE_MAX);
+
+    static_assert(K_PIPE_MMAS < K_PIPE_MAX - 1);
+
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr uint32_t TmaTransactionBytes = static_cast<uint32_t>(
+        cutlass::bits_to_bytes(size<0>(sA) * size<1>(sA) * sizeof_bits<InternalElementA>::value) +
+        cutlass::bits_to_bytes(size<0>(sB) * size<1>(sB) * sizeof_bits<InternalElementB>::value));
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+
+    PipelineParams params;
+    params.transaction_bytes = TmaTransactionBytes;
+    params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
+    params.is_leader = warp_group_thread_idx == 0;
+    params.num_consumers = NumThreadsPerWarpGroup;
+
+    MainloopPipeline pipeline(storage.pipeline_storage, params, ClusterShape{});
+
+    // State variables used for iterating the circular buffer
+    // smem_pipe_read / release is used by the consumer of SMEM data - i.e MMA
+    // smem_pipe_write is used by the producer of SMEM data - i.e TMA
+    PipelineState smem_pipe_read;
+    PipelineState smem_pipe_release;
+    PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    }
+    else {
+      __syncthreads();
+    }
+
+    // Set predicate for the lowest lane_id in the warp
+    int lane_predicate = cute::elect_one_sync();
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    // Keep a copy to know when to stop issuing loads
+    int k_tile_count_tma = k_tile_count;
+
+    // Issue TmaLoads (Prologue fetches)
+    if (warp_idx == 0 && lane_predicate == 1) {
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Issue the prologue loads
+      int prologue_tma_count = min(K_PIPE_MAX, k_tile_count);
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < prologue_tma_count; ++stage) {
+        pipeline.producer_acquire(smem_pipe_write);
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,stage));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,stage));
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+      k_tile_count_tma -= prologue_tma_count;
+    }
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                            // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                            // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                     // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                     // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                      // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    __syncthreads();
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    // Prologue MMAs
+    assert(k_tile_count >= 1);
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,smem_pipe_read.index()), tCrB(_,_,k_block,smem_pipe_read.index()), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count) - 1; 
+        prologue_mma_count > 0; --prologue_mma_count)
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+    warpgroup_fence_operand(accum);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until data is available
+      pipeline.consumer_wait(smem_pipe_read);
+
+      //
+      // Compute on k_tile
+      //
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      pipeline.consumer_release(smem_pipe_release);  // UNLOCK wr stage, done _computing_ on it
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      // Do Acquire & Load only if needed - helps with both performance and also corner case illegal barrier-ops
+      if (warp_idx == 0 && lane_predicate == 1 && (k_tile_count_tma > 0) ) {
+        pipeline.producer_acquire(smem_pipe_write);  // LOCK wr stage, for _writing_
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write.index()));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write.index()));
+        ++smem_pipe_write;
+        ++k_tile_iter;
+        --k_tile_count_tma;
+      }
+
+      // Advance consumer pipeline
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    // Wait on all GMMAs
+    warpgroup_wait<0>();
+    warpgroup_fence_operand(accum);
+
+    // Workaround for ensuring Smem destruction doesn't happen accidentally
+    if constexpr (size(typename DispatchPolicy::ClusterShape{}) > 1) {
+      cute::cluster_arrive();
+      cute::cluster_wait();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100755
index 000000000..b370dc70b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,582 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100755
index 000000000..da5274469
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.mma_promotion_interval
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
+    implementable = implementable && (args.mma_promotion_interval % (size<2>(TileShape{})() / TiledMma().template tile_size_mnk<2>()()) == 0);
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100755
index 000000000..01e83bdf5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,724 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA = decltype(cute::stride(LayoutA{}));
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
+  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
+                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
+                                                     ElementEMma,
+                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using SmemCopyAtomE = AutoVectorizingCopy;
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using CtaShape_MNK = TileShape;
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutE = decltype(tile_to_shape(
+      SmemLayoutAtomE{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>, 
+                                                  tfloat32_t,
+                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage {
+      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
+      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 0;
+
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{};
+    StrideB dB{};
+    ElementE const* ptr_E{};
+    LayoutE layout_e{};
+  };
+
+  // Device side kernel params
+  struct Params {
+
+    using TMA_A = decltype(make_tma_copy<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_E = decltype(make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_B = decltype(make_tma_copy<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<TmaInternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_E tma_load_e = make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      args.layout_a,
+      args.layout_e
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool size_check = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
+    }
+    else { // If A is K-major
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!size_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    // Check if layout_a and layout_e is filled correctly
+    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    bool layout_check = true;
+    layout_check = layout_check && (layout_a_ref == args.layout_a);
+    layout_check = layout_check && (layout_e_ref == args.layout_e);
+
+    if (!layout_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
+    }
+
+    return size_check && layout_check;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
+
+      // Define the CTA-in-cluster Layout and Coord
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      // TMA Multicast Masks
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    Tensor sE_ = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = as_position_independent_swizzle_tensor(sE_);
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
+
+    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
+    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
+
+    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+
+    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
+    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+
+  template <class MMA_Atom,
+            class AtomLayoutMNK,
+            class PermutationMNK,
+            class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
+  {
+    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
+
+    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(PermutationMNK{}),
+                            get<2>(PermutationMNK{}));
+    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
+                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
+                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template<class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
+  {
+    // (M,K) -> (M,K)
+    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
+    // (ethrid,val) -> (M,K)
+    auto layoutE_TV = thrfrg_E(mma, ref_E);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  template <class... MArgs, class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
+  {
+    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class... CArgs, class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
+                    TiledMMA<MArgs...>  const& mma)
+  {
+    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
new file mode 100755
index 000000000..eec61981f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Base device-level grouped kernel.
+*/
+
+#pragma once
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename BaseKernel_>
+class BaseGrouped {
+public:
+
+  using BaseKernel = BaseKernel_;
+
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename BaseKernel::ThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  /// Argument structure
+  using Arguments = typename BaseKernel::Arguments;
+
+  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
+
+protected:
+
+  /// Kernel parameters object
+  typename BaseKernel::Params params_;
+
+private:
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count) {
+    int32_t tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
+      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
+      tiles += problem_tile_count(problem);
+    }
+    return tiles;
+  }
+
+  /// Copy from `data` to `workspace`
+  Status copy_to_workspace(void* workspace, void* data, size_t bytes) {
+    cudaError_t cuda_error = cudaMemcpy(workspace, data, bytes, cudaMemcpyHostToDevice);
+    if (cuda_error != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      cuda_error = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+          "  cudaMemcpy() returned error "
+          << cudaGetErrorString(cuda_error));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Precomputes scheduling information for the grouped GEMM
+  Status precompute(Arguments const &args, int32_t tile_count, void* workspace) {
+    size_t workspace_bytes = get_workspace_size(args);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes,
+                                                args.problem_count,
+                                                args.threadblock_count,
+                                                (void*)host_workspace.data());
+    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes);
+  }
+
+  /// Reorder `data` according to `indices`
+  template <typename T>
+  static void reorder_array(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      copy.at(i) = data[indices[i]];
+    }
+
+    memcpy(data, copy.data(), indices.size() * sizeof(T));
+  }
+
+public:
+
+  /// Constructs the GEMM.
+  BaseGrouped() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return BaseKernel::can_implement(args);
+  }
+
+  /// Get the number of tiles in a problem
+  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const &problem) {
+    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
+    return BaseKernel::ProblemVisitor::tile_count(grid);
+  }
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(Arguments const &args) {
+    if (args.host_problem_sizes == nullptr) {
+        CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
+        return -1;
+    }
+
+    return group_tile_count(args.host_problem_sizes, args.problem_count);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      return BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes,
+                                                            args.problem_count,
+                                                            args.threadblock_count);
+    } else {
+      return 0;
+    }
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+
+    return dim3(args.threadblock_count, 1, 1);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::maximum_active_blocks()");
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        Kernel<BaseKernel>,
+        BaseKernel::kThreadCount,
+        smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Sorts each pointer passed in according to the indices that sort
+  /// `problem_sizes_ptr` in descending order of problem-K dimension.
+  static void sort_problems(int problem_count,
+                            cutlass::gemm::GemmCoord* problem_sizes_ptr,
+                            int64_t* lda_host_ptr,
+                            int64_t* ldb_host_ptr,
+                            int64_t* ldc_host_ptr,
+                            int64_t* ldd_host_ptr,
+                            int64_t* offset_A_ptr,
+                            int64_t* offset_B_ptr,
+                            int64_t* offset_C_ptr,
+                            int64_t* offset_D_ptr)
+  {
+    std::vector<size_t> indices(problem_count);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::stable_sort(indices.begin(), indices.end(),
+      [&problem_sizes_ptr](size_t i, size_t j) {
+        return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
+      });
+
+    reorder_array(problem_sizes_ptr, indices);
+    reorder_array(lda_host_ptr, indices);
+    reorder_array(ldb_host_ptr, indices);
+    reorder_array(ldc_host_ptr, indices);
+    reorder_array(ldd_host_ptr, indices);
+    reorder_array(offset_A_ptr, indices);
+    reorder_array(offset_B_ptr, indices);
+    reorder_array(offset_C_ptr, indices);
+    reorder_array(offset_D_ptr, indices);
+  }
+
+  /// Computes the number of threadblocks to launch for the grouped kernel
+  static int sufficient(const cutlass::gemm::GemmCoord* problem_sizes_ptr=nullptr,
+                        int problem_count=0,
+                        int available_sm_count=-1) {
+    // Determine the number of blocks that would be launched to fill up a single
+    // wave on the GPU with each SM having maximum occupancy.
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
+          << cudaGetErrorString(result));
+      return 0;
+    }
+
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_idx);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
+    if (override_sm_count) {
+      available_sm_count = multiprocessor_count;
+    }
+
+    int max_active_blocks = maximum_active_blocks();
+    if (max_active_blocks <= 0) {
+      return 0;
+    }
+
+    int occupancy_based_block_count = available_sm_count * max_active_blocks;
+
+    if (problem_sizes_ptr == nullptr || problem_count == 0) {
+      return occupancy_based_block_count;
+    }
+
+    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
+
+    // If the group contains a single problem, launching the exact number of
+    // threadblocks needed to cover the problem minimizes the work performed
+    // per threadblock in finding the next tile to compute. We return total_tiles
+    // unless the user has provided the SM count.
+    if (problem_count == 1 && override_sm_count) {
+      return total_tiles;
+    }
+
+    // Choose between the full wave of threadblocks and the tile count. If there
+    // are fewer tiles in the group than threadblocks in the full wave, only
+    // some threadblocks will be assigned tiles. Those threadblocks
+    // which are not assigned tiles still need to perform the work of iterating through
+    // problem sizes to determine that they have no work to do. This competes for cycles
+    // with those threadblocks that are assigned tiles to compute.
+    return std::min(total_tiles, occupancy_based_block_count);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Workspace
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_ = typename BaseKernel::Params(args, workspace, tile_count);
+    } else {
+      params_ = typename BaseKernel::Params(args, workspace);
+    }
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_.update(args, workspace, tile_count);
+    } else {
+      params_.update(args, workspace);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    if (!params_.problem_visitor.problem_count) {
+      return Status::kSuccess;
+    }
+
+    dim3 grid(params_.threadblock_count, 1, 1);
+    dim3 block(BaseKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    // Launch
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Initializes and runs the kernel.
+  Status operator()(
+    Arguments const &args,
+    void *workspace,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
new file mode 100755
index 000000000..e7ed2da94
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -0,0 +1,955 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Definitions for GEMM structures
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename OperatorClass,
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC,
+  typename ElementAccumulator
+>
+struct DefaultGemmConfiguration;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassSimt, 
+  ArchTag,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  using ThreadblockShape = GemmShape<128, 128, 8>;
+  using WarpShape = GemmShape<32, 64, 8>;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    1,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ArchTag,
+  typename ElementC>
+struct DefaultGemmConfiguration<arch::OpClassSimt, ArchTag, int8_t, int8_t, ElementC, int32_t> {
+  
+  static int const kAlignmentA = 4;
+  static int const kAlignmentB = 4;
+  using ThreadblockShape = GemmShape<128, 128, 32>;
+  using WarpShape = GemmShape<32, 64, 32>;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+    ElementC,
+    1,
+    int32_t,
+    float
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassWmmaTensorOp, 
+  ArchTag,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  static int const kStages = 2;
+  
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm70,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 32>;
+  using WarpShape = GemmShape<64, 64, 32>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 2;
+  
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
+  using ThreadblockShape = GemmShape<128, 256, 32>;
+  using WarpShape = GemmShape<64, 64, 32>;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = typename platform::conditional<
+      (platform::is_same<ElementA, int8_t>::value ||
+       platform::is_same<ElementA, int4b_t>::value ||
+       platform::is_same<ElementA, uint8_t>::value ||
+       platform::is_same<ElementA, uint4b_t>::value),
+      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+   
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+    
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+   
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+    
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<8, 8, 128>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpXorPopc;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, ElementA,
+                                ElementB, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 16>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = typename platform::conditional<
+      (platform::is_same<ElementA, int8_t>::value ||
+       platform::is_same<ElementA, int4b_t>::value ||
+       platform::is_same<ElementA, uint8_t>::value ||
+       platform::is_same<ElementA, uint4b_t>::value),
+      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 128, 16>;
+  using WarpShape = GemmShape<32, 64, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 1, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm80, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+     
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<16, 8, 256>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int4b_t,
+  int8_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int8_t,
+  int4b_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Base configuration for all {fe4m3, fe5m2} x {fe4m3, fe5m2} combinations on SM89
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementAccumulator>
+struct DefaultGemmConfigurationSm89F8 {
+  static_assert((platform::is_same<ElementA, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementA, cutlass::float_e5m2_t>::value),
+                "ElementA must be of type float_e4m3_t or float_e5m2_t");
+  static_assert((platform::is_same<ElementB, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementB, cutlass::float_e5m2_t>::value),
+                "ElementB must be of type float_e4m3_t or float_e5m2_t");
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+/// Partial specialization for SM89 fe4m3 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe4m3 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm90, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 1, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm90, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
new file mode 100755
index 000000000..54ddab400
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
@@ -0,0 +1,849 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/ell_gemm.h"
+
+#include "cutlass/gemm/kernel/default_ell_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Blocked-Ell sparse gemm device-level operator. This is an interface to efficient CUTLASS
+  Blocked-Ell kernels that may be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to Blocked-Ell problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  Example of a CUTLASS EllGemm operator is as follows:
+
+    //
+    // Instantiate the CUTLASS EllGemm operator.
+    //
+
+    cutlass::gemm::device::EllGemm<
+      cutlass::half_t,
+      cutlass::layout::RowMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      float, 
+      cutlass::arch::OpClassTensorOp, 
+      cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          cutlass::half_t, 128 / cutlass::sizeof_bits<cutlass::half_t>::value,
+          float, float>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 
+      4, // Stages
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value, // Alignment A
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value  // Alignment B
+    > ellgemm_op;
+
+    //
+    // Launch the EllGemm operation on the device
+    //
+
+    Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format:
+      a_rows              - Rows in the sparse matrix.
+      a_cols              - Colums in the sparse matrix.
+      BlockedEllA         - Packed matrix (ellValue matrix) that stores non-zero values in 
+                            consecutive blocks, whose size is (a_rows * a_ell_num_columns)
+      ell_idx             - Blocked-ELL Column indices (ellColInd) matrix, whose size is
+                            (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
+      a_ell_blocksize     - Size of the ELL-Blocks.
+      a_ell_num_columns   - Number of columns in the Blocked-Ellpack format (ellValue columns)
+      B                   - Input dense matrix whose size is (a_cols * n)
+      C/D                 - Output dense matrix whose size is (a_rows * n)
+
+    cutlass::Status status = ellgemm_op({
+      {a_rows, n, a_cols},  // GemmCoord problem_size
+      {BlockedEllA, lda},   // TensorRef<cutlass::half_t, layout::RowMajor> ref_BlockedEllA
+      {B, ldb},             // TensorRef<cutlass::half_t, layout::ColumnMajor> ref_B,
+      {C, ldc},             // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},             // TensorRef<float, layout::ColumnMajor> ref_D,
+      ell_idx,              // Blocked-ELL Column indices or ellColInd matrix (const int*)
+      a_ell_num_columns,    // Columns in the Blocked-Ellpack (ellValue) matrix (int)
+      a_ell_blocksize,      // Size of the ELL-Blocks (int)
+      a_ell_base,           // Base index of ellColInd (int) - Zero or One
+      {alpha, beta}         // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// Supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by GEMM
+      typename Operator,
+
+      /// Sparse matrix is A or not
+      bool IsASparse
+    >
+    class EllGemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse = true
+    >
+class EllGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kIsASparse = IsASparse;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kIsASparse
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_{};
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+                                              args.problem_size, 
+                                              {args.ell_blocksize,
+                                              ThreadblockShape::kN, ThreadblockShape::kK},
+                                              args.split_k_slices);
+      
+    tiled_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    grid_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return set(args, grid_shape, workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+class EllGemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, IsASparse> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kSplitKSerial = SplitKSerial;
+  static bool const kIsASparse = false;
+
+  using UnderlyingOperator = EllGemm< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    SplitKSerial,
+    Operator,
+    kIsASparse
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      args.split_k_slices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    tiled_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    return underlying_operator_.set(to_underlying_arguments(args), grid_shape, workspace);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    grid_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    set(args, grid_shape, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
new file mode 100755
index 000000000..c6f488b14
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
@@ -0,0 +1,772 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute>
+class Gemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    SharedMemoryClearOption::kNone,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      int const *gather_A_indices_ = nullptr,
+      int const *gather_B_indices_ = nullptr,
+      int const *scatter_D_indices_ = nullptr
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  Gemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace),
+      args.gather_A_indices,
+      args.gather_B_indices,
+      args.scatter_D_indices
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
+>
+class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
+           Operator_, GatherA, GatherB, ScatterD, PermuteDLayout> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = Gemm< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    SplitKSerial,
+    Operator,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    // For gather+scatter operations
+    int *gather_A_indices;
+    int *gather_B_indices;
+    int *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      int *gather_A_indices_ = nullptr,
+      int *gather_B_indices_ = nullptr,
+      int *scatter_D_indices_ = nullptr
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  Gemm() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices,
+      args.gather_B_indices,
+      args.gather_A_indices,
+      args.scatter_D_indices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
new file mode 100755
index 000000000..1ae2db467
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
@@ -0,0 +1,738 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_array.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmArray {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  using Operator = Operator_;
+
+  /// Define the kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    false,
+    Operator
+  >::GemmKernel;
+
+  using GemmKernel = kernel::GemmArray<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+
+    ElementA const * const *ptr_A;
+    LayoutA layout_A;
+
+    ElementB const * const *ptr_B;
+    LayoutB layout_B;
+
+    ElementC const * const *ptr_C;
+    LayoutC layout_C;
+
+    ElementC * const * ptr_D;
+    LayoutC layout_D;
+    
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      ElementA const * const *ptr_A_,
+      LayoutA layout_A_,
+      ElementB const * const *ptr_B_,
+      LayoutB layout_B_,
+      ElementC const * const *ptr_C_,
+      LayoutC layout_C_,
+      ElementC * const * ptr_D_,
+      LayoutC layout_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ptr_A(ptr_A_),
+      layout_A(layout_A_),
+      ptr_B(ptr_B_),
+      layout_B(layout_B_),
+      ptr_C(ptr_C_),
+      layout_C(layout_C_),
+      ptr_D(ptr_D_),
+      layout_D(layout_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmArray() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (args.layout_A.stride(0) % kAlignmentA) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_B.stride(0) % kAlignmentB) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_C.stride(0) % kAlignmentC) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_D.stride(0) % kAlignmentC) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ptr_A,
+      args.layout_A,
+      args.ptr_B,
+      args.layout_B,
+      args.ptr_C,
+      args.layout_C,
+      args.ptr_D,
+      args.layout_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      args.batch_count,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK});
+
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ptr_A,
+      args.layout_A,
+      args.ptr_B,
+      args.layout_B,
+      args.ptr_C,
+      args.layout_C,
+      args.ptr_D,
+      args.layout_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB,
+  typename Operator_
+>
+class GemmArray<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  AlignmentA,
+  AlignmentB,
+  Operator_
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = false;
+
+  //
+  using UnderlyingOperator = GemmArray< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+
+    ElementA const * const *ptr_A;
+    LayoutA layout_A;
+
+    ElementB const * const *ptr_B;
+    LayoutB layout_B;
+
+    ElementC const * const *ptr_C;
+    LayoutC layout_C;
+
+    ElementC * const * ptr_D;
+    LayoutC layout_D;
+    
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      ElementA const * const *ptr_A_,
+      LayoutA layout_A_,
+      ElementB const * const *ptr_B_,
+      LayoutB layout_B_,
+      ElementC const * const *ptr_C_,
+      LayoutC layout_C_,
+      ElementC * const * ptr_D_,
+      LayoutC layout_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ptr_A(ptr_A_),
+      layout_A(layout_A_),
+      ptr_B(ptr_B_),
+      layout_B(layout_B_),
+      ptr_C(ptr_C_),
+      layout_C(layout_C_),
+      ptr_D(ptr_D_),
+      layout_D(layout_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmArray() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+
+    GemmCoord problem_size{
+      args.problem_size.n(), 
+      args.problem_size.m(), 
+      args.problem_size.k()
+    };
+
+    return UnderlyingArguments(
+      problem_size,
+      args.ptr_B,
+      args.layout_B.stride(),
+      args.ptr_A,
+      args.layout_A.stride(),
+      args.ptr_C,
+      args.layout_C.stride(),
+      args.ptr_D,
+      args.layout_D.stride(),
+      args.epilogue,
+      args.batch_count
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
new file mode 100755
index 000000000..5981457c7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
@@ -0,0 +1,704 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined batch GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_batched.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmBatched {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  using Operator = Operator_;
+
+  /// Define the kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    false,
+    Operator
+  >::GemmKernel;
+
+  using GemmKernel = kernel::GemmBatched<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    int64_t stride_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    int64_t stride_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    int64_t stride_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    int64_t stride_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      int64_t stride_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      int64_t stride_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      int64_t stride_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      int64_t stride_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!TensorRef_aligned(args.ref_A, kAlignmentA) || (args.stride_A % kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_B, kAlignmentB) || (args.stride_B % kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_C, kAlignmentC) || (args.stride_C % kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_D, kAlignmentC) || (args.stride_D % kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.stride_A,
+      args.ref_B.non_const_ref(),
+      args.stride_B,
+      args.ref_C.non_const_ref(),
+      args.stride_C,
+      args.ref_D,
+      args.stride_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data()); 
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB,
+  typename Operator_
+>
+class GemmBatched<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  AlignmentA,
+  AlignmentB,
+  Operator_
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = false;
+
+  //
+  using UnderlyingOperator = GemmBatched< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    int64_t stride_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    int64_t stride_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    int64_t stride_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    int64_t stride_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      int64_t stride_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      int64_t stride_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      int64_t stride_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      int64_t stride_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmBatched() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      args.stride_B,
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      args.stride_A,
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      args.stride_C,
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.stride_D,
+      args.epilogue,
+      args.batch_count
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
new file mode 100755
index 000000000..e36c69cef
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
@@ -0,0 +1,718 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM
+  kernels that may be invoked from host code.
+
+  The contributions of this class are:
+
+    1. At compile time, it maps data types and high-level structural parameters
+  onto specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel
+  parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most
+  plausible GEMM configurations for each supported architecture. Consequently,
+  not all parameters are exposed to the top-level interface. Rather, sensible
+  defaults at each level of the CUTLASS hierarchy are selected to tradeoff
+  simplicity of the interface with flexibility. We expect most configurations to
+  be specified at this level. Applications with more exotic requirements may
+  construct their kernels of interest using CUTLASS components at the
+  threadblock, warp, and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects
+  compose some internal state with an overloaded function call operator. This
+  enables decoupling of initialization from execution, possibly reducing
+  overhead during steady state phases of application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each
+  logical input to the computation. This is distinct from the kernel-level
+  Params structure pattern which contains application-specific precomputed state
+  needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's
+  SGEMM NN is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+
+      /// Layout type for A matrix operand
+      typename LayoutA,
+
+      /// Element type for B matrix operand
+      typename ElementB,
+
+      /// Layout type for B matrix operand
+      typename LayoutB,
+
+      /// Element type for C and D matrix operands
+      typename ElementC,
+
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator
+    // (selects complex or gaussian complex)
+    typename Operator_ = arch::OpMultiplyAddComplex,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false>
+class GemmComplex {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static bool const kSplitKSerial = SplitKSerial;
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultGemmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kTransformA,
+    kTransformB,
+    Operator,
+    kSplitKSerial
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmComplex() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      // Determine grid shape
+      ThreadblockSwizzle threadblock_swizzle;
+
+      cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, 
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.split_k_slices);
+
+      return sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (selects complex or gaussian complex)
+  typename Operator_,
+  /// If true, kernel supports split-K as a serial reduction
+  bool SplitKSerial
+>
+class GemmComplex<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,    // partially specialized on LayoutC
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  TransformA,
+  TransformB,
+  Operator_,
+  SplitKSerial
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  using Operator = Operator_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = GemmComplex< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformB,
+    TransformA,
+    Operator,
+    SplitKSerial
+  >;
+  
+  static int const kAlignmentA = UnderlyingOperator::kAlignmentB;
+  static int const kAlignmentB = UnderlyingOperator::kAlignmentA;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+  static ComplexTransform const kTransformA = UnderlyingOperator::kTransformB;
+  static ComplexTransform const kTransformB = UnderlyingOperator::kTransformA;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmComplex() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
new file mode 100755
index 000000000..877375e94
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Device-level grouped GEMM.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/device/base_grouped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename GemmKernel_>
+class GemmGrouped : public BaseGrouped<GemmKernel_> {
+public:
+  using GemmKernel = GemmKernel_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
new file mode 100755
index 000000000..3de3cecbf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,385 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Device-level GEMM with layernorm elementwise operations fused in mainloop
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmLayernormMainloopFusion : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_
+>
+class GemmLayernormMainloopFusion<ElementA_, LayoutA_, ElementB_, LayoutB_, 
+           ElementScaleBias_, LayoutScaleBias_,
+           ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementScaleBias = ElementScaleBias_;
+  using LayoutScaleBias = LayoutScaleBias_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  using UnderlyingOperator = typename GemmLayernormMainloopFusion< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementScaleBias,
+    LayoutScaleBias, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmLayernormMainloopFusion() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
new file mode 100755
index 000000000..ac453c63b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
@@ -0,0 +1,515 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class SparseGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    TensorRef<ElementE const, LayoutE> ref_E;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      TensorRef<ElementE, LayoutE> ref_E_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ref_E(ref_E_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref(),
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
new file mode 100755
index 000000000..b7d8cecfa
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  GemmSparseUniversal is a stateful, reusable Sparse GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversal : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ampere sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
new file mode 100755
index 000000000..a313ddc90
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,202 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversalWithAbsmax :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ada sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
new file mode 100755
index 000000000..e599217a1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a sparse GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class SparseGemmWithAbsmax {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithAbsmax<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  using Arguments = typename GemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithAbsmax() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref(),
+      args.ref_Aux,
+      args.ptr_Vector,
+      args.ldr,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
new file mode 100755
index 000000000..73edfa35d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Sparse GEMM with visitor
+ */
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks_ =
+        typename cutlass::epilogue::threadblock::detail::EmptyCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+class SparseGemmWithVisitor {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithVisitor<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    FusionCallbacks,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    EpilogueStages
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementE const, LayoutE> ref_E;
+    typename FusionCallbacks::Arguments epilogue;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementE, LayoutE> ref_E_,
+      typename FusionCallbacks::Arguments epilogue_ = 
+        typename FusionCallbacks::Arguments()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_E(ref_E_),
+      epilogue(epilogue_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithVisitor() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      cutlass::TensorRef<ElementC, LayoutC>(), // It only matters that it's empty.
+      cutlass::TensorRef<ElementC, LayoutC>(), // Same as above.
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    constexpr int SplitKSlices = 1;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      SplitKSlices);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_E.non_const_ref(),
+      args.epilogue
+    };
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
new file mode 100755
index 000000000..f78c5a216
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -0,0 +1,636 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for GEMM performing a reduction over K partitions in parallel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_splitk_parallel.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  Gemm device-level operator performing parallel reduction over the K partition.
+
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert<
+        ElementAccumulator_,
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementAccumulator_,
+                                 ElementAccumulator_>::EpilogueOutputOp::kCount,
+        ElementAccumulator_>,
+    /// Reduction operator
+    typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd<
+        ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator,
+        EpilogueOutputOp_::kCount>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmSplitKHorizontalThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSplitKParallel {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ConvertScaledOp = ConvertScaledOp_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ReductionOp = ReductionOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+
+  /// GEMM kernel 
+  using GemmKernel = typename kernel::DefaultGemmSplitKParallel<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    ConvertScaledOp,
+    ThreadblockSwizzle,
+    kStages,
+    Operator
+  >::GemmKernel;
+
+  /// Reduction kernel
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  //
+  //
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    typename ConvertScaledOp::Params convert;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      typename ConvertScaledOp::Params convert_ = 
+        typename ConvertScaledOp::Params(),
+      typename ReductionOp::Params reduction_ =
+        typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      convert(convert_),
+      reduction(reduction_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params gemm_params_;
+
+  /// Reduction kernel parameters object
+  typename ReductionKernel::Params reduction_params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmSplitKParallel() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    return sizeof(ElementAccumulator_) * size_t(args.problem_size.m()) * size_t(args.problem_size.n()) * grid_shape.k();
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    // Define a reference to the workspace - this is an aligned region in device memory.
+    if (!workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+    
+    TensorRef<ElementAccumulator_, layout::RowMajor> ref_workspace(
+      static_cast<ElementAccumulator_ *>(workspace), 
+      args.problem_size.n());
+
+    int64_t partition_stride = int64_t(args.problem_size.m()) * int64_t(args.problem_size.n());
+
+    // Initialize the Params structure
+    gemm_params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      ref_workspace,
+      args.convert,
+      partition_stride
+    };
+
+    reduction_params_ = typename ReductionKernel::Params(
+      args.problem_size.mn(),
+      grid_shape.k(),
+      partition_stride,
+      ref_workspace,
+      args.ref_D,
+      args.ref_C.non_const_ref(),
+      args.epilogue
+    );
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    if (!workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    gemm_params_.ref_A.reset(args.ref_A.data());
+    gemm_params_.ref_B.reset(args.ref_B.data());
+    gemm_params_.ref_D.reset(workspace);     
+
+    reduction_params_.ref_D.reset(args.ref_D.data());
+    reduction_params_.ref_C.reset(args.ref_C.data());
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Launch GEMM kernel
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(gemm_params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+
+      result = cudaFuncSetAttribute(
+        Kernel<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    //
+    // Launch reduction kernel
+    //
+
+    block = ReductionKernel::block_shape();
+    grid = ReductionKernel::grid_shape(gemm_params_.problem_size.mn());
+
+    Kernel<ReductionKernel><<< grid, block, 0, stream >>>(reduction_params_);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Epilogue output operator
+    typename ConvertScaledOp_,
+    /// Reduction operator
+    typename ReductionOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages, int kAlignmentA, int kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_>
+class GemmSplitKParallel<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+                         layout::ColumnMajor, ElementAccumulator_,
+                         OperatorClass_, ArchTag_, ThreadblockShape_,
+                         WarpShape_, InstructionShape_, EpilogueOutputOp_,
+                         ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_,
+                         Stages, kAlignmentA, kAlignmentB, Operator_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ConvertScaledOp = ConvertScaledOp_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ReductionOp = ReductionOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+
+  using UnderlyingOperator = GemmSplitKParallel< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ConvertScaledOp,
+    ReductionOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentA,
+    kAlignmentB,
+    Operator
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  using ReductionKernel = typename UnderlyingOperator::ReductionKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    typename ConvertScaledOp::Params convert;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      typename ConvertScaledOp::Params convert_ = 
+        typename ConvertScaledOp::Params(),
+      typename ReductionOp::Params reduction_ =
+        typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      convert(convert_),
+      reduction(reduction_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmSplitKParallel() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices,
+      args.convert,
+      args.reduction
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
new file mode 100755
index 000000000..55413b77a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
@@ -0,0 +1,442 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout_ = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute
+>
+class GemmUniversal : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmUniversal<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmUniversal<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout_,
+    /// Permute operand A
+    typename PermuteALayout_,
+    /// Permute operand B
+    typename PermuteBLayout_
+>
+class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD,
+           PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversal< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout,
+    PermuteBLayout,
+    PermuteALayout
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversal() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
new file mode 100755
index 000000000..73564d3c6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -0,0 +1,693 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+// 2.x
+#include "cutlass/gemm/device/gemm_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+// 3.x
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
+  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, new static methods
+  are exposed in 3.x APIs that bypass the stateful methods or args->params lowering.
+
+  It supports kernel types that implement both the 2.x and 3.0 APIs,
+  however, this is done by specializing the implementation of GemmUniversalAdapter
+  on the two kernel API types, and thus, GemmUniversalAdapter's behaviour might
+  differ between the two specializations.
+*/
+template <class GemmKernel_, class Enable = void>
+class GemmUniversalAdapter;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
+{
+public:
+  using GemmKernel = GemmKernel_;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // Map back to 2.x type as best as possible
+  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static ComplexTransform const kTransformA = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  using OperatorClass = cutlass::detail::get_operator_class_t<typename CollectiveMainloop::TiledMma>;
+
+  using ArchTag = typename GemmKernel::ArchTag;
+
+  // NOTE: Assume identity swizzle for now
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA, typename CollectiveMainloop::TiledMma::ValTypeA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB, typename CollectiveMainloop::TiledMma::ValTypeB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  // Split-K preserves splits that are 128b aligned
+  static int constexpr kSplitKAlignment = cute::max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  /// Argument structure: User API
+  using Arguments = typename GemmKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::Params;
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (GemmKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{}));
+    }
+
+    workspace_bytes += GemmKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
+    return GemmKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return GemmKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<GemmKernel>,
+        GemmKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = GemmKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<GemmKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling GemmKernel::to_underling_arguments()
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("GemmUniversal::run()");
+    dim3 const block = GemmKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
+#endif
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
+      void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "GemmUniversal::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               0);
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<GemmKernel>;
+        if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 90) {
+          if constexpr (is_static_1x1x1) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching static 1x1x1 kernel");
+#endif
+            launch_result = cutlass::kernel_launch<GemmKernel>(
+              grid, block, smem_size, stream, params, launch_with_pdl);
+            if (launch_result != Status::kSuccess) {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+            }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            else {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+            }
+#endif
+          }
+          else {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching dynamic cluster kernel");
+#endif
+            launch_result = ClusterLauncher::launch(
+              grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
+#endif
+        launch_result = cutlass::kernel_launch<GemmKernel>(
+          grid, block, smem_size, stream, params, launch_with_pdl);
+        if (launch_result != Status::kSuccess) {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+        }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+        }
+#endif
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: cudaGetLastError reports success");
+#endif
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 2.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
+{
+public:
+
+  using GemmKernel = GemmKernel_;
+
+  static bool const kInternalTranspose =
+    !cutlass::epilogue::threadblock::detail::is_2x_evt_v<typename GemmKernel::Epilogue> &&  // 2.x EVT does not require internal transpose
+    cute::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+  using WarpShape = typename GemmKernel::WarpShape;
+  using InstructionShape = typename GemmKernel::InstructionShape;
+
+  // warp-level, arch-level (instruction), math operator
+  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+
+  // Operator class and arch tag extract bottom-up
+  // set it for top-level gemm device-level template
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  // Type, layout, and complex transform deliberately exchanged with B
+  using MapArguments = kernel::detail::MapArguments<
+    typename GemmKernel::ElementA,
+    typename GemmKernel::LayoutA,
+    GemmKernel::kTransformA,
+    GemmKernel::kAlignmentA,
+    typename GemmKernel::ElementB,
+    typename GemmKernel::LayoutB,
+    GemmKernel::kTransformB,
+    GemmKernel::kAlignmentB,
+    typename GemmKernel::LayoutC,
+    kInternalTranspose
+  >;
+
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename MapArguments::LayoutC;
+  static int const kAlignmentC = GemmKernel::kAlignmentC;
+
+  // C and D same type for 2.x kernel
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutD>;
+
+  static int const kStages = GemmKernel::Mma::kStages;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalAdapter() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    if (kInternalTranspose) {
+      return args.transposed_problem();
+    }
+    else {
+      return args;
+    }
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args), cuda_adapter);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args), cuda_adapter);
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr
+  ) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream, cuda_adapter);
+  }
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args) {
+
+    return underlying_operator_.update(to_underlying_arguments(args));
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return underlying_operator_.run(stream, cuda_adapter);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
new file mode 100755
index 000000000..e23191eae
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates streamk, batched strided, and batched array variants.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/limits>
+#else
+#include <limits>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename GemmKernel_>
+class GemmUniversalBase {
+public:
+
+  using GemmKernel = GemmKernel_;
+
+  /// Boolean indicating whether the CudaHostAdapter is enabled
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+
+  /// Index of the GEMM Kernel within the CudaHostAdapter
+  static int32_t const kGemmKernelIndex = 0;
+
+  /// Kernel dynamic shared memory allocation requirement
+  /// Update the kernel function's shared memory configuration for the current device
+  static constexpr size_t kSharedStorageSize = sizeof(typename GemmKernel::SharedStorage);
+
+protected:
+
+  //
+  // Device properties (uniform across all instances of the current thread)
+  //
+
+  // Device ordinal
+  CUTLASS_THREAD_LOCAL static int device_ordinal_;
+
+  /// Device SM count
+  CUTLASS_THREAD_LOCAL static int device_sms_;
+
+  /// Kernel SM occupancy (in thread blocks)
+  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
+
+protected:
+
+  /// Initialize static thread-local members for the thread's current device,
+  /// if necessary.
+  static Status init_device_props()
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
+
+    cudaError_t cudart_result;
+
+    // Get current device ordinal
+    int current_ordinal;
+    cudart_result = cudaGetDevice(&current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Done if matches the current static member
+    if (current_ordinal == device_ordinal_) {
+      // Already initialized
+      return Status::kSuccess;
+    }
+
+    // Update SM count member
+    cudart_result = cudaDeviceGetAttribute (&device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // If requires more than 48KB: configure for extended, dynamic shared memory
+    if constexpr (kSharedStorageSize >= (48 << 10))
+    {
+      cudart_result = cudaFuncSetAttribute(
+        Kernel2<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        kSharedStorageSize);
+      if (cudart_result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    // Update SM occupancy member
+    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+      &sm_occupancy_,
+      Kernel2<GemmKernel>,
+      GemmKernel::kThreadCount,
+      kSharedStorageSize,
+      cudaOccupancyDisableCachingOverride);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Update device ordinal member on success
+    device_ordinal_ = current_ordinal;
+
+    CUTLASS_TRACE_HOST("  "
+      "device_ordinal: (" << device_ordinal_ << "), "
+      "device_sms: (" << device_sms_ << "), "
+      "sm_occupancy: (" << sm_occupancy_ << ") "
+      "smem_size: (" << kSharedStorageSize << ") "
+      "GemmKernel::kThreadCount: (" << GemmKernel::kThreadCount << ")");
+
+    return Status::kSuccess;
+  }
+
+
+protected:
+
+  //
+  // Instance data members
+  //
+
+  /// Kernel parameters
+  typename GemmKernel::Params params_;
+
+
+  /// Initialize params member
+  Status init_params(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    int32_t device_sms = 0;
+    int32_t sm_occupancy = 0;
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      //
+      // Occupancy query using CudaHostAdapter::query_occupancy().
+      //
+
+      if (cuda_adapter) {
+
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      // Initialize static device properties, if necessary
+      Status result = init_device_props();
+
+      if (result != Status::kSuccess) {
+        return result;
+      }
+
+      //
+      // Use thread-local static members for occupancy query initialized by call to
+      // `init_device_props()`
+      //
+
+      device_sms   = device_sms_;
+      sm_occupancy = sm_occupancy_;
+    }
+
+    // Initialize params member
+    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
+    return Status::kSuccess;
+  }
+
+public:
+
+  //---------------------------------------------------------------------------------------------
+  // Stateless API
+  //---------------------------------------------------------------------------------------------
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
+
+    if (!kEnableCudaHostAdapter || cuda_adapter) {
+
+      dim3 grid = get_grid_shape(args, cuda_adapter);
+
+      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+            grid.z <= std::numeric_limits<uint16_t>::max()))
+      {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else {
+      //
+      // With a null host adapter, a conservative grid shape is computed and required to conform to CUDA grid
+      // dimension limits.
+      //
+
+      int64_t logicalGridM = (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) / ThreadblockShape::kM;
+      int64_t logicalGridN = (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+      int32_t logicalGridL = args.batch_count;
+
+      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
+          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
+          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
+
+        return Status::kErrorInvalidProblem;
+      }
+
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+
+  /// Returns the workspace size (in bytes) needed for the problem
+  /// geometry expressed by these arguments
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return 0;
+    }
+
+    // Get size from parameters
+    size_t workspace_bytes = base.params_.get_workspace_size();
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    return workspace_bytes;
+  }
+
+
+  /// Returns the grid extents in thread blocks to launch
+  static dim3 get_grid_shape(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return dim3(0,0,0);
+    }
+
+    // Get dims from parameters
+    dim3 grid_dims = base.params_.get_grid_dims();
+
+    CUTLASS_TRACE_HOST(
+         "  tiled_shape: " << base.params_.get_tiled_shape()  << "\n"
+      << "  grid_dims: {" << grid_dims << "}");
+
+    return grid_dims;
+  }
+
+
+  /// Returns the maximum number of active thread blocks per multiprocessor
+  static int maximum_active_blocks(CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
+
+    int32_t device_sms   = 0;
+    int32_t sm_occupancy = 0;
+
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      if (cuda_adapter) {
+
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+        return -1;
+        }
+      }
+      else {
+        return -1;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+      // Initialize static device properties, if necessary
+      if (init_device_props() != Status::kSuccess) {
+        return -1;
+      }
+
+      sm_occupancy = sm_occupancy_;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
+    return sm_occupancy;
+  }
+
+
+  //---------------------------------------------------------------------------------------------
+  // Stateful API
+  //---------------------------------------------------------------------------------------------
+
+  /// Initializes GEMM state from arguments and workspace memory
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize parameters from args
+    Status result = init_params(args, cuda_adapter);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    // Assign and prepare workspace memory
+    if (args.mode == GemmUniversalMode::kGemm) {
+      return params_.init_workspace(workspace, stream);
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
+    params_.update(args);
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
+
+    // Configure grid and block dimensions
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+    dim3 grid = params_.get_grid_dims();
+
+    // Launch kernel
+    CUTLASS_TRACE_HOST("  "
+      "grid: (" << grid << "), "
+      "block: (" << block << "), "
+      "SMEM: (" << kSharedStorageSize << ")");
+
+    cutlass::arch::synclog_setup();
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      if (cuda_adapter) {
+        void* kernel_params[] = {&params_};
+        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream, kernel_params, 0);
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
+
+      // Query for errors
+      cudaError_t result = cudaGetLastError();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    return run(stream, cuda_adapter);
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Static initializers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Device ordinal
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
+
+/// Device SM count
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
+
+/// Kernel SM occupancy (in thread blocks)
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
new file mode 100755
index 000000000..7ef581ac9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Stream-K GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalStreamkWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalStreamkWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalStreamkWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalStreamkWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
new file mode 100755
index 000000000..35f7b5416
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
@@ -0,0 +1,404 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Universal GEMM with absolute-maximum calculation and scaling
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm89,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithAbsMax;
+
+// Partial specialization for SM89
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename LayoutC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB
+>
+class GemmUniversalWithAbsMax<
+    ElementA_,
+    LayoutA_,
+    ElementB_,
+    LayoutB_,
+    ElementC_,
+    LayoutC_,
+    ElementAccumulator_,
+    arch::OpClassTensorOp,
+    arch::Sm89,
+    ThreadblockShape_,
+    WarpShape_,
+    InstructionShape_,
+    EpilogueOutputOp_,
+    ThreadblockSwizzle_,
+    Stages,
+    AlignmentA,
+    AlignmentB,
+    Operator_,
+    TransformA,
+    TransformB
+> :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      arch::OpClassTensorOp,
+      arch::Sm89,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SM89 column-major output exchanges problem size and operand.
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB>
+class GemmUniversalWithAbsMax<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, arch::OpClassTensorOp, arch::Sm89, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithAbsMax<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithAbsMax() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
new file mode 100755
index 000000000..809a504a7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
new file mode 100755
index 000000000..b25ae6a36
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a GEMM kernel that can reduce one of the input matrix
+    into a vector along the K dimension.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
+
+#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_ = true,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute
+>
+class GemmWithKReduction : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static constexpr int kStages = Stages;
+  static constexpr int kAlignmentA = AlignmentA;
+  static constexpr int kAlignmentB = AlignmentB;
+  static constexpr int kAlignmentC = EpilogueOutputOp::kCount;
+  static constexpr ComplexTransform kTransformA = TransformA;
+  static constexpr ComplexTransform kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
+>
+class GemmWithKReduction<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ReduceKForA_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD, PermuteDLayout> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmWithKReduction< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    !ReduceKForA_,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmWithKReduction() = default;
+
+  /// Helper to construct a transposed equivalent for the underying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
new file mode 100755
index 000000000..5e181743e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel_>
+class Gemv {
+public:
+
+  using GemvKernel = GemvKernel_;
+
+
+  using ElementA = typename GemvKernel::ElementA;
+  using LayoutA  = typename GemvKernel::LayoutA;
+  using ElementB = typename GemvKernel::ElementB;
+  using ElementC = typename GemvKernel::ElementC;
+
+  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
+  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
+
+  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
+  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
+
+  static int const kThreadCount = GemvKernel::kThreadCount;
+  static int const kThreadsPerRow = GemvKernel::kThreadsPerRow;
+
+  using Arguments = typename GemvKernel::Arguments;
+  using Params = typename GemvKernel::Params;
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Constructs the Gemv.
+  Gemv() { }
+
+  /// Determines whether the Gemv can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return GemvKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return 0;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args, dim3 const &block) { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3((args.problem_size.row() + (block.x - 1)) / block.x, 1, args.batch_count % 65536);
+    }
+    else {
+      return dim3((args.problem_size.row() + (block.y - 1)) / block.y, 1, args.batch_count % 65536);
+    }
+  }
+
+  /// Computes the block shape
+  static dim3 get_block_shape() { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3(kThreadCount, 1, 1);
+    }
+    else {
+      return dim3(kThreadsPerRow, kThreadCount / kThreadsPerRow, 1);
+    }
+  }
+
+  /// Initializes Gemv state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    params_ = Params(args);
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    return params_.update(args);    
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    dim3 block = get_block_shape();
+    dim3 grid = get_grid_shape(params_, block);
+
+    int smem_size = int(sizeof(typename GemvKernel::SharedStorage));
+    
+    // Launch
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
new file mode 100755
index 000000000..296f38cad
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
@@ -0,0 +1,548 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Rank2K kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementB_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Rank2K {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 2;
+
+  // static asserts for rank 2k update kernel
+  static_assert(platform::is_same<LayoutA, LayoutB>::value,
+    "Rank 2K update operator support same layouts for operandA and B");
+
+  /// Define the kernel
+  using Rank2Kkernel = typename kernel::DefaultRank2KUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kTransformB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::Rank2Kkernel;
+  
+  using Arguments = typename Rank2Kkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename Rank2Kkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  Rank2K() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = Rank2Kkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename Rank2Kkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<Rank2Kkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(Rank2Kkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<Rank2Kkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Rank2K update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Rank2K<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, TransformA, TransformB, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kUpdateRank = 2;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Rank2K<
+    ElementB,
+    LayoutB,
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentB,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kTransformB,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using Rank2Kkernel = typename UnderlyingOperator::Rank2Kkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Rank2K.
+  Rank2K() { }
+
+  /// Helper to construct a transposed equivalent for the underying Rank2K operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the Rank2K can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Rank2K state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Rank2K
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
new file mode 100755
index 000000000..6cbebc5d7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Device-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/device/base_grouped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Rank2K Grouped
+template <typename Rank2Kkernel_>
+class Rank2KGrouped : public BaseGrouped<Rank2Kkernel_> {
+public:
+  using Rank2Kkernel = Rank2Kkernel_;
+  static const cutlass::FillMode kFillModeC = Rank2Kkernel::kFillModeC;
+  static const cutlass::BlasMode kBlasMode = Rank2Kkernel::kBlasMode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
new file mode 100755
index 000000000..ae18a11b8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined RankK kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class RankK {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  /// Define the kernel
+  using RankKkernel = typename kernel::DefaultRankKUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::RankKkernel;
+  
+  using Arguments = typename RankKkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename RankKkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  RankK() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = RankKkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename RankKkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<RankKkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(RankKkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<RankKkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by RankK update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class RankK<ElementA_, LayoutA_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA,
+           SplitKSerial, Operator_, TransformA, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  // Complex transform for input A matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using RankKkernel = typename UnderlyingOperator::RankKkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the RankK.
+  RankK() { }
+
+  /// Helper to construct a transposed equivalent for the underying RankK operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args;
+  }
+
+  /// Determines whether the RankK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes RankK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace RankK
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
new file mode 100755
index 000000000..c36ef959b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
@@ -0,0 +1,603 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined SYMM and HEMM kernels. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+
+#include "cutlass/gemm/kernel/default_symm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Symm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  // static asserts for symm update kernel
+  static_assert(platform::is_same<LayoutA, LayoutB>::value,
+    "SYMM update operator support same layouts for operand A and B");
+
+  /// Define the kernel
+  using SymmKernel = typename kernel::DefaultSymmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kSideModeA,
+    kFillModeA,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kAlignmentBKernel,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::SymmKernel;
+  
+  using Arguments = typename SymmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename SymmKernel::Params params_;
+public:
+
+  /// Constructs the SYMM.
+  Symm() { }
+
+  /// Determines whether the SYMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = SymmKernel::can_implement(args);
+
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+   
+    if (FillModeA != FillMode::kLower && FillModeA != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideModeA == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename SymmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename SymmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(SymmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename SymmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<SymmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<SymmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/********************************************************************************************************
+  SYMM/HEMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always symmetric/hermitian, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+  Although, cuBLAS SYMM/HEMM only supports ColumnMajor layouts for all matrices (A, B, C/D).
+
+  For the mainloop and symm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for SYMM/HEMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Symm update kernel
+    typename Operator_,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Symm<ElementA_, LayoutA_, SideModeA, FillModeA, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Symm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    InvertSideMode<kSideModeA>::mode,
+    InvertFillMode<kFillModeA>::mode,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using SymmKernel = typename UnderlyingOperator::SymmKernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Symm.
+  Symm() { }
+
+  /// Helper to construct a transposed equivalent for the underying SYMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the Symm can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Symm state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Symm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
new file mode 100755
index 000000000..09b9152cb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a TRMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+
+#include "cutlass/gemm/kernel/default_trmm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Trmm device-level operator. This is an interface to efficient CUTLASS TRMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to TRMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible TRMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS TRMM operator implementing the functionality of cuBLAS's STRMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS TRMM operator.
+    //
+
+    cutlass::gemm::device::Trmm<
+      float,
+      cutlass::layout::ColumnMajor,
+      cutlass::SideMode::kLeft,
+      cutlass::FillMode::kLower,
+      cutlass::DiagType::kNonUnit,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+    > trmm_op;
+
+    //
+    // Launch the TRMM operation on the device
+    //
+
+    cutlass::Status status = trmm_op({
+      cutlass::gemm::GemmUniversalMode,   // Trmm Problem Mode
+      {m, n, m/n},                        // GemmCoord problem_size (k is based on left- or right-side mode)
+      batch_count,
+      {alpha},                            // EpilogueOutputOp::Params epilogue_op_params
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int lda,
+      int ldb,
+      int ldc
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Side Mode for A (kLeft or kRight)
+      SideMode SideModeA,
+
+      /// Fill Mode for A (kLower or kUpper)
+      FillMode FillModeA,
+
+      /// DiagType for A (kNonUnit or kUnit)
+      DiagType DiagTypeA,
+
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages,
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// If true, kernel supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by TRMM
+      typename Operator,
+
+      /// Complex elementwise transformation on A operand
+      ComplexTransform TransformA
+    >
+    class Trmm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by TRMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone>
+class Trmm {
+ public:
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  // Complex Transform don't appply to B
+  static ComplexTransform const kTransformA = TransformA; 
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static ComplexTransform const kTransformAKernel = (SideModeA == SideMode::kRight) ? 
+                                              ComplexTransform::kNone : TransformA;
+  static ComplexTransform const kTransformBKernel = (SideModeA == SideMode::kRight) ? 
+                                              TransformA : ComplexTransform::kNone;
+
+  /// Define the kernel
+  using TrmmKernel = typename kernel::DefaultTrmmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kTransformAKernel,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kTransformBKernel,
+    kAlignmentBKernel,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::TrmmKernel;
+  
+  using Arguments = typename TrmmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename TrmmKernel::Params params_;
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = TrmmKernel::can_implement(args);
+   
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (FillModeA == FillMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (DiagTypeA == DiagType::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+ 
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideMode == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename TrmmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename TrmmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(TrmmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename TrmmKernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<TrmmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<TrmmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/********************************************************************************************************
+  TRMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always triangular, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for TRMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA>
+class Trmm<ElementA_, LayoutA_, SideModeA, FillModeA, DiagTypeA,
+           ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
+           Operator_, TransformA> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_; 
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  // Changing SideMode as we change the layout
+  static SideMode const kSideModeT = (SideModeA == SideMode::kLeft) ?
+                                      SideMode::kRight : SideMode::kLeft;
+  // Changing FillMode as we change the layout
+  static FillMode const kFillModeT = (FillModeA == FillMode::kLower) ? 
+                                      FillMode::kUpper : FillMode::kLower;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  // Complex Transform don't appply to B
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = Trmm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    kSideModeT,
+    kFillModeT,
+    kDiagType,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    TransformA
+  >;
+
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using TrmmKernel = typename UnderlyingOperator::TrmmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Helper to construct a transposed equivalent for the underying TRMM operator which is identical
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
new file mode 100755
index 000000000..904e6af3c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
@@ -0,0 +1,324 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp" // cute::false_type
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+template <class T, template <int...> class U>
+struct is_kernel_tag_of : cute::false_type {};
+
+template <template <int...> class U, int... Args>
+struct is_kernel_tag_of<U<Args...>, U> : cute::true_type {};
+
+template <class T, template <int...> class U>
+constexpr bool is_kernel_tag_of_v = is_kernel_tag_of<T, U>::value;
+
+template <class T, template <int,bool> class U>
+struct is_asymmetric_dma_kernel_tag_of : cute::false_type {};
+
+template <template <int, bool> class U, int I0, bool B0>
+struct is_asymmetric_dma_kernel_tag_of<U<I0, B0>, U> : cute::true_type {};
+
+template <class T, template <int, bool> class U>
+constexpr bool is_asymmetric_dma_kernel_tag_of_v = \
+                              is_asymmetric_dma_kernel_tag_of<T, U>::value;
+
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+using namespace cute;
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+enum class KernelInputTransformType {
+    FastF32,
+    InterleavedComplexTF32
+};
+
+} // namespace detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace kernel::detail {
+
+// Has_SwapAB<T>::value will be true only if:
+//   class T has member SwapAB and T::SwapAB is true
+template <typename T, typename = void>
+struct Has_SwapAB { static constexpr bool value = false; };
+
+template <typename T>
+struct Has_SwapAB <T, CUTE_STL_NAMESPACE::void_t<decltype(T::SwapAB)>>
+{ static constexpr bool value = T::SwapAB; };
+
+template <typename T>
+static constexpr bool Has_SwapAB_v = Has_SwapAB<T>::value;
+
+} // namespace kernel::detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Kernel schedule policies (the base class tags, one for each kernel layer file)
+//
+struct KernelMultistage { };
+struct KernelCpAsyncWarpSpecialized { };
+struct KernelCpAsyncWarpSpecializedPingpong { };
+struct KernelCpAsyncWarpSpecializedCooperative { };
+struct KernelTma { };
+struct KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperative { 
+};
+
+struct KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Builder dispatch policies (not a part of the main CUTLASS layers, simply used to opt into
+// specific collective builder dispatches)
+//
+
+// FP8 related policies (including Fast Accumulation)
+struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum : KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+// Policies to opt into mixed type GEMMs
+struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongMixedInput : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeMixedInput: KernelTmaWarpSpecializedCooperative { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+// Policies for dispatch of epilogue
+struct EpilogueDefault { };
+struct EpilogueTransposed { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Collective Mainloop Policies
+//
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, WITHOUT predicated gmem loads
+struct MainloopSm70TwoStageUnpredicated {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, with predicated gmem loads
+struct MainloopSm70TwoStage {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, WITHOUT predicated gmem loads
+template<int Stages_>
+struct MainloopSm80CpAsyncUnpredicated {
+  constexpr static int Stages = Stages_;
+  using ArchTag = arch::Sm80;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm80CpAsync {
+  constexpr static int Stages = Stages_;
+  using ArchTag = cute::conditional_t<(size(ClusterShape_{}) > 1), arch::Sm90, arch::Sm80>;
+  using Schedule = KernelMultistage;
+  using ClusterShape = ClusterShape_;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmma {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  constexpr static int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelTma;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// With GMMA's A data from registers.
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpongMixedInput> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperativeMixedInput>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// For FP8 kernels
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaWarpSpecializedFP8
+  : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, KernelSchedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, KernelSchedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper sparse GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecializedSparse {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
new file mode 100755
index 000000000..ac288e3e8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cute/layout.hpp"
+#include "cutlass/detail/layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using cutlass::detail::TagToStrideA;
+using cutlass::detail::TagToStrideB;
+using cutlass::detail::TagToStrideC;
+using cutlass::detail::TagToStrideA_t;
+using cutlass::detail::TagToStrideB_t;
+using cutlass::detail::TagToStrideC_t;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+using cutlass::detail::StrideToLayoutTagA;
+using cutlass::detail::StrideToLayoutTagB;
+using cutlass::detail::StrideToLayoutTagC;
+using cutlass::detail::StrideToLayoutTagA_t;
+using cutlass::detail::StrideToLayoutTagB_t;
+using cutlass::detail::StrideToLayoutTagC_t;
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  return ::cutlass::detail::is_major<ModeIndex>(Stride{});
+}
+
+template<class Stride>
+constexpr bool
+is_mn_major() {
+  return is_major<0,Stride>();
+}
+
+template<class Stride>
+constexpr
+bool
+is_k_major() {
+  return is_major<1,Stride>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_mn_major_A() {
+  return is_mn_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_mn_major_B() {
+  return is_mn_major<TagToStrideB_t<LayoutB>>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_k_major_A() {
+  return is_k_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_k_major_B() {
+  return is_k_major<TagToStrideB_t<LayoutB>>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// The following two metafunctions are used to detect whether a `kernel::Gemm` or `kernel::GemmUniversal`
+// is implementing the CUTLASS 3.x API or not, by checking if the problem shape type is aliased within or not.
+template <class GemmKernel, class = void>
+struct IsCutlass3GemmKernel : cute::false_type { };
+
+template <typename GemmKernel>
+struct IsCutlass3GemmKernel<GemmKernel, cute::void_t<typename GemmKernel::ProblemShape>>
+    : cute::true_type { };
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
new file mode 100755
index 000000000..66aae898d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM operand enumeration: D = A * B + C
+enum class Operand {
+  kA, /// A multiplicand
+  kB, /// B multiplicand
+  kC, /// Source accumulator
+  kD  /// Destination accumulator
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class GemmUniversalMode {
+  kGemm,
+  kGemmSplitKParallel,
+  kBatched,
+  kArray,
+  kGrouped,
+  kInvalid
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Some options for clearing shared memory
+enum class SharedMemoryClearOption {
+  kNone,            ///< SMEM is in don't-care state
+  kZfill,           ///< Kernels fill out of bounds accesses with zeros
+  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
+};
+
+/////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
new file mode 100755
index 000000000..4a90a1d06
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing problem shapes 
+           for 3.x Ptr-Array GEMMs and Grouped GEMMs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ProblemShape_>
+struct GroupProblemShape {
+  using UnderlyingProblemShape = ProblemShape_;
+  int32_t num_groups = 1;
+  UnderlyingProblemShape* problem_shapes = nullptr;
+  UnderlyingProblemShape const* host_problem_shapes = nullptr;
+
+  CUTLASS_HOST_DEVICE
+  int32_t groups() const { return num_groups; }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t group_idx) const {
+    return problem_shapes[group_idx];
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t group_idx) const {
+    return host_problem_shapes[group_idx];
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() {
+    return host_problem_shapes != nullptr;
+  }
+};
+
+template <class ProblemShape_>
+class ArrayProblemShape {
+public:
+  using UnderlyingProblemShape = ProblemShape_;
+
+  ArrayProblemShape() = default;
+  ArrayProblemShape(UnderlyingProblemShape ps) : problem_shape_(ps) {}
+
+  // Num of groups for Ptr-Array GEMM always remain one, just the number of batches (l) can vary
+  // This is just to maintain uniformity with GroupProblemShape
+  constexpr int32_t groups() const { return 1; }
+
+  UnderlyingProblemShape* problem_shapes() const {
+    return &problem_shape_;
+  }
+  UnderlyingProblemShape const* host_problem_shapes() const {
+    return &problem_shape_;
+  }
+
+  // This is just to maintain uniformity with GroupProblemShape
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() {
+    return true;
+  }
+private:
+  UnderlyingProblemShape problem_shape_{};
+};
+
+} // namespace cutlass::gemm 
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
new file mode 100755
index 000000000..49f9eef33
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
@@ -0,0 +1,837 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Default kernel-level Blocked-Ell sparse gemm operators.
+      This operator combines threadblock-scoped ELL MMA
+      with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/kernel/ell_gemm.h"
+#include "cutlass/gemm/threadblock/default_ell_mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+>
+struct DefaultEllGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
+                   WarpShape, InstructionShape, EpilogueOutputOp,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
+      InstructionShape, 2, Operator, true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for Volta architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  GemmShape<8, 8, 4>,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<8, 8, 4>,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+  >
+struct DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    2,
+    SplitKSerial,
+    Operator,
+    IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      GemmShape<1, 1, 1>,
+      2,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator, 
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   layout::RowMajor,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator,
+                   IsASparse> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial,IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for SIMT DP4A
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for C matrix operand
+    typename LayoutC,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
+                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
+                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
+                   Operator, IsASparse> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+
+  using OperatorClass =  arch::OpClassSimt;
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      2,
+      Operator
+      >::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Wmma Gemm Kernel
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    > 
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA, 
+  ElementB, LayoutB, kAlignmentB, 
+  ElementC, LayoutC, 
+  ElementAccumulator, 
+  arch::OpClassWmmaTensorOp,
+  ArchTag, 
+  ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, 
+  ThreadblockSwizzle, 
+  Stages, 
+  SplitKSerial,
+  Operator,
+  IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA,
+      ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, 
+      arch::OpClassWmmaTensorOp, 
+      ArchTag,
+      ThreadblockShape, 
+      WarpShape, 
+      InstructionShape, 
+      Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue 
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
+      ThreadblockShape,
+      typename Mma::Operator, 
+      kPartitionsK, 
+      EpilogueOutputOp,
+      EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
new file mode 100755
index 000000000..4678df4af
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
@@ -0,0 +1,1189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/layout/permute.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+>
+struct DefaultGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB, 
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD, 
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   LayoutC, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using RegularEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  using Affine2Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
+          2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Use zfill or predicate for out-of-bound cp.async
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
+>
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    2,
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ScatterD,
+    PermuteDLayout
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, SharedMemoryClear, false, false, false> {
+
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
+                   WarpShape, InstructionShape, EpilogueOutputOp,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, SharedMemoryClear,
+                   false, false, false> {
+
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
+      InstructionShape, 2, Operator, true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Volta architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Use zfill or predicate for out-of-bound cp.async
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
+>
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  GemmShape<8, 8, 4>,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<8, 8, 4>,
+    2,
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ScatterD,
+    PermuteDLayout
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+  >
+struct DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    2,
+    SplitKSerial,
+    Operator,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout,
+    typename platform::enable_if< ! platform::is_same<ArchTag, arch::Sm80>::value >::type > {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      GemmShape<1, 1, 1>,
+      2,
+      Operator,
+      false,
+      SharedMemoryClear,
+      GatherA,
+      GatherB,
+      PermuteALayout,
+      PermuteBLayout>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess,
+      ScatterD,
+      PermuteDLayout
+      >::Epilogue;
+
+  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
+      2,
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   LayoutC,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator,
+                   SharedMemoryClear,
+                   GatherA,
+                   GatherB,
+                   ScatterD,
+                   PermuteDLayout,
+                   PermuteALayout,
+                   PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess,
+      ScatterD,
+      PermuteDLayout
+      >::Epilogue;
+
+  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
+      2,
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>; 
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for SIMT DP4A
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for C matrix operand
+    typename LayoutC,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+>
+struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
+                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
+                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
+                   Operator, SharedMemoryClear, false, false, false,
+                   layout::NoPermute, layout::NoPermute> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+
+  using OperatorClass =  arch::OpClassSimt;
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      2,
+      Operator
+      >::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Wmma Gemm Kernel
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+> 
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA, 
+  ElementB, LayoutB, kAlignmentB, 
+  ElementC, LayoutC, 
+  ElementAccumulator, 
+  arch::OpClassWmmaTensorOp,
+  ArchTag, 
+  ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, 
+  ThreadblockSwizzle, 
+  Stages, 
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  false,
+  false,
+  false,
+  layout::NoPermute,
+  layout::NoPermute
+> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA,
+      ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, 
+      arch::OpClassWmmaTensorOp, 
+      ArchTag,
+      ThreadblockShape, 
+      WarpShape, 
+      InstructionShape, 
+      Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue 
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
+      ThreadblockShape,
+      typename Mma::Operator, 
+      kPartitionsK, 
+      EpilogueOutputOp,
+      EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
new file mode 100755
index 000000000..7ef46c6cf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -0,0 +1,404 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial
+>
+struct DefaultGemmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm50, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape,
+    WarpShape, 
+    InstructionShape, 
+    ElementA, LayoutA, 
+    ElementB, LayoutB, 
+    ElementAccumulator, layout::RowMajor, 
+    arch::OpClassSimt,
+    Stages,
+    Operator,
+    false,
+    cutlass::arch::CacheOperation::Global,
+    cutlass::arch::CacheOperation::Global,
+    TransformA, 
+    TransformB
+  >;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, 
+          typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, 
+          typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassSimt, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
new file mode 100755
index 000000000..f9163874c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout
+>
+struct DefaultGemmGrouped<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  PermuteDLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    false, /*GatherA*/
+    false, /*GatherB*/
+    false, /*ScatterD*/
+    PermuteDLayout
+  >::GemmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGrouped<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmGrouped<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  layout::NoPermute, /*PermuteDLayout*/
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGrouped<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
new file mode 100755
index 000000000..a031c1a95
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level softmax-grouped-GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultGemmGroupedSoftmaxMainloopFusion {
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC_, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA_,
+    LayoutA_,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC_,
+    kInternalTranspose
+  >;
+
+private:
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaSoftmaxMainloopFusion<
+      typename MapArguments::ElementA, typename MapArguments::LayoutA, MapArguments::kAlignmentA,
+      typename MapArguments::ElementB, typename MapArguments::LayoutB, MapArguments::kAlignmentB,
+      ElementScaleBias_, LayoutScaleBias_, ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, kInternalTranspose,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+public:
+  using GemmKernel = kernel::GemmGroupedSoftmaxMainloopFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
new file mode 100755
index 000000000..68d739e30
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
+struct DefaultGemmLayernormMainloopFusion {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaLayernormMainloopFusion<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementScaleBias, LayoutScaleBias, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmLayernormMainloopFusion<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
new file mode 100755
index 000000000..df74a0749
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -0,0 +1,352 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_planar_complex.h"
+#include "cutlass/gemm/kernel/gemm_planar_complex_array.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Math operation performed by GEMM (e.g. arch::OpMultiplyAdd)
+    typename Operator,
+    /// Conditional enabling to switch between stages
+    typename Enable = void
+  >
+struct DefaultGemmPlanarComplexUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for pipelined mainloop
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+  >
+struct DefaultGemmPlanarComplexUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  typename platform::enable_if<(Stages <= 2)>::type 
+> {
+
+  /// Define planar complex valued variants instead
+  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexPipelined<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator
+  >::ThreadblockMma;
+
+  /// Planar complex epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
+    ThreadblockShape,
+    typename Mma::Policy::Operator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape::kK / WarpShape::kK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount  
+  >::Epilogue;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmPlanarComplex<
+    Mma,
+    Epilogue, 
+    ThreadblockSwizzle
+  >;
+
+  // Array variant
+  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiple pipeline stages.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+  >
+struct DefaultGemmPlanarComplexUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  typename platform::enable_if<(Stages > 2)>::type 
+> {
+
+  /// Define planar complex valued variants instead
+  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator
+  >::ThreadblockMma;
+
+  /// Planar complex epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
+    ThreadblockShape,
+    typename Mma::Policy::Operator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape::kK / WarpShape::kK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount  
+  >::Epilogue;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmPlanarComplex<
+    Mma,
+    Epilogue, 
+    ThreadblockSwizzle
+  >;
+
+  // Array variant
+  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
new file mode 100755
index 000000000..f1841a377
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
new file mode 100755
index 000000000..250a0e7b2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversal {
+
+  using DefaultGemmKernel = typename kernel::DefaultSparseGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator
+  >::GemmKernel;
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = kernel::GemmSparseUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
new file mode 100755
index 000000000..019390921
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversalWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA, LayoutA, kAlignmentA,
+    ElementB, LayoutB, kAlignmentB,
+    ElementC, LayoutC, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false, // SplitKSerial
+    Operator
+  >::GemmKernel;
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  using GemmKernel = kernel::GemmSparseUniversalWithAbsmax<
+      typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
new file mode 100755
index 000000000..30d063233
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+    Default configuration for a sparse GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemmWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA_, LayoutA_, kAlignmentA,
+    ElementB_, LayoutB_, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithAbsmax<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
new file mode 100755
index 000000000..9d7f2c6f7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+struct DefaultSparseGemmWithVisitor;
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages>
+struct DefaultSparseGemmWithVisitor<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   FusionCallbacks, ThreadblockSwizzle, Stages, Operator,
+                   EpilogueStages> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static constexpr int kAlignmentC = 128 / sizeof_bits<ElementC>::value;
+  using ElementEpilogue = ElementAccumulator;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  using EpilogueOutputOp =
+      typename epilogue::thread::LinearCombination<
+          ElementC, kAlignmentC,
+          ElementAccumulator, ElementEpilogue>;
+  using BaseEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK,
+          EpilogueOutputOp, EpilogueOutputOp::kCount>::Epilogue;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      BaseEpilogue,
+      FusionCallbacks,
+      EpilogueStages>;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithEpilogueVisitor<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
new file mode 100755
index 000000000..061bb7494
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/gemm_splitk_parallel.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator
+>
+struct DefaultGemmSplitKParallel {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate using the basic GEMM's
+  /// mainloop.
+  using Default = DefaultGemm<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC_,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false,
+    Operator
+  >;
+
+  /// Define the matrix multiply operator
+  using Mma = typename Default::Mma;
+
+  /// Define the epilogue
+  using Epilogue = typename Default::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmSplitKParallel<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
new file mode 100755
index 000000000..c19fdb5e2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a Stream-K GEMM that can broadcast a bias vector in the epilogue.
+    Similar structure to DefaultGemmWithBroadcast, but uses its own epilogue 
+    (DefaultStreamkEpilogueWithBroadcastTensorOp) and its own GEMM kernel 
+    (GemmStreamkWithFusedEpilogue).
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmStreamkWithBroadcast {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultStreamkEpilogueWithBroadcastTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmStreamkWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
new file mode 100755
index 000000000..ed7951be5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -0,0 +1,396 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemmUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout
+  >::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  SharedMemoryClear,
+  false,
+  false,
+  false,
+  layout::NoPermute,
+  layout::NoPermute,
+  layout::NoPermute,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
new file mode 100755
index 000000000..a3c69f2dc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused epilogue visitor callbacks
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Access granularity of C matrix in unit of elements
+  int kAlignmentC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Element type for epilogue computation
+  typename ElementEpilogue,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename FusionCallbacks,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Number of stages used in the pipelined epilogue
+  int EpilogueStages = 1
+>
+struct DefaultGemmWithVisitor {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA, 
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    epilogue::thread::LinearCombination<
+        ElementC_, kAlignmentC, 
+        ElementAccumulator, ElementEpilogue 
+    >,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      typename GemmBase::Epilogue,
+      FusionCallbacks,
+      EpilogueStages
+  >;
+
+  /// GemmWithVisitor without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public GemmWithEpilogueVisitor<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// GemmWIthVisitor with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public GemmWithEpilogueVisitorStreamk<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
new file mode 100755
index 000000000..3fd643e7e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithAbsMax {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithAbsMax<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
new file mode 100755
index 000000000..e95c25610
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@@ -0,0 +1,243 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithBroadcast {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
+///
+///
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable
+>
+struct DefaultGemmWithBroadcast<
+  ElementA_, LayoutA_, TransformA, kAlignmentA, 
+  ElementB_, LayoutB_, TransformB, kAlignmentB,
+  ElementC_, LayoutC_,
+  ElementAccumulator,
+  OperatorClass,
+  cutlass::arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  Enable
+  > {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    cutlass::arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
new file mode 100755
index 000000000..ca4c2cba6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
+#include "cutlass/gemm/threadblock/default_mma_with_reduction.h"
+#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Reduce A or B along the K dimension
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    ///
+    typename Enable = void>
+struct DefaultGemmWithKReduction {
+
+  static const bool kReduceKForA = (platform::is_same<LayoutC, cutlass::layout::RowMajor>::value) ? ReduceKForA_ : !ReduceKForA_;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaWithReduction<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, kReduceKForA, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the epilogue of the reduction vector
+  using EpilogueGemmKReduction =
+      typename cutlass::epilogue::threadblock::EpilogueGemmKReduction<
+          ElementAccumulator, ElementC, ThreadblockShape, typename Mma::Operator, kReduceKForA>;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmWithKReduction<Mma, Epilogue, EpilogueGemmKReduction, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
new file mode 100755
index 000000000..1a578f09f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithReduction {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator,
+    SharedMemoryClearOption::kClearLastStage
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
+///
+///
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable
+>
+struct DefaultGemmWithReduction<
+  ElementA_, LayoutA_, TransformA, kAlignmentA, 
+  ElementB_, LayoutB_, TransformB, kAlignmentB,
+  ElementC_, LayoutC_,
+  ElementAccumulator,
+  OperatorClass,
+  cutlass::arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  EpilogueReductionOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  Enable
+  >  {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    cutlass::arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
new file mode 100755
index 000000000..db6306401
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/gemv.h"
+#include "cutlass/gemm/threadblock/default_gemv_core.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the ThreadBlock tile - concept: gemm::GemmShape<>
+    typename ThreadBlockShape_,
+    /// Size of the per-thread shape - concept: gemm::GemmShape<>
+    typename ThreadShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C/D matrix
+    typename ElementCD_,
+    /// Layout of C/D matrix (concept: MatrixLayout)
+    typename LayoutCD_,
+    ///  Data type of the accumulator
+    typename ElementAccumulator_ = ElementCD_>
+struct DefaultGemv {
+
+  /// Shape of Threadblock-level matrix operation (concept: GemmShape)
+  using ThreadBlockShape = ThreadBlockShape_;
+
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using ThreadShape = ThreadShape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulators
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Data type of accumulators (same as C/D)
+  using LayoutAccumulator = LayoutCD_;
+
+  /// Data type of input/output matrix C/D
+  using ElementCD = ElementCD_;
+
+  /// Layout of input/output matrix C/D
+  using LayoutCD = LayoutCD_;
+
+  // Define the core components
+  using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
+      ThreadBlockShape, ThreadShape, ElementA, LayoutA, ElementB, LayoutB,
+      ElementAccumulator, LayoutAccumulator>;
+
+  // Define the threadblock-scoped gemv
+  using ThreadBlockGemv = cutlass::gemm::threadblock::Gemv<Core>;
+
+  // Iterator for multiplicand A
+  using IteratorA = typename ThreadBlockGemv::IteratorA;
+
+  // Iterator for multiplicand B
+  using IteratorB = typename ThreadBlockGemv::IteratorB;
+
+  /// Policy for the iterator that reads/writes C/D
+  using IteratorPolicyCD = typename platform::conditional<
+        platform::is_same<LayoutCD, layout::RowMajor>::value,
+        cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+          layout::PitchLinearShape<ThreadBlockShape::kN, ThreadBlockShape::kM>, Core::kThreadsPerN, ThreadShape::kN>,
+        cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+          layout::PitchLinearShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, Core::kThreadsPerN, ThreadShape::kM>>::type;
+
+  /// Iterator that reads/writes C/D
+  using IteratorCD = cutlass::transform::threadblock::PredicatedTileIterator<
+   cutlass::MatrixShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, ElementCD, LayoutCD, 0, IteratorPolicyCD>;
+
+  /// Fragment storage for C/D
+  using FragmentCD = typename IteratorCD::Fragment;
+
+  // Define the threadblock swizzle
+  using ThreadBlockSwizzle = cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
new file mode 100755
index 000000000..63400ef40
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2K;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
new file mode 100755
index 000000000..1a685286c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
@@ -0,0 +1,498 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2KComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformB,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct Rank2KTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+};
+  
+  // partial specializations for HER2K CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::ColumnMajor, layout::ColumnMajor, 
+  ComplexTransform::kNone, ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HER2K CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::RowMajor, layout::RowMajor, 
+  ComplexTransform::kConjugate, ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
new file mode 100755
index 000000000..7c79dd61a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
@@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false,                  // SplitKSerial
+    Operator,
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Complex-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false,                  // SplitKSerial
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  /// Pass through the user-provided TransformA and TransformB so as to
+  /// correctly set public-facing TransformA and TransformB in kernel::Rank2KGrouped.
+  /// This is needed because kernel::DefaultRank2KComplex may change TransformA and
+  /// TransformB that become template arguments to Mma1 and Mma2.
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
new file mode 100755
index 000000000..41e9cc45c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank 2k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank 2k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    BlasMode::kSymmetric
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,  
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    kBlasMode
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
new file mode 100755
index 000000000..780b205a4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
@@ -0,0 +1,247 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankK;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
new file mode 100755
index 000000000..56d2fcc99
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankKComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct RankKTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformA;
+
+};
+  
+  // partial specializations for HERK CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::ColumnMajor, 
+  ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HERK CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::RowMajor, 
+  ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
new file mode 100755
index 000000000..309ea4642
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_k.h"
+#include "cutlass/gemm/kernel/default_rank_k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRankKUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankK<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankKComplex<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
new file mode 100755
index 000000000..8f0ff4255
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
new file mode 100755
index 000000000..c2f803100
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
@@ -0,0 +1,508 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't appply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't appply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
new file mode 100755
index 000000000..ac0da25d1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/kernel/default_symm.h"
+#include "cutlass/gemm/kernel/default_symm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultSymmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYMM/HEMM
+    typename Operator>
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymm<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA, 
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymmComplex<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
new file mode 100755
index 000000000..3380eee37
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// 
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode SideMode_,
+    /// Fill Mode for the triangular matrix
+    FillMode FillMode_,
+    /// Diag Type for the triangular matrix
+    DiagType DiagType_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
new file mode 100755
index 000000000..c5cba8fb4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Side Mode for the kernel
+  SideMode SideMode_,
+  /// Fill Mode for the triangular matrix
+  FillMode FillMode_,
+  /// Diag Type for the triangular matrix
+  DiagType DiagType_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial
+>
+struct DefaultTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
new file mode 100755
index 000000000..e06e15ca3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
@@ -0,0 +1,359 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/kernel/default_trmm.h"
+#include "cutlass/gemm/kernel/default_trmm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator,
+    ///
+    typename Enable = void
+    >
+struct DefaultTrmmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator>
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::TrmmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator
+  >
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial
+  >::TrmmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
new file mode 100755
index 000000000..7cd619802
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
@@ -0,0 +1,824 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
+  bool IsASparse                  ///! If true, A is sparse matrix
+>
+struct EllGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+   Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+  EllGemm() = default;
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kM - 1 ) / Mma::Shape::kM;
+    int ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    int tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        ell_block_offset_m * params.ell_blocksize
+        + tile_offset_m * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.m() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutB
+      int ell_stride = iterator_B.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =  
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.operator()<true, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        } 
+        else {
+          mma.operator()<true, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      ell_block_offset_m * params.ell_blocksize
+      + tile_offset_m * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+         ell_block_offset_m * params.ell_blocksize
+         + min((tile_offset_m + 1) * Mma::Shape::kM, params.ell_blocksize)),
+      min(params.problem_size.n(),
+        (threadblock_tile_offset.n()+1) * Mma::Shape::kN)
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+// B is Sparse
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct EllGemm<Mma_, Epilogue_, ThreadblockSwizzle_, SplitKSerial, false> {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  EllGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kN - 1 ) / Mma::Shape::kN;
+    int ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    int tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        ell_block_offset_n * params.ell_blocksize
+        + tile_offset_n * Mma::Shape::kN,
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.n() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutA
+      int ell_stride = iterator_A.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.operator()<false, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+        else {
+          mma.operator()<false, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      ell_block_offset_n * params.ell_blocksize
+      + tile_offset_n * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+        (threadblock_tile_offset.m()+1) * Mma::Shape::kM),
+      min(params.problem_size.n(),
+         ell_block_offset_n * params.ell_blocksize
+         + min((tile_offset_n + 1) * Mma::Shape::kN, params.ell_blocksize))
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
new file mode 100755
index 000000000..354f5ea8a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct Gemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+    int gemm_k_size;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr,
+      int const *gather_A_indices = nullptr,
+      int const *gather_B_indices = nullptr,
+      int const *scatter_D_indices = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      gather_A_indices(gather_A_indices),
+      gather_B_indices(gather_B_indices),
+      scatter_D_indices(scatter_D_indices) {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+      
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Gemm() { } 
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    typename Mma::IteratorA::TensorRef ref_A,
+    typename Mma::IteratorB::TensorRef ref_B,
+    typename Epilogue::OutputTileIterator::TensorRef ref_C,
+    typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
new file mode 100755
index 000000000..bafa5fa8b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmArray {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::Element const * const * ptr_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::Element const * const * ptr_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Element const * const * ptr_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::Element * const * ptr_D;
+    int64_t stride_D;
+    typename OutputOp::Params epilogue;
+    int batch_count;
+    int gemm_k_iterations;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : 
+      swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
+      typename Mma::IteratorA::Element const * const * ptr_A_,
+      typename Mma::IteratorA::Layout layout_A,
+      typename Mma::IteratorB::Element const * const * ptr_B_,
+      typename Mma::IteratorB::Layout layout_B,
+      typename Epilogue::OutputTileIterator::Element const * const * ptr_C_,
+      typename Epilogue::OutputTileIterator::Layout layout_C,
+      typename Epilogue::OutputTileIterator::Element * const * ptr_D_,
+      typename Epilogue::OutputTileIterator::Layout layout_D,
+      typename OutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(layout_A),
+      ptr_A(ptr_A_),
+      params_B(layout_B),
+      ptr_B(ptr_B_),
+      params_C(layout_C),
+      ptr_C(ptr_C_),
+      params_D(layout_D),
+      ptr_D(ptr_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_),
+      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {
+
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmArray() { } 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
+      batch_idx < params.batch_count; 
+      batch_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        0
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        const_cast<typename Mma::IteratorA::Element *>(params.ptr_A[batch_idx]),
+        params.problem_size.mk(),
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        const_cast<typename Mma::IteratorB::Element *>(params.ptr_B[batch_idx]),
+        params.problem_size.kn(),
+        thread_idx,
+        tb_offset_B);
+
+      //
+      // Main loop
+      //
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+      
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp output_op(params.epilogue);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        const_cast<typename Epilogue::OutputTileIterator::Element *>(params.ptr_C[batch_idx]),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        params.ptr_D[batch_idx],
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // run efficient epilogue
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
new file mode 100755
index 000000000..0c11e997c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmBatched {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    int64_t stride_A{0};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    int64_t stride_B{0};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    int64_t stride_C{0};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    int64_t stride_D{0};
+    typename OutputOp::Params epilogue{};
+    int batch_count{1};
+    int gemm_k_iterations{0};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
+      typename Mma::IteratorA::TensorRef ref_A_,
+      int64_t stride_A_,
+      typename Mma::IteratorB::TensorRef ref_B_,
+      int64_t stride_B_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C_,
+      int64_t stride_C_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D_,
+      int64_t stride_D_,
+      typename OutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A_.layout()),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      params_B(ref_B_.layout()),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      params_C(ref_C_.layout()),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      params_D(ref_D_.layout()),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_),
+      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+  GemmBatched() = default;
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
+      batch_idx < params.batch_count; 
+      batch_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        0
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        params.problem_size.mk(),
+        thread_idx,
+        tb_offset_A);
+
+      iterator_A.add_pointer_offset(params.stride_A * batch_idx);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        params.problem_size.kn(),
+        thread_idx,
+        tb_offset_B);
+
+      iterator_B.add_pointer_offset(params.stride_B * batch_idx);
+
+
+      //
+      // Main loop
+      //
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+      
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp output_op(params.epilogue);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        params.ref_C.data(),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C.add_pointer_offset(params.stride_C * batch_idx);
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        params.ref_D.data(),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D.add_pointer_offset(params.stride_D * batch_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // run efficient epilogue
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
new file mode 100755
index 000000000..daa6cbd77
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGrouped {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments = kernel::detail::MapArguments<
+    typename Mma::IteratorA::Element,
+    typename Mma::IteratorA::Layout,
+    Mma::kTransformA,
+    Mma::IteratorA::AccessType::kElements,
+    typename Mma::IteratorB::Element,
+    typename Mma::IteratorB::Layout,
+    Mma::kTransformB,
+    Mma::IteratorB::AccessType::kElements,
+    typename Mma::LayoutC,
+    kTransposed
+  >;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(    
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params output_op,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
+    ): 
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      output_op(output_op),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.output_op),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd)
+    { 
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmGrouped() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+ 
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations, 
+        accumulators, 
+        iterator_A, 
+        iterator_B, 
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op, 
+        iterator_D, 
+        accumulators, 
+        iterator_C); 
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
new file mode 100755
index 000000000..304f23e73
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels 
+template <
+  typename ThreadblockShape,
+  bool Transposed
+>
+struct GemmGroupedProblemSizeHelper {
+
+  static bool const kTransposed = Transposed;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    if (kTransposed) {
+      swap(problem.m(), problem.n());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct GemmGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  static bool const kTransposed = Transposed;
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GemmGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (params_, shared_storage_, block_idx)
+  {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
new file mode 100755
index 000000000..3d889469f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs with a softmax fused beforehand
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGroupedSoftmaxMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments = kernel::detail::MapArguments<
+    typename Mma::IteratorA::Element,
+    typename Mma::IteratorA::Layout,
+    Mma::kTransformA,
+    Mma::IteratorA::AccessType::kElements,
+    typename Mma::IteratorB::Element,
+    typename Mma::IteratorB::Layout,
+    Mma::kTransformB,
+    Mma::IteratorB::AccessType::kElements,
+    typename Mma::LayoutC,
+    kTransposed
+  >;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  using ElementScaleBias = typename Mma::IteratorNormSum::Element;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params output_op,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      void ** ptr_norm,
+      void ** ptr_sum,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      output_op(output_op),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      ptr_norm(ptr_norm),
+      ptr_sum(ptr_sum),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.output_op),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      ptr_norm(args.ptr_norm),
+      ptr_sum(args.ptr_sum),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      ptr_norm = args.ptr_norm;
+      ptr_sum = args.ptr_sum;
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmGroupedSoftmaxMainloopFusion() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Construct iterator to the softmax norm/sum vector
+      typename Mma::IteratorNormSum iterator_norm_sum(
+        problem_size.m(),
+        static_cast<ElementScaleBias const *>(params.ptr_norm[problem_idx]),
+        static_cast<ElementScaleBias const *>(params.ptr_sum[problem_idx]),
+        thread_idx,
+        MatrixCoord(0, threadblock_offset.m())
+      );
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_B,
+        iterator_norm_sum,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
new file mode 100755
index 000000000..f324d7b30
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,782 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel with layernorm operations fused in mainloop.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmLayernormMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  using ElementScaleBias = typename Mma::IteratorVarMean::Element;
+  using LayoutScaleBias = typename Mma::IteratorVarMean::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_var{nullptr};
+    void const * ptr_mean{nullptr};
+    void const * ptr_gamma{nullptr};
+    void const * ptr_beta{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_var{0};
+    int64_t batch_stride_mean{0};
+    int64_t batch_stride_gamma{0};
+    int64_t batch_stride_beta{0};
+    int64_t batch_stride_C{0};
+
+    typename LayoutA::Stride stride_a{};
+    typename LayoutB::Stride stride_b{};
+    typename LayoutScaleBias::Stride stride_var{};
+    typename LayoutScaleBias::Stride stride_mean{};
+    typename LayoutScaleBias::Stride stride_gamma{};
+    typename LayoutScaleBias::Stride stride_beta{};
+    typename LayoutC::Stride stride_c{};
+    typename LayoutC::Stride stride_d{};
+
+    typename LayoutA::Stride::LongIndex lda{};
+    typename LayoutB::Stride::LongIndex ldb{};
+    typename LayoutScaleBias::Stride::LongIndex ld_var{};
+    typename LayoutScaleBias::Stride::LongIndex ld_mean{};
+    typename LayoutScaleBias::Stride::LongIndex ld_gamma{};
+    typename LayoutScaleBias::Stride::LongIndex ld_beta{};
+    typename LayoutC::Stride::LongIndex ldc{};
+    typename LayoutC::Stride::LongIndex ldd{};
+
+    int const * ptr_gather_A_indices{nullptr};
+    int const * ptr_gather_B_indices{nullptr};
+    int const * ptr_scatter_D_indices{nullptr};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutScaleBias::Stride stride_var,
+      typename LayoutScaleBias::Stride stride_mean,
+      typename LayoutScaleBias::Stride stride_gamma,
+      typename LayoutScaleBias::Stride stride_beta,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(0), ldb(0), ldc(0), ldd(0),
+      ld_var(0), ld_mean(0),
+      ld_gamma(0), ld_beta(0),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      stride_var(stride_var), stride_mean(stride_mean),
+      stride_gamma(stride_gamma), stride_beta(stride_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutScaleBias::Stride::LongIndex ld_var,
+      typename LayoutScaleBias::Stride::LongIndex ld_mean,
+      typename LayoutScaleBias::Stride::LongIndex ld_gamma,
+      typename LayoutScaleBias::Stride::LongIndex ld_beta,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ld_var(ld_var), ld_mean(ld_mean),
+      ld_gamma(ld_gamma), ld_beta(ld_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      stride_var = make_Coord(ld_var);
+      stride_mean = make_Coord(ld_mean);
+      stride_gamma = make_Coord(ld_gamma);
+      stride_beta = make_Coord(ld_beta);
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_var;
+    void * ptr_mean;
+    void * ptr_gamma;
+    void * ptr_beta;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_var;
+    int64_t batch_stride_mean;
+    int64_t batch_stride_gamma;
+    int64_t batch_stride_beta;
+    int64_t batch_stride_C;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_var(const_cast<void *>(args.ptr_var)),
+      ptr_mean(const_cast<void *>(args.ptr_mean)),
+      ptr_gamma(const_cast<void *>(args.ptr_gamma)),
+      ptr_beta(const_cast<void *>(args.ptr_beta)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_var(args.batch_stride_var),
+      batch_stride_mean(args.batch_stride_mean),
+      batch_stride_gamma(args.batch_stride_gamma),
+      batch_stride_beta(args.batch_stride_beta),
+      batch_stride_C(args.batch_stride_C),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_var = const_cast<void *>(args.ptr_var);
+      ptr_mean = const_cast<void *>(args.ptr_mean);
+      ptr_gamma = const_cast<void *>(args.ptr_gamma);
+      ptr_beta = const_cast<void *>(args.ptr_beta);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_var = args.batch_stride_var;
+      batch_stride_mean = args.batch_stride_mean;
+      batch_stride_gamma = args.batch_stride_gamma;
+      batch_stride_beta = args.batch_stride_beta;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+      
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmLayernormMainloopFusion op;
+    op(params, shared_storage);
+  }
+ 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Construct iterators to A var/mean vector
+    typename Mma::IteratorVarMean iterator_var_mean(
+      params.problem_size.m(),
+      static_cast<ElementScaleBias const *>(params.ptr_var),
+      static_cast<ElementScaleBias const *>(params.ptr_mean),
+      thread_idx,
+      MatrixCoord(0, (threadblock_tile_offset.m() * Mma::Shape::kM))
+    );
+
+    // Construct iterators to A scale/bias vector
+    typename Mma::IteratorGammaBeta iterator_gamma_beta(
+      problem_size_k,
+      static_cast<ElementScaleBias const *>(params.ptr_gamma),
+      static_cast<ElementScaleBias const *>(params.ptr_beta),
+      thread_idx,
+      MatrixCoord(
+        0, (threadblock_tile_offset.k() * Mma::Shape::kK)
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B,
+      iterator_var_mean,
+      iterator_gamma_beta, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
new file mode 100755
index 000000000..5a7f29d8f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GemmParams {
+
+  //
+  // Type definitions
+  //
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  using MmaIteratorParams = typename cutlass::transform::threadblock::PredicatedTileAccessIteratorParams;  
+  using EpilogueIteratorParams = typename cutlass::epilogue::threadblock::PredicatedTileIteratorParams;
+
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{};
+
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  int batch_count{1};
+  int gemm_k_size{0};
+
+  void * ptr_A{nullptr};
+  void * ptr_B{nullptr};
+  void * ptr_C{nullptr};
+  void * ptr_D{nullptr};
+
+  LongIndex lda{0};
+  LongIndex ldb{0};
+  LongIndex ldc{0};
+  LongIndex ldd{0};
+
+  LongIndex batch_stride_A{0};
+  LongIndex batch_stride_B{0};
+  LongIndex batch_stride_C{0};
+  LongIndex batch_stride_D{0};
+
+  int *semaphore{nullptr};
+
+  //
+  // Methods
+  //
+
+  GemmParams() = default;
+
+  CUTLASS_HOST_DEVICE
+  GemmParams(
+    cutlass::gemm::GemmCoord problem_size_,
+    cutlass::gemm::GemmCoord grid_tiled_shape_,
+    int swizzle_log_tile_,
+    GemmUniversalMode mode_,
+    int batch_count_,
+    int gemm_k_size_,
+    void const * ptr_A_,
+    void const * ptr_B_,
+    void const * ptr_C_,
+    void * ptr_D_,
+    LongIndex lda_,
+    LongIndex ldb_, 
+    LongIndex ldc_, 
+    LongIndex ldd_,
+    int64_t batch_stride_A_,
+    int64_t batch_stride_B_,
+    int64_t batch_stride_C_,
+    int64_t batch_stride_D_,
+    MmaIteratorParams const & params_itr_a_,
+    MmaIteratorParams const & params_itr_b_,
+    EpilogueIteratorParams const & params_itr_c_,
+    EpilogueIteratorParams const & params_itr_d_,
+    void *workspace_ = nullptr) :
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(swizzle_log_tile_),
+      mode(mode_),
+      batch_count(batch_count_),
+      gemm_k_size(gemm_k_size_),
+      ptr_A(const_cast<void *>(ptr_A_)),
+      ptr_B(const_cast<void *>(ptr_B_)),
+      ptr_C(const_cast<void *>(ptr_C_)),
+      ptr_D(ptr_D_),
+      lda(lda_),
+      ldb(ldb_),
+      ldc(ldc_),
+      ldd(ldd_),
+      batch_stride_A(batch_stride_A_),
+      batch_stride_B(batch_stride_B_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      params_itr_a(params_itr_a_),
+      params_itr_b(params_itr_b_),      
+      params_itr_c(params_itr_c_),
+      params_itr_d(params_itr_d_),
+      semaphore(static_cast<int *>(workspace_)
+    ) { }
+
+
+  CUTLASS_HOST_DEVICE
+  void update(
+    void const * ptr_A_,
+    void const * ptr_B_,
+    void const * ptr_C_,
+    void * ptr_D_,
+    int64_t batch_stride_A_,
+    int64_t batch_stride_B_,
+    int64_t batch_stride_C_,
+    int64_t batch_stride_D_,
+    void *workspace_ = nullptr) {
+
+    ptr_A = const_cast<void *>(ptr_A_);
+    ptr_B = const_cast<void *>(ptr_B_);
+    ptr_C = const_cast<void *>(ptr_C_);
+    ptr_D = ptr_D_;
+
+    batch_stride_A = batch_stride_A_;
+    batch_stride_B = batch_stride_B_;
+    batch_stride_C = batch_stride_C_;
+    batch_stride_D = batch_stride_D_;
+
+
+    semaphore = static_cast<int *>(workspace_);
+    CUTLASS_TRACE_HOST("GemmParams::update()");
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
new file mode 100755
index 000000000..019f93c8f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -0,0 +1,158 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma, typename Epilogue, typename ThreadblockSwizzle>
+CUTLASS_GLOBAL void GemmPipelined(
+  cutlass::gemm::GemmCoord problem_size,
+  cutlass::gemm::GemmCoord grid_tiled_shape,
+  typename Mma::IteratorA::Params params_A,
+  typename Mma::IteratorA::TensorRef ref_A,
+  typename Mma::IteratorB::Params params_B,
+  typename Mma::IteratorB::TensorRef ref_B,
+  typename Epilogue::Params params_epilogue
+  ) {
+
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ union {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  } shared_storage;
+
+  // Compute threadblock location
+  ThreadblockSwizzle threadblock_swizzle;
+
+  int swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
+
+  cutlass::gemm::GemmCoord tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
+
+  if (grid_tiled_shape.m() <= tb_tile_offset.m() ||
+    grid_tiled_shape.n() <= tb_tile_offset.n()) {
+
+    return;
+  }
+
+  // Compute initial location in logical coordinates
+  cutlass::MatrixCoord tb_offset_A{
+    tb_tile_offset.m() * Mma::Shape::kM,
+    tb_tile_offset.k()
+  };
+
+  cutlass::MatrixCoord tb_offset_B{
+    tb_tile_offset.k(),
+    tb_tile_offset.n() * Mma::Shape::kN
+  };
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(
+    params_A,
+    ref_A.data(),
+    {problem_size.m(), problem_size.k()},
+    tb_thread_id,
+    tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(
+    params_B,
+    ref_B.data(),
+    {problem_size.k(), problem_size.n()},
+    tb_thread_id,
+    tb_offset_B);
+
+  int warp_id = canonical_warp_idx_sync();
+  int lane_id = threadIdx.x % 32;
+
+  //
+  // Main loop
+  //
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage.main_loop, tb_thread_id, warp_id, lane_id);
+
+  typename Mma::FragmentC accumulators;
+
+  accumulators.clear();
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(problem_size, accumulators, iterator_A, iterator_B, accumulators);
+
+  //
+  // Epilogue
+  //
+
+  Epilogue epilogue(
+    params_epilogue, 
+    shared_storage.epilogue, 
+    tb_thread_id, 
+    warp_id, 
+    lane_id);
+
+  tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
+
+  //assume identity swizzle
+  MatrixCoord threadblock_offset(
+    tb_tile_offset.m() * Mma::Shape::kM,
+    tb_tile_offset.n() * Mma::Shape::kN
+  );
+
+  // run efficient epilogue
+  epilogue({problem_size.m(), problem_size.n()}, accumulators, threadblock_offset);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
new file mode 100755
index 000000000..09228ca01
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -0,0 +1,715 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmPlanarComplex {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using Operator = typename Mma::Operator;
+  using ArchTag = typename Mma::ArchTag;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value, 
+    128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Additional types needed for reflection
+  //
+
+  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::Shape;
+
+  static int const kStages = Mma::kStages;
+    
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  //
+  // Arguments structure
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A_real{nullptr};
+    void const * ptr_A_imag{nullptr};
+    void const * ptr_B_real{nullptr};
+    void const * ptr_B_imag{nullptr};
+    void const * ptr_C_real{nullptr};
+    void const * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
+    
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A_real,
+      void const * ptr_A_imag,
+      void const * ptr_B_real,
+      void const * ptr_B_imag,
+      void const * ptr_C_real,
+      void const * ptr_C_imag,
+      void * ptr_D_real,
+      void * ptr_D_imag,
+      typename LayoutA::Stride::Index lda_real,
+      typename LayoutA::Stride::Index lda_imag,
+      typename LayoutB::Stride::Index ldb_real,
+      typename LayoutB::Stride::Index ldb_imag,
+      typename LayoutC::Stride::Index ldc_real,
+      typename LayoutC::Stride::Index ldc_imag,
+      typename LayoutC::Stride::Index ldd_real,
+      typename LayoutC::Stride::Index ldd_imag,
+      int64_t batch_stride_A = 0,
+      int64_t batch_stride_A_imag = 0,
+      int64_t batch_stride_B = 0,
+      int64_t batch_stride_B_imag = 0,
+      int64_t batch_stride_C = 0,
+      int64_t batch_stride_C_imag = 0,
+      int64_t batch_stride_D = 0,
+      int64_t batch_stride_D_imag = 0)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A_real(ptr_A_real), 
+      ptr_A_imag(ptr_A_imag), 
+      ptr_B_real(ptr_B_real),
+      ptr_B_imag(ptr_B_imag),
+      ptr_C_real(ptr_C_real),
+      ptr_C_imag(ptr_C_imag),
+      ptr_D_real(ptr_D_real), 
+      ptr_D_imag(ptr_D_imag), 
+      lda_real(lda_real),
+      lda_imag(lda_imag),
+      ldb_real(ldb_real),
+      ldb_imag(ldb_imag),
+      ldc_real(ldc_real),
+      ldc_imag(ldc_imag),
+      ldd_real(ldd_real),
+      ldd_imag(ldd_imag),
+      batch_stride_A(batch_stride_A),
+      batch_stride_A_imag(batch_stride_A_imag),
+      batch_stride_B(batch_stride_B),
+      batch_stride_B_imag(batch_stride_B_imag),
+      batch_stride_C(batch_stride_C),
+      batch_stride_C_imag(batch_stride_C_imag),
+      batch_stride_D_imag(batch_stride_D_imag)
+    {}
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A_real, args.ptr_B_real);
+      std::swap(args.ptr_A_imag, args.ptr_B_imag);
+      std::swap(args.lda_real, args.ldb_real);
+      std::swap(args.lda_imag, args.ldb_imag);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.batch_stride_A_imag, args.batch_stride_B_imag);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_A_real{nullptr};
+    void * ptr_A_imag{nullptr};
+    void * ptr_B_real{nullptr};
+    void * ptr_B_imag{nullptr};
+    void * ptr_C_real{nullptr};
+    void * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A_real(args.lda_real),
+      params_A_imag(args.lda_imag),
+      params_B_real(args.ldb_real),
+      params_B_imag(args.ldb_imag),
+      params_C_real(args.ldc_real),
+      params_C_imag(args.ldc_imag),
+      params_D_real(args.ldd_real),
+      params_D_imag(args.ldd_imag),
+      output_op(args.epilogue),
+      ptr_A_real(const_cast<void *>(args.ptr_A_real)),
+      ptr_A_imag(const_cast<void *>(args.ptr_A_imag)),
+      ptr_B_real(const_cast<void *>(args.ptr_B_real)),
+      ptr_B_imag(const_cast<void *>(args.ptr_B_imag)),
+      ptr_C_real(const_cast<void *>(args.ptr_C_real)),
+      ptr_C_imag(const_cast<void *>(args.ptr_C_imag)),
+      ptr_D_real(args.ptr_D_real),
+      ptr_D_imag(args.ptr_D_imag),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_A_imag(args.batch_stride_A_imag),
+      batch_stride_B_imag(args.batch_stride_B_imag),
+      batch_stride_C_imag(args.batch_stride_C_imag),
+      batch_stride_D_imag(args.batch_stride_D_imag)
+    {}
+
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Double the size returned by the base class because we need to
+        // accumulate two ElementC components
+        workspace_bytes *= 2;
+      }
+
+      return workspace_bytes;
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A_real = const_cast<void *>(args.ptr_A_real);
+      ptr_A_imag = const_cast<void *>(args.ptr_A_imag);
+
+      ptr_B_real = const_cast<void *>(args.ptr_B_real);
+      ptr_B_imag = const_cast<void *>(args.ptr_B_imag);
+
+      ptr_C_real = const_cast<void *>(args.ptr_C_real);
+      ptr_C_imag = const_cast<void *>(args.ptr_C_imag);
+
+      ptr_D_real = const_cast<void *>(args.ptr_D_real);
+      ptr_D_imag = const_cast<void *>(args.ptr_D_imag);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_A_imag = args.batch_stride_A_imag;
+      batch_stride_B_imag = args.batch_stride_B_imag;
+      batch_stride_C_imag = args.batch_stride_C_imag;
+      batch_stride_D_imag = args.batch_stride_D_imag;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(Arguments const &args)
+  {
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
+
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplex op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A_real = static_cast<ElementA *>(params.ptr_A_real);
+    ElementA *ptr_A_imag = static_cast<ElementA *>(params.ptr_A_imag);
+
+    ElementB *ptr_B_real = static_cast<ElementB *>(params.ptr_B_real);
+    ElementB *ptr_B_imag = static_cast<ElementB *>(params.ptr_B_imag);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A;
+      ptr_A_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A_imag;
+      ptr_B_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B;
+      ptr_B_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A_real = static_cast<ElementA * const *>(params.ptr_A_real)[threadblock_tile_offset.k()];
+      ptr_A_imag = static_cast<ElementA * const *>(params.ptr_A_imag)[threadblock_tile_offset.k()];
+      ptr_B_real = static_cast<ElementB * const *>(params.ptr_B_real)[threadblock_tile_offset.k()];
+      ptr_B_imag = static_cast<ElementB * const *>(params.ptr_B_imag)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A_real(
+      params.params_A_real,
+      ptr_A_real,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorA iterator_A_imag(
+      params.params_A_imag,
+      ptr_A_imag,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B_real(
+      params.params_B_real,
+      ptr_B_real,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorB iterator_B_imag(
+      params.params_B_imag,
+      ptr_B_imag,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A_real,
+      iterator_A_imag,
+      iterator_B_real, 
+      iterator_B_imag, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C_real = static_cast<ElementC *>(params.ptr_C_real);
+    ElementC *ptr_C_imag = static_cast<ElementC *>(params.ptr_C_imag);
+    ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real);
+    ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D_real += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_D_imag += threadblock_tile_offset.k() * params.batch_stride_D_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C;
+      ptr_C_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C_imag;
+      ptr_D_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D;
+      ptr_D_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C_real = static_cast<ElementC * const *>(params.ptr_C_real)[threadblock_tile_offset.k()];
+      ptr_C_imag = static_cast<ElementC * const *>(params.ptr_C_imag)[threadblock_tile_offset.k()];
+      ptr_D_real = static_cast<ElementC * const *>(params.ptr_D_real)[threadblock_tile_offset.k()];
+      ptr_D_imag = static_cast<ElementC * const *>(params.ptr_D_imag)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C_real(
+      params.params_C_real,
+      ptr_C_real,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_C_imag(
+      params.params_C_imag,
+      ptr_C_imag,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D_real(
+      params.params_D_real,
+      ptr_D_real,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_D_imag(
+      params.params_D_imag,
+      ptr_D_imag,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    //
+    // Construct epilogue
+    //
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C_real = iterator_D_real;
+        iterator_C_imag = iterator_D_imag;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D_real, 
+      iterator_D_imag, 
+      accumulators, 
+      iterator_C_real,
+      iterator_C_imag); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
new file mode 100755
index 000000000..0c21fb8d8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmPlanarComplexArray {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using Operator = typename Mma::Operator;
+  using ArchTag = typename Mma::ArchTag;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value, 
+    128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Additional types needed for reflection
+  //
+
+  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::Shape;
+
+  static int const kStages = Mma::kStages;
+    
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  //
+  // Arguments structure
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
+
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
+
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
+
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
+
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
+
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      int const *ptr_M,
+      int const *ptr_N,
+      int const *ptr_K,
+      void const * const * ptr_A_real,
+      void const * const * ptr_A_imag,
+      void const * const * ptr_B_real,
+      void const * const * ptr_B_imag,
+      void const * const * ptr_C_real,
+      void const * const * ptr_C_imag,
+      void * const * ptr_D_real,
+      void * const * ptr_D_imag,
+      typename LayoutA::Stride::Index lda_real,
+      typename LayoutA::Stride::Index lda_imag,
+      typename LayoutB::Stride::Index ldb_real,
+      typename LayoutB::Stride::Index ldb_imag,
+      typename LayoutC::Stride::Index ldc_real,
+      typename LayoutC::Stride::Index ldc_imag,
+      typename LayoutC::Stride::Index ldd_real,
+      typename LayoutC::Stride::Index ldd_imag)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_M(ptr_M),
+      ptr_N(ptr_N),
+      ptr_K(ptr_K),
+      ptr_A_real(ptr_A_real), 
+      ptr_A_imag(ptr_A_imag), 
+      ptr_B_real(ptr_B_real),
+      ptr_B_imag(ptr_B_imag),
+      ptr_C_real(ptr_C_real),
+      ptr_C_imag(ptr_C_imag),
+      ptr_D_real(ptr_D_real), 
+      ptr_D_imag(ptr_D_imag), 
+      lda_real(lda_real),
+      lda_imag(lda_imag),
+      ldb_real(ldb_real),
+      ldb_imag(ldb_imag),
+      ldc_real(ldc_real),
+      ldc_imag(ldc_imag),
+      ldd_real(ldd_real),
+      ldd_imag(ldd_imag)
+    {}
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_M, args.ptr_N);
+      std::swap(args.ptr_A_real, args.ptr_B_real);
+      std::swap(args.ptr_A_imag, args.ptr_B_imag);
+      std::swap(args.lda_real, args.ldb_real);
+      std::swap(args.lda_imag, args.ldb_imag);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
+
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      ptr_M(args.ptr_M),
+      ptr_N(args.ptr_N),
+      ptr_K(args.ptr_K),
+      params_A_real(args.lda_real),
+      params_A_imag(args.lda_imag),
+      params_B_real(args.ldb_real),
+      params_B_imag(args.ldb_imag),
+      params_C_real(args.ldc_real),
+      params_C_imag(args.ldc_imag),
+      params_D_real(args.ldd_real),
+      params_D_imag(args.ldd_imag),
+      output_op(args.epilogue),
+      ptr_A_real(args.ptr_A_real),
+      ptr_A_imag(args.ptr_A_imag),
+      ptr_B_real(args.ptr_B_real),
+      ptr_B_imag(args.ptr_B_imag),
+      ptr_C_real(args.ptr_C_real),
+      ptr_C_imag(args.ptr_C_imag),
+      ptr_D_real(args.ptr_D_real),
+      ptr_D_imag(args.ptr_D_imag)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_M = args.ptr_M;
+      ptr_N = args.ptr_N;
+      ptr_K = args.ptr_K;
+
+      ptr_A_real = args.ptr_A_real;
+      ptr_A_imag = args.ptr_A_imag;
+
+      ptr_B_real = args.ptr_B_real;
+      ptr_B_imag = args.ptr_B_imag;
+
+      ptr_C_real = args.ptr_C_real;
+      ptr_C_imag = args.ptr_C_imag;
+
+      ptr_D_real = args.ptr_D_real;
+      ptr_D_imag = args.ptr_D_imag;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(Arguments const &args) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
+
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplexArray op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int batch_idx = threadblock_tile_offset.k();
+
+    int problem_size_m = params.problem_size.m();
+    int problem_size_n = params.problem_size.n();
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A_real = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_real[batch_idx]));
+    ElementA *ptr_A_imag = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_imag[batch_idx]));
+
+    ElementB *ptr_B_real = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_real[batch_idx]));
+    ElementB *ptr_B_imag = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_imag[batch_idx]));
+
+    //
+    // If pointers for problem sizes are specified, these are loaded from global memory
+    //
+
+    if (params.ptr_M) {
+      problem_size_m = params.ptr_M[batch_idx];
+    }
+
+    if (params.ptr_N) {
+      problem_size_n = params.ptr_N[batch_idx];
+    }
+
+    if (params.ptr_K) {
+      problem_size_k = params.ptr_K[batch_idx];
+    }
+
+    int const kBlockCountM = (problem_size_m + Mma::Shape::kM - 1) / Mma::Shape::kM;
+    int const kBlockCountN = (problem_size_n + Mma::Shape::kN - 1) / Mma::Shape::kN;
+        
+    int const kGemmKIterations = (problem_size_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    //
+    // Each threadblock loops over the logical problem size which the kernel may have discovered
+    // after the grid is launched.
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int block_m = threadblock_tile_offset.m(); 
+      block_m < kBlockCountM; 
+      block_m += params.grid_tiled_shape.m()) {
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int block_n = threadblock_tile_offset.n(); 
+        block_n < kBlockCountN; 
+        block_n += params.grid_tiled_shape.n()) {
+
+        //
+        // Compute indices within threadblock and warp.
+        //
+        int thread_idx = threadIdx.x;
+
+        // Broadcast the warp_id computed by lane 0 to ensure dependent code
+        // is compiled as warp-uniform.
+        int warp_idx = canonical_warp_idx_sync();
+        int lane_idx = threadIdx.x % 32;
+    
+        //
+        // Proceed with regular GEMM logic.
+        //
+
+        // Compute initial location in logical coordinates
+        cutlass::MatrixCoord tb_offset_A{ block_m * Mma::Shape::kM, 0};
+        cutlass::MatrixCoord tb_offset_B{ 0, block_n * Mma::Shape::kN };
+
+        // Construct iterators to A and B operands
+        typename Mma::IteratorA iterator_A_real(
+          params.params_A_real,
+          ptr_A_real,
+          {problem_size_m, problem_size_k},
+          thread_idx,
+          tb_offset_A);
+
+        typename Mma::IteratorA iterator_A_imag(
+          params.params_A_imag,
+          ptr_A_imag,
+          {problem_size_m, problem_size_k},
+          thread_idx,
+          tb_offset_A);
+
+        typename Mma::IteratorB iterator_B_real(
+          params.params_B_real,
+          ptr_B_real,
+          {problem_size_k, problem_size_n},
+          thread_idx,
+          tb_offset_B);
+  
+        typename Mma::IteratorB iterator_B_imag(
+          params.params_B_imag,
+          ptr_B_imag,
+          {problem_size_k, problem_size_n},
+          thread_idx,
+          tb_offset_B);
+
+        //
+        // Main loop
+        //
+
+        // Construct thread-scoped matrix multiply
+        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+        typename Mma::FragmentC accumulators;
+
+        accumulators.clear();
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(
+          kGemmKIterations, 
+          accumulators, 
+          iterator_A_real,
+          iterator_A_imag,
+          iterator_B_real, 
+          iterator_B_imag, 
+          accumulators);
+
+        //
+        // Epilogue
+        //
+
+        EpilogueOutputOp output_op(params.output_op);
+
+        //
+        // Masked tile iterators constructed from members
+        //
+
+        //assume identity swizzle
+        MatrixCoord threadblock_offset(
+          block_m * Mma::Shape::kM,
+          block_n * Mma::Shape::kN
+        );
+
+        ElementC *ptr_C_real = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_real[batch_idx]));
+        ElementC *ptr_C_imag = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_imag[batch_idx]));
+        ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real[batch_idx]);
+        ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag[batch_idx]);
+
+        // Tile iterator loading from source tensor.
+        typename Epilogue::OutputTileIterator iterator_C_real(
+          params.params_C_real,
+          ptr_C_real,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        typename Epilogue::OutputTileIterator iterator_C_imag(
+          params.params_C_imag,
+          ptr_C_imag,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        // Tile iterator writing to destination tensor.
+        typename Epilogue::OutputTileIterator iterator_D_real(
+          params.params_D_real,
+          ptr_D_real,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        typename Epilogue::OutputTileIterator iterator_D_imag(
+          params.params_D_imag,
+          ptr_D_imag,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        //
+        // Construct epilogue
+        //
+
+        Epilogue epilogue(
+          shared_storage.epilogue, 
+          thread_idx, 
+          warp_idx, 
+          lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+          output_op, 
+          iterator_D_real, 
+          iterator_D_imag, 
+          accumulators, 
+          iterator_C_real,
+          iterator_C_imag); 
+
+
+      } // for block_n
+    } // for block_m
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
new file mode 100755
index 000000000..c5420c72d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
@@ -0,0 +1,804 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename LayoutC,
+  typename LayoutE
+>
+struct SparseUniversalArgumentsBase : UniversalArgumentsBase {
+  //
+  // Data members
+  //
+
+  void const * ptr_A;
+  void const * ptr_B;
+  void const * ptr_C;
+  void * ptr_D;
+  void const * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  typename LayoutA::Stride::LongIndex lda;
+  typename LayoutB::Stride::LongIndex ldb;
+  typename LayoutC::Stride::LongIndex ldc;
+  typename LayoutC::Stride::LongIndex ldd;
+  typename LayoutE::Stride::LongIndex lde;
+
+  //
+  // Methods
+  //
+
+  SparseUniversalArgumentsBase():
+    ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr), ptr_E(nullptr)
+  {}
+
+  /// constructs an arguments structure
+  SparseUniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    void const * ptr_A,
+    void const * ptr_B,
+    void const * ptr_C,
+    void * ptr_D,
+    void const * ptr_E,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    int64_t batch_stride_E,
+    typename LayoutA::Stride::LongIndex lda,
+    typename LayoutB::Stride::LongIndex ldb,
+    typename LayoutC::Stride::LongIndex ldc,
+    typename LayoutC::Stride::LongIndex ldd,
+    typename LayoutC::Stride::LongIndex lde)
+  :
+    UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+    ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_E(ptr_E),
+    batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+    batch_stride_E(batch_stride_E),
+    lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), lde(lde)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalArgumentsBase::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+template <
+  typename Mma,
+  typename Epilogue,
+  typename Arguments,
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB
+>
+struct SparseUniversalParamsBase : UniversalParamsBase<
+  ThreadblockSwizzle,
+  ThreadblockShape,
+  ElementA,
+  ElementB,
+  ElementC,
+  LayoutA,
+  LayoutB> {
+  using ParamsBase = UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>;
+
+  //
+  // Data members
+  //
+
+  typename Mma::IteratorA::Params params_A;
+  typename Mma::IteratorB::Params params_B;
+  typename Epilogue::OutputTileIterator::Params params_C;
+  typename Epilogue::OutputTileIterator::Params params_D;
+  typename Mma::IteratorE::Params params_E;
+
+  void * ptr_A;
+  void * ptr_B;
+  void * ptr_C;
+  void * ptr_D;
+  void * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseUniversalParamsBase() = default;
+
+  /// Constructor
+  SparseUniversalParamsBase(
+    Arguments const &args,  /// GEMM application arguments
+    int device_sms,         /// Number of SMs on the device
+    int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+  :
+    ParamsBase(args, device_sms, sm_occupancy),
+    params_A(args.lda),
+    params_B(args.ldb),
+    params_C(args.ldc),
+    params_D(args.ldd),
+    params_E(args.lde),
+    ptr_A(const_cast<void *>(args.ptr_A)),
+    ptr_B(const_cast<void *>(args.ptr_B)),
+    ptr_C(const_cast<void *>(args.ptr_C)),
+    ptr_D(args.ptr_D),
+    ptr_E(const_cast<void *>(args.ptr_E)),
+    batch_stride_A(args.batch_stride_A),
+    batch_stride_B(args.batch_stride_B),
+    batch_stride_C(args.batch_stride_C),
+    batch_stride_E(args.batch_stride_E)
+  {}
+
+  /// Lightweight update given a subset of arguments.
+  void update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalParamsBase::update()");
+
+    // Update input/output pointers
+    this->ptr_A = const_cast<void *>(args.ptr_A);
+    this->ptr_B = const_cast<void *>(args.ptr_B);
+    this->ptr_C = const_cast<void *>(args.ptr_C);
+    this->ptr_D = args.ptr_D;
+    this->ptr_E = const_cast<void *>(args.ptr_E);
+
+    this->batch_stride_A = args.batch_stride_A;
+    this->batch_stride_B = args.batch_stride_B;
+    this->batch_stride_C = args.batch_stride_C;
+    this->batch_stride_D = args.batch_stride_D;
+    this->batch_stride_E = args.batch_stride_E;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde)
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      epilogue(epilogue)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (cute::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (cute::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (cute::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+    bool isEMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    isEMisaligned = (problem_size.m() % kAlignmentE)
+                  || ((problem_size.k() / kSparse) % kAlignmentE);
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      isEMisaligned = true;
+    }
+
+    if (mode == GemmUniversalMode::kGemm
+     || mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if ((problem_size.k() / split_k_count) % Mma::Shape::kK) {
+        isEMisaligned = true;
+      }
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      isEMisaligned = true;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isEMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for E operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversal op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
new file mode 100755
index 000000000..47b76a171
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversalWithAbsmax {
+public:
+  using Base = GemmSparseUniversal<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using ElementAux = typename Epilogue::AuxOutputTileIterator::Element;
+  using LayoutAux = typename Epilogue::AuxOutputTileIterator::Layout;
+  using ElementVector = typename Epilogue::ElementVector;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    void const* ptr_Aux;
+    void const* ptr_Vector;
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename LayoutAux::Stride::LongIndex ldaux;
+    int64_t ldvector;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      void const * ptr_Aux,
+      void const * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      int64_t batch_stride_Aux,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde,
+      typename LayoutAux::Stride::LongIndex ldaux,
+      int64_t ldvector
+      )
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_Aux(batch_stride_Aux),
+      batch_stride_Vector(batch_stride_Vector),
+      ldaux(ldaux),
+      ldvector(ldvector),
+      epilogue(epilogue)
+    { }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+    int64_t ldvector;
+
+    void* ptr_Aux;
+    void* ptr_Vector;
+
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_Aux(args.ldaux),
+      ldvector(args.ldvector),
+      ptr_Aux(const_cast<void *>(args.ptr_Aux)),
+      ptr_Vector(const_cast<void *>(args.ptr_Vector)),
+      batch_stride_Aux(args.batch_stride_Aux),
+      batch_stride_Vector(args.batch_stride_Vector),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+      ptr_Aux = const_cast<void *>(args.ptr_Aux);
+      ptr_Vector = const_cast<void *>(args.ptr_Vector);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+      this->batch_stride_Aux = args.batch_stride_Aux;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count) {
+    return Base::can_implement(problem_size, mode, split_k_count);
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversalWithAbsmax op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    ElementAux * ptr_Aux = static_cast<ElementAux *>(params.ptr_Aux);
+    ElementVector * ptr_Vector = static_cast<ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_Aux;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<ElementAux * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldvector;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      // Only the final block uses Vector
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+       (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Vector,
+      iterator_D,
+      accumulators,
+      iterator_C,
+      iterator_Aux,
+      params.problem_size.mn(),
+      threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
new file mode 100755
index 000000000..8ab98ff01
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for GEMM performing a reduction over K partitions in parallel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmSplitKParallel {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  static int const kAlignmentK = Mma::Operator::Shape::kK;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int64_t splitk_slice_stride;
+    int gemm_k_size;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op,
+      int64_t splitk_slice_stride
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      splitk_slice_stride(splitk_slice_stride) {
+
+      int full_gemm_k_iterations = problem_size.k() / Mma::Shape::kK;
+      int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmSplitKParallel() { } 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k;
+    if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
+      problem_size_k = params.problem_size.k();
+    }
+    else {
+      problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+    }
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    int warp_idx = threadIdx.x / 32;
+    int lane_idx = threadIdx.x % 32;
+
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to output tile
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    iterator_D.add_pointer_offset(params.splitk_slice_stride * threadblock_tile_offset.k());
+
+    // Execute the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_D);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
new file mode 100755
index 000000000..013fb7730
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
@@ -0,0 +1,2396 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Stream-K Gemm kernel compatible with fused epilogues
+    that broadcast a bias vector over the MMA output.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmStreamkWithFusedEpilogue;
+
+// GemmStreamkWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C1{nullptr};
+    void const * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc1{};
+    typename LayoutC::Stride::Index ldc2{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C1{nullptr};
+    void * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C1{};
+    typename Epilogue::OutputTileIterator::Params params_C2{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += tile_work.tiled_coord.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += tile_work.tiled_coord.k() * params.batch_stride_C2;
+      }
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[tile_work.tiled_coord.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[tile_work.tiled_coord.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+
+// GemmStreamkWithFusedEpilogue with one source
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector{nullptr};
+    void * ptr_Tensor{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
new file mode 100755
index 000000000..4a2258c41
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+    batched array variants.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_, 
+  typename LayoutA_, 
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  bool Transpose
+>
+struct MapArguments {
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  static ComplexTransform const kTransformA = TransformA;
+  static int const kAlignmentA = AlignmentA; 
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kAlignmentB = AlignmentB; 
+  using LayoutC = LayoutC_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_, 
+  typename LayoutA_, 
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_
+>
+struct MapArguments<
+  ElementA_,
+  LayoutA_,
+  TransformA,
+  AlignmentA, 
+  ElementB_,
+  LayoutB_,
+  TransformB,
+  AlignmentB,
+  LayoutC_,
+  true
+> {
+  using ElementA = ElementB_;
+  using LayoutA = typename layout::LayoutTranspose<LayoutB_>::type;
+  static ComplexTransform const kTransformA = TransformB;
+  static int const kAlignmentA = AlignmentB; 
+  using ElementB = ElementA_;
+  using LayoutB = typename layout::LayoutTranspose<LayoutA_>::type;
+  static ComplexTransform const kTransformB = TransformA;
+  static int const kAlignmentB = AlignmentA; 
+  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
new file mode 100755
index 000000000..08b30c74c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
@@ -0,0 +1,702 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmUniversal<
+  Mma_,
+  Epilogue_,
+  ThreadblockSwizzle_,
+  void,
+  // 3.x kernels use the first template argument to define the ProblemShape
+  // We use this invariant to SFINAE dispatch against either the 2.x API or the 3.x API
+  cute::enable_if_t<not (cute::is_tuple<Mma_>::value || IsCutlass3ArrayKernel<Mma_>::value)>
+> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    typename LayoutA::Stride stride_a;
+    typename LayoutB::Stride stride_b;
+    typename LayoutC::Stride stride_c;
+    typename LayoutC::Stride stride_d;
+
+    typename LayoutA::Stride::LongIndex lda;
+    typename LayoutB::Stride::LongIndex ldb;
+    typename LayoutC::Stride::LongIndex ldc;
+    typename LayoutC::Stride::LongIndex ldd;
+
+    int const * ptr_gather_A_indices;
+    int const * ptr_gather_B_indices;
+    int const * ptr_scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
+      ptr_gather_A_indices(nullptr),
+      ptr_gather_B_indices(nullptr),
+      ptr_scatter_D_indices(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      lda = 0;
+      ldb = 0;
+      ldc = 0;
+      ldd = 0;
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr
+    ):
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const
+    {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (cute::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (cute::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (cute::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversal op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
new file mode 100755
index 000000000..6c7b89a24
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+// In cases where ProblemShape is not a tuple, this is used to check if the
+// underlying problem shape type is aliased within or not.
+// Used for dispatching GemmUniversal to 2.x API or 3.x API
+template <class ProblemShape, class = void>
+struct IsCutlass3ArrayKernel : cute::false_type { };
+
+template <typename ProblemShape>
+struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
+    : cute::true_type { };
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/kernel/sm70_gemm.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp"
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
new file mode 100755
index 000000000..73426db5b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+namespace cutlass::gemm::kernel {
+
+
+/*
+ * Stateless universal device GEMM kernel type that treats GEMM as
+ * a composition of a collective mainloop and a collective epilogue.
+ *
+ * Supports both the 2.x and 3.x APIs based on whether the first type is
+ * a cute::tuple<> or not.
+ * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h
+ * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp
+ *
+ * In the following declaration, the name preceding the 'Or' refers to
+ * 3.x API type argument order, and the name succeeding the 'Or' refers to
+ * 2.x API type argument order. Template arguments without two names
+ * belong to the 3.x API only.
+**/
+template <
+  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversal;
+
+
+} // namespace cutlass::gemm::kernel
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
new file mode 100755
index 000000000..39a9bfb58
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
@@ -0,0 +1,1168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+struct GemmUniversalStreamk {
+public:
+
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count {1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride stride_a{0};
+    typename LayoutB::Stride stride_b{0};
+    typename LayoutC::Stride stride_c{0};
+    typename LayoutC::Stride stride_d{0};
+
+    typename LayoutA::Stride::LongIndex lda{0};
+    typename LayoutB::Stride::LongIndex ldb{0};
+    typename LayoutC::Stride::LongIndex ldc{0};
+    typename LayoutC::Stride::LongIndex ldd{0};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), avail_sms(avail_sms)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const
+    {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace = nullptr;
+    void *partials_workspace = nullptr;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_D = nullptr;
+    void * ptr_C = nullptr;
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+    }
+
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage
+  {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalStreamk::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return typename Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return typename Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        accumulator_tile,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversalStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmUniversalStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
new file mode 100755
index 000000000..5ce123a1a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that compute the epilogue visitor functor
+template <
+  typename Mma,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue,             ///! Epilogue
+  typename ThreadblockSwizzle_   ///! Threadblock swizzling function
+>
+class GemmWithEpilogueVisitor: public GemmUniversal<Mma, Epilogue, ThreadblockSwizzle_> {
+public:
+
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using Base = GemmUniversal<Mma, Epilogue, ThreadblockSwizzle>;
+  using Base::Base;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ElementA = typename Base::ElementA;
+  using LayoutA = typename Base::LayoutA;
+  using ElementB = typename Base::ElementB;
+  using LayoutB = typename Base::LayoutB;
+  using ElementC = typename Base::ElementC;
+  using LayoutC = typename Base::LayoutC;
+
+  using ThreadblockShape = typename Mma::Shape;
+
+  //
+  // Structures
+  //
+
+  using SharedStorage = typename Base::SharedStorage;
+  using Arguments = typename Base::Arguments;
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename FusionCallbacks::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices))
+    {
+      // Raise error on unsupported modes
+      assert(args.mode != GemmUniversalMode::kGemmSplitKParallel && "Sm80 EVT does not support SplitKParallel.");
+      assert(!(args.mode == GemmUniversalMode::kGemm && this->grid_tiled_shape.k() > 1 )
+        && "Sm80 EVT does not support SplitKSerial.");
+      assert(args.mode != GemmUniversalMode::kArray && "Sm80 EVT does not support Array Gemm.");
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalwithVisitor::Params::update()");
+
+      // Update input pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+  };
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitor op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
new file mode 100755
index 000000000..cdb825993
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
@@ -0,0 +1,895 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept with streamk.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+class GemmWithEpilogueVisitorStreamk {
+public:
+
+  using Base = GemmUniversalStreamk<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  using Arguments = typename Base::Arguments;
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape{};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename FusionCallbacks::Params output_op{};
+
+
+    void * ptr_D{nullptr};
+    void * ptr_C{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+
+  };
+
+  struct TileWorkDesc: Base::TileWorkDesc {
+    int k_end;
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+  // using TileWorkDesc = typename Base::TileWorkDesc;
+  using SharedStorage = typename Base::SharedStorage;
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    return Base::can_implement(problem_size);
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    cutlass::gemm::GemmCoord threadblock_tile_offset{
+      tile_work.tiled_coord.m(),
+      tile_work.tiled_coord.n(),
+      tile_work.tiled_coord.k()
+    };
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      accumulator_tile,
+      threadblock_tile_offset,
+      params.problem_shape,
+      thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        tiled_coord,
+        params.problem_shape,
+        thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitorStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitorStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        params.output_op,
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
new file mode 100755
index 000000000..470eaef53
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that computes the absolute maximum value of the output and a pre-activation-function
+// auxiliary output.
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithAbsMax {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldaux;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_Aux(nullptr)
+    {}
+
+    /// Constructs an arguments structure with ldaux
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldaux)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldaux(ldaux), ldr(ldr)
+    {
+    }
+
+    /// Constructs an Arguments structure without ldaux.
+    /// These parameters are overridden with D batch stride and ldd.
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr)
+    : Arguments(mode, problem_size, batch_count, epilogue, ptr_A, ptr_B, ptr_C, ptr_D, ptr_Aux, ptr_Vector,
+               batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_Vector,
+               lda, ldb, ldc, ldd, ldr, ldd)
+    {
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Aux(args.ldaux),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Aux(args.ptr_Aux),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_Vector(args.batch_stride_Vector)
+    {
+
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+      ptr_Aux = args.ptr_Aux;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithAbsMax op;
+    op(params, shared_storage);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementAuxOutput *ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput *>(params.ptr_Aux);
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to auxiliary tensor.
+      typename Epilogue::AuxOutputTileIterator iterator_Aux(
+        params.params_Aux,
+        ptr_Aux,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C,
+               iterator_Aux,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
new file mode 100755
index 000000000..363d109ce
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
@@ -0,0 +1,1512 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Gemm kernel with fused reduction operation.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmWithFusedEpilogue;
+
+// GemmWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase{
+
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C1;
+    void const * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc1;
+    typename LayoutC::Stride::Index ldc2;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C1(nullptr),
+      ptr_C2(nullptr),
+      ptr_D(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::Params params_C2;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C1;
+    void * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Additional tensor to load from
+      typename Epilogue::TensorTileIterator tensor_iterator(
+          params.params_Tensor,
+          // Only the final block outputs Tensor
+          ptr_Tensor,
+          params.problem_size.mn(),
+          thread_idx,
+          threadblock_offset);
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C1,
+               iterator_C2,
+               tensor_iterator,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+
+    #if SPLIT_K_ENABLED
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += threadblock_tile_offset.k() * params.batch_stride_C2;
+      }
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[threadblock_tile_offset.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[threadblock_tile_offset.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+    #endif
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.params_C1,
+      ptr_C1,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_C2(
+      params.params_C2,
+      ptr_C2,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    #if SPLIT_K_ENABLED
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C1 = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+    #endif
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C1,
+             iterator_C2,
+             tensor_iterator,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    #if SPLIT_K_ENABLED
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+    #endif
+  }
+};
+
+// GemmWithFusedEpilogue with one source
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Additional tensor to load from
+      typename Epilogue::TensorTileIterator tensor_iterator(
+          params.params_Tensor,
+          // Only the final block outputs Tensor
+          ptr_Tensor,
+          params.problem_size.mn(),
+          thread_idx,
+          threadblock_offset);
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C,
+               tensor_iterator,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+
+    #if SPLIT_K_ENABLED
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+    #endif
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    #if SPLIT_K_ENABLED
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+    #endif
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             tensor_iterator,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    #if SPLIT_K_ENABLED
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
new file mode 100755
index 000000000..49c4b0a1a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
@@ -0,0 +1,704 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename EpilogueGemmKReduction_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithKReduction {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using EpilogueGemmKReduction = EpilogueGemmKReduction_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using LayoutGemmKReduction = cutlass::layout::PitchLinear;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  static int const kReduceKForA = Mma::kReduceKForA;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+    void * ptr_gemm_k_reduction;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_gemm_k_reduction;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction;
+
+    //
+    // Methods
+    //
+
+    Arguments() :
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_gemm_k_reduction(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_gemm_k_reduction,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_gemm_k_reduction,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_gemm_k_reduction(ptr_gemm_k_reduction),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_gemm_k_reduction(batch_stride_gemm_k_reduction),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ld_gemm_k_reduction(ld_gemm_k_reduction)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+    void * ptr_gemm_k_reduction;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_gemm_k_reduction;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_gemm_k_reduction(args.batch_stride_gemm_k_reduction),
+      ptr_D(args.ptr_D),
+      ptr_gemm_k_reduction(args.ptr_gemm_k_reduction)
+    {}
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << this->problem_size);
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel) {
+        ptr_D = workspace;
+        ptr_gemm_k_reduction = static_cast<uint8_t *>(workspace)
+                 + sizeof(ElementC) * size_t(this->batch_stride_D) * size_t(this->grid_tiled_shape.k());
+
+        return Status::kSuccess;
+      }
+
+      return ParamsBase::init_workspace(workspace, stream);
+    }
+
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Split-K parallel always requires a temporary workspace
+        workspace_bytes +=
+          sizeof(ElementC) *
+          size_t(batch_stride_gemm_k_reduction) *
+          size_t(this->grid_tiled_shape.k());
+      }
+
+      return workspace_bytes;
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+      ptr_gemm_k_reduction = args.ptr_gemm_k_reduction;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_gemm_k_reduction = args.batch_stride_gemm_k_reduction;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC =  (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand A");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand B");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand C");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithKReduction op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    typename Mma::FragmentReduction gemm_k_accumulators;
+
+    gemm_k_accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators,
+      gemm_k_accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    ElementC *ptr_gemm_k_reduction = static_cast<ElementC *>(params.ptr_gemm_k_reduction);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_gemm_k_reduction += threadblock_tile_offset.k() * params.batch_stride_gemm_k_reduction;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+ 
+    if ((kReduceKForA && threadblock_tile_offset.n() == 0)
+     || (!kReduceKForA && threadblock_tile_offset.m() == 0)) {
+
+      int warp_idx_mn = warp_idx % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      int warp_idx_m = warp_idx_mn % Mma::Base::WarpCount::kM;
+      int warp_idx_n = warp_idx_mn / Mma::Base::WarpCount::kM;
+ 
+     if ((kReduceKForA && warp_idx_n == 0)
+      || (!kReduceKForA && warp_idx_m == 0)) {
+
+        int reduction_warp_idx = kReduceKForA ? warp_idx_m : warp_idx_n;
+        int reduction_threadblock_offset = kReduceKForA ? threadblock_tile_offset.m() :
+                                                          threadblock_tile_offset.n();
+        int reduction_vector_size = kReduceKForA ? params.problem_size.m()
+                                                 : params.problem_size.n();
+        EpilogueGemmKReduction epilogue_gemm_k_reduction(thread_idx,
+                                                         reduction_warp_idx,
+                                                         lane_idx,
+                                                         reduction_threadblock_offset,
+                                                         ptr_gemm_k_reduction);
+        epilogue_gemm_k_reduction(
+          reduction_vector_size,
+          gemm_k_accumulators,
+          params.mode == GemmUniversalMode::kGemm
+            && (params.grid_tiled_shape.k() > 1)
+            && (threadblock_tile_offset.k() > 0));
+      }
+    }
+   
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
new file mode 100755
index 000000000..9ec55e13c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
@@ -0,0 +1,638 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/cache_operation.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/numeric_conversion.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_ = 1,            ///< Number of elements involved in a global access.
+  int kThreadCount_ = 0,                  ///< Number of threads in the thread block.
+                                          ///  It will be calculated automatically if set to 0.
+  int kThreadsPerRow_ = 0                 ///< Number of threads in the k dimension.
+                                          ///  It will be calculated automatically if set to 0.
+>
+struct Gemv;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for column-major A matrix
+template <
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_,
+  int kThreadCount_,
+  int kThreadsPerRow_
+>
+struct Gemv <
+  ElementA_,
+  layout::ColumnMajor,
+  ElementB_,
+  ElementC_,
+  ElementAccumulator_,
+  EpilogueOutputOp_,
+  kElementsPerAccess_,
+  kThreadCount_,
+  kThreadsPerRow_
+>{
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  // thread block shape (kThreadCount, 1, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 32 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ? 1 : kThreadsPerRow_;
+
+  static int const kStages = 1;
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  static int const kAlignmentC = 1;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    MatrixCoord     problem_size;
+    int32_t         batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA      ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC       *ptr_D;
+
+    int64_t         inc_B;
+    int64_t         inc_C;
+    int64_t         inc_D;
+
+    int64_t         batch_stride_A;
+    int64_t         batch_stride_B;
+    int64_t         batch_stride_C;
+    int64_t         batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments(): batch_count(0) { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ): 
+      problem_size(problem_size),
+      batch_count(batch_count),
+      output_op(output_op),
+      ref_A(ref_A),
+      ptr_B(static_cast<ElementB const *>(ptr_B)),
+      ptr_C(static_cast<ElementC const *>(ptr_C)),
+      ptr_D(static_cast<ElementC       *>(ptr_D)),
+      inc_B(inc_B),
+      inc_C(inc_C),
+      inc_D(inc_D),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ): 
+      Arguments(
+        problem_size, 
+        batch_count, 
+        output_op, 
+        ref_A, 
+        ptr_B, 
+        ptr_C, 
+        ptr_D,
+        1, 
+        1, 
+        1, 
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D
+    ): 
+      Arguments(
+        problem_size, 
+        1, 
+        output_op, 
+        ref_A, 
+        ptr_B, 
+        ptr_C, 
+        ptr_D,
+        inc_B, 
+        inc_C, 
+        inc_D, 
+        1, 
+        1, 
+        1, 
+        1)
+    { }
+
+    Status update(Arguments const &args) {
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Gemv() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+ 
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
+
+      int i = blockIdx.x * kThreadCount + threadIdx.x;
+
+      ElementA const *ptr_A = params.ref_A.data() + i;
+      ElementB const *ptr_B = params.ptr_B;
+
+      ptr_A += batch_idx * params.batch_stride_A;
+      ptr_B += batch_idx * params.batch_stride_B;
+
+      ElementAccumulator accum = ElementAccumulator();
+
+      // Compute inner product
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int k = 0; k < params.problem_size.column(); ++k) {
+
+        // Fetch from A
+        ElementA a = ElementA();
+        if (i < params.problem_size.row()) {
+          a = *ptr_A;
+        }
+        ptr_A += params.ref_A.stride(0);
+
+        // Fetch from B
+        ElementB b = *ptr_B;
+        ptr_B += params.inc_B;
+
+        // Math
+        accum += ElementAccumulator(a) * ElementAccumulator(b);
+      }
+
+      //
+      // Epilogue phase
+      //
+
+      ElementC const *ptr_C = params.ptr_C + i * params.inc_C + batch_idx * params.batch_stride_C;
+      ElementC       *ptr_D = params.ptr_D + i * params.inc_D + batch_idx * params.batch_stride_D;
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+      typename EpilogueOutputOp::FragmentOutput      source_fragment;
+      typename EpilogueOutputOp::FragmentOutput      output_fragment;
+      
+      accum_fragment[0] = accum;
+
+      if (i < params.problem_size.row()) {
+        if (output_op.is_source_needed()) {
+          source_fragment[0] = *ptr_C;
+          output_fragment = output_op(accum_fragment, source_fragment);
+        }
+        else {
+          output_fragment = output_op(accum_fragment);
+        }
+
+        *ptr_D = output_fragment[0];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for row-major A matrix
+template <
+    typename ElementA_,
+    typename ElementB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename EpilogueOutputOp_,
+    int kElementsPerAccess_,
+    int kThreadCount_,
+    int kThreadsPerRow_ 
+>
+struct Gemv <
+    ElementA_,            
+    layout::RowMajor,
+    ElementB_,            
+    ElementC_,
+    ElementAccumulator_,
+    EpilogueOutputOp_,
+    kElementsPerAccess_,
+    kThreadCount_,
+    kThreadsPerRow_
+>{
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static int const kElementsPerAccess = kElementsPerAccess_;
+  
+  using FragmentA = Array<ElementA, kElementsPerAccess>;
+  using FragmentB = Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
+
+  // thread block shape (kThreadsPerRow, kThreadCount / kThreadsPerRow, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 128 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ?
+                                  std::min(static_cast<int>(kThreadCount / (kElementsPerAccess * sizeof(ElementA))), 16)
+                                  : kThreadsPerRow_;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    MatrixCoord     problem_size;
+    int32_t         batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA      ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC       *ptr_D;
+
+    int64_t         batch_stride_A;
+    int64_t         batch_stride_B;
+    int64_t         batch_stride_C;
+    int64_t         batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments(): batch_count(0) { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int32_t     batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ):
+      problem_size(problem_size),
+      batch_count(batch_count),
+      output_op(output_op),
+      ref_A(ref_A),
+      ptr_B(static_cast<ElementB const *>(ptr_B)),
+      ptr_C(static_cast<ElementC const *>(ptr_C)),
+      ptr_D(static_cast<ElementC       *>(ptr_D)),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D
+    ):
+      Arguments(
+        problem_size,
+        1,
+        output_op,
+        ref_A,
+        ptr_B,
+        ptr_C,
+        ptr_D,
+        1,
+        1,
+        1,
+        1)
+    { }
+
+    Status update(Arguments const &args) {
+      problem_size = args.problem_size;
+      batch_count = args.batch_count;
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Gemv() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size) {
+    if (problem_size.column() % kElementsPerAccess != 0) {
+      return Status::kErrorMisalignedOperand;
+    }
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
+      int idx_col_k = threadIdx.x;
+      int idx_row_m = blockIdx.x * blockDim.y + threadIdx.y;
+
+      if (idx_row_m < params.problem_size.row()) {
+        // problem_size (row = m, column = k)
+        // matrix A (batch, m, k)
+        // vector B (batch, 1, k)
+        // vector C (batch, m, 1)
+        // vector D (batch, m, 1)
+
+        // move in the batch dimension
+        ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
+        ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
+
+        ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+        ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+        // move in the k dimension
+        ptr_A += idx_col_k * kElementsPerAccess;
+        ptr_B += idx_col_k * kElementsPerAccess;
+
+        // move in the m dimension
+        ptr_A += idx_row_m * params.problem_size.column();
+        ptr_C += idx_row_m;
+        ptr_D += idx_row_m;
+
+        NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+        NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+
+        ElementAccumulator accum = 0.f;
+
+        FragmentB fragB;
+        FragmentA fragA;
+
+        int unroll_col_k = 0;
+
+        // rows of the rolling tile
+        int const tileA_k = kThreadsPerRow * kElementsPerAccess;
+
+        for (; unroll_col_k < params.problem_size.column() / tileA_k * tileA_k; unroll_col_k += tileA_k) {
+
+          // fetch from matrix A
+          arch::global_load<FragmentA,
+                            sizeof(FragmentA),
+                            arch::CacheOperation::LastUse>(fragA, (ptr_A + unroll_col_k), true);
+
+          // fetch from vector B
+          arch::global_load<FragmentB,
+                            sizeof(FragmentB),
+                            arch::CacheOperation::Always>(fragB, (ptr_B + unroll_col_k), true);
+
+          FragmentCompute fragB_Compute = srcB_converter(fragB);
+          FragmentCompute fragA_Compute = srcA_converter(fragA);
+
+          // Math
+          CUTLASS_PRAGMA_UNROLL
+          for (int e = 0; e < kElementsPerAccess; e++) {
+            accum += fragA_Compute.at(e) * fragB_Compute.at(e);
+          }
+        }
+
+        // calculate the rest of K elements
+        // each thread fetch 1 element each time
+        for (int k = unroll_col_k + idx_col_k; k < params.problem_size.column(); k += kThreadsPerRow) {
+          ElementB b = *(ptr_B - idx_col_k * kElementsPerAccess + k);
+          ElementA a = *(ptr_A - idx_col_k * kElementsPerAccess + k);
+
+          accum += ElementAccumulator(a) * ElementAccumulator(b);
+        }
+
+        EpilogueOutputOp output_op(params.output_op);
+        typename EpilogueOutputOp::FragmentOutput source_fragment;
+
+        // prefetch from source matrix C
+        if (output_op.is_source_needed()) {         
+          source_fragment[0] = *(ptr_C);
+        }
+
+        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+        typename EpilogueOutputOp::FragmentOutput output_fragment;
+
+        for (int mask = (kThreadsPerRow >> 1); mask > 0; mask >>= 1) {
+          accum += __shfl_xor_sync(0xFFFFFFFF, accum, mask, 32);
+        }
+
+        if (idx_col_k == 0) {
+          accum_fragment[0] = accum;
+
+          if (output_op.is_source_needed()) {
+            output_fragment = output_op(accum_fragment, source_fragment);
+          }
+          else {
+            output_fragment = output_op(accum_fragment);
+          }
+
+          *ptr_D = output_fragment[0];
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
new file mode 100755
index 000000000..673f1995c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -0,0 +1,244 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+namespace detail
+{
+  template<typename ElementAlphaBeta, bool BetaIsZero>
+  struct GemvBatchedStridedEpilogueScaling
+  {
+    ElementAlphaBeta const & alpha;
+    ElementAlphaBeta const & beta;
+
+    CUTLASS_DEVICE
+    GemvBatchedStridedEpilogueScaling(ElementAlphaBeta& alpha_, ElementAlphaBeta& beta_) :
+      alpha(alpha_), beta(beta_)
+    { }
+
+    template<typename FragmentCD, typename FragmentAccumulator>
+    CUTLASS_DEVICE
+    void operator()(FragmentAccumulator& accumulators,
+                    FragmentCD const& fragment_C,
+                    FragmentCD& fragment_D) const
+    {
+      using AccType = typename FragmentAccumulator::value_type;
+      using CDType = typename FragmentCD::value_type;
+
+      static_assert(FragmentCD::kElements == FragmentAccumulator::kElements,
+                    "Mistmatch in fragment sizes.");
+
+      for (int i = 0; i < FragmentCD::kElements; ++i)
+      {
+        if (BetaIsZero)
+        {
+          fragment_D[i] = CDType(accumulators[i] * AccType(alpha));
+        }
+        else
+        {
+          fragment_D[i] = CDType(accumulators[i] * AccType(alpha)
+                                 + AccType(fragment_C[i]) * AccType(beta));
+        } 
+      } 
+    }
+  };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero=false>
+CUTLASS_DEVICE void GemvBatchedStridedDevice(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  ElementAlphaBeta beta,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_C,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv;
+  using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle;
+  using EpilogueScale = detail::GemvBatchedStridedEpilogueScaling<ElementAlphaBeta, BetaIsZero>;
+
+  ThreadBlockSwizzle swizzler;
+
+  // Compute initial location in logical coordinates
+  BatchedGemmCoord tb_offset = swizzler.get_tile_offset();
+  int const batch_idx = swizzler.get_batch_idx();
+
+  // Offset to the batch
+  ref_A.add_pointer_offset(batch_idx*lda);
+  ref_B.add_pointer_offset(batch_idx*ldb);
+
+  // Construct iterators to A and B operands
+  typename GemvKernel::IteratorA::Params params_A(ref_A.layout());
+  typename GemvKernel::IteratorA iterator_A(
+      params_A,
+      ref_A.data(),
+      { 1, problem_size.k() },
+      0,
+      { 0, 0 });
+
+  typename GemvKernel::IteratorB::Params params_B(ref_B.layout());
+  typename GemvKernel::IteratorB iterator_B(
+      params_B,
+      ref_B.data(),
+      { problem_size.k(), problem_size.n() },
+      threadIdx.x,
+      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+
+  //
+  // Main loop
+  //
+
+  // Construct thread-scoped matrix multiply
+  ThreadBlockGemv mma;
+
+  typename ThreadBlockGemv::FragmentC accumulators;
+  accumulators.clear();
+
+  // Compute threadblock-scoped gemv
+  mma(problem_size.mnk(), accumulators, iterator_A, iterator_B, accumulators);
+
+  //
+  // Epilogue
+  //
+  typename GemvKernel::FragmentCD fragment_CD;
+
+  // Load C (skip if beta is zero)
+  if (!BetaIsZero)
+  {
+    tb_offset = swizzler.get_tile_offset();
+    ref_C.add_pointer_offset(batch_idx*ldc);
+    typename GemvKernel::IteratorCD::Params params_C(ref_C.layout());
+    typename GemvKernel::IteratorCD iterator_C(
+        params_C,
+        ref_C.data(),
+        { 1, problem_size.n() },
+        threadIdx.x,
+        { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+    iterator_C.load(fragment_CD);
+  }
+
+  // Apply alpha/beta scaling
+  EpilogueScale epilogue_scale(alpha, beta);
+  epilogue_scale(accumulators, fragment_CD, fragment_CD);
+
+  // Store D
+  tb_offset = swizzler.get_tile_offset();
+  ref_D.add_pointer_offset(batch_idx*ldd);
+  typename GemvKernel::IteratorCD::Params params_D(ref_D.layout());
+  typename GemvKernel::IteratorCD iterator_D(
+      params_D,
+      ref_D.data(),
+      { 1, problem_size.n() },
+      threadIdx.x,
+      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+  iterator_D.store(fragment_CD);
+}
+
+template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  ElementAlphaBeta beta,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_C,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, BetaIsZero>(
+    problem_size, alpha, beta, ref_A, lda, ref_B, ldb, ref_C, ldc, ref_D, ldd
+  );
+}
+
+template <typename GemvKernel, typename ElementAlphaBeta>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
+    problem_size, alpha, ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
+  );
+}
+
+template <typename GemvKernel>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  using ElementAlphaBeta = typename GemvKernel::IteratorCD::Element;
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
+    problem_size, ElementAlphaBeta(1), ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
new file mode 100755
index 000000000..31787372a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
@@ -0,0 +1,463 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base scheduler for grouped problems
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of scheduling to perform for the ProblemVisitor
+enum class GroupScheduleMode {
+  // Perform all scheduling on device
+  kDeviceOnly,
+  // Precompute on the host the full sequence of problems to access
+  kHostPrecompute
+};
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape_>
+struct BaseGroupedProblemVisitor {
+  using ThreadblockShape = ThreadblockShape_;
+
+  struct ProblemInfo {
+    static int32_t const kNoPrefetchEntry = -1;
+    int32_t problem_idx;
+    int32_t problem_start;
+
+    CUTLASS_DEVICE
+    ProblemInfo() : problem_idx(kNoPrefetchEntry), problem_start(kNoPrefetchEntry) {}
+
+    CUTLASS_DEVICE
+    ProblemInfo(int32_t problem_idx_, int32_t problem_start_) :
+      problem_idx(problem_idx_), problem_start(problem_start_) {}
+  };
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes(nullptr), problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+  };
+
+  Params params;
+  int32_t tile_idx;
+  int32_t problem_tile_start;
+  int32_t problem_idx;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaseGroupedProblemVisitor(
+    Params const &params_,
+    int32_t block_idx
+  ):
+  params(params_),
+  tile_idx(block_idx),
+  problem_tile_start(0),
+  problem_idx(0)
+  {}
+
+  /// Get the grid shape
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return ProblemSizeHelper::grid_shape(problem);
+  }
+
+  /// Gets the global tile index
+  CUTLASS_HOST_DEVICE
+  int32_t tile_index() const {
+    return tile_idx;
+  }
+
+  /// Gets the index of the problem
+  CUTLASS_HOST_DEVICE
+  int32_t problem_index() const {
+    return problem_idx;
+  }
+
+  CUTLASS_HOST_DEVICE
+  int32_t threadblock_idx() const {
+    return tile_idx - problem_tile_start;
+  }
+
+  CUTLASS_DEVICE
+  void advance(int32_t grid_size) {
+    tile_idx += grid_size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+  }
+
+  /// Returns the problem size for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size() const {
+    GemmCoord problem = params.problem_sizes[problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return ProblemSizeHelper::tile_count(grid);
+  }
+
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
+    int32_t total_tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      auto problem = host_problem_sizes_ptr[i];
+      possibly_transpose_problem(problem);
+      auto grid = grid_shape(problem);
+      total_tiles += tile_count(grid);
+    }
+
+    return total_tiles;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ProblemSizeHelper,
+  typename ThreadblockShape,
+  GroupScheduleMode GroupScheduleMode_,
+  int PrefetchTileCount,
+  int ThreadCount
+>
+struct GroupedProblemVisitor;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ProblemVisitor that performs all scheduling on device
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kDeviceOnly,
+                             PrefetchTileCount,
+                             ThreadCount>: public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+  static bool const kRequiresPrecomputation = false;
+  static int const kThreadsPerWarp = 32;
+
+  struct SharedStorage {};
+
+  // Final tile of the problem loaded by this thread. Each thread will hold
+  // a separate value.
+  int32_t problem_ending_tile;
+
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  problem_ending_tile(0),
+  shared_storage(shared_storage_)
+  {
+    this->problem_idx = -1 * kThreadsPerWarp;
+    this->problem_tile_start = 0;
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    // Check whether the tile to compute is within the range of the current problem.
+    int32_t problem_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, this->problem_idx % kThreadsPerWarp);
+    if (this->tile_idx < problem_tile_end) {
+      return true;
+    }
+
+    // Check whether the tile to compute is within the current group of problems fetched by the warp.
+    // The last tile for this group is the final tile of the problem held by the final thread in the warp.
+    int32_t group_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+    // Keep the starting problem for this group in `problem_idx`. This is done to reduce
+    // register pressure. The starting problem for this group is simply the first problem
+    // in the group most recently fetched by the warp.
+    int32_t &group_problem_start = this->problem_idx;
+    group_problem_start = (this->problem_idx / kThreadsPerWarp) * kThreadsPerWarp;
+
+    // Keep the starting tile for this group in `problem_tile_start`. This is done to reduce
+    // register pressure.
+    int32_t &group_tile_start = this->problem_tile_start;
+
+    // Each thread in the warp processes a separate problem to advance until
+    // reaching a problem whose starting tile is less less than tile_idx.
+    while (group_tile_end <= this->tile_idx) {
+      group_problem_start += kThreadsPerWarp;
+      if (group_problem_start > this->params.problem_count) {
+        return false;
+      }
+
+      // Since `group_tile_start` is a reference to `this->problem_tile_start`, this
+      // also sets `this->problem_tile_start`. The fact that `this->problem_tile_start`
+      // is also set here is used later in `next_tile`.
+      group_tile_start = group_tile_end;
+
+      int lane_idx = threadIdx.x % kThreadsPerWarp;
+      int32_t lane_problem = group_problem_start + lane_idx;
+
+      // Compute the number of tiles in the problem assigned to each thread.
+      problem_ending_tile = 0;
+      if (lane_problem < this->params.problem_count) {
+        cutlass::gemm::GemmCoord problem = this->params.problem_sizes[lane_problem];
+        this->possibly_transpose_problem(problem);
+        cutlass::gemm::GemmCoord grid = this->grid_shape(problem);
+        problem_ending_tile = this->tile_count(grid);
+      }
+
+      // Compute a warp-wide inclusive prefix sum to compute the ending tile index of
+      // each thread's problem.
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < kThreadsPerWarp; i <<= 1) {
+        int32_t val = __shfl_up_sync(0xffffffff, problem_ending_tile, i);
+        if (lane_idx >= i) {
+          problem_ending_tile += val;
+        }
+      }
+
+      // The total tile count for this group is now in the final position of the prefix sum
+      int32_t tiles_in_group = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+      problem_ending_tile += group_tile_start;
+      group_tile_end += tiles_in_group;
+    }
+
+    // The next problem to process is the first one that does not have ending tile position
+    // that is greater than or equal to tile index.
+    int32_t problem_idx_in_group =
+        __popc(__ballot_sync(0xffffffff, problem_ending_tile <= this->tile_idx));
+
+    this->problem_idx = group_problem_start + problem_idx_in_group;
+
+    // The starting tile for this problem is the ending tile of the previous problem. In cases
+    // where `problem_idx_in_group` is the first problem in the group, we do not need to reset
+    // `problem_tile_start`, because it is set to the previous group's ending tile in the while
+    // loop above.
+    if (problem_idx_in_group > 0) {
+      this->problem_tile_start = __shfl_sync(0xffffffff, problem_ending_tile, problem_idx_in_group - 1);
+    }
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Precomputes schedule on host and prefetches into shared memory
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kHostPrecompute,
+                             PrefetchTileCount,
+                             ThreadCount> : public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  static_assert(PrefetchTileCount > 0,
+                "GroupedProblemVisitor with GroupScheduleMode `kHostPrecompute` currently requires prefetching to shared memory");
+
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  using ProblemInfo = typename Base::ProblemInfo;
+  static bool const kRequiresPrecomputation = true;
+
+  static int const kPrefetchTileCount = PrefetchTileCount;
+  static int const kThreadCount = ThreadCount;
+
+  struct SharedStorage {
+    // Sequence of problem IDs and starting tiles to compute
+    cutlass::Array<ProblemInfo, kPrefetchTileCount> prefetched_problems;
+  };
+
+  int32_t tiles_computed;
+  int32_t iterations_per_block;
+  int32_t block_load_start;
+  SharedStorage &shared_storage;
+  ProblemInfo const *problem_info_ptr;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  tiles_computed(0),
+  shared_storage(shared_storage_),
+  problem_info_ptr(reinterpret_cast<ProblemInfo const*>(params_.workspace))
+  {
+    iterations_per_block = (params_.tile_count - 1 + gridDim.x) / gridDim.x;
+    block_load_start = iterations_per_block * block_idx;
+    // Start prefetching the first set of tiles to compute
+    prefetch_tiles();
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx >= this->params.tile_count) {
+      return false;
+    }
+
+    int32_t prefetch_idx = (tiles_computed % kPrefetchTileCount);
+    if (prefetch_idx == 0) {
+      // Ensure all previous stores to shared memory have been completed
+      __syncthreads();
+    }
+
+    auto problem_info = shared_storage.prefetched_problems[prefetch_idx];
+    ++tiles_computed;
+
+    if ((tiles_computed % kPrefetchTileCount) == 0) {
+      // Begin prefetching next set of tiles. Synchronize first to ensure that
+      // we don't overwrite the current buffer while someone else is using it.
+      __syncthreads();
+      prefetch_tiles();
+    }
+
+    this->problem_idx = problem_info.problem_idx;
+    this->problem_tile_start = problem_info.problem_start;
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = ((total_tiles - 1 + block_count) / block_count);
+    return sizeof(ProblemInfo) * entries_per_block * block_count;
+  }
+#if !defined(__CUDACC_RTC__)
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {
+    ProblemInfo* host_problem_info_ptr = reinterpret_cast<ProblemInfo*>(host_workspace_ptr);
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = (total_tiles - 1 + block_count) / block_count;
+
+    int tile = 0;
+    int start_tile = 0;
+    for (int p_idx = 0; p_idx < problem_count; ++p_idx) {
+      auto problem = host_problem_sizes_ptr[p_idx];
+      Base::possibly_transpose_problem(problem);
+      auto grid = Base::grid_shape(problem);
+      int tiles = Base::tile_count(grid);
+      ProblemInfo problem_info(p_idx, start_tile);
+      for (int i = 0; i < tiles; ++i, ++tile) {
+        host_problem_info_ptr[(entries_per_block * (tile % block_count)) + (tile / block_count)] = problem_info;
+      }
+      start_tile += tiles;
+    }
+  }
+#endif
+private:
+  CUTLASS_DEVICE
+  void prefetch_tiles() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int32_t i = 0; i < kPrefetchTileCount; i += kThreadCount) {
+      int32_t offset = threadIdx.x + i;
+      if (offset < kPrefetchTileCount && (tiles_computed + offset < iterations_per_block)) {
+        shared_storage.prefetched_problems[offset] = problem_info_ptr[block_load_start + tiles_computed + offset];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
new file mode 100755
index 000000000..6080e7994
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
@@ -0,0 +1,115 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of sparse GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ParamsA,
+  typename TensorRefA,
+  typename ParamsB,
+  typename TensorRefB,
+  typename ParamsE,
+  typename TensorRefE>
+struct SparseParamsBase
+{
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile;
+  ParamsA params_A{};
+  TensorRefA ref_A{};
+  ParamsB params_B{};
+  TensorRefB ref_B{};
+  ParamsE params_E{};
+  TensorRefE ref_E{};
+  int gemm_k_iterations{0};
+  int gemm_k_size{0};
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseParamsBase() = default;
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SparseParamsBase(
+    cutlass::gemm::GemmCoord const & problem_size,
+    cutlass::gemm::GemmCoord const & grid_tiled_shape,
+    TensorRefA ref_A,
+    TensorRefB ref_B,
+    TensorRefE ref_E,
+    int const mma_shape_k)
+  :
+    problem_size(problem_size),
+    grid_tiled_shape(grid_tiled_shape),
+    swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+    params_A(ref_A.layout()),
+    ref_A(ref_A),
+    params_B(ref_B.layout()),
+    ref_B(ref_B),
+    params_E(ref_E.layout()),
+    ref_E(ref_E)
+  {
+    int total_gemm_k_iterations = (problem_size.k() + mma_shape_k - 1) / mma_shape_k;
+    int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+    gemm_k_size = gemm_k_iterations * mma_shape_k;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
new file mode 100755
index 000000000..86986f2e2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of universal GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/gemm.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace util {
+
+template <class LayoutA, class LayoutB>
+CUTLASS_HOST_DEVICE
+static bool 
+is_continous_k_aligned(GemmCoord problem_size, size_t alignmentA, size_t alignmentB) {
+  return (platform::is_same<LayoutA, layout::RowMajor>::value && (problem_size.k() % alignmentA) == 0) ||
+         (platform::is_same<LayoutB, layout::ColumnMajor>::value && (problem_size.k() % alignmentB) == 0);
+}
+
+}  // namespace util
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Argument structure
+struct UniversalArgumentsBase
+{
+  //
+  // Data members
+  //
+
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  GemmCoord problem_size{};
+  int batch_count{1};
+  int64_t batch_stride_D{0};
+
+  //
+  // Methods
+  //
+
+  UniversalArgumentsBase() = default;
+
+  /// constructs an arguments structure
+  UniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    int64_t batch_stride_D)
+  :
+    mode(mode),
+    problem_size(problem_size),
+    batch_count(batch_count),
+    batch_stride_D(batch_stride_D)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB>
+struct UniversalParamsBase
+{
+  //
+  // Data members
+  //
+
+  GemmCoord problem_size{};
+  GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{0};
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  int batch_count {0};
+  int gemm_k_size {0};
+  int64_t batch_stride_D {0};
+  int *semaphore = nullptr;
+
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  UniversalParamsBase() = default;
+
+  /// Constructor
+  UniversalParamsBase(
+    UniversalArgumentsBase const &args, /// GEMM application arguments
+    int device_sms,                     /// Number of SMs on the device
+    int sm_occupancy)                   /// Kernel SM occupancy (in thread blocks)
+  :
+    problem_size(args.problem_size),
+    mode(args.mode),
+    batch_count(args.batch_count),
+    batch_stride_D(args.batch_stride_D),
+    semaphore(nullptr)
+  {
+    init_grid_tiled_shape();
+  }
+
+  /// Returns the workspace size (in bytes) needed for this problem geometry
+  size_t get_workspace_size() const
+  {
+    size_t workspace_bytes = 0;
+    if (mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes =
+        sizeof(ElementC) *
+        size_t(batch_stride_D) *
+        size_t(grid_tiled_shape.k());
+    }
+    else if (mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1)
+    {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+
+  /// Assign and initialize the specified workspace buffer.  Assumes
+  /// the memory allocated to workspace is at least as large as get_workspace_size().
+  Status init_workspace(
+    void *workspace,
+    cudaStream_t stream = nullptr)
+  {
+    semaphore = static_cast<int *>(workspace);
+    // Zero-initialize entire workspace
+    if (semaphore)
+    {
+      size_t workspace_bytes = get_workspace_size();
+
+      CUTLASS_TRACE_HOST("  Initialize " << workspace_bytes << " workspace bytes");
+
+      cudaError_t result = cudaMemsetAsync(
+        semaphore,
+        0,
+        workspace_bytes,
+        stream);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Returns the GEMM volume in thread block tiles
+  GemmCoord get_tiled_shape() const
+  {
+    return grid_tiled_shape;
+  }
+
+
+  /// Returns the total number of thread blocks to launch
+  int get_grid_blocks() const
+  {
+    dim3 grid_dims = get_grid_dims();
+    return grid_dims.x * grid_dims.y * grid_dims.z;
+  }
+
+
+  /// Returns the grid extents in thread blocks to launch
+  dim3 get_grid_dims() const
+  {
+    return ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  void init_grid_tiled_shape() {
+    // Get GEMM volume in thread block tiles
+    grid_tiled_shape = ThreadblockSwizzle::get_tiled_shape(
+      problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      batch_count);
+
+    swizzle_log_tile = ThreadblockSwizzle::get_log_tile(grid_tiled_shape);
+
+    // Determine extent of K-dimension assigned to each block
+    gemm_k_size = problem_size.k();
+
+    if (mode == GemmUniversalMode::kGemm || mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      static const uint32_t CACHELINE_BYTES = 128;
+      static const size_t element_bytes_a = sizeof(ElementA);
+      static const size_t element_bytes_b = sizeof(ElementB);
+      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
+      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
+
+      const bool cacheline_alignment_needed =
+          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
+
+      int const kAlignK = const_max(
+                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
+                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
+
+      gemm_k_size = round_up(ceil_div(problem_size.k(), batch_count), kAlignK);
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
new file mode 100755
index 000000000..6b36db21a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
@@ -0,0 +1,688 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped Rank2K kernel.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                          ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                          ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  ComplexTransform OriginalTransformA_,    ///! Public-facing transformation on A
+  ComplexTransform OriginalTransformB_,    ///! Public-facing transformation on B
+  FillMode FillModeC_,                     ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_,                      ///! Blas3 computation mode
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct Rank2KGrouped {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+
+  static_assert(platform::is_same<typename Mma1::LayoutC, cutlass::layout::RowMajor>::value &&
+                platform::is_same<typename Mma2::LayoutC, cutlass::layout::RowMajor>::value,
+                "Kernel-level grouped Rank2K requires that LayoutC be row major.");
+
+  // Define generic Mma for usecases that use Kernel::Mma
+  using Mma = Mma1_;
+
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion to reflect the original layout,
+  // fill mode, etc. passed in.
+  //
+  // Recall that a Rank2K operation performs (A x BT) + (B x AT)
+  // This is performed via:
+  //    Mma1 = (A x BT)
+  //    Mma2 = (B x AT)
+  //
+  // However, if C needs to be transposed, then this is changed to the following:
+  //    Mma1 = (B x AT)
+  //    Mma2 = (A x BT)
+  //
+  // The transformation above is achieved by swapping the Layouts/Elements/Transforms/etc.
+  // of A and B as they are passed into the instantiations of Mma1 and Mma2.
+  //
+  // Now, given access to only Mma1 and Mma2, as well as whether a transposition has occurred,
+  // we wish to retrieve the original Layouts/Elements/etc. for A and B that were passed into
+  // the device-level call.
+  //
+  // The logic to do this (which is made clearer by referencing the above instantiations) is as follows:
+  //   LayoutA = kTransposed ? Mma2::LayoutA : Mma1::LayoutA
+  //   LayoutB = kTransposed ? Mma1::LayoutA : Mma2::LayoutA
+  //
+  // We achieve this swapping by passing Mma1::*A and Mma2::*B to Rank2KMapArguments:
+  using MapArgumentsA = kernel::detail::Rank2KMapArguments<
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementA = typename MapArgumentsA::ElementA;
+  using LayoutA = typename MapArgumentsA::LayoutA;
+  static int const kAlignmentA = MapArgumentsA::kAlignmentA;
+
+  using MapArgumentsB = kernel::detail::Rank2KMapArguments<
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementB = typename MapArgumentsB::ElementA;
+  using LayoutB = typename MapArgumentsB::LayoutA;
+  static int const kAlignmentB = MapArgumentsB::kAlignmentA;
+
+  // Use the user-provided TransformA and TransformB, rather than those
+  // resulting from MapArguments, because Mma1 and Mma2 may have different
+  // complex transforms than those passed in by the user.
+  // (See kernel/rank_2k_complex.h for an example of this)
+  static cutlass::ComplexTransform const kTransformA = OriginalTransformA_;
+  static cutlass::ComplexTransform const kTransformB = OriginalTransformB_;
+
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArgumentsA::LayoutC;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static FillMode const kFillModeC = MapArgumentsA::kFillModeC;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static BlasMode const kBlasMode = BlasMode_;
+
+private:
+  static FillMode const kInternalFillModeC = FillModeC_;
+
+public:
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = Rank2KGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kInternalFillModeC>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord *problem_sizes = nullptr;
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    ElementA ** ptr_A = nullptr;
+    ElementB ** ptr_B = nullptr;
+    ElementC ** ptr_C = nullptr;
+    ElementC ** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex *lda = nullptr;
+    typename LayoutB::Stride::LongIndex *ldb = nullptr;
+    typename LayoutC::Stride::LongIndex *ldc = nullptr;
+    typename LayoutC::Stride::LongIndex *ldd = nullptr;
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params epilogue,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr,
+      bool allow_early_exit=false
+    ):
+      mode(mode),
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      epilogue(epilogue),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes),
+      allow_early_exit(allow_early_exit)
+    {
+
+    }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count = 0;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count = 0;
+
+    ElementA** ptr_A = nullptr;
+    ElementB** ptr_B = nullptr;
+    ElementC** ptr_C = nullptr;
+    ElementC** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex* lda = nullptr;
+    typename LayoutB::Stride::LongIndex* ldb = nullptr;
+    typename LayoutC::Stride::LongIndex* ldc = nullptr;
+    typename LayoutC::Stride::LongIndex* ldd = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args, void *workspace = nullptr, int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.epilogue),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd),
+      allow_early_exit(args.allow_early_exit)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma1::SharedStorage mma1_main_loop;
+      typename Mma2::SharedStorage mma2_main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  Rank2KGrouped() = default;
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    //
+    // Problem visitor.
+    //
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
+
+      //
+      // Perform checks to determine whether the results of this threadblock will be needed.
+      // An example of an unneeded threadblock is one that is assigned to compute in the upper
+      // portion of a Rank2K kernel filled with mode kLower.
+      //
+      // TODO: Consider pushing these checks into ProblemVisitor to avoid spuriously
+      // returning from `next_tile()`.
+      //
+
+      // Early exit if threadblock is out of range
+      if (grid_shape.m() <= threadblock_tile_offset.m() ||
+          grid_shape.n() <= threadblock_tile_offset.n()) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Lower and
+      // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kLower &&
+          (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Upper and
+      // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kUpper &&
+          threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      bool tile_on_diagonal = false;
+      // Mark tiles that are being crossed by the main diagonal
+      // (top-right and bottom-left corners are on either side of the diagonal)
+      if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+          && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        tile_on_diagonal = true;
+      }
+
+      int offset_k = 0;
+      int problem_size_k = problem_size.k();
+
+      //
+      // Fetch pointers based on mode.
+      //
+      if (params.mode == GemmUniversalMode::kGemm ||
+          params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+        if (threadblock_tile_offset.k() + 1 < grid_shape.k()) {
+          problem_size_k = (threadblock_tile_offset.k() + 1) * problem_size.k();
+        }
+
+        offset_k = threadblock_tile_offset.k() * problem_size.k();
+      }
+
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::Stride::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::Stride::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_MxK{
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        offset_k,
+      };
+
+      cutlass::MatrixCoord tb_offset_KxN{
+        offset_k,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      };
+
+      // Assume identity swizzle
+      MatrixCoord tb_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands for Mma1
+      typename Mma1::IteratorA iterator_A(
+        Mma1::IteratorA::Params(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma1::IteratorB iterator_BT(
+        Mma1::IteratorB::Params(ldm_B),
+        ptr_B,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Construct iterators to A and B operands for Mma2
+      typename Mma2::IteratorA iterator_B(
+        Mma2::IteratorA::Params(ldm_B),
+        ptr_B,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma2::IteratorB iterator_AT(
+        Mma2::IteratorB::Params(ldm_A),
+        ptr_A,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+      Mma1 mma1(shared_storage.kernel.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+      Mma2 mma2(shared_storage.kernel.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma1::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add (A x BT)
+      mma1(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_BT,
+        accumulators);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+
+        //
+        // Epilogue
+        //
+
+        EpilogueOutputOp output_op(params.output_op);
+
+        int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+        ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+        ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+        // If TB not on diagonal, FillMode doesn't apply.
+        FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+        // Tile iterator loading from source tensor.
+        typename Epilogue::OutputTileIterator iterator_C(
+          Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+          ptr_C,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        // Tile iterator writing to destination tensor.
+        typename Epilogue::OutputTileIterator iterator_D(
+          Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+          ptr_D,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        Epilogue epilogue(
+          shared_storage.kernel.epilogue,
+          thread_idx,
+          warp_idx,
+          lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+
+        __syncthreads();
+
+        accumulators.clear();
+      }
+
+      // Compute threadblock-scoped matrix multiply-add (B x AT)
+      mma2(
+        gemm_k_iterations,
+        accumulators,
+        iterator_B,
+        iterator_AT,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+      typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+      EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+        ptr_C = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+      }
+
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+      // If TB not on diagonal, FillMode doesn't apply.
+      FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      if (kBlasMode == BlasMode::kSymmetric) {
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      } else {
+        epilogue(
+          output_op_her2k,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
new file mode 100755
index 000000000..2e31c7783
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped Rank2K operations.
+
+    This problem visitor is specialized for Rank2K operations, for which matrix C is upper/lower
+    triangular. Using a problem visitor designed for GEMMs for Rank2K problems is inefficient
+    because threadblocks will be frequently assigned to tiles that exit early (e.g., due to
+    being assigned to a tile in the upper-triangular portion of a lower-triangular problem).
+    This can lead to load imbalance among threadblocks, as the GEMM-based scheduler
+    assigns all threadblocks to nearly the same number of tiles, regardless of whether
+    those tiles exit early.
+
+    Consider an example of a group of four Rank2Ks with matrix C consisting of a grid of 2x2 tiles.
+    Consider a grid of 8 threadblocks. The default GEMM scheduler will assign threadblocks to
+    tiles in the following order:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  1          4  5           0  1          4  5
+          2  3          6  7           2  3          6  7
+    Assuming that the problems are lower triangular, blocks 1 and 5 are continuously assigned
+    to inactive tiles.
+
+    This problem visitor aims to assign threadblocks to only those tiles which are in the
+    upper/lower triangular portion of a given problem. Using the example above, the resulting
+    assignment would be:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  -          3  -           6  -          1  -
+          1  2          4  5           7  0          2  3
+
+    Achieving the schedule above requires a mapping from threadblock ID to tile coordinates (i, j).
+    We will illustrate this by mapping on a lower-triangular matrix with a 3x3 grid. We first
+    calculate row and column indices assuming one-indexed rows, tiles, and threadblock IDs, and
+    then subtract one to convert to zero-indexed.
+                      Col 1   Col 2   Col 3
+                     ----------------------
+              Row 1 |   1      -       -
+              Row 2 |   2      3       -
+              Row 3 |   4      5       6
+
+    We next outline this mapping, borrowing from: https://stackoverflow.com/a/40954159
+
+    Calculating row i given threadblock ID t
+    ----------------------------------------
+    For a given row i, all threadblock IDs t in that row satisfy the following:
+          t <= 1 + 2 + 3 + ... + (i-1) + i
+
+    The closed-form equation for the right-hand side is: i(i+1)/2.
+    Using this, we can solve for i given t:
+          t  <= i(i+1)/2
+          2t <= i^2 + i
+          2t <= i^2 + i + 0.25 - 0.25
+          2t + 0.25 <= i^2 + i + 0.25
+          2t + 0.25 <= (i + 0.5)^2
+          sqrt(2t + 0.25) - 0.5 <= i
+
+    To account for fractional values, we set:
+          i = ceil(sqrt(2t + 0.25) - 0.5)
+
+    To turn this into a zero-indexed row and work with zero-indexed t, we perform:
+          i = ceil(sqrt(2(t+1) + 0.25) - 0.5) - 1
+            = ceil(sqrt(2t + 2.25) - 0.5) - 1
+
+    Calculating column j given threadblock ID t and row i
+    -----------------------------------------------------
+    For a given row i, all threadblock IDs t in that row also satisfy the following:
+          t > 1 + 2 + 3 + ... + (i-2) + (i-1)
+      --> t > i(i-1)/2
+
+    Threadblock IDs within a given row are sequential, so the one-indexed column ID
+    for one-indexed threadblock ID t and row i is:
+          j = t - (i(i-1)/2)
+
+    The zero-indexed version becomes:
+          j = (t+1) - (i(i+1)/2) -1
+            = t - (i(i+1)/2)
+
+    Accounting for non-square grids
+    -------------------------------
+    Though the overall output problem size for Rank2K problems is guranteed to be square, the
+    grids used in computing may not be square due to using non-square threadblock shapes. For
+    example, a threadblock shape of 64x32 operating on a problem of output size 128x128 would
+    result in a grid of 2x4 tiles.
+
+    This case can be handled by noting that the output resembles a square grid of 2x2 "macro tiles"
+    each of which contains 2 "true tiles." We can thus first map a threadblock ID to its "macro tile"
+    using the equations above, and then map it to the "true tile" within its "macro tile." In the example
+    of a 2x4 grid, this mapping would look as follows:
+        "Macro grid"           "True grid"
+       {0, 1}    -            0   1   -   -
+       {2, 3}  {4, 5}         2   3   4   5
+
+    A zero-indexed threadblock ID t is mapped to its "macro tile ID" t_macro as:
+      t_macro = t // r
+    Where r is the ratio of the maximum dimension of the grid to the minimum dimension of the grid
+    (i.e., r = 4 / 2 = 2 in the previous example).
+
+    One uses t_macro and the calculations above to find the row and column in the square matrix to
+    obtain i_macro and j_macro (zero-indexed). The mapping from (i_macro, j_macro) --> (i, j)
+    is simply the following:
+        if (ThreadblockShape::M > ThreadblockShape::N):
+            r = ThreadblockShape::M / ThreadblockShape::N
+            i = i_macro
+            j = (j_macro * r) + (t % r)
+        elif (ThreadblockShape::M < ThreadblockShape::N):
+            r = ThreadblockShape::N / ThreadblockShape::M
+            i = (i_macro * r) + (t % r)
+            j = j_macro
+        else:
+            i = i_macro
+            j = j_macro
+
+    Handling cases with grid dimensions that aren't multiples of eachother
+    ----------------------------------------------------------------------
+    Even though threadblock shapes M and N are typically multiples of one another, the grid
+    for a given problem may not have dimensions of the same ratio as that of the threadblock.
+    For example, a problem of size 132x132 using a threadblock of shape 64x32 will result
+    in a grid of 3x5 tiles. In this case, there is not an integer number of "true tiles"
+    per "macro tile."
+
+    When this scenario arises, we simply pad the larger dimension of the grid such that
+    there are an integer number of "true tiles" per "macro tile." Thus, the 3x5 grid in
+    the example above will be treated as a 3x6 grid. Row and column positions for each
+    tile are calculated as above. Any threadblocks that map to tiles that are outside the
+    problem range or upper/lower triangular portion (e.g., (2, 5)) will exit early from
+    this problem and may proceed to the next problem in the group.
+
+    Handling upper-triangular matrices
+    ----------------------------------
+    The only modification needed for upper-triangular matrices is to swap i_macro and j_macro
+    in the calculations above.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+namespace detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Helpers for calculating offsets for Rank2K problem visitor. These helpers specifically pertain
+// to the conversion from "macro tiles" to "true tiles" in the description above.
+//
+template <
+  typename ThreadblockShape,
+  typename Enable = void
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper;
+
+// Partial specialization for the case where threadblock shape M > threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM > ThreadblockShape::kN) >::type
+> {
+  static_assert(ThreadblockShape::kM % ThreadblockShape::kN == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M > threadblock shape N "
+             "requires that threadblock shape M be a multiple of threadblock shape N.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kM / ThreadblockShape::kN;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return (col * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+};
+
+// Partial specialization for the case where threadblock shape M < threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM < ThreadblockShape::kN) >::type
+> {
+
+  static_assert(ThreadblockShape::kN % ThreadblockShape::kM == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M < threadblock shape N "
+             "requires that threadblock shape N be a multiple of threadblock shape M.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kN / ThreadblockShape::kM;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.n();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return (row * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Partial specialization for the case where threadblock shape M == threadblock shape N
+// In this case, macro tiles are equivalent to true tiles, so the conversions are
+// identity functions.
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM == ThreadblockShape::kN) >::type
+> {
+
+  static int32_t const kThreadblockSkewRatio = 1;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Helper for correctly representing problem sizes in grouped kernels 
+template <typename ThreadblockShape>
+struct Rank2KGroupedProblemSizeHelper {
+  using OffsetHelper = Rank2KGroupedProblemVisitorOffsetHelper<ThreadblockShape>;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    // Return the number of tiles at or below the diagonal (or at and above
+    // for mode kUpper). We do this by first calculating this value assuming
+    // we have a square matrix of tiles of size `dim x dim` where `dim` is the
+    // minimum among {grid.m(), grid.n()}. We then multiply the resulting value
+    // by OffsetHelper::kThreadblockSkewRatio to account for cases in which there
+    // are more tiles in one dimension than the other.
+    int32_t dim = OffsetHelper::min_dim(grid);
+    int32_t tiles_on_diagonal = dim;
+    int32_t tiles_below_diagonal = ((dim * (dim - 1)) / 2);
+    return (tiles_on_diagonal + tiles_below_diagonal) * OffsetHelper::kThreadblockSkewRatio;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Default problem visitor for fill modes kUpper and kLower.
+//
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC>
+struct Rank2KGroupedProblemVisitor : public GroupedProblemVisitor<
+                                              detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
+                                              ThreadblockShape,
+                                              GroupScheduleMode_,
+                                              PrefetchTileCount,
+                                              ThreadCount> {
+
+  static cutlass::FillMode const kFillModeC = FillModeC;
+
+  static_assert(kFillModeC == cutlass::FillMode::kLower || kFillModeC == cutlass::FillMode::kUpper,
+              "Default Rank2KGroupedProblemVisitor requires fill mode of kLower or kUpper.");
+
+  using ProblemSizeHelper = detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper,
+                                     ThreadblockShape,
+                                     GroupScheduleMode_,
+                                     PrefetchTileCount,
+                                     ThreadCount>;
+  using OffsetHelper = typename ProblemSizeHelper::OffsetHelper;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  Rank2KGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, shared_storage_, block_idx)
+  {}
+
+  CUTLASS_DEVICE
+  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
+    int32_t macro_id = threadblock_id / OffsetHelper::kThreadblockSkewRatio;
+    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
+    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
+
+    if (kFillModeC == cutlass::FillMode::kUpper) {
+      swap(macro_row, macro_col);
+    }
+
+    int32_t row = OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
+    int32_t col = OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
+
+    return cutlass::gemm::GemmCoord(row, col, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
new file mode 100755
index 000000000..11b2a915a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Transpositions for Rank2K problems.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_,
+  bool Transpose
+>
+struct Rank2KMapArguments {
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  static ComplexTransform const kTransformA = TransformA;
+  static int const kAlignmentA = AlignmentA;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kAlignmentB = AlignmentB;
+  using LayoutC = LayoutC_;
+  static FillMode const kFillModeC = FillModeC_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_
+>
+struct Rank2KMapArguments<
+  ElementA_,
+  LayoutA_,
+  TransformA,
+  AlignmentA,
+  ElementB_,
+  LayoutB_,
+  TransformB,
+  AlignmentB,
+  LayoutC_,
+  FillModeC_,
+  true
+> {
+  using ElementA = ElementB_;
+  using LayoutA = LayoutB_;
+  static ComplexTransform const kTransformA = TransformB;
+  static int const kAlignmentA = AlignmentB;
+  using ElementB = ElementA_;
+  using LayoutB = LayoutA_;
+  static ComplexTransform const kTransformB = TransformA;
+  static int const kAlignmentB = AlignmentA;
+  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+  static FillMode const kFillModeC = InvertFillMode<FillModeC_>::mode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
new file mode 100755
index 000000000..bd7ffb0e3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
@@ -0,0 +1,769 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                 ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_,            ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_              ///! Blas3 computation mode
+>
+struct Rank2KUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (A x B^T)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (B x A^T)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
+
+      }
+
+      /// Returns arguments for a the transposed problem
+      Arguments transposed_problem() const {
+        Arguments args(*this);
+        
+        std::swap(args.ptr_A, args.ptr_B);
+        std::swap(args.lda, args.ldb);
+        std::swap(args.batch_stride_A, args.batch_stride_B);
+
+        return args;
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A{};
+    typename Mma1::IteratorB::Params params_BT{};
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_B{};
+    typename Mma2::IteratorB::Params params_AT{};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
+
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+    void * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    int *semaphore = nullptr;
+
+    bool allow_early_exit {false};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_BT(args.ldb),
+      params_B(args.ldb),
+      params_AT(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Rank2KUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+        && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma1::IteratorB iterator_BT(
+      params.params_BT,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_B(
+      params.params_B,
+      ptr_B,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma2::IteratorB iterator_AT(
+      params.params_AT,
+      ptr_A,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add (A x BT)
+    mma1(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_BT, 
+      accumulators);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+      //
+      // Fetch pointers based on mode.
+      //
+      
+      // Construct the semaphore.
+      Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+      if (params.mode == GemmUniversalMode::kGemm) {
+
+        // If performing a reduction via split-K, fetch the initial synchronization
+        if (params.grid_tiled_shape.k() > 1) {
+          
+          // Fetch the synchronization lock initially but do not block.
+          semaphore.fetch();
+
+          // Indicate which position in a serial reduction the output operator is currently updating
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+      else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kBatched) {
+        ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kArray) {
+        ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+        ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      }
+
+      
+      // If CTA not on diagonal, FillMode doesn't apply. 
+      FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+          
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_offset.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_offset.k());
+
+        __threadfence();
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op, 
+        iterator_D, 
+        accumulators, 
+        iterator_C); 
+      
+      //
+      // Release the semaphore
+      //
+
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+        int lock = 0;
+        if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+          // The final threadblock resets the semaphore for subsequent grids.
+          lock = 0;
+        }
+        else {
+          // Otherwise, the semaphore is incremented
+          lock = threadblock_tile_offset.k() + 1;
+        }
+        
+        semaphore.release(lock);
+      }
+
+      __syncthreads();
+
+      accumulators.clear();
+    }
+
+    // Compute threadblock-scoped matrix multiply-add (B x AT)
+    mma2(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_B, 
+      iterator_AT, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+    typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+    EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+      ptr_C = static_cast<ElementC *>(params.ptr_D);
+    }
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        if (kBlasMode == BlasMode::kSymmetric) {
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        } else {
+          output_op_her2k.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    if (kBlasMode == BlasMode::kSymmetric) {
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    } else {
+      epilogue(
+        output_op_her2k,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    }
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
new file mode 100755
index 000000000..ad418286b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_             ///! Fill Mode for C (kLower or kUpper)
+>
+struct RankKUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = 128 / sizeof_bits<ElementA>::value;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(0),
+      ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
+
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+   
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    int *semaphore{nullptr};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_A)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_A),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_A);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  RankKUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma::Shape::kM <= threadblock_tile_offset.n() * Mma::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM > threadblock_tile_offset.n() * Mma::Shape::kN
+        && threadblock_tile_offset.m() * Mma::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
new file mode 100755
index 000000000..b6ad7613d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape,
+    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  static constexpr bool IsGdcEnabled = false;
+
+  static constexpr bool is_valid_tile_scheduler =
+  cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>;
+static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializing the tile scheduler.");
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(cute::size(TiledMma{}));
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, args.hw_info.sm_count};
+    auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{});
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool mode_implementable = args.mode == GemmUniversalMode::kGemm or
+          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    return mode_implementable && TileScheduler::can_implement(args.scheduler);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    return workspace_size;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    cutlass::Status status = Status::kSuccess;
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    int batch_count = 1;
+    if constexpr (cute::rank(ProblemShape{}) == 4) {
+      batch_count = cute::size<3>(params.problem_shape);
+    }
+
+    return dim3(
+      cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))),
+      cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))),
+      batch_count
+    );
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShape>::value);
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    int thread_idx = int(threadIdx.x);
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
+    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);                                        // (m,n,k,l)
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get batch slice
+    Tensor mA_mk = mA_mkl(_,_,l_coord);                                                                        // (m,k)
+    Tensor mB_nk = mB_nkl(_,_,l_coord);                                                                        // (n,k)
+
+    // Slice to get the tiles this thread block is responsible for
+    Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{});           // (BLK_M,BLK_K,k)
+    Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{});           // (BLK_N,BLK_K,k)
+
+    // Compute tile residues for predication
+    auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl);                             // M - BLK_M * m_coord
+    auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl);                             // N - BLK_N * n_coord
+    auto k_residue   = K - size<1>(gA) * size<2>(gA);                                        // K - BLK_K * k_coord_max
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
+    clear(accumulators);
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    int  k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accumulators,
+      gA,
+      gB,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      blk_coord_mnkl,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
new file mode 100755
index 000000000..823e919ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,881 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(cute::is_void_v<TileScheduler_>,
+    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
+    typename detail::TileSchedulerSelector<
+      GroupScheduler, ArchTag,
+      TileShape, ClusterShape,
+      ProblemShape>::Scheduler,
+    typename detail::TileSchedulerSelector<
+    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaThreads = CUTE_STATIC_V(size(TiledMma{}));
+  static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMmaThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* scheduler_workspace = workspace_ptr;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    } else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+    static_assert(NumMmaWarpGroups == 2, "Cooperative kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = size(TiledMma{});
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = size(TiledMma{});
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        if (work_tile_info.is_valid()) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape_MNKL,
+            curr_batch
+          );
+          // Ensure warp is converged before issuing tensormap fence release
+          __syncwarp();
+          // Entire warp must do this (i.e. it's aligned)
+          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+        }
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
+          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          work_tile_info = next_work_tile_info;
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
+            // Since this state is waiting for loads to finish, it must start in the inverted phase.
+            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
+              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
+            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+          // Advance the producer state for the last remaining stage that was being waited for above
+          mainloop_pipe_producer_state.advance(1);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        if (work_tile_info.is_valid()) {
+          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            0
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        }
+
+        load_order_barrier.wait();
+
+        while (work_tile_info.is_valid()) {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx(),
+              wait
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (work_tile_info.is_valid()) {
+
+        if (warp_idx_in_warp_group == 0) {
+          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            consumer_warp_group_idx
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
+                                                                     epi_store_tensormap,
+                                                                     consumer_warp_group_idx);
+        }
+      }
+
+      while (work_tile_info.is_valid()) {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        static_assert(cute::is_any_of_v<TileScheduler,
+            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
+            detail::PersistentTileSchedulerSm90>);
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+      } // Scheduler work fetch loop
+
+      // Cooperative only needs TMA to complete at the very end of the kernel
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
new file mode 100755
index 000000000..386337641
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,946 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(cute::is_void_v<TileScheduler_>,
+    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
+    typename detail::TileSchedulerSelector<
+      GroupScheduler, ArchTag,
+      TileShape, ClusterShape,
+      ProblemShape>::Scheduler,
+    typename detail::TileSchedulerSelector<
+    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage = cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* scheduler_workspace = workspace_ptr;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    } else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
+    static_assert(NumMmaWarpGroups == 2, "Pingpong kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_idx - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    if (warp_group_role == WarpGroupRole::Consumer1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      const auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+      work_tile_info = next_work_tile_info;
+      if (!work_tile_info.is_valid()) {
+        return;
+      }
+
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+
+      problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+    }
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        if (work_tile_info.is_valid()) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape_MNKL,
+            curr_batch
+          );
+          // Ensure warp is converged before issuing tensormap fence release
+          __syncwarp();
+          // Entire warp must do this (i.e. it's aligned)
+          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+        }
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
+          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+          work_tile_info = next_work_tile_info;
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
+            // Since this state is waiting for loads to finish, it must start in the inverted phase.
+            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
+              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
+            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+          // Advance the producer state for the last remaining stage that was being waited for above
+          mainloop_pipe_producer_state.advance(1);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        if (work_tile_info.is_valid()) {
+          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            0
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+        }
+
+        load_order_barrier.wait();
+
+        while (work_tile_info.is_valid()) {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx(),
+              wait
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (work_tile_info.is_valid()) {
+
+        if (warp_idx_in_warp_group == 0) {
+          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape_MNKL,
+            work_tile_info.L_idx,
+            consumer_warp_group_idx
+          );
+
+          // Converge before issuing tensormap fence release since fence is aligned
+          __syncwarp();
+          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                     epi_store_tensormap,
+                                                                     consumer_warp_group_idx);
+        }
+      }
+
+      while (work_tile_info.is_valid()) {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        static_assert(cute::is_any_of_v<TileScheduler,
+            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
+            detail::PersistentTileSchedulerSm90>);
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          math_wg_order_barrier.wait();
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          math_wg_order_barrier.arrive();
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+           math_wg_order_barrier.wait();
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+
+        // Skip a tile for pingpong
+        if (work_tile_info.is_valid()) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+          // Go to next tile
+          auto [next_next_work_tile_info, next_increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+
+          work_tile_info = next_next_work_tile_info;
+          increment_pipe = next_increment_pipe;
+        }
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
new file mode 100755
index 000000000..c7245457e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/trace.h"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTma, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::ThreadCount;
+
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    int thread_idx = int(threadIdx.x);
+    int warp_idx   = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto blk_coord = make_coord(_,_,_);                                                   // (m,n,k) -- defer the slice
+
+    // Make tiled views
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{});                  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{});                  // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                   // (MMA,MMA_M,MMA_N)
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      gA, params.mainloop.tma_load_a,
+      gB, params.mainloop.tma_load_b,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      thread_idx,
+      block_rank_in_cluster,
+      smem_buf,
+      params.mainloop
+    );
+
+    constexpr int BLK_M_RANK = cute::rank<0>(blk_shape);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return  get<i>(M) - get<0,i>(blk_shape) * get<i>(m_coord);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(blk_shape);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return  get<i>(N) - get<1,i>(blk_shape) * get<i>(n_coord);
+      }));
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      output_tile_coord,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
new file mode 100755
index 000000000..b278f96e9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/conv/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/arch/grid_dependency_control.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<cutlass::gemm::KernelTmaWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  // Handles the static_assert placed inside the operator()
+  // This is also used to decide whether the load_init inside collective mainloop returns rank 4 tensors or rank 5 tensors
+  static constexpr bool IsConvProblemShape = not (cute::is_tuple_v<ProblemShape>|| IsCutlass3ArrayKernel<ProblemShape>::value);
+  static_assert( IsConvProblemShape || (cute::rank(ProblemShape{}) == 3 || cute::rank(ProblemShape{}) == 4), "ProblemShape{} should be <M,N,K> or <M,N,K,L> for Gemm");
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, TileShape, ClusterShape>::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 1;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    cutlass::gemm::GemmUniversalMode mode{}; //maintained here for backward compatibility
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+
+    // Default constructor
+    Arguments() = default;
+
+    // Constructor with specified mode 
+    // It is used for Gemm
+    Arguments(
+        cutlass::gemm::GemmUniversalMode mode_,
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(mode_)
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+    // Constructor with default value for 'mode'
+    // This allows us to set GemmUniversal mode as kGemm for Conv right away
+    // while keeping the testbeds unchanged
+    Arguments(
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(cutlass::gemm::GemmUniversalMode::kGemm) // Default mode
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+  };
+
+  // Kernel entry point API
+  struct Params {
+    using ProblemShapeMNKL = decltype(cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(ProblemShape{}, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{}));
+    ProblemShapeMNKL problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+
+    (void) workspace;
+    auto problem_shape_mnkl = cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(args.problem_shape, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{});
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    auto swapped_problem_shape = problem_shape_mnkl;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(swapped_problem_shape) = get<1>(problem_shape_mnkl);
+      get<1>(swapped_problem_shape) = get<0>(problem_shape_mnkl);
+    }
+    return {
+      swapped_problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(transformed_problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+        return implementable;
+    }
+
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(transformed_problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+    enum class ProducerWarpRole {
+      MainloopEpilogue = 0,
+      Warp1 = 1,
+      Warp2 = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+  
+    // Preconditions only valid for Gemm
+    static_assert(IsConvProblemShape || cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    // Using constexpr if (C++17 and later)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, cute::Int<1>{});
+    
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. 
+    // Expects a tuple of tensors for conv where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+    
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl), compact_col_major(shape<2>(gB_nkl)));
+
+    // handles the difference between the rank of Tensor returned by load_input in case they do not have a batch mode
+    auto l_coord = [&] (auto const& gB_nkl_) {
+      // gB_nkl needs to be passed into the lambda because C++17
+      // does not permit lambda capture of structured bindings.
+      if constexpr (not IsConvProblemShape) {
+        // This needs to be inside an `if constexpr`,
+        // because shape<4>(gB_nkl) is not well-formed otherwise.
+        return idx2crd(int(blockIdx.z), shape<4>(gB_nkl_));
+      }
+      else {
+        return Int<0>{};
+      }
+    } (gB_nkl);
+
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+    auto k_tile_count = size<3>(gA_mkl);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      if (producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          blk_coord,
+          k_tile_iter, k_tile_count,
+          lane_idx,
+          block_rank_in_cluster,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting mainloop pipeline state for the pipeline drain
+        mainloop_pipe_producer_state.advance(k_tile_count);
+        // Make sure mainloop consumer has been waited upon before issuing epilogue load
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          // Ensure warp is converged before issuing epilogue loads
+          __syncwarp();
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+        }
+      } 
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Epilogue and write to gD
+      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+
+      collective_epilogue.store_tail(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state_next,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state_next
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
new file mode 100755
index 000000000..243a9e708
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,671 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileSchedulerTag = TileSchedulerTag_;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                                          TileSchedulerTag, 
+                                          ArchTag, 
+                                          TileShape,
+                                          ClusterShape
+                                          >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* scheduler_workspace = workspace_ptr;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
+#  define ENABLE_SM90_KERNEL_LEVEL 1
+#endif
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(NumMMAThreads == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % NumMMAThreads;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumMMAThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    } 
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;   
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                           );
+
+          work_tile_info = next_work_tile_info;
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && is_epi_load_needed) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+
+        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+            
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                           );
+          work_tile_info = next_work_tile_info;
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                          );
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
new file mode 100755
index 000000000..cf4a552cb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,664 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage =
+    cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* scheduler_workspace = workspace_ptr;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      TileScheduler::to_underlying_arguments(problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = canonical_warp_group_idx() - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    if (warp_group_role == WarpGroupRole::Consumer1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      scheduler.advance_to_next_work();
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        while (work_tile_info.is_valid()) {
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        load_order_barrier.wait();
+        while (work_tile_info.is_valid()) {
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+      // It is possible to have work tiles start off invalid,
+      // so we have to check that first.
+      if (not work_tile_info.is_valid()) {
+        // Hint on an early release of global memory resources.
+        // The timing of calling this function only influences performance,
+        // not functional correctness.
+        cutlass::arch::launch_dependent_grids();
+
+        return;
+      }
+      #endif
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          warp_group_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info, NumMmaWarpGroups)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state_next,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state_next
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
new file mode 100755
index 000000000..c2a888ae3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = Shape<_1,_1,_1>{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      // Compute tile residues for predication
+      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+      collective_mainloop.load(
+        mainloop_pipeline,
+        mainloop_pipe_producer_state,
+        gA,
+        gB,
+        k_tile_iter, k_tile_count,
+        residue_mnk,
+        thread_idx,
+        shared_storage.tensors.mainloop
+      );
+      // Update starting mainloop pipeline state for the pipeline drain
+      mainloop_pipe_producer_state.advance(k_tile_count);
+      // Make sure mainloop consumer has been waited upon before issuing epilogue load
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      if (collective_epilogue.is_producer_load_needed()) {
+        epi_load_pipe_producer_state =
+        collective_epilogue.load(
+          epi_load_pipeline,
+          epi_load_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          tiled_mma,
+          thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Epilogue and write to gD
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
new file mode 100755
index 000000000..041745206
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
@@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    TileScheduler t;
+    return t.template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    TileScheduler t;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+    return t.template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, one or multiple Consumers collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<2>(gA)), shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, work_k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler) &&
+           collective_epilogue.is_producer_load_needed()) {
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+      }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          work_k_tile_count,
+          mma_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          work_k_tile_count
+        );
+
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
new file mode 100755
index 000000000..142fabd2f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
@@ -0,0 +1,516 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = 2 * cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+  static_assert(NumMmaWarpGroups == 2, "Pingpong kernel requires 2 MMA warp groups.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = typename MathWarpGroupOrderBarrier::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+    int warp_group_consumer_idx = warp_group_idx - NumLoadWarpGroups;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_consumer_idx;
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    if (warp_group_consumer_idx == 1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      scheduler.advance_to_next_work();
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(k_tile_count);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          // Update starting pipeline state for the next tile
+          epi_load_pipe_producer_state.advance(c_tile_count);
+        }
+
+        // Get next work tile
+        scheduler.advance_to_next_work();
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        // Update starting load/store pipeline states for the next tile
+        epi_load_pipe_consumer_state.advance(c_tile_count * NumMmaWarpGroups);
+        epi_store_pipe_producer_state.advance(d_tile_count * NumMmaWarpGroups);
+
+        // Wait for all TMA stores to complete
+        epi_store_pipeline.producer_tail(epi_store_pipe_producer_state);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
new file mode 100755
index 000000000..5e61e7c99
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
+
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+class PersistentTileSchedulerSm90:
+public StaticPersistentTileScheduler<PersistentTileSchedulerSm90> {
+
+  using BaseScheduler = StaticPersistentTileScheduler<PersistentTileSchedulerSm90>;
+public:
+  using StaticPersistentTileScheduler::StaticPersistentTileScheduler;
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  using Arguments = BaseScheduler::Arguments;
+
+  static constexpr bool IsDynamicPersistent = false;
+
+  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order) {
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
+    return get_work_idx_m_and_n(
+      blk_per_grid_dim,
+      divmod_cluster_shape_major,
+      divmod_cluster_shape_minor,
+      divmod_cluster_blk_major,
+      log_swizzle_size,
+      raster_order,
+      cta_m_in_cluster,
+      cta_n_in_cluster
+    );
+  }
+
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order,
+      uint64_t cta_m_in_cluster,
+      uint64_t cta_n_in_cluster) {
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = cta_m_in_cluster;
+    }
+    else {
+      cluster_minor_offset = cta_n_in_cluster;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor +
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor +
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx};
+    }
+    else {
+      return {major_work_idx, minor_work_idx};
+    }
+
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+};
+
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
new file mode 100755
index 000000000..888be276d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+template <class GroupProblemShape>
+class PersistentTileSchedulerSm90Group {
+  //
+  // Data members
+  //
+
+private:
+  uint64_t current_work_linear_idx_ = 0;
+  uint64_t total_grid_size_ = 0;
+
+  // Tracking current group, its starting linear idx and total tiles
+  struct GroupInfo {
+    int group_idx = 0;
+    uint64_t start_linear_idx = 0;
+    uint64_t total_tiles = 0;
+  } current_group_info_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, false};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using ProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
+  using Params = PersistentTileSchedulerSm90GroupParams<ProblemShape>;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  struct Arguments {
+    int max_swizzle_size = 1;
+    // Not applying Heuristics for Grouped problems, since largest dimension can change per group
+    RasterOrderOptions raster_order = RasterOrderOptions::AlongM;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    Arguments const& arguments,
+    [[maybe_unused]] void* workspace=nullptr,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u
+    ) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes.groups(),
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      problem_shapes.groups(),
+      problem_shapes.problem_shapes,
+      problem_shapes.host_problem_shapes,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size, 
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class TileShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments,
+    bool truncate_by_problem_size=true) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes.groups(),
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(int groups, GroupProblemShape problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
+    uint32_t total_ctas = 0;
+    uint32_t cta_in_N_dim = 1; // We linearize the blocks across all the problems here
+
+    // If host problem shapes are not provided.
+    if (!problem_shapes.is_host_problem_shape_available()) {
+      total_ctas = hw_info.sm_count;
+    }
+    // If host problem shapes are provided, make a better decision about possibility to launch smaller grid.
+    else {
+      for (int group = 0; group < groups; group++) {
+        auto ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes.get_host_problem_shape(group)), cute::shape<0>(cta_shape)));
+        auto ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes.get_host_problem_shape(group)), cute::shape<1>(cta_shape)));
+        auto problem_blocks_m = round_up(ctas_along_m, cute::get<0>(cluster_shape));
+        auto problem_blocks_n = round_up(ctas_along_n, cute::get<1>(cluster_shape));
+        total_ctas += problem_blocks_m * problem_blocks_n;
+      }
+    }
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(cluster_shape),
+      total_ctas, cta_in_N_dim
+    );
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return true;
+  }
+
+  PersistentTileSchedulerSm90Group() = default;
+
+  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_) : scheduler_params(params_) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (scheduler_params.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+
+    uint64_t ctas_along_m, ctas_along_n;
+    if (is_tuple<decltype(cute::shape<0>(params_.problem_shapes_[0]))>::value ||
+        is_tuple<decltype(cute::shape<1>(params_.problem_shapes_[0]))>::value) {
+      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.m()));
+      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.n()));
+    }
+    else {
+      ctas_along_m = scheduler_params.divmod_cta_shape_m_.divide(cute::shape<0>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_m_.divisor - 1);
+      ctas_along_n = scheduler_params.divmod_cta_shape_n_.divide(cute::shape<1>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_n_.divisor - 1);
+    }
+    auto problem_blocks_m = round_up(ctas_along_m, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.m());
+    auto problem_blocks_n = round_up(ctas_along_n, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.n());
+    current_group_info_.total_tiles = problem_blocks_m * problem_blocks_n;
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) {
+    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    return get_work_idx_m_and_n(linear_idx,
+                                current_group_info_,
+                                scheduler_params.groups_,
+                                scheduler_params.problem_shapes_,
+                                scheduler_params.cta_shape_,
+                                scheduler_params.cluster_shape_,
+                                scheduler_params.divmod_cluster_shape_major_,
+                                scheduler_params.divmod_cluster_shape_minor_,
+                                scheduler_params.divmod_cta_shape_m_,
+                                scheduler_params.divmod_cta_shape_n_,
+                                scheduler_params.log_swizzle_size_, 
+                                scheduler_params.raster_order_);
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+  }
+
+  // get work_idx_m, work_idx_n from linear_idx while applying swizzle
+  static CUTLASS_DEVICE
+  WorkTileInfo
+  get_work_idx_m_and_n(
+      uint64_t linear_idx,
+      struct GroupInfo& group_info,
+      int32_t total_problem_groups,
+      ProblemShape* problem_shapes,
+      GemmCoord cta_shape,
+      GemmCoord cluster_shape,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cta_shape_m,
+      FastDivmodU64 const& divmod_cta_shape_n,
+      int32_t log_swizzle_size, 
+      RasterOrder raster_order) {
+
+    bool valid_tile = true;
+    uint64_t ctas_along_m, ctas_along_n;
+    if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
+        is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
+      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
+      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
+    }
+    else {
+      ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
+      ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
+    }
+    auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+    group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+
+    while (group_info.start_linear_idx + group_info.total_tiles <= linear_idx) {
+      group_info.group_idx++;
+
+      if (group_info.group_idx >= total_problem_groups)
+        return WorkTileInfo::invalid_work_tile();
+
+      group_info.start_linear_idx += group_info.total_tiles;
+      if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
+          is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
+        ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
+        ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
+      }
+      else {
+        ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
+        ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
+      }
+      problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+      problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+      group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+    }
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    uint64_t blk_per_grid_dim = divmod_cluster_shape_minor.divide(linear_idx - group_info.start_linear_idx);
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = cta_m_in_cluster;
+    }
+    else {
+      cluster_minor_offset = cta_n_in_cluster;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+    
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    uint64_t curr_group_cluster_blk_major;
+    if (raster_order == RasterOrder::AlongN) {
+      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_n);
+    }
+    else {
+      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_m);
+    }
+    cluster_idx_minor_div_swizzle = extra / curr_group_cluster_blk_major;
+    cluster_idx_major = extra % curr_group_cluster_blk_major;
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor + 
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor + 
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx, group_info.group_idx, valid_tile};
+    }
+    else {
+      return {major_work_idx, minor_work_idx, group_info.group_idx, valid_tile}; 
+    }
+
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape_MNKL, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape_MNKL problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t
+  epilgoue_subtile_idx(WorkTileInfo const& work_tile_info, Params const& params) const {
+    return 0;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+  
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work();
+  }
+
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
new file mode 100755
index 000000000..80b374ad7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -0,0 +1,960 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
+template <
+  class TileShape,
+  class ClusterShape
+>
+class PersistentTileSchedulerSm90StreamK {
+  //
+  // Data members
+  //
+
+private:
+  using UnderlyingScheduler = PersistentTileSchedulerSm90;
+
+private:
+  using UnderlyingArguments = typename UnderlyingScheduler::Arguments;
+  using UnderlyingParams = typename UnderlyingScheduler::Params;
+
+  uint64_t current_work_linear_idx_ = 0;
+
+public:
+
+  using RasterOrder = UnderlyingScheduler::RasterOrder;
+  using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  // Use a dummy barrier manager to simply get the type used to store the barrier
+  using BarrierType = typename NamedBarrierManager<1>::T;
+
+  using Params = PersistentTileSchedulerSm90StreamKParams;
+  using ReductionMode = Params::ReductionMode;
+  using DecompositionMode = Params::DecompositionMode;
+
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t K_idx = 0;
+    int32_t L_idx = 0;
+
+    // Number of k tiles to compute for this unit of work. For stream-K, this
+    // can indicate the number of K tiles across multiple output tiles.
+    uint32_t k_tile_count = 0;
+
+    // Number of k tiles remaining for the work unit as a whole
+    uint32_t k_tile_remaining = 0;
+
+    // Whether this unit of work is the final split for the given tile
+    bool is_separate_reduction = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
+      // (which only performs reduction and epilogue)
+      return k_tile_count > 0 || is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_reduction_unit() const {
+      return is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      // For separate reduction units, the K_idx of the work tile is unused.
+      // Therefore, we override it to contain the subtile of that the reduction
+      // unit operates on.
+      return is_reduction_unit() ? K_idx : -1;
+    }
+
+    CUTLASS_HOST_DEVICE
+    void
+    setup_separate_reduction(int32_t epilogue_subtile_idx) {
+      // Set the epilogue subtile in the K_idx, since this is otherwise unused
+      // by separate reduction units.
+      K_idx = epilogue_subtile_idx;
+
+      is_separate_reduction = true;
+      k_tile_count = 0;
+      // Clean up remaining k tiles
+      k_tile_remaining = 0;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, -1, 0};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return (K_idx + k_tile_count) == k_tiles_per_output_tile;
+    }
+  };
+
+  struct Arguments {
+
+    Arguments() = default;
+    Arguments(Arguments const&) = default;
+    Arguments(Arguments&&) = default;
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments const& args) {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments&& args) noexcept {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_) : splits(splits_) {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_, int max_swizzle_size_, RasterOrderOptions raster_order_, DecompositionMode decomposition_mode_) :
+      splits(splits_),
+      max_swizzle_size(max_swizzle_size_),
+      raster_order(raster_order_),
+      decomposition_mode(decomposition_mode_) {}
+
+    // The splitting factor to be used in a split-K decomposition of the problem.
+    // If this is set to a value greater than 1, stream-K decomposition logic
+    // is bypassed in favor of a split-K decomposition.
+    int splits = 1;
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+    ReductionMode reduction_mode = ReductionMode::Deterministic;
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShape problem_shape,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace,
+      const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      args.decomposition_mode,
+      workspace,
+      epilogue_subtile
+    );
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    // Split count > 1 is only valid for heuristic and split-K decomposition modes
+    return (args.splits == 1 ||
+            args.decomposition_mode == DecompositionMode::Heuristic ||
+            args.decomposition_mode == DecompositionMode::SplitK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerSm90StreamK() { };
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerSm90StreamK(Params const& params_) : scheduler_params(params_) {
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() const {
+    return get_current_work_for_linear_idx(current_work_linear_idx_, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx, Params const& params) {
+    // The maximum number of work units is units_per_problem_ * splits_.
+    // The multiplication by splits_ is used for handling split-K, in which
+    // units_per_problem_ is equal to the total number of output tiles. To account
+    // for the fact that we have splits_ peers per output tile, we multiply this
+    // value by splits_. For stream-K, this multiplication ends up being a no-op
+    // because splits_ is set to 1 for stream-K.
+    if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor + params.separate_reduction_units_)) {
+      // Invalid work. Return an empty result.
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    WorkTileInfo work_tile_info;
+    assign_work(params, linear_idx, work_tile_info);
+    return work_tile_info;
+  }
+
+  // Returns whether the current work_tile_info passed in should continue to be used. This
+  // occurs only in the stream-K decomposition with stream-K work units, which encompass
+  // work over multiple output tiles. If the current work_tile_info should continue to be
+  // used, it is updated to advance to the next output tile it should cover.
+  CUTLASS_DEVICE
+  bool
+  continue_current_work(WorkTileInfo& work_tile_info) const {
+    return continue_current_work_for_linear_idx(
+      current_work_linear_idx_, work_tile_info, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work_for_linear_idx(
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info,
+    Params const& params) {
+
+    work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count;
+
+    if (work_tile_info.k_tile_remaining == 0) {
+      return false;
+    }
+    assign_work(params, linear_idx, work_tile_info);
+    return work_tile_info.is_valid();
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo work_tile_info, uint32_t advance_count = 1) const {
+     // Never pass this by reference; it needs a copy,
+    // because continue_current_work will modify it.
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        current_work_linear_idx_ + (
+          uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count)
+          ),
+        scheduler_params
+    ).is_valid();
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape, ClusterShape cluster_shape) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+  }
+
+  // Given the cluster shape, computes the physical grid we should launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) {
+    // Fixup is not needed for invalid or data-parallel tiles
+    return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return params.requires_separate_reduction();
+  }
+
+  // When the work tile is not special for reduction, it's valid. Otherwise need to skip
+  // global loading that producer warpgroup do, also math computation that consumer warpgroup do.
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return !work_tile_info.is_reduction_unit();
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+    static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+    static constexpr uint32_t MaxNumNamedBarriers = 2;
+    using BarrierManager = NamedBarrierManager<NumThreadsPerWarpGroup, Offset, MaxNumNamedBarriers>;
+    return fixup_helper<FrgTensorC, BarrierManager>(
+      params, work_tile_info, accumulators, num_barriers, barrier_idx);
+  }
+
+  // Helper for performing the reduction across splits for a given output tile.
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  fixup_helper(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx,
+    uint32_t num_accumulator_mtxs = 1) {
+
+    using ElementAccumulator = typename FrgTensorC::value_type;
+
+    if (!requires_fixup(params, work_tile_info)) {
+      return;
+    }
+    uint64_t tile_idx = output_tile_index(params, work_tile_info);
+
+    // Index of the lock on which to wait
+    uint64_t lock_idx = (tile_idx * num_barriers) + barrier_idx;
+
+    uint64_t reduction_tile_idx = tile_idx;
+    uint64_t num_peers = 0;
+    uint64_t reduction_peer_offset = 0;
+    if (params.requires_separate_reduction()) {
+      // If separate reduction is to be performed, each stream-K unit writes its partials
+      // to a separate portion of the workspace. There are as many of these portions as there
+      // are peers for a given output tile, so we multiply the tile index by the maximum peer count.
+      auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast<uint32_t>(work_tile_info.K_idx));
+      num_peers = last_peer_id - first_peer_id + 1;
+      reduction_tile_idx *= Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+      reduction_peer_offset = my_peer_id * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{});
+    }
+
+    // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
+    // Thus, the start of the reduction space is the same across all threads in a warp group.
+    uint64_t reduction_offset =
+      (static_cast<uint64_t>(cute::size<0>(TileShape{})) * static_cast<uint64_t>(cute::size<1>(TileShape{})) * reduction_tile_idx * num_accumulator_mtxs) +
+      reduction_peer_offset +
+      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount);
+
+    ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
+
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    uint32_t barrier_group_thread_idx = threadIdx.x % BarrierManager::ThreadCount;
+
+    // The number of tiles for which reduction is required is either:
+    //   (a) the total number of output tiles (in the case of split-K)
+    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
+    // To calculate the total number of output tiles in the split-K case, we
+    // note that, in the split-K case, the units_per_problem_ member of Params will be
+    // the total number of output tiles.
+    uint32_t reduction_tiles = 0;
+    if (params.divmod_splits_.divisor > 1) {
+      reduction_tiles = params.units_per_problem_;
+    }
+    else if (params.requires_separate_reduction()) {
+      reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+    }
+    else {
+      reduction_tiles = params.sk_tiles_;
+    }
+
+    uint64_t reduction_workspace_size = Params::get_reduction_workspace_size(
+      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value, num_accumulator_mtxs);
+    BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
+      reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
+
+    if (work_tile_info.is_reduction_unit()) {
+      plus<AccumulatorArrayT> add_fragments;
+      uint64_t peer_offset = size(accumulators) * num_barriers * BarrierManager::ThreadCount;
+
+      // Wait until the peers collaborating on this output tile have all written
+      // their accumulators to workspace.
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers);
+
+      // Load the first peer's data
+      BlockStripedReduceT::load(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+
+      for (uint64_t i = 1; i < num_peers; ++i) {
+        // Load peer fragment
+        AccumulatorArrayT addend_fragment;
+        auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace + (i * peer_offset));
+
+        BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, barrier_group_thread_idx);
+
+        // Add peer fragment
+        *accumulator_array = add_fragments(*accumulator_array, addend_fragment);
+      }
+    }
+    else if (!compute_epilogue(work_tile_info, params)) {
+      if (params.requires_separate_reduction() || work_tile_info.K_idx == 0) {
+        // The first peer initializes the workspace partials in the non-separate-reduction case,
+        // and all peers write to their own location in workspace when using separate reduction
+        BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+      else {
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+
+        // Perform reduction in workspace
+        BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+
+      // If separate reduction is being performed, each participating stream-K unit increments the barrier
+      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
+      uint32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
+
+      // Signal our arrival
+      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+    }
+    else {
+      if (params.reduction_mode_ == ReductionMode::Deterministic) {
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+      }
+      else {
+        // Wait unitl the first split has stored its accumulators
+        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+      }
+
+      // The block computing the final split for the tile adds previously-reduced partials
+      // to its accumulators and computes the epilogue.
+      BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+    }
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
+    // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
+    //  1. The tile is computed in data-parallel mode
+    //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
+    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
+    return work_tile_info.is_valid() &&
+            (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) &&
+             !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction;
+  }
+
+  // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K]
+  CUTLASS_DEVICE
+  static uint64_t
+  output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
+    uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n(
+      work_tile_info.M_idx, work_tile_info.N_idx,
+      params.divmod_cluster_shape_major_,
+      params.divmod_cluster_shape_minor_,
+      params.divmod_cluster_blk_major_,
+      params.log_swizzle_size_,
+      params.raster_order_
+    );
+
+    uint64_t tiles_mn = params.divmod_batch_.divisor;
+    return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+    Arguments const& args,
+    ProblemShape problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile
+    );
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    cudaStream_t stream,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile,
+      1,
+      cuda_adapter
+    );
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) {
+    return work_tile_info.k_tile_count;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+
+  // Returns the initial work tile info that will be computed over
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work();
+  }
+
+private:
+  // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
+  // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining
+  // iterations) is used to find the next tile in the current work unit.
+  CUTLASS_DEVICE
+  static void
+  assign_work(
+    Params const& params,
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info) {
+
+    auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = cute::block_id_in_cluster();
+    uint64_t cta_m_in_cluster = static_cast<uint64_t>(cta_m_in_cluster_);
+    uint64_t cta_n_in_cluster = static_cast<uint64_t>(cta_n_in_cluster_);
+    uint64_t output_tile_id = linear_idx;
+    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
+      // Separate-reduction work
+      auto cluster_size = params.get_cluster_size();
+      // Divide up the linearized separate reduction units into clusters
+      uint64_t cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
+      uint64_t cluster_tile_idx, epi_subtile_idx;
+      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id = cluster_tile_idx * cluster_size;
+
+      work_tile_info.setup_separate_reduction(epi_subtile_idx);
+    }
+    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
+      // Data-parallel work
+      output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
+      work_tile_info.K_idx = 0;
+      work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor;
+      work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
+    }
+    else {
+      // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
+      // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
+      // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
+      // it would compute a (potentially-partial) output tile for based on the space of k iterations
+      // assigned to it. This often results in stream-K threadblocks processing tiles with different
+      // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
+      // (generally few) waves of threadblocks assigned to compute stream-K work.
+      //
+      // With the introduction of threadblock clusters, there is additional benefit to maintaining
+      // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
+      // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
+      // threadblocks respects the ability to perform multicasting.
+      //
+      // To do so, we divide up the linearized stream-K units into clusters and share the same K
+      // offsets for work within clusters.
+
+      uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+      uint64_t group_idx;
+      params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
+
+      // Determine whether we are in a "big group" that will process an additional
+      // stream-K cluster tile.
+      uint64_t sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
+      uint64_t sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles);
+      if (group_idx < params.big_groups_) {
+        ++sk_cluster_tiles_in_group;
+      }
+
+      // Determine whether we are in a "big unit" within the group, that will process
+      // an additional K chunk in the group.
+      uint64_t sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size();
+      uint64_t k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
+      uint64_t k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
+      uint64_t big_units_in_group = params.div_cluster_size(
+        k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
+
+      uint64_t split;
+      params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
+
+      bool is_split_k = params.divmod_splits_.divisor > 1;
+      uint64_t big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx;
+      uint64_t big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
+      uint64_t linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
+      uint64_t k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
+
+      // Determine the starting k iteration computed by this stream-K work unit
+      uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) +
+                                 (k_tiles_per_split * split);
+
+      // Adjust the starting position and number of k iterations for "big units," which
+      // compute one extra iteration. If there are any big units, they will be the first
+      // in the linearized ID space.
+      auto k_tiles_in_my_split = k_tiles_per_split;
+      if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
+        // Since the "big units" are the first units in the linearized ID space, each
+        // of the units preceding this big unit computed one extra iteration. Thus,
+        // we must offset our start iteration by the number of units that precede
+        // the current unit in the linearized ID space.
+        unit_iter_start += big_unit_cmp_lhs;
+        ++k_tiles_in_my_split;
+      }
+      else {
+        // Increment by one for each of the big clusters (since all big units precede this unit)
+        unit_iter_start += big_unit_cmp_rhs;
+      }
+
+      if (!is_split_k) {
+        // Adjust the unit starting position and number of tiles to avoid
+        // computing splits of size less than min_iters_per_sk_unit_
+        int unused, start_tile_k_tile;
+        params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start);
+        if (start_tile_k_tile < Params::min_iters_per_sk_unit_) {
+          // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another
+          // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to take over these K tiles.
+          unit_iter_start -= start_tile_k_tile;
+          k_tiles_in_my_split += start_tile_k_tile;
+        }
+        else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+          // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+          auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile);
+          unit_iter_start += adjustment_tiles;
+          k_tiles_in_my_split -= adjustment_tiles;
+        }
+        else if (params.ktile_start_alignment_count == 2 && start_tile_k_tile % 2 != 0) {
+          // ktile for each SM start from even number
+          // If start from odd number ktile within the output tile
+          //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+          // if end on odd number ktile within the output tile
+          //    now end at ktile that one before my ktile end (give one ktile to next sm)
+          unit_iter_start -= 1;
+          k_tiles_in_my_split += 1;
+        }
+      }
+
+      if (work_tile_info.k_tile_count == 0) {
+        // This is a new unit
+
+        if (!is_split_k) {
+          //
+          // Adjust the unit ending position and number of tiles to avoid
+          // computing splits of size less than min_iters_per_sk_unit_
+          //
+
+          // Begin by assuming that no adjustment is needed
+          auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split;
+
+          int unused, end_tile_k_tile;
+          params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end);
+
+          if (end_tile_k_tile < Params::min_iters_per_sk_unit_) {
+            // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile,
+            // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+            // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+            k_tiles_in_my_split -= end_tile_k_tile;
+          }
+          else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+            // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+            // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+            // Adjust our work to take on these K tiles.
+            k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile);
+          }
+          else if (params.ktile_start_alignment_count == 2 && end_tile_k_tile % 2 != 0) {
+            // ktile for each SM start from even number
+            // If start from odd number ktile within the output tile
+            //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+            // If end on odd number ktile within the output tile,
+            //    now end at ktile that one before my ktile end (give one ktile to next sm)
+            k_tiles_in_my_split -= 1;
+          }
+        }
+
+        work_tile_info.k_tile_remaining = k_tiles_in_my_split;
+      }
+
+      uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1;
+
+      // Find the output tile corresponding to the final k tile covered by this
+      // work unit. Stream-K work units will work backwards in terms of the tiles they
+      // are responsible computing. This is beneficial because the final (partial)
+      // tile computed by a stream-K block is typically the beginning of the output
+      // tile, while the beginning (partial) tile is typically the ending of another
+      // output tile. Since ending portions of an output tile must reduce across
+      // other work units computing portions of that output tile, it is preferable
+      // for them to be computed later, so as to reduce the likelihood of blocking
+      // on other work.
+
+      auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end);
+      uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor;
+      uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor;
+
+      // Convert the output tile from the linearized space within each group to the
+      // overall linearized space.
+      output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx;
+
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id *= params.get_cluster_size();
+
+      // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
+      if (params.raster_order_ == RasterOrder::AlongN) {
+        output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+      }
+      else {
+        output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+      }
+
+      // The unit's starting k iteration in the current tile is either the starting
+      // iteration for the tile as a whole, or the starting k iteration for the unit
+      // as a whole (if the latter is greater than the former).
+      uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start);
+
+      // Similarly, the unit's ending k iteration (exclusive) is either the end of
+      // the current tile it is assigned, or the ending iteration of the unit as a whole
+      // (if the latter is less than the former).
+      uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1);
+
+      // Set the k offset to be the starting k tile for this output tile
+      work_tile_info.K_idx = static_cast<int32_t>(tile_iter_start - output_tile_iter_start);
+      work_tile_info.k_tile_count = tile_iter_end - tile_iter_start;
+    }
+
+    uint64_t work_idx_l, remainder;
+    params.divmod_batch_(work_idx_l, remainder, output_tile_id);
+
+    uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n(
+                                          cta_per_grid_dim,
+                                          params.divmod_cluster_shape_major_,
+                                          params.divmod_cluster_shape_minor_,
+                                          params.divmod_cluster_blk_major_,
+                                          params.log_swizzle_size_,
+                                          params.raster_order_
+                                        );
+
+    // Set the M, N, and L block offsets
+    work_tile_info.M_idx = work_idx_m;
+    work_tile_info.N_idx = work_idx_n;
+    work_tile_info.L_idx = static_cast<int32_t>(work_idx_l);
+  }
+
+  // Returns the starting and ending peer ID of this tile
+  CUTLASS_HOST_DEVICE
+  static auto
+  tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) {
+    uint32_t tile_idx_in_cluster_path = params.div_cluster_size(tile_idx);
+    uint32_t start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
+    uint32_t end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
+    uint32_t big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
+
+    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) {
+      uint32_t unit_k_start = unit_idx * k_tiles_per_unit;
+      uint32_t unit_k_end = unit_k_start + k_tiles_per_unit;
+      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
+          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the next stream-K unit.
+        ++unit_idx;
+      }
+
+      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
+          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the previous stream-K unit.
+        --unit_idx;
+      }
+
+      return unit_idx;
+    };
+
+    // Lambda to find the ID of the stream-K unit that computes this K tile
+    auto find_unit = [&](uint32_t k_tile) {
+      if (k_tile < big_unit_k_tiles) {
+        // The tile is within the "big unit range"
+        uint32_t unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor));
+      }
+      else {
+        // The tile is after the "big unit range." Account for this by finding the "normal unit"
+        // that it belongs to, and then offsetting by the number of big units
+        uint32_t unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor));
+      }
+    };
+
+    return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile));
+  }
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
new file mode 100755
index 000000000..af274ee09
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct SparseGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      TensorRefE ref_E,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op) {
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemm() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename Mma::IteratorE::TensorRef ref_E) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
+      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
new file mode 100755
index 000000000..f464e29cc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct SparseGemmWithAbsmax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  using ParamsC = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefC = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsD = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefD = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsAux = typename Epilogue::AuxOutputTileIterator::Params;
+  using TensorRefAux = typename Epilogue::AuxOutputTileIterator::TensorRef;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefD ref_D;
+    TensorRefE ref_E;
+    TensorRefAux ref_Aux;
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename Epilogue::OutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      TensorRefC ref_C_,
+      TensorRefD ref_D_,
+      TensorRefE ref_E_,
+      TensorRefAux ref_Aux_,
+      void* ptr_Vector_,
+      typename LayoutC::Stride::Index ldr_,
+      typename OutputOp::Params epilogue_ = 
+        typename OutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ref_E(ref_E_),
+      ref_Aux(ref_Aux_),
+      ptr_Vector(ptr_Vector_),
+      ldr(ldr_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    ParamsC params_C;
+    TensorRefC ref_C;
+    ParamsD params_D;
+    TensorRefD ref_D;
+    ParamsAux params_Aux;
+    TensorRefAux ref_Aux;
+
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename OutputOp::Params output_op;
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
+      TensorRefC ref_C,
+      TensorRefD ref_D,
+      TensorRefE ref_E,
+      TensorRefAux ref_Aux,
+      void* ptr_Vector,
+      typename LayoutC::Stride::Index ldr,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_Aux(ref_Aux),
+      params_Aux(ref_Aux.layout()),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr) {
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithAbsmax() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename Mma::IteratorE::TensorRef ref_E) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
+      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : params.ref_Aux.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
new file mode 100755
index 000000000..364804086
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
@@ -0,0 +1,238 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sparse Gemm that compute the epilogue visitor functor
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct SparseGemmWithEpilogueVisitor : public SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>  {
+
+  using Base = SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  static int const kSparse = Base::kSparse;
+  static int const kElementsPerElementE = Base::kElementsPerElementE;
+  using SharedStorage = typename Base::SharedStorage;
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    typename FusionCallbacks::Params output_op;
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Mma::IteratorE::TensorRef ref_E,
+      typename FusionCallbacks::Arguments output_op = typename FusionCallbacks::Arguments()
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      output_op(FusionCallbacks::to_underlying_arguments(problem_size, output_op, nullptr /*workspace*/)),
+      problem_shape(problem_size.m(), problem_size.n(), 1) {
+    }
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithEpilogueVisitor() { }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Epilogue
+    //
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
new file mode 100755
index 000000000..67d346e3b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
@@ -0,0 +1,502 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Users are not supposed to use this class directly.
+// This is a CRTP base class for the actual tile schedulers.
+template<class Subclass>
+class StaticPersistentTileScheduler {
+
+private:
+  uint64_t current_work_linear_idx_;
+  uint64_t total_grid_size_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, false};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+public:
+  struct Arguments {
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+  };
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      [[maybe_unused]] KernelHardwareInfo const& hw_info,
+      Arguments const& arguments,
+      [[maybe_unused]] void* workspace=nullptr,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  can_implement(Arguments const& args) {
+    return args.max_swizzle_size >= 1;
+  }
+
+  CUTLASS_HOST_DEVICE
+  StaticPersistentTileScheduler() { }
+
+  CUTLASS_DEVICE explicit StaticPersistentTileScheduler(Params const& params_) : scheduler_params(params_) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    return get_current_work();
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() const {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) const {
+    if (linear_idx >= scheduler_params.blocks_per_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices
+    uint64_t work_idx_l, remainder;
+    scheduler_params.divmod_batch_(work_idx_l, remainder, linear_idx);
+
+    uint64_t blk_per_grid_dim = scheduler_params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = Subclass::get_work_idx_m_and_n(blk_per_grid_dim,
+                                                         scheduler_params.divmod_cluster_shape_major_,
+                                                         scheduler_params.divmod_cluster_shape_minor_,
+                                                         scheduler_params.divmod_cluster_blk_major_,
+                                                         scheduler_params.log_swizzle_size_,
+                                                         scheduler_params.raster_order_);
+
+    return {work_idx_m, work_idx_n, static_cast<int32_t>(work_idx_l), true};
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo& work_tile_info, uint32_t advance_count = 1) const {
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        current_work_linear_idx_ + (total_grid_size_ * uint64_t(advance_count))
+    ).is_valid();
+  }
+
+  // Computes the linear index within a batch given M and N tile offsets within the batch.
+  // This essentially inverts the mapping performed in get_work_idx_m_and_n
+  static CUTLASS_DEVICE
+  uint64_t
+  get_linear_idx_from_m_and_n(
+    int32_t tile_m,
+    int32_t tile_n,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+    FastDivmodU64 const& divmod_cluster_blk_major,
+    int32_t log_swizzle_size,
+    RasterOrder raster_order) {
+
+    uint64_t minor_work_idx, major_work_idx, cluster_minor_offset;
+    if (raster_order == RasterOrder::AlongN) {
+      minor_work_idx = static_cast<uint64_t>(tile_m);
+      major_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_m = divmod_cluster_shape_minor.divide(tile_m) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_m - cluster_m;
+    }
+    else {
+      major_work_idx = static_cast<uint64_t>(tile_m);
+      minor_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_n = divmod_cluster_shape_minor.divide(tile_n) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_n - cluster_n;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major, cluster_major_offset;
+    cluster_idx_minor = divmod_cluster_shape_minor.divide(minor_work_idx - cluster_minor_offset);
+    divmod_cluster_shape_major(cluster_idx_major, cluster_major_offset, major_work_idx);
+
+    uint64_t cluster_idx_minor_div_swizzle = cluster_idx_minor >> log_swizzle_size;
+    uint64_t offset = cluster_idx_minor & ((1 << log_swizzle_size) - 1);
+
+    uint64_t extra = cluster_idx_minor_div_swizzle * divmod_cluster_blk_major.divisor + cluster_idx_major;
+
+    uint64_t cluster_id = (extra << log_swizzle_size) | offset;
+    return (cluster_id * divmod_cluster_shape_major.divisor + cluster_major_offset) * divmod_cluster_shape_minor.divisor + cluster_minor_offset;
+  }
+
+  // Given the inputs, computes the total number of output blocks over which this problem will compute. 
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) {
+    auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
+    auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape),
+      cta_m, cta_n
+    );
+  }
+
+  // Reloaded interface that receives WorkTileInfo to deduce next work.
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+  
+  // Given the inputs, computes the total number of output blocks over which this problem will compute.
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl,
+                          TileShape tile_shape_mnk,
+                          AtomThrShape atom_thr_shape_mnk,
+                          ClusterShape cluster_shape_mnk) {
+    auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk)));
+    auto cta_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk));
+    auto cta_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape_mnk),
+      cta_m, cta_n
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info, dim3 block_id_in_cluster) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster;
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+      [[maybe_unused]] Params const& params,
+      ProblemShapeMNKL problem_shape_mnk,
+      BlockShape cta_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo hw_info,
+      Arguments arguments = Arguments{},
+      bool truncate_by_problem_size=true) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo hw_info) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+    Arguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.log_swizzle_size_;
+    }
+    args.raster_order = params.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape_mnk),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
+    // TileScheduler works at CTA-level, kernel works at cluster-level
+    int m_coord = idx2crd(work_tile_info.M_idx / scheduler_params.cluster_shape_m_,
+                          scheduler_params.problem_tiles_m_);
+    int n_coord = idx2crd(work_tile_info.N_idx / scheduler_params.cluster_shape_n_,
+                          scheduler_params.problem_tiles_n_);
+    int l_coord = idx2crd(work_tile_info.L_idx,
+                          scheduler_params.problem_tiles_l_);
+    return make_coord(m_coord, n_coord, _, l_coord);
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Performs the reduction across splits for a given output tile. No fixup is required for
+  // work units returned by this scheduler.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) const { }
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) {
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
+    return cute::make_coord_iterator(k_tiles);
+  }
+
+  template <class ProblemShape, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+public:
+  // Sink scheduler params as a member
+  Params scheduler_params;
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
new file mode 100755
index 000000000..b51cc6ede
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
@@ -0,0 +1,675 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (A*B or B*A)
+  typename Mma2_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (AT*B or B*AT)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_              ///! Fill Mode for triangular matrix (kLower or kUpper)
+>
+struct SymmUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (TRMM - with diagonal: C_tmp = alpha * A * B)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (TRMM - withOUT diagonal: alpha * AT * B)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideModeA = SideMode_;
+  static FillMode const kFillModeA = FillMode_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+
+      }
+
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A_mma1{};
+    typename Mma1::IteratorB::Params params_B_mma1{};
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_A_mma2{};
+    typename Mma2::IteratorB::Params params_B_mma2{};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
+
+    int *semaphore{nullptr};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A_mma1(args.lda),
+      params_B_mma1(args.ldb),
+      params_A_mma2(args.lda),
+      params_B_mma2(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  SymmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes two GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK_mma1{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma1{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_MxK_mma2{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma2{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+    int gemm_k_iterations_mma1 = gemm_k_iterations;
+    int gemm_k_iterations_mma2 = gemm_k_iterations;
+
+
+    /******************************************************************************************************
+     * SYMM (Side Mode, Fill Mode) is made of two TRMMs:
+      First TRMM (Mma1: Side Mode, Fill Mode, Non-Unit Diag): (A * B) or (B * A)
+      Second TRMM (Mma2: Side Mode, Inverted Fill Mode, Unit Diag): (AT * B) or (B * AT)
+
+     * For the first TRMM (Mma1) of SYMM, the following method is used to calculate the k-iterations:
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+
+      * For the second TRMM (Mma2) of SYMM, the k-iterations and threadblock offsets are calculated 
+        the same way as the first TRMM (Mma1) of same side mode but with inverted fill mode. 
+        For example, if the first TRMM is left sided with lower fill, the second TRMM would be 
+        left sided with upper fill.
+    ********************************************************************************************************/
+
+    if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+      
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1  -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }      
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1 -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }
+
+    }
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A_mma1(
+      params.params_A_mma1,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma1);
+
+    typename Mma1::IteratorB iterator_B_mma1(
+      params.params_B_mma1,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma1);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_A_mma2(
+      params.params_A_mma2,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma2);
+
+    typename Mma2::IteratorB iterator_B_mma2(
+      params.params_B_mma2,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma2);
+
+    // Compute threadblock-scoped matrix multiply-add (A x B) or (B x A)
+    mma1(
+      gemm_k_iterations_mma1, 
+      accumulators, 
+      iterator_A_mma1, 
+      iterator_B_mma1, 
+      accumulators);
+
+    // Compute threadblock-scoped matrix multiply-add (AT x B) or (B x AT)
+    mma2(
+      gemm_k_iterations_mma2, 
+      accumulators, 
+      iterator_A_mma2, 
+      iterator_B_mma2, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
new file mode 100755
index 000000000..2d9b63ffe
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Utilities for selecting default tile schedulers
+*/
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// Tags for specifying tile schedulers
+//
+
+struct PersistentScheduler { };
+
+struct StreamKScheduler { };
+
+struct GroupScheduler { }; // Only used for Grouped GEMMs
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+//
+// Selectors mapping tile scheduler tag and arch tag to a tile scheduler class
+//
+
+template <
+  class TileSchedulerTag,
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+  , class ProblemShapeType = void
+>
+struct TileSchedulerSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a tile scheduler for given parameters.");
+};
+
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    ArchTag,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90;
+};
+
+// Default (void) for Sm90 maps to PersistentTileSchedulerSm90
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    void,
+    ArchTag,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = typename TileSchedulerSelector<
+      PersistentScheduler,
+      ArchTag,
+      TileShape,
+      ClusterShape
+  >::Scheduler;
+};
+
+template <
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
+};
+
+template <
+  class TileShape,
+  class ClusterShape
+  , class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+    , GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel::detail
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
new file mode 100755
index 000000000..0972731c2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -0,0 +1,1535 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Parameters structures for persistent tile schedulers
+*/
+
+#include "cutlass/coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/workspace.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters for SM90 tile schedulers
+//
+
+// Parameters for SM90 persistent tile scheduler
+struct PersistentTileSchedulerSm90Params {
+
+  enum class RasterOrder {
+    AlongM,
+    AlongN
+  };
+
+  enum class RasterOrderOptions {
+    Heuristic,
+    AlongM,
+    AlongN
+  };
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  uint64_t blocks_per_problem_ = 0;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  uint32_t problem_tiles_m_ = 0;
+  uint32_t problem_tiles_n_ = 0;
+  uint32_t problem_tiles_l_ = 0;
+  uint32_t cluster_shape_m_ = 0;
+  uint32_t cluster_shape_n_ = 0;
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    problem_tiles_m_ = problem_blocks_m / cluster_shape.m();
+    problem_tiles_n_ = problem_blocks_n / cluster_shape.n();
+    problem_tiles_l_ = problem_blocks.z;
+    cluster_shape_m_ = cluster_shape.m();
+    cluster_shape_n_ = cluster_shape.n();
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+
+    blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n / cluster_shape.n());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_m / cluster_shape.m());
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true
+    ) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      truncate_by_problem_size
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true
+    ) {
+
+    int const sm_count = hw_info.sm_count;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    else {
+      int cta_per_device = sm_count;
+      /*
+      * Optimal grid size calculation is based on
+      * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      * Hence, maximum SMs per GPC = 18
+      */
+      constexpr int max_sm_per_gpc = 18;
+      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+      auto cluster_size = cluster_shape.m() * cluster_shape.n();
+      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+      cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+      cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+      if (sm_count < cta_per_device) {
+        cta_per_device = sm_count;
+      }
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) {
+    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
+    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
+
+    return get_tiled_cta_shape_mnl(problem_shape, cluster_shape, cta_m, cta_n);
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(problem_blocks_m),
+      static_cast<uint32_t>(problem_blocks_n),
+      static_cast<uint32_t>(problem_shape.batch())
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent stream-K scheduler
+struct PersistentTileSchedulerSm90StreamKParams {
+
+  // Strategies for computing reductions between CTAs computing portions of a given output tile
+  enum class ReductionMode {
+    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
+    // currently accumulating.
+    //
+    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+    Deterministic,
+
+    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+    // the accumulated value and accumulate it into registers on top of which the epilogue will
+    // be performed).
+    //
+    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+    // of accumulation)
+    Nondeterministic
+  };
+
+  // Strategies for decomposing the problem
+  enum class DecompositionMode {
+    // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
+    Heuristic,
+    // Force a data-parallel decomposition
+    DataParallel,
+    // Force a split-K decomposition. This should be paired with setting the `splits` parameter
+    SplitK,
+    // Force a stream-K decomposition
+    StreamK
+  };
+
+  using UnderlyingParams = PersistentTileSchedulerSm90Params;
+  using RasterOrder = UnderlyingParams::RasterOrder;
+  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+
+  // Cluster dimensions are typically always a power of 2, so use
+  // the power-of-two variants of FastDivmod for these.
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  // Total number of cluster-sized output tiles (i.e., not including any
+  // splitting factors). This is primarily used for split-K decompositions,
+  // and may be overridden in other decompositions.
+  FastDivmodU64 divmod_clusters_mnl_{};
+
+  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
+  // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
+  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
+  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
+  // of tiles that would be assigned in the same wave according to the rasterization order of the
+  // data-parallel formulation of the problem.
+  FastDivmodU64 divmod_sk_groups_{};
+
+  // Number of stream-K units in each group
+  FastDivmodU64 divmod_sk_units_per_group_{};
+
+  uint64_t units_per_problem_ = 0;
+  FastDivmod divmod_tiles_per_output_tile_{};
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  // The splitting factor to be used in a split-K decomposition of the problem.
+  // If this is set to a value greater than 1, stream-K decomposition logic
+  // is bypassed in favor of a split-K decomposition.
+  FastDivmod divmod_splits_{};
+
+  // Number of stream-K or split-K work units that compute an extra k iteration.
+  // This is done to handle residuals in dividing up the k iteration space.
+  // For stream-K, since the actual assignment of work to stream-K units will be done
+  // at the granularity of a cluster, we store only the number of big clusters.
+  uint32_t big_units_ = 0;
+
+  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
+  uint32_t big_groups_ = 0;
+
+  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
+  void* reduction_workspace_ = nullptr;
+
+  // Number of tiles covered by stream-K work units
+  uint32_t sk_tiles_ = 0;
+
+  // Number of work units computing stream-K tiles
+  uint32_t sk_units_ = 0;
+
+  // Number of tiled k iterations computed by each stream-K work unit. This
+  // can potentially cover more than one output tile.
+  FastDivmod divmod_k_tiles_per_sk_unit_{};
+  // Number of tiled k iterations computed by each "big" stream-K units, which
+  // processes one more K chunk than a "normal" stream-K unit.
+  FastDivmod divmod_k_tiles_per_sk_big_unit_{};
+
+  // Strategy to use when reducing between collaborating CTAs
+  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
+
+  // The number of sub blocks in the kernel epilogue
+  FastDivmodU64 divmod_epilogue_subtile_{};
+
+  // The number of blocks that launched for doing separate reduction
+  uint32_t separate_reduction_units_ = 0;
+
+  // Minimum number of k tiles that can be assigned to a stream-K unit
+  static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
+
+  // Maximum number of groups of stream-K units
+  static constexpr uint32_t max_sk_groups_ = 8u;
+
+  // ktile start from even for each cta
+  uint32_t ktile_start_alignment_count { 1u };
+
+  // Divides dividend by the cluster size
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  div_cluster_size(uint64_t dividend) const {
+    // Use each underlying fast divmod rather than performing integer division
+    // by the multiplication of major.divisor * minor.divisor
+    return divmod_cluster_shape_minor_.divide(
+      divmod_cluster_shape_major_.divide(dividend)
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  get_cluster_size() const {
+    return divmod_cluster_shape_minor_.divisor * divmod_cluster_shape_major_.divisor;
+  }
+
+  // Returns whether the kernel uses separate reduction
+  CUTLASS_HOST_DEVICE
+  bool
+  requires_separate_reduction() const {
+    return separate_reduction_units_ > 0;
+  }
+
+  // Returns the maximum number of peers that can collaborate on a given output tile
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
+    // When we can divide up our SK units to SK tiles evenly, the number of peers
+    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
+    // is not exact, some tiles will need to be covered by additional SK units. Because
+    // the extra work can occur at both the beginning and the end of the SK tile, at
+    // most 2 extra peers will be needed.
+    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
+  }
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1
+  ) {
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
+      problem_shape, tile_shape, cluster_shape);
+
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      reduction_mode,
+      decomposition_mode,
+      workspace,
+      epilogue_subtile
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1
+  ) {
+    UnderlyingParams underlying_params;
+    underlying_params.initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    auto problem_blocks_l = problem_blocks.z;
+
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
+    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
+
+    // Reduction workspace is at the beginning of the workspace. Lock workspace follows.
+    void* reduction_workspace = workspace;
+
+    if (decomposition_mode == DecompositionMode::SplitK ||
+        (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+      // Short circuit to basic split-K decomposition
+
+      // Don't split by more than the available number of SMs
+      if (splits > hw_info.sm_count) {
+        splits = hw_info.sm_count;
+      }
+
+      // Don't split by more than the K tile iterations
+      //
+      // splits is almost certainly nonnegative here (e.g., hw_info.sm_count,
+      // despite being an int, is a count), so it can safely be converted to unsigned
+      // in the comparison to avoid a signed-unsigned comparison warning-as-error.
+      if (static_cast<decltype(k_tiles_per_output_tile)>(splits) > k_tiles_per_output_tile) {
+        splits = k_tiles_per_output_tile;
+      }
+
+      // If splits == k_tiles_per_output_tiles, there will be one k_tile per cta
+      //   and this violate k_tile start from even requirements. Thus we need to
+      //   reduce the number of splits.
+      if (ktile_start_alignment_count > 1u &&
+           static_cast<decltype(k_tiles_per_output_tile)>(splits) == k_tiles_per_output_tile) { 
+        splits = k_tiles_per_output_tile / ktile_start_alignment_count;
+      }
+
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
+    // can fit within sm_count SMs.
+    dim3 grid = get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    uint64_t ctas_per_wave = grid.x * grid.y;
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
+    uint32_t sk_tiles = get_num_sk_tiles(
+      output_tiles,
+      ctas_per_wave,
+      cluster_size,
+      k_tiles_per_output_tile,
+      decomposition_mode
+    );
+    uint64_t dp_tiles = output_tiles - sk_tiles;
+
+    // Calculate the number of work units covering the data-parallel and stream-K tiles.
+    // A "work unit" is a single index in the linearized ID space used by the scheduler.
+    // We distinguish it from a "block," which is typically tied to a hardware unit
+    // (e.g., the callers into this scheduler will be persistent thread blocks).
+    // A work unit can encompass multiple output tiles worth of work (as will be the
+    // case for stream-K blocks).
+    // Since splitting is not required for data-parallel tiles, only one data-parallel unit
+    // is needed per data-parallel tile.
+    uint64_t dp_units = dp_tiles;
+
+    uint64_t ctas_per_sk_wave = ctas_per_wave;
+    uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
+        sk_units == 0) {
+      // Short circuit to basic data-parallel decomposition
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        /* splits = */ 1,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    bool do_separate_reduction = should_perform_separate_reduction(
+      epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
+
+    // Determine the number of stream-K groups that will be used. We currently use
+    // max_sk_groups_ unless this extends beyond the extent of the dimension over
+    // which the problem is rasterized. For example, if the tiled problem shape
+    // (in CTA_M x CTA_N representation) when using 1x1 clusters is 4x16,
+    // and we rasterize along the M dimension, we choose 4 groups, rather than 8.
+    // If the cluster shape is 2x1, we choose 2 groups (CTA_M / CLUSTER_M).
+    uint32_t max_groups_problem;
+    if (underlying_params.raster_order_ == RasterOrder::AlongM) {
+      max_groups_problem = problem_blocks_m / cluster_shape.m();
+    }
+    else {
+      max_groups_problem = problem_blocks_n / cluster_shape.n();
+    }
+
+    // Select the number of groups that will be use. We start with the maximum
+    // number of potential groups, and iterate down looking for a group size that
+    // evenly divides the stream-K units and tiles, and for which the resulting
+    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
+
+    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
+
+    // Grouping is disabled when separate reduction is used
+    if (do_separate_reduction) {
+      groups = 1;
+    }
+
+    uint32_t fallback_groups = 0;
+    auto sk_cluster_tiles = sk_tiles / cluster_size;
+    auto sk_cluster_units = sk_units / cluster_size;
+
+    auto sk_splits_too_small = [&](uint32_t g) {
+      // Check whether the number of K tiles computed per stream-K unit is less
+      // than min_iters_per_sk_unit_
+      auto total_sk_cluster_tiles = (sk_cluster_tiles / g) * cluster_size;
+      auto total_sk_k_tiles = total_sk_cluster_tiles * k_tiles_per_output_tile;
+      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
+      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
+    };
+
+    auto is_ideal_grouping = [&](uint32_t g) {
+      // An ideal grouping will evenly divide stream-K clusters, evenly divide
+      // stream-K tiles, and not result in stream-K splits that are too small.
+      return (sk_cluster_units % g == 0) && (sk_cluster_tiles % g == 0) && !sk_splits_too_small(g);
+    };
+
+    auto is_valid_grouping = [&](uint32_t g) {
+      // A grouping is valid, but not ideal, if it evenly divides the
+      // stream-K clusters and does not result in stream-K splits that are
+      // too small. Such a setting can be used as a fallback option in the
+      // case that an ideal grouping is not achievable
+      return sk_cluster_units % g == 0 && !sk_splits_too_small(g);
+    };
+
+    while (groups > 1 && !is_ideal_grouping(groups)) {
+      if (fallback_groups == 0 && is_valid_grouping(groups)) {
+        // Set fallback groups once in preference for a larger number of groups.
+        fallback_groups = groups;
+      }
+      --groups;
+    }
+
+    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
+    // found a fallback group count, use this instead.
+    if (groups == 1 && fallback_groups > 0) {
+      groups = fallback_groups;
+    }
+
+    auto sk_units_per_group = sk_units / groups;
+
+    // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
+    //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
+    // Both total_tiles and sm_count are multiples of cluster size due to padding added
+    // prior to kernel launch.
+    uint64_t sk_cluster_tiles_per_group = sk_cluster_tiles / groups;
+    uint64_t sk_tiles_per_group = sk_cluster_tiles_per_group * cluster_size;
+
+    // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
+    // are stream-K units within a group that process an extra K chunk.
+    uint64_t sk_big_groups = sk_cluster_tiles % groups;
+
+    uint64_t k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group;
+
+    // Number of k tiles computed per stream-K unit
+    uint64_t k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group;
+
+    uint32_t reduction_units = 0;
+
+    // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0)
+    // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles)
+    if (do_separate_reduction) {
+      // Each reduction unit will reduce the partials of an epilogue subtile for
+      // a given output tile and compute the epilogue. Thus, there are as many reduction
+      // units as there are epilogue subtiles.
+      reduction_units = sk_tiles * epilogue_subtile;
+    }
+    else if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
+      // If the number of stream-K units is a multiple of the number of stream-K tiles, then
+      // the problem can leverage a basic split-K decomposition for the stream-K tiles.
+      // This case happens when separate reduction is disable.
+      uint32_t sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
+      set_params_basic(
+        underlying_params,
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        sk_splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
+    divmod_batch_ = underlying_params.divmod_batch_;
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
+    divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
+    divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
+
+    // Override divmod_clusters_mnl_ to be the number of cluster-sized stream-K units.
+    // This setting ensures that the use of this divmod for stream-K decompositions
+    // is essentially a no-op.
+    divmod_clusters_mnl_ = FastDivmodU64(sk_units / cluster_size);
+    divmod_splits_ = FastDivmod(1);
+    log_swizzle_size_ = underlying_params.log_swizzle_size_;
+    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
+    raster_order_ = underlying_params.raster_order_;
+
+    // Assign big_units_ assuming that group count == 1. This is unused by stream-K
+    // when group count > 1.
+    big_units_ = static_cast<uint32_t>(k_tiles_per_group % k_tiles_per_sk_unit);
+
+    big_groups_ = static_cast<uint32_t>(sk_big_groups);
+    reduction_workspace_ = reduction_workspace;
+    sk_tiles_ = sk_tiles;
+    sk_units_ = static_cast<uint32_t>(sk_units);
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit));
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit + 1));
+    reduction_mode_ = reduction_mode;
+    divmod_epilogue_subtile_ = FastDivmodU64(epilogue_subtile);
+    separate_reduction_units_ = reduction_units;
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
+    // to be truncated based on the number of output tiles in the problem.
+    return UnderlyingParams::get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      /* truncate_by_problem_size = */false
+    );
+  }
+
+  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
+  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  static uint32_t
+  get_num_sk_tiles(
+    uint64_t output_tiles,
+    uint64_t ctas_per_wave,
+    uint64_t cluster_size,
+    uint32_t k_tiles_per_output_tile,
+    DecompositionMode decomposition_mode
+  ) {
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        decomposition_mode == DecompositionMode::SplitK) {
+      return 0;
+    }
+
+    // If there is wave quantization, assign the first two waves worth of tiles to be
+    // covered by stream-K work and the remainder to be data-parallel. Since we know
+    // that full_waves == total_waves - 1 in this case, the number of data-parallel
+    // waves is simply full_waves-1 (unless full_waves == 0).
+    uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
+    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    uint64_t sk_tiles = output_tiles - dp_tiles;
+
+    if (decomposition_mode == DecompositionMode::Heuristic) {
+      if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
+        // All tiles will be data-parallel tiles if there is either no quantization
+        // or if there is no work to be split.
+        return 0;
+      }
+
+      //
+      // The final wave is not full. Perform some stream-K work.
+      //
+
+      // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
+      // one wave and the tail wave is more than half full. This is subject to change.
+      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
+      if (2 * tail_tiles >= ctas_per_wave) {
+        return 0;
+      }
+    }
+
+    return static_cast<uint32_t>(sk_tiles);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint64_t
+  get_num_sk_units(GemmCoord cluster_shape, uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
+    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
+    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
+    // will be as many work units as there are threadblocks in a single wave.
+    //
+    // When the total k iterations across stream-K tiles is too small to justify distributing
+    // across an entire wave of blocks, we instead distribute the iterations over a smaller
+    // set of blocks.
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+
+    // Number of k iterations computed by the stream-K units as a whole
+    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
+    min_sized_sk_units = (min_sized_sk_units / cluster_size) * cluster_size;
+
+    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    return sk_units;
+  }
+
+  // Calculates the size of the workspace needed for holding reduction barriers
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups, uint32_t barrier_bits) {
+    size_t workspace_bits = num_tiles * static_cast<size_t>(mma_warp_groups) * static_cast<size_t>(barrier_bits);
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  // Calculates the size of the workspace needed for holding partial outputs from splits
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
+    size_t output_tile_size = tile_shape.m() * tile_shape.n();
+    size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  #if !defined(__CUDACC_RTC__)
+  static void
+  get_workspace_component_sizes(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    size_t& barrier_workspace_size,
+    size_t& reduction_workspace_size,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1) {
+
+    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
+    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
+    // of output tiles that will be split, and then calculate the workspace needed to cover these.
+    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+
+    if (decomposition_mode == DecompositionMode::DataParallel) {
+      barrier_workspace_size = 0;
+      reduction_workspace_size = 0;
+    }
+    else if (splits > 1 &&
+             (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic)) {
+      // Basic split-K variant requires workspace for all output tiles
+      barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+    }
+    else {
+      KernelHardwareInfo new_hw_info;
+      new_hw_info.device_id = hw_info.device_id;
+      new_hw_info.sm_count = hw_info.sm_count;
+      if (new_hw_info.sm_count <= 0) {
+        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
+      }
+
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        new_hw_info,
+        max_swizzle,
+        raster_order_option
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
+      uint32_t sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        cluster_size,
+        static_cast<uint32_t>(k_tiles_per_output_tile),
+        decomposition_mode
+      );
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+
+      uint64_t reduction_tiles = sk_tiles;
+      if (should_perform_separate_reduction(epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave)) {
+        // In separate reduction, each peer writes to its own location in scratch space.
+        // Thus, for separate reduction, we need as many reduction tiles per output tile
+        // as there are the maximum number of peers that can collaborate on an output tile.
+        reduction_tiles *= max_peers_per_tile(sk_units, sk_tiles);
+      }
+
+      // Though separate reduction requires a larger reduction workspace, only one barrier
+      // is needed per output tile. Each peer will increment the barrier by one once the peer has
+      // written its accumulator to scratch space. The separate reduction unit will only begin
+      // performing the reduction when the barrier has reached the number of peers for the output tile.
+      barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+    }
+  }
+  #endif // !defined(__CUDACC_RTC__)
+
+  // Returns whether the kernel is configured in a manner for which separate reduction should be used
+  CUTLASS_HOST_DEVICE
+  static bool
+  should_perform_separate_reduction(uint32_t, uint64_t, uint64_t, uint64_t, uint64_t) {
+    // Separate reduction is temporarily disabled, pending fixes
+    return false;
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    uint32_t num_accumulator_mtxs) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1) {
+
+    size_t barrier_workspace_size = 0;
+    size_t reduction_workspace_size = 0;
+
+    #if !defined(__CUDACC_RTC__)
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs
+      );
+    #endif
+
+    return barrier_workspace_size + reduction_workspace_size;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      1,
+      cuda_adapter
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    #if !defined(__CUDACC_RTC__)
+      uint64_t barrier_workspace_size = 0;
+      uint64_t reduction_workspace_size = 0;
+
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs
+      );
+
+      if (barrier_workspace_size > 0) {
+        if (workspace == nullptr) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        // Only the barrier workspace needs to be cleared for stream-K.
+        // Barrier workspace follows reduction workspace.
+        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
+        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream, cuda_adapter);
+      }
+    #endif // !defined(__CUDACC_RTC__)
+
+    return Status::kSuccess;
+  }
+
+  void
+  set_params_basic(
+    UnderlyingParams const& underlying_params,
+    uint32_t blocks_m,
+    uint32_t blocks_n,
+    uint32_t blocks_l,
+    uint32_t splits,
+    uint32_t k_tiles_per_output_tile,
+    void* reduction_workspace,
+    ReductionMode reduction_mode) {
+
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
+    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(1u);
+    auto cluster_size = underlying_params.divmod_cluster_shape_major_.divisor * underlying_params.divmod_cluster_shape_minor_.divisor;
+    divmod_clusters_mnl_ = FastDivmodU64((blocks_m * blocks_n * blocks_l) / cluster_size);
+    divmod_splits_ = FastDivmod(splits);
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
+    log_swizzle_size_ = underlying_params.log_swizzle_size_;
+    units_per_problem_ = blocks_m * blocks_n * blocks_l;
+    raster_order_ = underlying_params.raster_order_;
+    big_units_ = k_tiles_per_output_tile % splits;
+    reduction_workspace_ = reduction_workspace;
+    reduction_mode_ = reduction_mode;
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1);
+
+    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
+    sk_tiles_ = 0;
+    sk_units_ = 0;
+    divmod_sk_units_per_group_ = FastDivmodU64(1u);
+    separate_reduction_units_ = 0;
+  }
+
+  private:
+  // Round up number of bytes to the nearest multiple of L2 cache line alignment
+  CUTLASS_HOST_DEVICE
+  static size_t
+  round_up_to_l2_alignment(size_t bytes) {
+    constexpr size_t L2CacheLineSizeBytes = 128u;
+    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent group scheduler (only used for Grouped Gemms)
+template<class ProblemShape>
+struct PersistentTileSchedulerSm90GroupParams {
+
+  enum class RasterOrder {
+    AlongM,
+    AlongN
+  };
+
+  enum class RasterOrderOptions {
+    Heuristic,
+    AlongM,
+    AlongN
+  };
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_cta_shape_m_{};
+  FastDivmodU64 divmod_cta_shape_n_{};
+
+  uint64_t blocks_across_problem_ = 0;
+  bool pre_processed_problem_shapes = true;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  int32_t groups_ = 0;
+  ProblemShape* problem_shapes_ = nullptr;
+  GemmCoord cta_shape_;
+  GemmCoord cluster_shape_;
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    int32_t groups,
+    ProblemShape* problem_shapes,
+    ProblemShape const* host_problem_shapes,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+    groups_ = groups;
+    problem_shapes_ = problem_shapes;
+    cta_shape_ = cta_shape;
+    cluster_shape_ = cluster_shape;
+
+    blocks_across_problem_ = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+    pre_processed_problem_shapes = (host_problem_shapes == nullptr) ? false : true;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+    }
+
+    divmod_cta_shape_m_ = FastDivmodU64(cta_shape_.m());
+    divmod_cta_shape_n_ = FastDivmodU64(cta_shape_.n());
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(cta_m),
+      static_cast<uint32_t>(cta_n),
+      static_cast<uint32_t>(1) // Only a single batch per group is currently supported
+    };
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true) {
+
+    int const sm_count = hw_info.sm_count;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    else {
+      // Optimal grid size calculation is based on
+      // GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      // Hence, maximum SMs per GPC = 18
+      constexpr int max_sm_per_gpc = 18;
+      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+      auto cluster_size = cluster_shape.m() * cluster_shape.n();
+      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+      cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
+
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
new file mode 100755
index 000000000..50b33eab7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/core_io.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_,             ///! Fill Mode for triangular matrix (kLower or kUpper)
+  DiagType DiagType_              ///! Diag Type for triangular matrix (kNonUnit or kUnit)
+>
+struct TrmmUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideMode = SideMode_;
+  static FillMode const kFillMode = FillMode_;
+  static DiagType const kDiagType = DiagType_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size),
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldd(ldd) {
+      }
+    
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+   
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_D {0};
+
+    int *semaphore{nullptr};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TrmmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+    
+    /******************************************************************************************************
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+    ********************************************************************************************************/
+ 
+    if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n() + 1) * Mma::Shape::kN + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m()) * Mma::Shape::kM) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n()) * Mma::Shape::kN) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    }
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // Tile iterator loading from source tensor (although irrelevant to this kernel as beta is zero).
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
new file mode 100755
index 000000000..2e3798b15
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp-level multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
+  typename Operator = arch::OpMultiplyAdd,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+struct Mma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Overloads specialized for existing architectures
+//
+
+#include "cutlass/gemm/thread/mma_sm50.h"
+#include "cutlass/gemm/thread/mma_sm60.h"
+#include "cutlass/gemm/thread/mma_sm61.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
new file mode 100755
index 000000000..c778832bf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_,
+  /// Operator used to compute GEMM
+  typename Operator_
+>
+struct MmaGeneric {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = Operator_;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = arch::Mma<
+    gemm::GemmShape<1,1,1>,
+    1,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    Operator>;
+
+  static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0));
+
+  static bool const kAllFp32 = platform::is_same<ElementA, float>::value &&
+      platform::is_same<ElementB, float>::value &&
+      platform::is_same<ElementC, float>::value;
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementA const, LayoutA> a_ref(
+      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
+
+    TensorRef<ElementB const, LayoutB> b_ref(
+      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
+
+    TensorRef<ElementC, LayoutC> d_ref(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
+
+    MmaOp mma_op;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK; ++k) {
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860)
+      if (kMultipleOf2 && kAllFp32) {
+        //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile.
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; n+=2) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; m+=2) {
+  
+            int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m;
+
+            //top-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n+1);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //top-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n+1);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+          }
+        }
+      } else 
+      #endif
+      {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; ++n) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; ++m) {
+  
+            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+  
+            MatrixCoord mn(m_serpentine, n);
+            MatrixCoord mk(m_serpentine, k);
+            MatrixCoord kn(k, n);
+  
+            Array<ElementC, 1> d;
+            Array<ElementA, 1> a;
+            Array<ElementB, 1> b;
+  
+            d[0] = d_ref.at(mn);
+            a[0] = a_ref.at(mk);
+            b[0] = b_ref.at(kn);
+  
+            mma_op(d, a, b, d);
+  
+            d_ref.at(mn) = d[0];
+          }
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Matrix multiply-add operation - assumes operand B is not changing
+struct MmaComplexF32_Column {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation - assumes operand A is not changing
+struct MmaComplexF32_Corner {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_
+>
+struct MmaGeneric<
+  Shape_,
+  complex<float>,
+  LayoutA_,
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  arch::OpMultiplyAdd> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = complex<float>;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = complex<float>;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = complex<float>;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = arch::Mma<
+    gemm::GemmShape<1,1,1>,
+    1,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementA const, LayoutA> a_ref(
+      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
+
+    TensorRef<ElementB const, LayoutB> b_ref(
+      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
+
+    TensorRef<ElementC, LayoutC> d_ref(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
+
+    detail::MmaComplexF32_Column mma_column;
+    detail::MmaComplexF32_Corner mma_corner;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+
+          int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+
+          MatrixCoord mn(m_serpentine, n);
+          MatrixCoord mk(m_serpentine, k);
+          MatrixCoord kn(k, n);
+
+          Array<ElementC, 1> d;
+          Array<ElementA, 1> a;
+          Array<ElementB, 1> b;
+
+          d[0] = d_ref.at(mn);
+          a[0] = a_ref.at(mk);
+          b[0] = b_ref.at(kn);
+
+          if ((m == 0 && n) || m == Shape::kM - 1) {
+            mma_corner(d, a, b, d);
+          }
+          else {
+            mma_column(d, a, b, d);
+          }
+
+          d_ref.at(mn) = d[0];
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles conventional layouts for FFMA and DFMA GEMM
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  bool> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename MmaGeneric<
+                                    Shape,
+                                    ElementA,
+                                    LayoutA,
+                                    ElementB,
+                                    LayoutB,
+                                    ElementC,
+                                    LayoutC,
+                                    Operator>::MmaOp;
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    MmaGeneric<
+      Shape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator> mma;
+
+    mma(D, A, B, C);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
new file mode 100755
index 000000000..5e2178982
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
@@ -0,0 +1,1161 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/functional.h"
+#include "cutlass/reduction/thread/reduce.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Structure to compute the matrix product for HFMA
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+
+  /// Type of GEMM inner vs outer product
+  bool
+>
+struct Mma_HFMA2;
+
+
+/////////////////////////////
+// Specialization for NNN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM/2 + m],
+                ptr_B[n*Shape::kK + k],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for NNT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            Array<half_t, 2> tmp_B;
+            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
+            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM + m],
+                tmp_B,
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for NTN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the GEMM M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM / Mma::Shape::kM; ++m) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Shape::kN / Mma::Shape::kN; ++n) {
+
+          Array<half_t, 2> tmp { ptr_D[m + n * Shape::kM/2] };
+
+          mma(
+            tmp,
+            ptr_A[m + k * Shape::kM/2],
+            ptr_B[k * Shape::kN + n],
+            tmp);
+
+          ptr_D[m + n * Shape::kM/2] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for NTT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM + m],
+                ptr_B[k*Shape::kN/2 + n],
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for TNN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            Array<half_t, 2> tmp_A;
+            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
+            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                tmp_A,
+                ptr_B[n*Shape::kK + k],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for TNT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            Array<half_t, 2> tmp_B;
+            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
+            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                ptr_A[m*Shape::kK + k],
+                tmp_B,
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for TTN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            Array<half_t, 2> tmp_A;
+            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
+            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                tmp_A,
+                ptr_B[k*Shape::kN + n],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for TTT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::RowMajor,
+  layout::RowMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            mma(
+                tmp,
+                ptr_A[m*Shape::kK + k],
+                ptr_B[k*Shape::kN/2 + n],
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////
+// Specialization for TNT + Inner Product  or 1x1x2K + LayoutC = T //
+/////////////////////////////////////////////////////////////////////
+
+template <typename Shape_, typename LayoutA, typename LayoutB>
+struct Mma_HFMA2<
+  Shape_,
+  LayoutA,
+  LayoutB,
+  layout::RowMajor,
+  false
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kK % 2),
+    "Mma_HFMA2 requires the K dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x2 HFMA2 sequence for bulk of computation
+    using GemmShape = gemm::GemmShape<1,1,2>;
+
+    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    // Inner product is calculated using MACs, followed by final reduction
+    multiply_add<Array<half_t, 2>> mac;
+    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
+
+        Array<half_t, 2> tmp_C;
+        tmp_C.clear();
+        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
+        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
+          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
+        }
+
+        Array<half_t, 1> res;
+        Array<half_t, 1> *ptr_res = &res;
+        res = reduce(tmp_C);
+
+        ptr_D[m*Shape::kN + n] = ptr_res[0];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////
+// Specialization for TNN + Inner Product  or 1x1x2K + LayoutC = N //
+/////////////////////////////////////////////////////////////////////
+
+template <typename Shape_, typename LayoutA, typename LayoutB>
+struct Mma_HFMA2<
+  Shape_,
+  LayoutA,
+  LayoutB,
+  layout::ColumnMajor,
+  false
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kK % 2),
+    "Mma_HFMA2 requires the K dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x2 HFMA2 sequence for bulk of computation
+    using GemmShape= gemm::GemmShape<1,1,2>;
+
+    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    // Inner product is calculated using MACs, followed by final reduction
+    multiply_add<Array<half_t, 2>> mac;
+    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
+
+        Array<half_t, 2> tmp_C;
+        tmp_C.clear();
+        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
+        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
+
+          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
+
+        }
+
+        Array<half_t, 1> res;
+        Array<half_t, 1> *ptr_res = &res;
+        res = reduce(tmp_C);
+
+        ptr_D[n*Shape::kM + m] = ptr_res[0];
+      }
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_, typename LayoutA, typename LayoutB, typename LayoutC
+>
+struct Mma<
+  Shape_,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  LayoutC,
+  arch::OpMultiplyAdd
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = half_t;
+
+  /// Data type of operand B
+  using ElementB = half_t;
+
+  /// Element type of operand C
+  using ElementC = half_t;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  static bool const a_row_major = platform::is_same< LayoutA, layout::RowMajor>::value;
+  static bool const b_column_major = platform::is_same< LayoutB, layout::ColumnMajor>::value;
+  static bool const c_row_major = platform::is_same< LayoutC, layout::RowMajor>::value;
+  static bool const c_column_major = platform::is_same< LayoutC, layout::ColumnMajor>::value;
+
+  static bool const m_mod2 = !(Shape::kM % 2);
+  static bool const n_mod2 = !(Shape::kN % 2);
+  static bool const k_mod2 = !(Shape::kK % 2);
+
+  // HFMA based MMA optimizations are of 2 types :
+  // 1. Inner product 
+  // 2. Outer product
+  // It is chosen based on LayoutC (for outer product gemm) or
+  // Using LayoutA and LayoutB or shape=1x1x2K (for inner product gemms)
+  // If all fails, we choose the generic MMA
+  static bool const use_outer_prod = (c_column_major && m_mod2) || (c_row_major && n_mod2);
+  static bool const use_inner_prod = (a_row_major && b_column_major && k_mod2) || (Shape::kM==1 && Shape::kN==1 && k_mod2);
+  static bool const use_optimized =  (use_outer_prod || use_inner_prod);
+
+  using ArchMmaOperator = typename platform::conditional< use_optimized, 
+    detail::Mma_HFMA2<Shape, LayoutA, LayoutB, LayoutC, use_outer_prod>, 
+    MmaGeneric <Shape, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, Operator> 
+  >::type;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    ArchMmaOperator mma;
+
+    mma(D, A, B, C);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+  /// Determines whether to enable thread::Gemm<> specializations compatible with SM50
+  template <
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB>
+  struct EnableMma_Crow_SM60 {
+
+    static bool const kIsConventionalLayout =
+      (platform::is_same<LayoutA, layout::RowMajor>::value ||
+        platform::is_same<LayoutA, layout::ColumnMajor>::value) &&
+      (platform::is_same<LayoutB, layout::RowMajor>::value ||
+        platform::is_same<LayoutB, layout::ColumnMajor>::value);
+
+    static bool const value = kIsConventionalLayout;
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes matrix product when C is row-major
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  typename LayoutA_,
+  typename LayoutB_
+>
+struct Mma<
+  Shape_,
+  half_t,
+  LayoutA_,
+  half_t,
+  LayoutB_,
+  half_t,
+  layout::RowMajor,
+  arch::OpMultiplyAdd,
+  typename platform::enable_if<detail::EnableMma_Crow_SM60<
+    LayoutA_,
+    LayoutB_
+    >::value>::type>{
+
+  using Shape = Shape_;
+  using ElementA = half_t;
+  using LayoutA = LayoutA_;
+  using ElementB = half_t;
+  using LayoutB = LayoutB_;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using Operator = arch::OpMultiplyAdd;
+
+  using TransposeMma = Mma<
+    GemmShapeTranspose<Shape>,
+    half_t,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    half_t,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    half_t,
+    layout::ColumnMajor,
+    arch::OpMultiplyAdd,
+    bool>;
+
+  using FragmentA = Array<ElementA, Shape::kMK>;
+  using FragmentB = Array<ElementB, Shape::kKN>;
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  using ArchMmaOperator = typename TransposeMma::ArchMmaOperator;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TransposeMma mma;
+
+    mma(D, B, A, C);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
new file mode 100755
index 000000000..a1abb05f6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
@@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles conventional layouts for IDP4A
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int32_t,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  bool> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = int8_t;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = layout::RowMajor;
+
+  /// Data type of operand B
+  using ElementB = int8_t;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = layout::ColumnMajor;
+
+  /// Element type of operand C
+  using ElementC = int32_t;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  //  Use 1x1x4 IDP4A sequence for bulk of computation
+  using ArchMmaOperator = arch::Mma<
+      gemm::GemmShape<1,1,4>,
+      1,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      arch::OpMultiplyAdd>; 
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementC, LayoutC> d(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
+    
+    // Copy accumulators
+    D = C;
+
+    /// Use 1x1x4 IDP4A sequence for bulk of computation
+    ArchMmaOperator mma;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+          MatrixCoord mn(m, n);
+
+          Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
+          Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
+
+          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
+
+          mma(
+            tmp,
+            ptr_A[m * Shape::kK / ArchMmaOperator::Shape::kK + k],
+            ptr_B[n * Shape::kK / ArchMmaOperator::Shape::kK + k],
+            tmp);
+
+          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Gemplate that handles conventional layouts for IDP4A
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  int8_t,
+  layout::ColumnMajor,
+  int8_t,
+  layout::RowMajor,
+  int32_t,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  int8_t> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = int8_t;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = layout::ColumnMajor;
+
+  /// Data type of operand B
+  using ElementB = int8_t;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = layout::RowMajor;
+
+  /// Element type of operand C
+  using ElementC = int32_t;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  /// Use 1x1x4 IDP4A sequence for bulk of computation
+  using ArchMmaOperator = arch::Mma<
+      gemm::GemmShape<1,1,4>,
+      1,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      arch::OpMultiplyAdd>; 
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementC, LayoutC> d(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
+    
+    // Copy accumulators
+    D = C;
+
+    /// Underlying matrix multiply operator
+    ArchMmaOperator mma;
+    
+    Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
+    Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+          MatrixCoord mn(m, n);
+
+          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
+
+          mma(
+            tmp,
+            ptr_A[m + k * Shape::kM],
+            ptr_B[n + k * Shape::kN],
+            tmp);
+
+          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
+        }
+      }
+    }
+  }
+};
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
new file mode 100755
index 000000000..fba281264
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
@@ -0,0 +1,734 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default template for a Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/threadblock/ell_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/ell_mma_multistage.h"
+#include "cutlass/transform/threadblock/ell_predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultEllMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass Simt)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassSimt, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>,
+      typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for SIMT IDP4A Kernels
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape>
+struct DefaultEllMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
+                  Operator, false> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using OperatorClass =  arch::OpClassSimt;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, layout::RowMajor >::value;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// Specialization for Wmma TensorOp operator with 2 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for Wmma TensorOp operator with 1 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 1, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 1, Operator>; 
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped singlestage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
new file mode 100755
index 000000000..404e18919
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -0,0 +1,151 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level batched GEMV assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting SIMT instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/gemm/threadblock/gemv.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Template defininng default vector-matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout.
+template <
+  typename Shape_,            /// Shape of the threadblock vector-matrix multiply operator
+  typename ThreadShape_,      /// Shape of per-thread vector-matrix multiply operator
+  typename ElementA_,         /// Element data type of A operand
+  typename LayoutA_,          /// Layout of operand A
+  typename ElementB_,         /// Element data type of B operand
+  typename LayoutB_,          /// Layout of operand B
+  typename ElementC_,         /// Data type of accumulator
+  typename LayoutC_           /// Layout of accumulator
+>
+struct DefaultGemvCore {
+
+  using Shape = Shape_;
+  using ThreadShape = ThreadShape_;
+
+  using LayoutA = LayoutA_;
+  using LayoutB = LayoutB_;
+  using LayoutC = LayoutC_;
+  
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  static int const kThreadsPerN = Shape::kN / ThreadShape::kN;
+
+  using IteratorPolicyA = typename platform::conditional<
+                            platform::is_same<LayoutA, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kK, Shape::kM>, 1, ThreadShape::kK>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kM, Shape::kK>, 1, ThreadShape::kM>>::type;
+
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+                          cutlass::MatrixShape<Shape::kM, Shape::kK>, ElementA, LayoutA, 1, IteratorPolicyA>;
+
+  using IteratorPolicyB = typename platform::conditional<
+                            platform::is_same<LayoutB, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreadsPerN, ThreadShape::kN>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreadsPerN, ThreadShape::kK>>::type;
+
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+                            cutlass::MatrixShape<Shape::kK, Shape::kN>, ElementB, LayoutB, 0, IteratorPolicyB>;
+
+  using IteratorPolicyC = typename platform::conditional<
+                            platform::is_same<LayoutC, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kN, Shape::kM>, kThreadsPerN, ThreadShape::kN>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kM, Shape::kN>, kThreadsPerN, ThreadShape::kM>>::type;
+
+  using IteratorC = cutlass::transform::threadblock::PredicatedTileIterator<
+                             cutlass::MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC, 0, IteratorPolicyC>;
+
+  using MmaSimtOp = typename cutlass::gemm::thread::Mma<
+    cutlass::gemm::GemmShape<ThreadShape::kM, ThreadShape::kN, Shape::kK>,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC>;
+
+  using Operator = MmaSimtOp;
+
+  // Assertions for correctness
+  static_assert((Shape::kM == 1), "M=1 is required for GEMV");
+  
+  static_assert((ThreadShape::kM == 1), "M=1 is required for GEMV");
+
+  static_assert(Shape::kK % ThreadShape::kK == 0, "Shape::K must be a multiple of ThreadShape::K");
+
+  static_assert(((ThreadShape::kK == 1) ||
+                (ThreadShape::kK == 2) || 
+                (ThreadShape::kK == 4) ||
+                (ThreadShape::kK == 8) ||
+                (ThreadShape::kK == 16) ||
+                (ThreadShape::kK == 32)
+               ),
+              "ThreadShape::K must be a 1, 2, 4, 8, 16 or 32");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
new file mode 100755
index 000000000..8885d1ffc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
@@ -0,0 +1,823 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute
+    >
+struct DefaultMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass Simt)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassSimt, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, true, SharedMemoryClearOption::kNone, false, false,
+                  layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>,
+      typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true, SharedMemoryClearOption::kNone, 
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for SIMT IDP4A Kernels
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape>
+struct DefaultMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
+                  Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using OperatorClass =  arch::OpClassSimt;
+
+  static const bool transposeA = platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB = platform::is_same< LayoutB, layout::RowMajor >::value;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// Specialization for Wmma TensorOp operator with 2 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for Wmma TensorOp operator with 1 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 1, Operator>; 
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped singlestage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
new file mode 100755
index 000000000..da83982f4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+#include "cutlass/arch/mma.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaCore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
new file mode 100755
index 000000000..91f4710ed
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -0,0 +1,1723 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+namespace detail {
+
+// convert a WarpShape which is the whole tile of elements into warp num threads.
+// The goal is for each thread's tile of elements to be as square as possible
+// for performance (4x4 will be faster than 2x8).
+template<typename WarpShape>
+constexpr int simt_get_warp_threads_m() {
+    return (WarpShape::kM > WarpShape::kN) ? 8 : 4;
+}
+
+/// Computes padding in shared memory to perform efficient transpose without bank conflicts.
+constexpr int simt_transpose_padding(int threads, int crosswise, int size_in_bits) {
+  return (size_in_bits >= 32 ?
+      threads / crosswise / (size_in_bits / 32) :
+      threads / crosswise * (32 / size_in_bits)
+  );
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA // was IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB // was IteratorThreadMapA
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+
+  static_assert(!(kPaddingM % LaneM),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,     /// Data type of A elements
+      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,     /// Data type of B elements
+      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,     /// Element type of C matrix
+      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA,
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB,
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,     /// Data type of A elements
+      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,     /// Data type of B elements
+      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,     /// Element type of C matrix
+      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, kPaddingN>, // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                    > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Row-major
+///   B: Column-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,
+    MatrixShape<0, kPaddingN>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Row-major
+///   B: Row-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::RowMajor, int8_t, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+  
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Column-major
+///   B: Column-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, kPaddingN>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
new file mode 100755
index 000000000..41000dc18
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -0,0 +1,682 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = 
+    layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value>;
+
+  // Shared memory layout
+  using SmemLayoutB = 
+    layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+      sizeof_bits<ElementB>::value>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+      sizeof_bits<ElementB>::value>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
new file mode 100755
index 000000000..0162ef0df
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -0,0 +1,1315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"
+
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = 
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+    sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                             kWarpThreadArrangementStridedA>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                       MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Below is for arch::OpMultiplyAddFastF16
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB =
+      layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<half_t>::value,
+                                                    int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA =
+      layout::RowMajorTensorOpMultiplicandCrosswise<sizeof_bits<half_t>::value,
+                                                    Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t,
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, half_t, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, half_t, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                              WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major-interleave
+///   B: row-major-interleave
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+///
+/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
+/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
+/// can be reused. The shared store iterator is the same as the crosswise shared
+/// store iterator. So, the only thing we need to do is to swap the coordinates
+/// (contiguous <=> strided) used by the global iterator and the shared store
+/// iterator.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Number of interleaved k
+    int InterleavedK>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
+                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_,
+                      AccumulatorsInRowMajor> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+  static int const kInterleavedK = InterleavedK;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess =
+      kAccessSizeInBits / sizeof_bits<ElementA>::value;
+
+  static int const kWarpThreadArrangementContiguous =
+      kInterleavedK / kElementsPerAccess;
+
+  static int const kWarpThreadArrangementStrided =
+      kWarpSize / kWarpThreadArrangementContiguous;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, kInterleavedK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kInterleavedK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapA,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                       MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
new file mode 100755
index 000000000..ae21ee8bc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -0,0 +1,2951 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+
+      SM80 Multi stage kernel expects stage number to be larger or equal to 3
+   to use asyncronous copy.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2RowMajor, double, layout::AffineRank2ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2RowMajor, double, layout::AffineRank2RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float-precision
+///
+///   ElementA: complex<float>
+///   ElementB: complex<float>
+///   ElementC: complex<float>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+  complex<float>, LayoutA_, 
+  complex<float>, LayoutB_, 
+  complex<float>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<float>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddComplexFastF32>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   ElementA: complex<double>
+///   ElementB: complex<double>
+///   ElementC: complex<double>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, InstructionShape_, 
+  complex<double>, LayoutA_, 
+  complex<double>, LayoutB_, 
+  complex<double>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<double>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major-interleaved
+///   B: row-major-interleaved
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+///
+/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
+/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
+/// can be reused. The shared store iterator is the same as the crosswise shared
+/// store iterator. So, the only thing we need to do is to swap the coordinates
+/// (contiguous <=> strided) used by the global iterator and the shared store
+/// iterator.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Number of interleaved K
+    int InterleavedK>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
+                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      AccumulatorsInRowMajor, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static int const kInterleavedK = InterleavedK;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess =
+      kAccessSizeInBits / sizeof_bits<ElementA>::value;
+
+  static int const kWarpThreadArrangementContiguous =
+      kInterleavedK / kElementsPerAccess;
+
+  static int const kWarpThreadArrangementStrided =
+      kWarpSize / kWarpThreadArrangementContiguous;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, kInterleavedK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kInterleavedK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapA,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneM) && !((Shape::kK / 32) % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneM),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
new file mode 100755
index 000000000..985693ce6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -0,0 +1,876 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting sparse
+   TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_sparse_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_sparse_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    /// Cache operation of operand A
+    , cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global
+>
+struct DefaultSparseMmaCore;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+ 
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  // crosswise cannot be larger than 1024 bit.
+  static int const kCrosswiseB =
+      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
+          ? (1024 / sizeof_bits<ElementB>::value)
+          : Shape::kK;
+
+  static int const kWarpThreadArrangementContiguousB =
+      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kCrosswiseB>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  // Warp thread arrangement
+  // crosswise cannot be larger than 1024 bit.
+  static int const kCrosswiseB =
+      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
+          ? (1024 / sizeof_bits<ElementB>::value)
+          : Shape::kK;
+
+  static int const kWarpThreadArrangementContiguousB =
+      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kCrosswiseB>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
new file mode 100755
index 000000000..665010741
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a threadblock-scoped access
+    int kAccessSizeInBits = -1, // -1 denoting the default
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaCoreWithAccessSize;
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DefaultMmaCoreWithAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a threadblock-scoped access (a value of -1 indicates the default)
+    int kAccessSizeInBits_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename platform::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, kAccessSizeInBits_, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccessDefault = 1;
+  static_assert(kAccessSizeInBits_ == -1 ||
+          sizeof_bits<ElementA>::value == sizeof_bits<ElementB>::value ||
+          kAccessSizeInBits_ / sizeof_bits<ElementA>::value == kElementsPerAccessDefault,
+          "Non-default value for kAccessSizeInBits_ is only allowed if size(elementA) == sizeof(elementB)");
+  static int const kElementsPerAccess = (kAccessSizeInBits_ != -1) ? kAccessSizeInBits_ / sizeof_bits<ElementA>::value : kElementsPerAccessDefault;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
new file mode 100755
index 000000000..9f45601a8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_with_reduction_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Reduce operand A or B along K dimension
+    bool ReduceKForA_,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false// (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaWithReductionCore {
+  using Base = DefaultMmaCore<Shape_,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              LayoutA,
+                              ElementB,
+                              LayoutB,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              Stages,
+                              Operator,
+                              AccumulatorsInRowMajor,
+                              CacheOpA,
+                              CacheOpB,
+                              TransformA,
+                              TransformB,
+                              IsComplex>;
+  using Shape = Shape_;
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+  using SmemIteratorA = typename Base::SmemIteratorA;
+  using SmemIteratorB = typename Base::SmemIteratorB;
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+  using WarpCount = typename Base::WarpCount;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+   
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaWithReductionTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, ReduceKForA_, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
new file mode 100755
index 000000000..5f8e3e339
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -0,0 +1,712 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: wmma tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    ///< Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+  // NOTE: shared memory layout for wmma is same as the operands' layout in the global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Iterators to write to shared memory
+  //
+  
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<kPaddingA, 0>,
+    MatrixShape<0, kPaddingB>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: wmma tensorop class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    ///< Shape of threadblock-scoped matrix multiply operator
+    ///< (concept:GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape) [allowed
+    /// wmma instruction shapes, e.g., 16x16x16, 32x8x16, 8x32x16,...]
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads per threadblock
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+  
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Iterators to write to shared memory 
+  //
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA 
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;  
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB // SmemThreadMapB 
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, kPaddingA>,
+    MatrixShape<kPaddingB, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassWmmaTensorOp, Stages, Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+  
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, kPaddingA>,
+    MatrixShape<0, kPaddingB>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount =
+      GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN,
+                Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+  
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<kPaddingA, 0>,
+    MatrixShape<kPaddingB, 0>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
new file mode 100755
index 000000000..5dd3dbc3a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaLayernormMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorVarMean =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorGammaBeta =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorGammaBeta =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorGammaBeta = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename MmaCore::MmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaLayernormMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorVarMean, IteratorGammaBeta, SmemIteratorGammaBeta,
+      CacheOpGammaBeta,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, WarpIteratorGammaBeta, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
new file mode 100755
index 000000000..1895962a7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Math operator tag (e.g. arch::OpMultiplyAdd)
+    typename Operator = arch::OpMultiplyAdd
+>
+struct DefaultMmaPlanarComplexMultistage {
+
+    // Construct a planar complex variant from the real-valued variant
+    using RealMmaMultistage = typename DefaultMma<
+        ElementA_,
+        LayoutA_,
+        kAlignmentA,
+        ElementB_,
+        LayoutB_,
+        kAlignmentB,
+        ElementAccumulator_,
+        LayoutC_,
+        OperatorClass_,
+        ArchTag_,
+        ThreadblockShape_,
+        WarpShape_,
+        InstructionShape_,
+        Stages,
+        Operator
+    >::ThreadblockMma;
+
+    using ThreadblockMma = MmaPlanarComplexMultistage<
+      ThreadblockShape_,
+      typename RealMmaMultistage::IteratorA,
+      typename RealMmaMultistage::SmemIteratorA,
+      cutlass::arch::CacheOperation::Global,
+      typename RealMmaMultistage::IteratorB,
+      typename RealMmaMultistage::SmemIteratorB,
+      cutlass::arch::CacheOperation::Global,
+      ElementAccumulator_,
+      LayoutC_,
+      typename RealMmaMultistage::Policy,
+      Stages,
+      TransformA,
+      TransformB
+    >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}   // namespace threadblock
+}   // namespace gemm
+}   // namespace cutlass
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
new file mode 100755
index 000000000..e800ba44d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/gemm/warp/mma_planar_complex.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_pipelined.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Instruction-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex transformation on operand A
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transformation on operand B
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Math operator tag (e.g. arch::OpMultiplyAdd)
+  typename Operator = arch::OpMultiplyAdd
+>
+struct DefaultMmaPlanarComplexPipelined {
+
+  // Construct a planar complex variant from the real-valued variant
+  using RealMma = typename DefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator_,
+    LayoutC_,
+    OperatorClass_,
+    ArchTag_,
+    ThreadblockShape_,
+    WarpShape_,
+    InstructionShape_,
+    Stages,
+    Operator
+  >::ThreadblockMma;
+
+  using ThreadblockMma = MmaPlanarComplexPipelined<
+    ThreadblockShape_,
+    typename RealMma::IteratorA,
+    typename RealMma::SmemIteratorA,
+    typename RealMma::IteratorB,
+    typename RealMma::SmemIteratorB,
+    ElementAccumulator_,
+    LayoutC_,
+    typename RealMma::Policy,
+    Stages,
+    TransformA,
+    TransformB
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
new file mode 100755
index 000000000..f50d36a4b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined softmax-GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaSoftmaxMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorNormSum =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSoftmaxMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorNormSum,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, InternalTranspose, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
new file mode 100755
index 000000000..677c11443
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    ///                                                                                               
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaWithReduction {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaWithReductionCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      ReduceKForA_,  Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaWithReductionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
new file mode 100755
index 000000000..7f249780d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageMmaComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageMmaComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            ElementAccumulator, layout::RowMajor, OperatorClass,
+                            ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
new file mode 100755
index 000000000..cab2a96ae
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from
+/// threadblock tile size, global memory data layout, and target math
+/// instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global>
+struct DefaultMultistageMmaComplexCore;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
new file mode 100755
index 000000000..33150314a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -0,0 +1,1808 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  using Operator = Operator_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,    
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB, 
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: column-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
new file mode 100755
index 000000000..abcb063e3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, kDiagType, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, kDiagType,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode, unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
new file mode 100755
index 000000000..388b9c476
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -0,0 +1,196 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultSparseMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultSparseMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  static int const kSparse = MmaCore::kSparse;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK / kSparse>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define iterators over tiles from the E operand
+  using ElementE = typename MmaCore::ElementE;
+  using LayoutE = typename MmaCore::GmemLayoutE;
+  using ThreadMapE = typename MmaCore::IteratorThreadMapE;
+  using AccessTypeE =
+      cutlass::Array<ElementE, 128 / sizeof_bits<ElementE>::value>;
+  using IteratorE =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM,
+                               ThreadblockShape::kK / kSparse /
+                                   MmaCore::kElementsPerElementE>,
+          ElementE, LayoutE, 1, ThreadMapE, AccessTypeE>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::SparseMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      IteratorE, typename MmaCore::SmemIteratorE, MmaCore::kCacheOpE,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
new file mode 100755
index 000000000..5e90f25c2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+// 
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, kDiagType, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, kDiagType, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, DiagType::kUnit, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode, unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, DiagType::kUnit, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
new file mode 100755
index 000000000..27f410ccd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
@@ -0,0 +1,648 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class EllMmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, EllIterator &ell_iter,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      EllIterator &ell_iterator
+      ) {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+          
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+          
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+      ++ell_iterator;
+      
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+          ++ell_iterator;
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
new file mode 100755
index 000000000..55a951e1d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class EllMmaPipelined : public MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for EllMmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "EllMmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    EllIterator &ell_iterator,
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // load sparse matrix  
+    if (is_A_sparse){
+      iterator_A.load(tb_frag_A);
+    } else {
+      iterator_B.load(tb_frag_B);
+    }
+    
+    // load dense matrix
+    if (is_offset_constant){
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+      }
+    } else {
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+      }
+    }
+
+    ++iterator_A;
+    ++iterator_B;
+    ++ell_iterator;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          // load sparse matrix  
+          if (is_A_sparse){
+            iterator_A.load(tb_frag_A);
+          } else {
+            iterator_B.load(tb_frag_B);
+          }
+
+          // load dense matrix
+          if (is_offset_constant){
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+            }
+          } else {
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+            }
+          }
+
+          ++iterator_A;
+          ++iterator_B;
+          ++ell_iterator;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
new file mode 100755
index 000000000..e246ddce6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
@@ -0,0 +1,147 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a threadblock-scoped GEMV kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix-vector product using SIMT math instructions.
+template <
+  class Core_ //< GemvCore
+>
+class Gemv {
+public:
+  using Shape = typename Core_::Shape;
+
+  /// The MMA operator that computes GEMV 
+  using Operator = typename Core_::Operator;
+
+  /// Iterates over A in global memory
+  using IteratorA = typename Core_::IteratorA;
+
+  /// Iterates over B in global memory
+  using IteratorB = typename Core_::IteratorB;
+
+  /// Fragment of operand C loaded from global memory
+  using IteratorC = typename Core_::IteratorC;
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of operand accumulator loaded/stored to global memory
+  using FragmentC = typename Operator::FragmentC;
+
+  /// Shape of the per-thread GEMV operation
+  using ThreadShape = typename Core_::ThreadShape;
+
+public:
+  CUTLASS_DEVICE
+  Gemv() { }
+
+  CUTLASS_DEVICE
+  void operator()(
+    GemmCoord const &problem_size,    ///< problem size of batched GEMV
+    FragmentC &accum,                 ///< destination accumulator tile
+    IteratorA iterator_A,             ///< iterator over A operand in global memory
+    IteratorB iterator_B,             ///< iterator over B operand in global memory
+    FragmentC const &src_accum) {     ///< source accumualtor tile
+
+    //
+    // Prologue
+    //
+
+    FragmentA frag_A;
+    FragmentB frag_B;
+    frag_A.clear();
+    frag_B.clear();
+
+    iterator_A.load(frag_A);
+    iterator_B.load(frag_B);
+    ++iterator_A;
+    ++iterator_B;
+
+    //
+    // Mainloop
+    //
+    Operator thread_mma;
+    int gemm_k = problem_size.k();
+
+    if (gemm_k < Shape::kK)
+    {
+      iterator_A.clear_mask();
+      iterator_B.clear_mask();
+    }
+
+    // iterate over K to accumulate result
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k > 0; gemm_k -= Shape::kK) {
+      thread_mma(accum, frag_A, frag_B, accum);
+
+      iterator_A.load(frag_A);
+      iterator_B.load(frag_B);
+      ++iterator_A;
+      ++iterator_B;
+
+      if (gemm_k < Shape::kK)
+      {
+        iterator_A.clear_mask();
+        iterator_B.clear_mask();
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
new file mode 100755
index 000000000..8370f6145
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for rematerializing indices/dimensions in the thread hierarchy from special registers
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxX() {
+  return threadIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxY() {
+  return threadIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxZ() {
+  return threadIdx.z;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxX() {
+  return blockIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxY() {
+  return blockIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxZ() {
+  return blockIdx.z;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimX() {
+  return blockDim.x;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimY() {
+  return blockDim.y;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimZ() {
+  return blockDim.z;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
new file mode 100755
index 000000000..16ec65688
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/tensor_ref.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct MmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
new file mode 100755
index 000000000..11eb20adb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
@@ -0,0 +1,707 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+    Used by BLAS3 kernels that need to treat diagonal elements of a input iterator as a special case.
+  
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kZfill,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBlas3Multistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Blas Mode
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBlas3Multistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              /* The following logic to determine kSizeRealBytes is so that compiler doesn't complain when
+               * compiling for not complex datatype and using half the size for cp_async_zfill */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+ 
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
new file mode 100755
index 000000000..11ad54446
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
@@ -0,0 +1,863 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, mean and var, in the prologue and
+    stores them in the register file.  In the mainloop, it loads two loop
+    variant vectors, gamma and beta, by using cp.async.  We will call
+    elementwise operation to apply var, mean, gamma, beta between ldmatrix and
+    warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/layernorm_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the scale and bias vectors
+  using TensorRefGammaBeta = TensorRef<ElementScaleBias, LayoutScaleBias>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the A scale and bias vectors in shared memory
+    using ShapeGammaBeta =
+        MatrixShape<1 + Policy::SmemPaddingA::kRow,
+                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand Scale and Bias
+    AlignedBuffer<ElementScaleBias, ShapeGammaBeta::kCount> operand_A_gamma_beta;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the A scale and bias vectors
+    CUTLASS_DEVICE
+    static LayoutScaleBias LayoutScaleBias() {
+      return LayoutScaleBias::packed(
+          {ShapeGammaBeta::kRow, ShapeGammaBeta::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the A operand Scale vector
+    CUTLASS_HOST_DEVICE
+    TensorRefGammaBeta operand_A_gamma_beta_ref() {
+      return TensorRefGammaBeta{operand_A_gamma_beta.data(), LayoutScaleBias()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
+  /// from shared memory
+  WarpIteratorGammaBeta warp_tile_iterator_A_gamma_beta_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_A_gamma_beta_(
+            shared_storage.operand_A_gamma_beta_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorVarMean_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorGammaBeta_,
+    /// Iterates over vectors of scale and bias vector in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorGammaBeta_,
+    /// Cache operation for scale/bias operand 
+    cutlass::arch::CacheOperation::Kind CacheOpGammaBeta,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaLayernormMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta_::Element,
+                       typename IteratorGammaBeta_::Layout, Policy_, WarpIteratorGammaBeta_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorVarMean = IteratorVarMean_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorGammaBeta = IteratorGammaBeta_;
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta::Element, 
+                                     typename IteratorGammaBeta::Layout, Policy,
+                                     WarpIteratorGammaBeta, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorGammaBeta = SmemIteratorGammaBeta_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpGammaBeta =
+      CacheOpGammaBeta;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentVarMean = typename IteratorVarMean::Fragment;
+  using WarpLoadedFragmentGammaBeta =
+      typename WarpIteratorGammaBeta::Fragment;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
+  SmemIteratorGammaBeta smem_iterator_A_gamma_beta_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaLayernormMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_A_gamma_beta_(shared_storage.operand_A_gamma_beta_ref(),
+                                  thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorGammaBeta &iterator_A_gamma_beta,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
+    // are small.  One iteration is enough.
+    if (group_start_A == 0) {
+      typename IteratorGammaBeta::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+              this->smem_iterator_A_gamma_beta_.get());
+
+      int const kSrcBytes =
+          sizeof_bits<typename IteratorGammaBeta::Element>::value *
+          IteratorGammaBeta::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+          dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorVarMean iterator_var_mean,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorGammaBeta iterator_A_gamma_beta,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentVarMean warp_loaded_frag_var_mean;
+    iterator_var_mean.add_tile_offset({0, warp_idx_m_});
+    iterator_var_mean.load(warp_loaded_frag_var_mean);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      // Async Copy for operand A scale and bias vectors.  Scale and bias
+      // vectors are small.  One iteration is enough.
+      {
+        typename IteratorGammaBeta::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+                this->smem_iterator_A_gamma_beta_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorGammaBeta::Element>::value *
+            IteratorGammaBeta::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+            dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_A_gamma_beta.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpLoadedFragmentGammaBeta warp_loaded_frag_A_gamma_beta[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::LayernormScaleBiasTransform<WarpTransformedFragmentA,
+                                            WarpLoadedFragmentVarMean,
+                                            WarpLoadedFragmentGammaBeta>
+                         elementwise_transform;
+ 
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_A_gamma_beta_.load(
+        warp_loaded_frag_A_gamma_beta[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_gamma_beta_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_var_mean,
+                         warp_loaded_frag_A_gamma_beta[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_gamma_beta_.load(
+            warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_gamma_beta_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                               warp_loaded_frag_var_mean,
+                               warp_loaded_frag_A_gamma_beta[warp_mma_k % 2]);
+        }
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	  		       group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	                               group_start_iteration_A, 
+                                 group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_A_gamma_beta.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_A_gamma_beta_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_var_mean,
+              warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        }
+      }
+
+    }
+    
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
new file mode 100755
index 000000000..ef5513170
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -0,0 +1,741 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
+  };
+
+ private:
+
+
+  // Structure encapsulating pipeline state live from one iteration to the next
+  struct PipeState {
+
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+    /// Temporary accumulator to facilitate staged-accumulation
+    FragmentC tmp_accum_;
+
+    /// Pair of A fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentA warp_loaded_frag_A_[2];
+    WarpTransformedFragmentA warp_transformed_frag_A_[2];
+
+    /// Pair of B fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentB warp_loaded_frag_B_[2];
+    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+  };
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma_;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Advance shared memory read-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_read_stage()
+  {
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      smem_read_stage_idx_ = 0;
+    }
+  }
+
+  /// Advance global memory read-iterators and shared memory write-iterators to the stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(
+    IteratorA &iterator_A,
+    IteratorB &iterator_B)
+  {
+    // Advance global iterators
+    iterator_A.add_tile_offset({0, 1});
+    iterator_B.add_tile_offset({1, 0});
+
+    // Advance shared iterators
+    smem_iterator_A_.add_tile_offset({0, 1});
+    smem_iterator_B_.add_tile_offset({1, 0});
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      // Disable global fetching if done with global fetch iterations
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next write stage
+      advance_smem_write_stage(iterator_A, iterator_B);
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Optionally clear the remaining stages of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint are zero.
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+      typename IteratorA::AccessType zero_A;
+
+      zero_A.clear();
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+  }
+
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+  }
+
+
+  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void mac_loop_iter(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+      // Load the next warp-tile's A fragment from shared memory
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // Load the next warp-tile's B fragment from shared memory
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_B_;
+
+      // Except for the first warp-tile, all warp-tiles convert their incoming shared memory fragments as necessary
+      if (warp_mma_k > 0) {
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_B_[warp_mma_k % 2]);
+      }
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      // Except for the last warp-tile, all warp-tiles issue their share of
+      // global->shared fragment copies
+      if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+
+        int group_start_iteration_A, group_start_iteration_B;
+        group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+        group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+        copy_tiles_and_advance(
+            iterator_A,
+            iterator_B,
+            group_start_iteration_A,
+            group_start_iteration_B);
+      }
+
+      // The second-to-last warp-tile also:
+      //   - performs the last warp-tile's share of global->shared fragment copies
+      //   - moves to the next global fetch stage
+      if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+
+        // Performs the last warp-tile's share of global->shared fragment copies
+        int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+        copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          group_start_iteration_A,
+          group_start_iteration_B);
+
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+      }
+
+      // The last warp-tile also converts the shared memory fragments used by
+      // the first warp-tile of the next iteration, if necessary (so we can
+      // immediately start issuing MMA instructions at the top of the loop )
+      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+      FragmentC &accum,             ///< [in|out] accumulator tile
+      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+      IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    PipeState pipe_state;
+
+    // Disable global fetching if done with global fetch iterations
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
+
+    // Load first warp-tile's B fragment from shared memory
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
+    ++this->warp_tile_iterator_B_;
+
+    // Transform, if necessary, the first warp-tile's shared memory fragments
+    warp_mma_.transform(
+      pipe_state.warp_transformed_frag_A_[0],
+      pipe_state.warp_transformed_frag_B_[0],
+      pipe_state.warp_loaded_frag_A_[0],
+      pipe_state.warp_loaded_frag_B_[0]);
+
+    if (Detail::kStagedAccumulation) {
+      pipe_state.tmp_accum_.clear();
+    }
+
+    // Mainloop
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      mac_loop_iter(
+        pipe_state,
+        accum,
+        iterator_A,
+        iterator_B,
+        gemm_k_iterations);
+    }
+
+    if (Detail::kStagedAccumulation) {
+      plus<FragmentC> plus_accum;
+      accum = plus_accum(accum, pipe_state.tmp_accum_);
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // Catch-up the smem-read iterator to the smem-write iterator (so this class can be reused for another tile's prologue)
+
+    // First, increment remaining warp tiles to get to the next full stage.  (Ideally we would
+    // just decrement one tile, but not all iterators implement --() decrement.)
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
+    }
+    smem_read_stage_idx_++;
+
+    // Then wrap back two full stages (one for the tile advancing we just did, and one to catch the write iterators)
+    static const int kStageIters = Policy::kPartitionsK * Base::kWarpGemmIterations;
+    if (smem_read_stage_idx_ > 1)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, (-2 * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
+    }
+    else
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
+    }
+    smem_read_stage_idx_ = smem_write_stage_idx_;
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    // Prologue (start fetching iterations of global fragments into shared memory)
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Initialize destination accumulators with source accumulators
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
new file mode 100755
index 000000000..89681ebce
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -0,0 +1,439 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  ///< transformation applied to A fragment
+  TransformA transform_A_;
+
+  ///< transformation applied to B fragment
+  TransformB transform_B_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    TransformA transform_A = TransformA(),              ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()               ///< transformation applied to B fragment
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+    transform_A_(transform_A),
+    transform_B_(transform_B),
+    smem_write_stage_idx(0)
+  {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+
+  /// Advance shared memory write-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
+
+    smem_write_stage_idx ^= 1;
+  }
+
+  /// Advance shared memory read- and write-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_stages()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      // wrap write stage
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
+    else
+    {
+      // wrap read stage
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+
+    smem_write_stage_idx ^= 1;
+  }
+
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // The last kblock is loaded in the prolog
+
+    // Load A fragment from global A
+    FragmentA tb_frag_A;
+    tb_frag_A.clear();
+    iterator_A.load(tb_frag_A);
+    ++iterator_A;
+
+    // Load B fragment from global B
+    FragmentB tb_frag_B;
+    tb_frag_B.clear();
+    iterator_B.load(tb_frag_B);
+    ++iterator_B;
+
+    // Store A and B fragments to shared
+    this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B_(tb_frag_B));
+
+    // Advance write stage
+    advance_smem_write_stage();
+  }
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    __syncthreads();
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+    int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+    FragmentC &accum,             ///< [in|out] accumulator tile
+    IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    // Load A fragment from shared A
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    ++this->warp_tile_iterator_A_;
+
+    // Load B fragment from shared B
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+    ++this->warp_tile_iterator_B_;
+
+    // Pair of fragments used to overlap global memory loads and math instructions;
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B_(tb_frag_B));
+
+          // Wait until we have at least one completed global fetch stage
+          gmem_wait();
+
+          // Advance smem read and write stages
+          advance_smem_stages();
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          // Load fragment from global A
+          tb_frag_A.clear();
+          iterator_A.load(tb_frag_A);
+          ++iterator_A;
+
+          // Load fragment from global B
+          tb_frag_B.clear();
+          iterator_B.load(tb_frag_B);
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+          accum,
+          warp_frag_A[warp_mma_k % 2],
+          warp_frag_B[warp_mma_k % 2],
+          accum);
+      }
+    }
+
+  }
+
+
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // First, increment remaining warp tiles to catch it up with the write stage.
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
+    }
+
+    // If we bumped the read iterators to the end of the circular buffer, wrap them around to
+    // align them with the write iterators
+    if (smem_write_stage_idx == 0)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum)                       ///< source accumulator tile
+  {
+    // Prologue
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
new file mode 100755
index 000000000..e8616cc90
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -0,0 +1,208 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPlanarComplexBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Stride to the imaginary part of the A operand
+    static int const kImaginaryStrideA = ShapeA::kCount;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Stride to the imaginary part of the A operand
+    static int const kImaginaryStrideB = ShapeB::kCount;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount + kImaginaryStrideA> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount + kImaginaryStrideB> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
new file mode 100755
index 000000000..b9deb6320
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -0,0 +1,646 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Transformation applied to A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Transformation applied to B
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplexMultistage : 
+  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Archtecture tag
+  using ArchTag = arch::Sm80;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Transformation applied to A
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B
+  static ComplexTransform const kTransformB = TransformB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = ArrayPlanarComplex<
+    typename Policy::Operator::FragmentC::Element,
+    Policy::Operator::FragmentC::kElements
+  >;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLoadIterationsA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    static int const kAccessesPerGroupA =
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const kAccessesPerGroupB =
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A_real,
+    IteratorA &iterator_A_imag,
+    
+    IteratorB &iterator_B_real, 
+    IteratorB &iterator_B_imag, 
+    
+    int group_start_A = 0, 
+    int group_start_B = 0) {
+
+    iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Load for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+        
+      typename IteratorA::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+          
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorA::Element>::value * 
+        IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+        auto gmem_ptr_real = iterator_A_real.get();
+        auto gmem_ptr_imag = iterator_A_imag.get();
+
+        bool pred_guard = iterator_A_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_A_real;
+        ++iterator_A_imag;
+      }
+
+      ++this->smem_iterator_A_;
+    }
+
+    iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      typename IteratorB::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+      
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorB::Element>::value * 
+        IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+        auto gmem_ptr_real = iterator_B_real.get();
+        auto gmem_ptr_imag = iterator_B_imag.get();
+
+        bool pred_guard = iterator_B_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_B_real;
+        ++iterator_B_imag;
+      }
+      ++this->smem_iterator_B_;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void warp_mma_planar_complex(
+    Operator & warp_mma, 
+    FragmentC &accum,
+    WarpFragmentA const & real_A, 
+    WarpFragmentA const & imag_A, 
+    WarpFragmentB const & real_B, 
+    WarpFragmentB const & imag_B) {
+
+    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
+
+    WarpFragmentB neg_real_B = neg_op_B(real_B);
+    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
+
+    warp_mma(accum.real, real_A, real_B, accum.real);  
+
+    if (kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.imag, real_A, imag_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone) {
+      warp_mma(accum.imag, imag_A, real_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.real, imag_A, imag_B, accum.real);
+    }
+    else {
+      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
+    }
+  }
+
+public:
+  
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_real,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_imag,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_real,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_imag,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A_real.clear_mask(gemm_k_iterations == 0);
+      iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+      iterator_B_real.clear_mask(gemm_k_iterations == 0);
+      iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A_real.set_iteration_index(0);
+      iterator_A_imag.set_iteration_index(0);
+
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Load for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorA::Element>::value * 
+            IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_A_real.valid();
+
+          auto src_ptr_real = iterator_A_real.get();
+          auto src_ptr_imag = iterator_A_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideA /
+                      IteratorA::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_A_real;
+          ++iterator_A_imag;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B_real.set_iteration_index(0);
+      iterator_B_imag.set_iteration_index(0);
+
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorB::Element>::value * 
+            IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_B_real.valid();
+
+          auto src_ptr_real = iterator_B_real.get();
+          auto src_ptr_imag = iterator_B_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideB /
+                      IteratorB::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_B_real;
+          ++iterator_B_imag;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A_real.add_tile_offset({0, 1});
+      iterator_A_imag.add_tile_offset({0, 1});
+
+      iterator_B_real.add_tile_offset({1, 0});
+      iterator_B_imag.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a memory fence between stages of cp.async instructions
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Blocks until all but kStages-2 cp.async stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+
+    WarpFragmentA warp_frag_real_A[2];
+    WarpFragmentA warp_frag_imag_A[2];
+
+    WarpFragmentB warp_frag_real_B[2];
+    WarpFragmentB warp_frag_imag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
+    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
+
+    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
+    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A_real.clear_mask(gemm_k_iterations == 0);
+    iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+    iterator_B_real.clear_mask(gemm_k_iterations == 0);
+    iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag);
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
+        
+        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        }
+        else {
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+    
+        copy_tiles_and_advance(
+          iterator_A_real, 
+          iterator_A_imag,
+          iterator_B_real, 
+          iterator_B_imag,
+          group_start_iteration_A, 
+          group_start_iteration_B);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a memory fence between stages of cp.async instructions
+          cutlass::arch::cp_async_fence();
+
+          // Blocks until all but kStages-2 cp.async stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A_real.add_tile_offset({0, 1});
+          iterator_A_imag.add_tile_offset({0, 1});
+          
+          iterator_B_real.add_tile_offset({1, 0});
+          iterator_B_imag.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A_real.clear_mask(gemm_k_iterations == 0);
+          iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+          iterator_B_real.clear_mask(gemm_k_iterations == 0);
+          iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+        }
+
+        warp_mma_planar_complex(
+          warp_mma, 
+          accum, 
+          warp_frag_real_A[warp_mma_k % 2], 
+          warp_frag_imag_A[warp_mma_k % 2],
+          warp_frag_real_B[warp_mma_k % 2], 
+          warp_frag_imag_B[warp_mma_k % 2]);
+      }
+
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
new file mode 100755
index 000000000..0e36a6dc6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -0,0 +1,424 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Transformation applied to A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Transformation applied to B
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplexPipelined : 
+  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  /// Transformation applied to A
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B
+  static ComplexTransform const kTransformB = TransformB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = ArrayPlanarComplex<
+    typename Policy::Operator::FragmentC::Element,
+    Policy::Operator::FragmentC::kElements
+  >;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+ private:
+
+  using FragmentA = typename IteratorA::Fragment;
+  using FragmentB = typename IteratorB::Fragment;
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void warp_mma_planar_complex(
+    Operator & warp_mma, 
+    FragmentC &accum,
+    WarpFragmentA const & real_A, 
+    WarpFragmentA const & imag_A, 
+    WarpFragmentB const & real_B, 
+    WarpFragmentB const & imag_B) {
+
+    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
+
+    WarpFragmentB neg_real_B = neg_op_B(real_B);
+    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
+
+    warp_mma(accum.real, real_A, real_B, accum.real);  
+
+    if (kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.imag, real_A, imag_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone) {
+      warp_mma(accum.imag, imag_A, real_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.real, imag_A, imag_B, accum.real);
+    }
+    else {
+      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
+    }
+  }
+
+public:
+  
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_real,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_imag,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_real,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_imag,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A_real;
+    FragmentA tb_frag_A_imag;
+
+    FragmentB tb_frag_B_real;
+    FragmentB tb_frag_B_imag;
+
+    tb_frag_A_real.clear();
+    tb_frag_A_imag.clear();
+
+    tb_frag_B_real.clear();
+    tb_frag_B_imag.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A_real.load(tb_frag_A_real);
+    iterator_A_imag.load(tb_frag_A_imag);
+
+    iterator_B_real.load(tb_frag_B_real);
+    iterator_B_imag.load(tb_frag_B_imag);
+
+    ++iterator_A_real;
+    ++iterator_A_imag;
+
+    ++iterator_B_real;
+    ++iterator_B_imag;
+
+    this->smem_iterator_A_.store(tb_frag_A_real);
+    this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
+
+    this->smem_iterator_B_.store(tb_frag_B_real);
+    this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_real_A[2];
+    WarpFragmentA warp_frag_imag_A[2];
+
+    WarpFragmentB warp_frag_real_B[2];
+    WarpFragmentB warp_frag_imag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
+    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
+
+    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
+    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
+
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A_real.clear_mask(gemm_k_iterations <= 1);
+    iterator_A_imag.clear_mask(gemm_k_iterations <= 1);
+    
+    iterator_B_real.clear_mask(gemm_k_iterations <= 1);
+    iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(tb_frag_A_real);
+          this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
+
+          this->smem_iterator_B_.store(tb_frag_B_real);
+          this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
+
+          __syncthreads();
+          
+          ++this->smem_iterator_B_;
+          ++this->smem_iterator_A_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
+        
+        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A_real.load(tb_frag_A_real);
+          iterator_A_imag.load(tb_frag_A_imag);
+
+          iterator_B_real.load(tb_frag_B_real);
+          iterator_B_imag.load(tb_frag_B_imag);
+
+          ++iterator_A_real;
+          ++iterator_A_imag;
+          ++iterator_B_real;
+          ++iterator_B_imag;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A_real.clear_mask(gemm_k_iterations <= 2);
+          iterator_A_imag.clear_mask(gemm_k_iterations <= 2);
+          iterator_B_real.clear_mask(gemm_k_iterations <= 2);
+          iterator_B_imag.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma_planar_complex(
+          warp_mma, 
+          accum, 
+          warp_frag_real_A[warp_mma_k % 2], 
+          warp_frag_imag_A[warp_mma_k % 2],
+          warp_frag_real_B[warp_mma_k % 2], 
+          warp_frag_imag_B[warp_mma_k % 2]);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
new file mode 100755
index 000000000..311562865
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaSingleStage : public MmaBase<Shape_, Policy_, 1> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 1>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  using ArchTag = arch::Sm70;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline)
+  static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1");
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaSingleStage(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,            ///< number of iterations of the mainloop
+    FragmentC &accum,                 ///< destination accumulator tile
+    IteratorA iterator_A,             ///< iterator over A operand in global memory
+    IteratorB iterator_B,             ///< iterator over B operand in global memory
+    FragmentC const &src_accum) {     ///< source accumualtor tile
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A;
+    WarpFragmentB warp_frag_B;
+
+    Operator warp_mma;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      this->smem_iterator_A_.store(tb_frag_A);
+      this->smem_iterator_B_.store(tb_frag_B);
+
+      __syncthreads();
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+        
+        this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A);
+        this->warp_tile_iterator_B_.load(warp_frag_B);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        warp_mma(accum, warp_frag_A, warp_frag_B, accum);
+      }
+
+      // Add negative offsets to return smem load iterators to the 'start' of the shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+
+      __syncthreads();
+
+      iterator_A.load(tb_frag_A);
+      iterator_B.load(tb_frag_B);
+
+      ++iterator_A;
+      ++iterator_B;
+
+      // Avoid reading out of bounds if this was the last loop iteration
+      iterator_A.clear_mask(gemm_k_iterations <= 2);
+      iterator_B.clear_mask(gemm_k_iterations <= 2);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
new file mode 100755
index 000000000..bd793fc84
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
@@ -0,0 +1,756 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, norm and sum, in the prologue and
+    stores them in the register file.  We will call elementwise operation to
+    apply norm and sum between ldmatrix and warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/softmax_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorNormSum_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaSoftmaxMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, Policy_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorNormSum = IteratorNormSum_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, Policy, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentNormSum = typename IteratorNormSum::Fragment;
+
+  static bool const kInternalTranspose = InternalTranspose;
+
+  using SoftmaxFragment = typename platform::conditional<kInternalTranspose,
+                                                         WarpTransformedFragmentB,
+                                                         WarpTransformedFragmentA>::type;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaSoftmaxMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorNormSum iterator_norm_sum,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentNormSum warp_loaded_frag_norm_sum;
+    iterator_norm_sum.add_tile_offset({0, warp_idx_m_});
+    iterator_norm_sum.load(warp_loaded_frag_norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::SoftmaxScaleBiasTransform<
+        SoftmaxFragment, WarpLoadedFragmentNormSum> elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    if (kInternalTranspose) {
+      elementwise_transform(warp_transformed_frag_B[0],
+                         warp_loaded_frag_norm_sum);
+    } else {
+      elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_norm_sum);
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+      }
+
+    }
+    
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
new file mode 100755
index 000000000..bb10c0a8f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    /// Padding used for E operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingE_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct SparseMmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingE = SmemPaddingE_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class SparseMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  static int const kSparse = Operator::kSparse;
+
+  static int const kElementsPerElementE = Operator::kElementsPerElementE;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  /// Tensor reference to the E operand
+  using TensorRefE = TensorRef<typename Operator::ElementE, typename Operator::LayoutE>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK / kSparse * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the E matrix operand in shared memory
+    using ShapeE =
+        MatrixShape<Shape::kM * 2 + Policy::SmemPaddingE::kRow,
+                    Shape::kK / kSparse / kElementsPerElementE / 2 * kStages +
+                        Policy::SmemPaddingE::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for E operand
+    AlignedBuffer<typename Operator::ElementE, ShapeE::kCount> operand_E;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the E matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutE LayoutE() {
+      return Operator::LayoutE::packed({ShapeE::kRow, ShapeE::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the E operand
+    CUTLASS_HOST_DEVICE
+    TensorRefE operand_E_ref() {
+      return TensorRefE{operand_E.data(), LayoutE()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of E operand from shared memory
+  typename Operator::IteratorE warp_tile_iterator_E_;
+
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  SparseMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx),
+      warp_tile_iterator_E_(shared_storage.operand_E_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
new file mode 100755
index 000000000..8113583d6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -0,0 +1,668 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_sparse_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Iterates over tiles of E operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorE_,
+    /// Iterates over tiles of E operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorE_,
+    /// Cache operation for operand E
+    cutlass::arch::CacheOperation::Kind CacheOpE,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class SparseMmaMultistage : 
+  public SparseMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = SparseMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of E operand in global memory
+  using IteratorE = IteratorE_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorE = SmemIteratorE_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE = CacheOpE;
+
+  static int const kSparse = Policy::Operator::kSparse;
+  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
+  static int const kMaxID2 = Policy::Operator::kMaxID2;
+  static int const kElementsPerElementE =
+      Policy::Operator::kElementsPerElementE;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// ElementE
+  using ElementE = typename IteratorE::Element;
+
+  /// LayoutE
+  using LayoutE = typename IteratorE::Layout; 
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of async copies to load one stage of operand A
+    static int const TBLoadIterationsA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of async copies to load one stage of operand B
+    static int const TBLoadIterationsB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of async copies to load one stage of operand E
+    static int const TBLoadIterationsE =
+        IteratorE::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of async copies to load one group of operand A
+    static int const kAccessesPerGroupA =
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of async copies to load one group of operand B
+    static int const kAccessesPerGroupB =
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of async copies to load one group of operand E
+    static int const kAccessesPerGroupE =
+        (TBLoadIterationsE + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// E operand is tiny.  For the most of time, not all the warps are needed
+    /// to load it from the global memory.
+    static int const kValidWarps = IteratorE::ThreadMap::kThreads / 32;
+
+    /// B operand is twice as big as A which brings very high register pressure.
+    /// We have to sacrifice the double buffer when the warp tile size is big.
+    static int const kBBufferSize =
+        ((sizeof(typename Operator::ElementC) == 4) &&
+         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
+                             typename Operator::ElementA>::value &&
+           platform::is_same<typename Operator::Policy::Operator::ElementB,
+                             typename Operator::ElementB>::value)) &&
+         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
+            ? 1
+            : 2;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+  using WarpFragmentE = typename Operator::FragmentE;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of E operand to shared memory
+  SmemIteratorE smem_iterator_E_;
+
+  /// Warp id
+  bool is_warp_valid_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  SparseMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_iterator_E_(shared_storage.operand_E_ref(), thread_idx)
+  {
+    is_warp_valid_ = warp_idx < Detail::kValidWarps;
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_E_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              IteratorE &iterator_E, int group_start_A = 0,
+                              int group_start_B = 0, int group_start_E = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // async copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::TBLoadIterationsA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // async copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::TBLoadIterationsB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+
+    iterator_E.set_iteration_index(group_start_E);
+    this->smem_iterator_E_.set_iteration_index(group_start_E);
+
+    // async copy for operand E
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupE; ++j) {
+      if (group_start_E + j < Detail::TBLoadIterationsE) {
+        typename IteratorE::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorE::AccessType *>(
+                this->smem_iterator_E_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
+                              IteratorE::ThreadMap::kElementsPerAccess / 8;
+
+        auto gmem_ptr = iterator_E.get();
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpE>(
+            dst_ptr, gmem_ptr, iterator_E.valid() && is_warp_valid_);
+
+        ++iterator_E;
+        ++this->smem_iterator_E_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over E operand in global memory
+      IteratorE iterator_E,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_E.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // async copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // async copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      iterator_E.set_iteration_index(0);
+      this->smem_iterator_E_.set_iteration_index(0);
+
+      // async copy for operand E
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsE; ++j) {
+        typename IteratorE::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorE::AccessType *>(
+                this->smem_iterator_E_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
+                              IteratorE::ThreadMap::kElementsPerAccess / 8;
+        if (is_warp_valid_)
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpE>(
+              dst_ptr, iterator_E.get(), iterator_E.valid());
+
+        ++iterator_E;
+
+        ++this->smem_iterator_E_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+      iterator_E.add_tile_offset({0, 1});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+      this->smem_iterator_E_.add_tile_offset({0, 1});
+
+      // cp.async.commit_group - completes a stage
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[Detail::kBBufferSize];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[Detail::kBBufferSize];
+    WarpFragmentE warp_frag_E[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_E_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+    this->warp_tile_iterator_E_.load(warp_frag_E[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+    ++this->warp_tile_iterator_E_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_E.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_E_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_E_.load(warp_frag_E[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_E_;
+
+       if (Detail::kBBufferSize == 2) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % Detail::kBBufferSize]);
+
+        warp_mma(
+          accum,
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize], accum,
+          warp_frag_E[warp_mma_k % 2]
+        );
+
+        if (Detail::kBBufferSize == 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+          ++this->warp_tile_iterator_B_;
+  
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+          group_start_iteration_E = warp_mma_k * Detail::kAccessesPerGroupE;
+
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
+              group_start_iteration_B, group_start_iteration_E);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+          group_start_iteration_E =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupE;
+
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
+              group_start_iteration_B, group_start_iteration_E);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed. 
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+          iterator_E.add_tile_offset({0, 1});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+          this->smem_iterator_E_.add_tile_offset({0, 1});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_E_.add_tile_offset({0, -Base::kStages});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            this->warp_tile_iterator_E_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+          iterator_E.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
+      }
+
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
new file mode 100755
index 000000000..fa95dd7d2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaWithReductionMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  using FragmentReduction = typename Operator::FragmentReduction;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  static int const kReduceKForA = Operator::kReduceKForA;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaWithReductionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      FragmentReduction &gemm_k_reduction_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum,
+          gemm_k_reduction_accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+    
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
new file mode 100755
index 000000000..1a4948d07
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -0,0 +1,459 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
+      GEMM problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+template <int N = 1>
+struct GemmIdentityThreadblockSwizzle {
+
+  CUTLASS_HOST_DEVICE
+  GemmIdentityThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *Gemm* problem size: gemm(M, N, K)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *ImplicitGemm* Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return get_tiled_shape(
+      implicit_gemm_problem_size, tile_size, split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *ImplicitGemm* Conv3d problem size: conv_operator(NZPQK, NDHWC, KTRSC)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return get_tiled_shape(
+      implicit_gemm_problem_size, tile_size, split_k_slices);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    int tile = 1 << get_log_tile(tiled_shape);
+    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    auto n = tiled_shape.n();
+    // Thresholds picked so that it doesn't cause too many no-op CTAs
+    if (N >= 8 && n >= 6)
+      return 3;
+    else if (N >= 4 && n >= 3)
+      return 2;
+    else if (N >= 2 && n >= 2)
+      return 1;
+    else
+      return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+
+    int const kTile = N;
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+
+    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
+      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
+
+    return GemmCoord{
+      (block_idx_x / kTile),
+      (block_idx_y * kTile) + (block_idx_x % kTile),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+struct GemmHorizontalThreadblockSwizzle {
+
+  CUTLASS_HOST_DEVICE
+  GemmHorizontalThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      split_k_slices);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for batched GEMMs
+struct GemmBatchedIdentityThreadblockSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int batch_count) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      batch_count % (1 << 16));
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxZ()
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Gets the batch index
+  CUTLASS_DEVICE
+  static int get_batch_idx() {
+    return RematerializeBlockIdxZ();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for split-K GEMMs
+template <int N = 1>
+struct GemmSplitKIdentityThreadblockSwizzle {
+
+  int const kTile = N;
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int partitions) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      partitions);
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    auto n = tiled_shape.n();
+    // Thresholds picked so that it doesn't cause too many no-op CTAs
+    if (N >= 8 && n >= 6)
+      return 3;
+    else if (N >= 4 && n >= 3)
+      return 2;
+    else if (N >= 2 && n >= 2)
+      return 1;
+    else
+      return 0;
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    int tile = 1 << get_log_tile(tiled_shape);
+    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+
+    int const kTile = N;
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+
+    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
+      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
+
+    return GemmCoord{
+      (block_idx_x / kTile),
+      (block_idx_y * kTile) + (block_idx_x % kTile),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for split-K GEMMs
+struct GemmSplitKHorizontalThreadblockSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int partitions) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      partitions);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for batched GEMVs
+struct GemvBatchedStridedThreadblockDefaultSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static BatchedGemmCoord get_tiled_shape(
+    BatchedGemmCoord problem_size,
+    BatchedGemmCoord tile_size) {
+
+    return BatchedGemmCoord(
+      1, // M is always 1
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      (problem_size.k() + tile_size.k() - 1) / tile_size.k(),
+      (problem_size.batch() + tile_size.batch() - 1) / tile_size.batch());
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(BatchedGemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static BatchedGemmCoord get_tile_offset(int log_tile) {
+    return BatchedGemmCoord{
+      0, // M is always 1
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ(),
+      RematerializeBlockIdxY(),
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static BatchedGemmCoord get_tile_offset() {
+    return BatchedGemmCoord{
+      0, // M is always 1
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ(),
+      RematerializeBlockIdxY(),
+    };
+  }
+
+  /// Gets the batch tile index
+  CUTLASS_DEVICE
+  static int get_batch_tile_idx() {
+    return RematerializeBlockIdxY();
+  }
+
+  /// Gets the absolute batch index
+  CUTLASS_DEVICE
+  static int get_batch_idx() {
+    return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
new file mode 100755
index 000000000..b79e587d7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -0,0 +1,801 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements streamk threadblock mapping blockIdx to GEMM problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <iostream>
+#include "cutlass/core_io.h"
+#include "cutlass/trace.h"
+#endif
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock mapping control for GEMMs
+struct ThreadblockSwizzleStreamK {
+
+  /// Advertise StreamkFeature
+  using StreamkFeature = void;
+
+
+  /// Kernel traits
+  template <typename GemmKernel>
+  struct KernelTraits {};
+
+
+  /// Reduction strategy
+  enum ReductionStrategy
+  {
+    kNone,      // Data-parallel strategy (no seams, fixup, etc.)
+
+    kAtomic,    // Non-deterministic reduction of SK-block partials using atomic aggregation in L2
+
+    kMixed,     // Deterministic reduction of SK-block partials employing either:
+                //   (a) A separate wave of reduction thread blocks" (for scenarios with lots of
+                //       SK-blocks per SK-tile)
+                //   (b) Turnstile-ordered atomic aggregation in L2 (for scenarios with few
+                //       SK-blocks per SK-tile)
+  };
+
+  static ReductionStrategy const kReductionStrategy = kMixed;
+
+
+  //
+  // Heuristics
+  //
+
+  /// Data-parallel wave-quantization efficiency threshold (above which we go data-parallel)
+  static float constexpr kDpEfficiencyThreshold = 0.92f;
+
+  /// Minimum number of MAC-iterations per streamk block
+  static int const kMinItersPerSkBlock = 2;
+
+  /// Height in CTAs of a grid rasterization cohort
+  static int const kCohortCtasM = 8;
+
+  /// Width in CTAs of a grid rasterization cohort
+  static int const kCohortCtasN = 4;
+
+  /// Number of CTAs per cohort
+  static int const kCtasPerCohort = kCohortCtasN * kCohortCtasM;
+
+  /// Cost-equivalent number of SM-iterations for fixup I/O
+  static int const kFixupStartupIterEquiv = 10;
+  static int const kFixupPeerIterEquiv = 3;
+
+
+  //
+  // Member state
+  //
+
+
+  /// The 3D value-extents of the GEMM computation volume (m,n,k)
+  GemmCoord problem_size;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_tiled_shape_m;
+  FastDivmod div_mod_tiled_shape_n;
+  FastDivmod div_mod_tiled_cohort_shape_n;
+  FastDivmod div_mod_iters_per_tile;
+
+  /// Whether to perform cohort CTA rasterization
+  bool cohort_raster;
+
+  // Whether to pad and remap block indices
+  bool remap_block_indices;
+
+  /// CTA occupancy per SM
+  int sm_occupancy;
+
+  /// Number of SMs for dispatch heuristics to load-balance using Stream-K CTAs (wave size)
+  int avail_sms;
+
+  int dp_blocks;                            /// Number of data-parallel thread blocks in the grid
+  int dp_first_wave_tiles;                  /// Number of output tiles each CTA in the first DP wave will produce
+
+  /// Number of reduction blocks in the grid
+  int reduction_blocks;
+
+  int sk_waves;
+  int sk_tiles;
+  int sk_big_blocks_per_region;
+  int sk_iters_per_region;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_sk_iters_per_normal_block;
+  FastDivmod div_mod_sk_iters_per_big_block;
+  FastDivmod div_mod_sk_iters_per_region;
+  FastDivmod div_mod_sk_regions;                      //!! used in block map
+  FastDivmod div_mod_sk_blocks_per_region;            //!! used in block map
+
+  /// The batch count
+  int batch_count;
+
+
+  //
+  // Host+device interface
+  //
+
+  /// Constructor
+  ThreadblockSwizzleStreamK() = default;
+
+  /// Returns the GEMM volume in thread block tiles
+  CUTLASS_HOST_DEVICE
+  GemmCoord tiled_shape() const
+  {
+    return GemmCoord(
+        static_cast<int>(div_mod_tiled_shape_m),
+        static_cast<int>(div_mod_tiled_shape_n),
+        batch_count);
+  }
+
+  /// Number of iterations per output tile
+  CUTLASS_HOST_DEVICE
+  int iters_per_tile() const
+  {
+    return static_cast<int>(div_mod_iters_per_tile);
+  }
+
+  /// Number of iterations for normal SK-blocks
+  CUTLASS_HOST_DEVICE
+  int sk_iters_per_normal_block() const
+  {
+    return static_cast<int>(div_mod_sk_iters_per_normal_block);
+  }
+
+  /// Number of SK regions
+  CUTLASS_HOST_DEVICE
+  int sk_regions() const
+  {
+    return static_cast<int>(div_mod_sk_regions);
+  }
+
+  /// Number of SK blocks per region (splitting factor)
+  CUTLASS_HOST_DEVICE
+  int sk_blocks_per_region() const
+  {
+    return static_cast<int>(div_mod_sk_blocks_per_region);
+  }
+
+
+  //
+  // Host-side interface
+  //
+
+  /// Debug print
+  void Print()
+  {
+#ifndef __CUDA_ARCH__
+    auto tiles = tiled_shape().mn().product();
+    std::cout <<
+        "problem_size: (" << problem_size.m() << "," << problem_size.n() << ")" <<
+        ", tiled_shape: (" << tiled_shape().m() << "," << tiled_shape().n() << ")" <<
+        ", tiles: " << tiles <<
+        ", dp_tiles: " << tiles - sk_tiles <<
+        ", sk_tiles: " << sk_tiles <<
+        ", iters_per_tile: " << iters_per_tile() <<
+        ", reduction_blocks: " << reduction_blocks <<
+        ", dp_blocks: " << dp_blocks <<
+        ", dp_waves: " << dp_blocks / avail_sms <<
+        ", dp_first_wave_tiles: " << dp_first_wave_tiles <<
+        ", sk_blocks_per_region: " << sk_blocks_per_region() <<
+        ", sk_regions: " << sk_regions() <<
+        ", sk_waves: " << sk_waves <<
+        ", sk_iters_per_normal_block: " << sk_iters_per_normal_block() <<
+        ", sk_big_blocks_per_region: " << sk_big_blocks_per_region <<
+        ", remap_block_indices: " << remap_block_indices <<
+        ", cohort_raster: " << cohort_raster <<
+        ", sm_occupancy: " << sm_occupancy <<
+        ", avail_sms: " << avail_sms <<
+        ", num_blocks: " << get_num_blocks() <<
+        "\n\n";
+#endif
+  }
+
+
+  // Compute sk_blocks to dispatch for a given number of sk_tiles
+  static void get_sk_blocks(
+    int &sk_blocks,     /// [out]
+    int &savings_iters, /// [out]
+    int sk_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int max_sk_occupancy,
+    bool allow_partial_wave)
+  {
+    savings_iters = INT_MIN;
+    sk_blocks = 0;
+
+    if (sk_tiles == 0) {
+      return;
+    }
+
+    int sk_iters = sk_tiles * iters_per_tile;
+
+    int dp_equiv_waves = (sk_tiles + avail_sms - 1) / avail_sms;
+    int dp_equiv_iters = iters_per_tile * dp_equiv_waves;
+
+    int min_sk_blocks = (allow_partial_wave) ? fast_min(avail_sms, sk_tiles + 1) : avail_sms;
+    int max_sk_blocks = fast_min(avail_sms * max_sk_occupancy, sk_iters / kMinItersPerSkBlock);
+
+    for (int trial_sk_blocks = min_sk_blocks; trial_sk_blocks <= max_sk_blocks; ++trial_sk_blocks)
+    {
+      int sk_waves = (trial_sk_blocks + avail_sms - 1) / avail_sms;
+      int max_sk_iters_per_block = (sk_iters + trial_sk_blocks - 1) / trial_sk_blocks;
+      int sk_iter_equiv = max_sk_iters_per_block * sk_waves;
+
+      int num_peers = ((trial_sk_blocks + sk_tiles - 1) / sk_tiles) + 1;        // add one for alignment skew
+
+      float iter_cost = 0.02f * float(num_peers) * float(sk_iter_equiv);
+
+      if (trial_sk_blocks % sk_tiles == 0)
+      {
+        // aligned
+        num_peers = (trial_sk_blocks / sk_tiles);
+
+        iter_cost = 0.0f;
+      }
+
+      float peer_cost = 2.0f * float(num_peers);
+
+      float base_cost = 2.0f * float(sk_waves);
+
+      int fixup_iter_equiv = int(base_cost + iter_cost + peer_cost);
+
+      int trial_savings_iters = dp_equiv_iters - sk_iter_equiv - fixup_iter_equiv;
+
+      if (trial_savings_iters >= savings_iters) {
+          savings_iters = trial_savings_iters;
+          sk_blocks = trial_sk_blocks;
+      }
+    }
+  }
+
+
+  /// Determine the populations of DP and SK blocks to invoke for the given number of output tiles
+  static void get_blocks(
+    int &dp_tiles,      /// [out]
+    int &sk_blocks,     /// [out]
+    int output_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int sm_occupancy)
+  {
+    int full_waves = output_tiles / avail_sms;
+    int full_wave_tiles = full_waves * avail_sms;
+    int partial_wave_tiles = output_tiles - full_wave_tiles;
+
+    int score = -1;
+    dp_tiles = output_tiles;
+    sk_blocks = 0;
+
+    if (partial_wave_tiles == 0)
+    {
+      // Perfect quantization
+      return;
+    }
+
+    if (full_waves < sm_occupancy)
+    {
+        // We're less than full GPU occupancy
+
+        // Form the SK wave from the partial wave to get us up to full GPU occupancy
+        int max_sk_occupancy = sm_occupancy - full_waves;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score < 0) {
+          // not profitable
+          sk_blocks = 0;
+          dp_tiles = output_tiles;
+        }
+
+        return;
+    }
+
+    // We're at (or greater) than GPU occupancy
+
+    if ((sm_occupancy > 1 ) && (full_waves % sm_occupancy == sm_occupancy - 1))
+    {
+        // If occupancy is more than one CTA per SM, form the SK wave from the partial
+        // wave to get us to full GPU occupancy
+        int max_sk_occupancy = 1;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score >= 0) {
+            return;
+        }
+    }
+
+    // Form the SK wave by combining the last full wave and the partial wave
+    // We're less than full GPU occupancy
+    dp_tiles = full_wave_tiles - avail_sms;
+
+    int max_sk_occupancy = sm_occupancy - ((full_waves - 1) % sm_occupancy);
+
+    get_sk_blocks(
+      sk_blocks,
+      score,
+      partial_wave_tiles + avail_sms,
+      iters_per_tile,
+      avail_sms,
+      max_sk_occupancy,
+      false);                 // we cannot run with less than a full wave of SK-blocks
+
+    if (score < 0) {
+      // not profitable
+      sk_blocks = 0;
+      dp_tiles = output_tiles;
+    }
+
+  }
+
+  /// Constructor: *Gemm* problem size (m, n, k)
+  ThreadblockSwizzleStreamK(
+    GemmUniversalMode const mode_,
+    GemmCoord const problem_size_,
+    GemmCoord const tile_size_,
+    int const batch_split_,                        /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+    int const sm_occupancy_,
+    int const device_sms_,
+    int const avail_sms_,                          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    size_t const element_A_bytes_,
+    size_t const element_B_bytes_,
+    size_t const element_C_bytes_,
+    int const epilogue_acc_fragments_)
+  :
+    problem_size(problem_size_),
+    batch_count((mode_ == GemmUniversalMode::kBatched || mode_ == GemmUniversalMode::kArray) ? batch_split_ : 1),
+    reduction_blocks(0),
+    dp_blocks(0),
+    dp_first_wave_tiles(1),     // Default: one tile per DP-block in the first wave of DP blocks
+    sk_tiles(0),
+    sk_big_blocks_per_region(0),
+    sk_iters_per_region(0),
+    sk_waves(0),
+    sm_occupancy(sm_occupancy_),
+    remap_block_indices(false),
+    avail_sms(fast_max(1, avail_sms_)),
+    cohort_raster(false)
+  {
+    int gpu_occupancy = device_sms_ * sm_occupancy;
+    int iters_per_tile = (problem_size.k() + tile_size_.k() - 1) / tile_size_.k();
+    int sk_iters_per_normal_block = 0;
+
+    int sk_regions = 1;              // Default: a single region of iteration space (across all SK tiles)
+    int sk_blocks_per_region = 0;
+
+    GemmCoord tiled_shape(
+      (problem_size.m() + tile_size_.m() - 1) / tile_size_.m(),
+      (problem_size.n() + tile_size_.n() - 1) / tile_size_.n(),
+      batch_count);
+
+    size_t problem_bytes =
+              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
+              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
+              (element_B_bytes_ * problem_size.k() * problem_size.n());
+
+    size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
+
+    [[maybe_unused]] float flops_per_byte = float(problem_flops) / float(problem_bytes);
+
+    int output_tiles = tiled_shape.m() * tiled_shape.n();
+    int waves = (output_tiles + avail_sms - 1) / avail_sms;
+    [[maybe_unused]] float dp_efficiency = float(output_tiles) / float(waves * avail_sms);
+
+    //
+    // Determine dispatch composition of DP-tiles and SK-blocks
+    //
+
+    // Start with a DP-only configuration
+    int dp_tiles = output_tiles;    // Number of data-parallel tiles
+    int sk_blocks = 0;              // Number of thread blocks to produce the remaining SK tiles
+
+    // Only kGemm mode allows for SK load balancing
+    if (mode_ == GemmUniversalMode::kGemm)
+    {
+      int split_factor = batch_split_;
+      if (split_factor > 1)
+      {
+        // Split-K override
+        dp_tiles = 0;
+        sk_blocks = output_tiles * split_factor;
+      }
+      else if ((kReductionStrategy != kNone) &&   // Load-balancing strategy statically enabled
+        (avail_sms > 1))                         // Plurality of SMs to load balance across
+      {
+        // Use heuristics
+        get_blocks(
+          dp_tiles,      /// [out]
+          sk_blocks,     /// [out]
+          output_tiles,
+          iters_per_tile,
+          avail_sms,
+          sm_occupancy);
+      }
+    }
+
+    sk_tiles = output_tiles - dp_tiles;
+
+
+    // Compute SK block iteration details
+    if (sk_blocks > 0)
+    {
+      sk_waves = (sk_blocks + avail_sms - 1) / avail_sms;
+
+      int sk_iters = sk_tiles * iters_per_tile;
+      sk_blocks = fast_min(sk_blocks, sk_iters);
+
+      sk_iters_per_normal_block = sk_iters / sk_blocks;
+      int extra_sk_iters = sk_iters - (sk_iters_per_normal_block * sk_blocks);
+      int sk_big_blocks = extra_sk_iters;
+
+      if ((sk_blocks > sk_tiles) && (sk_blocks % sk_tiles == 0))
+      {
+        // Split-K decomposition
+        sk_regions = sk_tiles;
+      }
+
+      sk_blocks_per_region = sk_blocks / sk_regions;
+      sk_big_blocks_per_region = sk_big_blocks / sk_regions;
+      sk_iters_per_region = sk_iters / sk_regions;
+
+      // Use a separate reduction wave when all of:
+      // - Non-atomic reduction stratgy
+      // - The number of SK waves won't fully occupy the GPU (Otherwise we don't have
+      //   a strong-scaling case for more parallel reduction)
+      // - More than three peers working on an SK tile.  (This occurs when the ratio of
+      //   SK-blocks to SK-tiles > 2, as a single tile may be covered by four SK-blocks,
+      //   e.g.:[partial-block | block | block | partial-block] ).  With three or
+      //   less peers, the two non-finishing SK-blocks are not expexted to contend.
+      if ((kReductionStrategy == kMixed) &&
+          (sk_waves < sm_occupancy) &&
+          (sk_blocks > 2 * sk_tiles))
+      {
+        // Launch a reduction block for every accumulator fragment in each SK-tile
+        reduction_blocks = sk_tiles * epilogue_acc_fragments_;
+
+      }
+
+      // When we have a multi-occupancy kernel and at least two waves of active blocks (where
+      // at least one wave is SK blocks), we need to (1) dispatch at least four waves, and (2)
+      // remap the block indices so that we can reliably spread the SK blocks evenly across the
+      // device's first SM occupancy valence. Also see get_num_blocks() and get_block_idx().
+      remap_block_indices = (
+          (sm_occupancy > 1) &&
+          (device_sms_ == avail_sms) &&
+          (get_num_active_blocks() > avail_sms * 2));
+
+      // Initialize fast div/mod members related to SK
+      div_mod_sk_iters_per_normal_block = FastDivmod(sk_iters_per_normal_block);
+      div_mod_sk_iters_per_big_block = FastDivmod(sk_iters_per_normal_block + 1);
+      div_mod_sk_iters_per_region = FastDivmod(sk_iters_per_region);
+      div_mod_sk_regions = FastDivmod(sk_regions);
+      div_mod_sk_blocks_per_region = FastDivmod(sk_blocks_per_region);
+    }
+
+    //
+    // Compute DP blocks
+    //
+
+    dp_blocks = dp_tiles;
+
+    cutlass::gemm::GemmCoord tiled_cohort_shape(
+        (tiled_shape.m() + kCohortCtasM - 1) / kCohortCtasM,
+        (tiled_shape.n() + kCohortCtasN - 1) / kCohortCtasN,
+        tiled_shape.k());
+    int cohort_blocks = (tiled_cohort_shape.m() * tiled_cohort_shape.n()) * kCtasPerCohort;
+    float cohort_efficiency = float(dp_blocks) / float(cohort_blocks);
+
+    // Check if the SK tiles would be in cohorts that are in-bounds
+    bool sk_in_range = true;
+    if (sk_tiles > 0)
+    {
+      int last_sk_tile = sk_tiles - 1;
+      int cohort_tile_idx = last_sk_tile / kCtasPerCohort;
+      int cohort_grid_m = cohort_tile_idx / tiled_cohort_shape.n();
+      int cohort_grid_n = (cohort_grid_m > 0) ?
+        tiled_cohort_shape.n() - 1 :
+        cohort_tile_idx % tiled_cohort_shape.n();
+
+      if ((((cohort_grid_m + 1) * kCohortCtasM) >= tiled_shape.m()) ||
+          (((cohort_grid_n + 1) * kCohortCtasN) >= tiled_shape.n()))
+      {
+        sk_in_range = false;
+      }
+
+    }
+
+    // Decide if we're going to be doing cohort raster
+    if (sk_in_range &&
+        (dp_blocks >= gpu_occupancy * 2) &&
+        (cohort_efficiency > 0.85f))
+    {
+      cohort_raster = true;
+      dp_blocks = cohort_blocks;
+    }
+    else if (sk_waves > 0)
+    {
+      // Update semi-persistence of first DP wave to ensure full grid wavesets
+      // (Only applies when there's an SK component and we're not doing blocked cohort rasterization)
+      int dp_tile_waves = (dp_tiles + avail_sms - 1) / avail_sms;
+      int full_dp_tile_waves = dp_tiles / avail_sms;
+      int waveset_excess = (sk_waves + dp_tile_waves) % sm_occupancy;
+
+      if (dp_first_wave_tiles + waveset_excess <= full_dp_tile_waves)
+      {
+        dp_first_wave_tiles += waveset_excess;
+        dp_blocks -= (waveset_excess * avail_sms);
+      }
+    }
+
+    // Setup fast-div/mod for device-side usage
+    div_mod_tiled_shape_m = FastDivmod(tiled_shape.m());
+    div_mod_tiled_shape_n = FastDivmod(tiled_shape.n());
+    div_mod_tiled_cohort_shape_n = FastDivmod(tiled_cohort_shape.n());
+    div_mod_iters_per_tile = FastDivmod(iters_per_tile);
+
+  }
+
+  /// Number of blocks performing useful work
+  int get_num_active_blocks() const
+  {
+    return (sk_waves * avail_sms) + dp_blocks + reduction_blocks;
+  }
+
+  /// Obtains number of threadblocks per GEMM
+  int get_num_blocks() const
+  {
+    int active_blocks = get_num_active_blocks();
+    if (remap_block_indices)
+    {
+      // Add padding blocks if we are performing remapping in order to dispatch a grid of at least four waves
+      return fast_max(active_blocks, avail_sms * 4);
+    }
+
+    return active_blocks;
+  }
+
+
+  /// Obtains grid extents in CTAs
+  dim3 get_grid_dims() const
+  {
+    return dim3(get_num_blocks(), 1, batch_count);
+  }
+
+
+  //
+  // Device-side interface
+  //
+
+  /// Obtains number of threadblocks per GEMM
+  CUTLASS_DEVICE
+  int device_num_blocks() const
+  {
+    return gridDim.x;
+  }
+
+  /// Obtains tile index for the given sk iteration
+  CUTLASS_DEVICE
+  int get_sk_tile_idx(int iter) const
+  {
+    int tile_idx = div_mod_iters_per_tile.div(iter);
+    return tile_idx;
+  }
+
+  /// Obtains the batch index
+  CUTLASS_DEVICE
+  int get_batch_idx() const
+  {
+    return RematerializeBlockIdxZ();
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset(int tile_idx) const
+  {
+    int m, n;
+
+    // row-major raster
+    div_mod_tiled_shape_n(m, n, tile_idx);
+
+    if (tiled_shape().m() < tiled_shape().n())
+    {
+      // column-major raster
+      div_mod_tiled_shape_m(n, m, tile_idx);
+    }
+
+    if (cohort_raster)
+    {
+      // tiled cohort raster
+      int cohort_tile_idx = tile_idx / kCtasPerCohort;
+      int cohort_grid_m, cohort_grid_n;
+      div_mod_tiled_cohort_shape_n(cohort_grid_m, cohort_grid_n, cohort_tile_idx);
+
+      int block_idx_cohort = tile_idx % kCtasPerCohort;
+      int block_cohort_m = block_idx_cohort / kCohortCtasN;
+      int block_cohort_n = block_idx_cohort % kCohortCtasN;
+
+      m = (cohort_grid_m * kCohortCtasM) + block_cohort_m;
+      n = (cohort_grid_n * kCohortCtasN) + block_cohort_n;
+    }
+
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset_row_major(int tile_idx) const
+  {
+    // row-major raster
+    int m, n;
+    div_mod_tiled_shape_n(m, n, tile_idx);
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains calling threadblock's linear threadblock index
+  CUTLASS_DEVICE
+  int get_block_idx() const
+  {
+    int block_idx = RematerializeBlockIdxX();
+
+    // Remap the block indices for the first two waves of thread blocks if
+    // we have multi-occupancy and the grid constitutes four or more waves
+    if (remap_block_indices && (block_idx < avail_sms * 2))
+    {
+      int dest_sm = block_idx / 2;
+      int dest_wave = block_idx % 2;
+      int remapped_block_idx = dest_sm + (dest_wave * avail_sms);
+      block_idx = remapped_block_idx;
+    }
+
+    // Remap block indices to interleave SK regions to limit intra-region waiting
+    if (block_idx < sk_regions() * sk_blocks_per_region())
+    {
+      int block_in_region;
+      int region;
+      div_mod_sk_regions(block_in_region, region, block_idx);
+      block_idx = (region * sk_blocks_per_region()) + block_in_region;
+    }
+
+    return block_idx;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_sk_block_idx(int iter) const
+  {
+    int region_idx;
+    int iter_in_region;
+    div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
+
+    int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
+    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks
+
+    int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
+    int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
+
+    int block_idx_in_region = (big_block_idx_in_region < sk_big_blocks_per_region) ?
+        big_block_idx_in_region :
+        normal_block_idx_in_region;
+
+    int owning_block_idx = (sk_blocks_per_region() * region_idx) + block_idx_in_region;
+
+    return owning_block_idx;
+  }
+
+  /// Obtains iteration extends for the given SK block index
+  CUTLASS_DEVICE
+  void get_iter_extents(
+      int sk_block_idx,
+      int &block_iter_begin,
+      int &block_iter_end) const
+  {
+    int region_idx;
+    int block_idx_in_region;
+    div_mod_sk_blocks_per_region(region_idx, block_idx_in_region, sk_block_idx);
+
+    block_iter_begin = (region_idx * sk_iters_per_region) + (block_idx_in_region * sk_iters_per_normal_block());
+
+    // Adjust extents for the first "num_big_blocks" blocks that get one extra iteration
+    int block_iters = sk_iters_per_normal_block();
+    if (block_idx_in_region < sk_big_blocks_per_region) {
+      // This is a +1 iteration block
+      block_iter_begin += block_idx_in_region;
+      block_iters++;
+    } else {
+      // This is a regular block
+      block_iter_begin += sk_big_blocks_per_region;
+    }
+    block_iter_end = block_iter_begin + block_iters;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_first_block_idx(int tile_idx, int block_idx) const
+  {
+    if (tile_idx >= sk_tiles) {
+      // DP tile
+      return block_idx;
+    }
+
+    int iter = tile_idx * iters_per_tile();
+    return get_sk_block_idx(iter);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
new file mode 100755
index 000000000..92e698f8a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -0,0 +1,612 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Complex transform on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transform on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_ = arch::OpMultiplyAddComplex>
+struct DefaultMmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case
+//  4 real-valued mma operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma operations
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use TF32 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t,
+        cutlass::layout::RowMajor,
+        tfloat32_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use BF16 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 operations on BF16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastBF16> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        bfloat16_t,
+        cutlass::layout::RowMajor,
+        bfloat16_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use F16 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.f16.f16.f32 operations on F16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastF16> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.f16.f16.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        half_t,
+        cutlass::layout::RowMajor,
+        half_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3xTF32 or 4xTF32 (fast and accurate complex<float> operation)
+/// Partial specialization - input and output types are complex<float> * complex<float> 
+//  Use 3xTF32 or 4xTF32 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = 3x[(ar*br - ai*bi) + j (ar*bi + ai*br)]
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplexFastF32> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t,
+        cutlass::layout::RowMajor,
+        tfloat32_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOpFastF32<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<double>*complex<double> case
+//  4 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations 
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
new file mode 100755
index 000000000..223426544
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_sparse_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+>
+struct DefaultSparseMmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultSparseMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::SparseMma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t, cutlass::layout::RowMajor, 
+        tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultSparseMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::SparseMma<InstructionShape_, 32, ElementA,
+                               cutlass::layout::RowMajor, ElementB,
+                               cutlass::layout::ColumnMajor, ElementC,
+                               cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
new file mode 100755
index 000000000..3a8cacd3d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/default_mma_tensor_op_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
new file mode 100755
index 000000000..67fcde77e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_mixed_input_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses BF16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastBF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses BF16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        bfloat16_t, cutlass::layout::RowMajor, 
+        bfloat16_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses F16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses F16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        half_t, cutlass::layout::RowMajor, 
+        half_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t, cutlass::layout::RowMajor, 
+        tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 for Fast Accurate FP32
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastF32, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        cutlass::tfloat32_t, cutlass::layout::RowMajor, 
+        cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpFastF32<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. F16 <= F16 x S8 + F16, F16 <= BF16 x S8 + F32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 16>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using ElementAMma = ElementOperand;
+  using ElementBMma = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 16>,
+        32,
+        ElementAMma, cutlass::layout::RowMajor,
+        ElementBMma, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. S32 <= S4 x S8 + S32, S32 <= S8 x S4 + S32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 32>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using MmaElementA = ElementOperand;
+  using MmaElementB = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 32>,
+        32,
+        MmaElementA, cutlass::layout::RowMajor,
+        MmaElementB, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAddSaturate
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
new file mode 100755
index 000000000..db6713cb4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
@@ -0,0 +1,92 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_with_reduction_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Reduce operand A or B along K dimension
+    bool ReduceKForA_,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMmaWithReductionTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaWithReductionTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, ReduceKForA_, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
new file mode 100755
index 000000000..145e4be7c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the Gemm problem (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1
+>
+struct DefaultMmaTensorOpWmma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    ///< Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK>
+struct DefaultMmaTensorOpWmma {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Wmma<
+          InstructionShape_, 
+          ElementA,
+          LayoutA, 
+          ElementB,
+          LayoutB, 
+          ElementC,
+          LayoutC, 
+          Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpWmma<
+        WarpShape_,
+        ElementA, 
+        LayoutA, 
+        ElementB, 
+        LayoutB,
+        ElementC, 
+        LayoutC, 
+        Policy, 
+        PartitionsK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+#endif
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
new file mode 100755
index 000000000..bbf0090b0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per channel scale+bias+relu before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentVarMean, typename FragmentGammaBeta>
+struct LayernormScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumVarMean = FragmentVarMean::kElements;
+  static int const NumGammaBeta = FragmentGammaBeta::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using VarMeanOperand = Array<__half2, MmaScaleBiasPair>;
+  using GammaBetaOperand = Array<T, MmaElements * MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 VarMeanOperand const &var_mean,
+                 GammaBetaOperand const &gamma_beta) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
+    uint32_t const *ptr_var_mean = reinterpret_cast<uint32_t const *>(&var_mean);
+    uint32_t const *ptr_gamma_beta = reinterpret_cast<uint32_t const *>(&gamma_beta);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
+    // It requires C to be an even number.
+    asm volatile(
+        "{\n\t"
+        " fma.rn.f16x2 %0, %1, %2, %3;\n"
+        " fma.rn.f16x2 %0, %4, %0, %5;\n"
+        "}\n"
+        : "=r"(ptr_activations[0])
+        : "r"(ptr_var_mean[0]), "r"(ptr_activations[0]),
+          "r"(ptr_var_mean[1]),
+          "r"(ptr_gamma_beta[0]), "r"(ptr_gamma_beta[1]));
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentVarMean const &var_mean,
+                  FragmentGammaBeta const &gamma_beta) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    VarMeanOperand const *ptr_var_mean =
+        reinterpret_cast<VarMeanOperand const *>(&var_mean);
+    GammaBetaOperand const *ptr_gamma_beta =
+        reinterpret_cast<GammaBetaOperand const *>(&gamma_beta);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_var_mean[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows],
+                ptr_gamma_beta[(i / MmaScaleBiasPair) % MmaCols]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
new file mode 100755
index 000000000..dc210b025
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
@@ -0,0 +1,60 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp-level multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Query the number of threads per warp
+template <typename OperatorClass>
+struct WarpSize {
+  static int const value = 32;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
new file mode 100755
index 000000000..2ef8bb42d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -0,0 +1,1168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  /// Data type of real & imag members of complex numbers in the SourceFragment
+  typename RealElement,
+  /// Destination fragment required by the mma operation 
+  typename DestinationFragment,
+  /// Source fragment holding complex<RealElement> elements
+  typename SourceFragment,
+  /// Number of mma operations performed
+  typename MmaIterations,
+  /// Shape of operand elements
+  typename MmaOperandShape,
+  /// Complex transform on A operand
+  ComplexTransform Transform_,
+  /// Operand A or Operand B
+  Operand Operand_,
+  /// Floating-point rounding style
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma;
+
+// Partial specialization for OperandA and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kA,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kA;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::ColumnMajor;
+  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kRow; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r + i * MmaOperandShape::kRow;
+          int col = c;
+
+          // Access complex<RealElement> and apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for OperandB and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kB,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kB;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::RowMajor;
+  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_HOST_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kColumn; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r;
+          int col = c + i * MmaOperandShape::kColumn;
+
+          // Access complex<RealElement> apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+        }
+      }
+    }
+  }
+};
+} // namespace detail 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddComplex;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = B[n].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag());
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag());
+        operand_B[0] = B[n].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+//  Output data type: complex<float>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<float>, 
+  LayoutA_, 
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = float;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = float;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = float;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = typename arch::OpMultiplyAddComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of complex products operations performed (one complex product needs four mma instructions)
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
+      "This implementation only supports mma.m16n8k8 math instructions.");
+
+    static_assert(InstMmaOperandA::kElements == 4, 
+      "This implementation only supports math instructions in which exactly four element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(InstMmaOperandB::kElements == 2, 
+      "This implementation only supports math instructions in which exactly two element is needed for the B operand."
+      "We can geneneralize later.");
+
+    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
+    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&A);
+    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A[m], operand_B[n], *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
+      }
+
+      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // negate OperandB to accumulate  -(a.imag()*b.imag())
+        // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements
+        negate<InstMmaOperandB> negate_op;
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+
+    //
+    // Define conversions from source type to instruction operands' type
+    //
+
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_to_nearest;
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_to_nearest;
+    #else
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; 
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz;
+    #endif
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementA,
+      InstMmaOperandA,
+      FragmentA,
+      MmaIterations,
+      MatrixShape<2, 2>,
+      kTransformA,
+      Operand::kA,
+      kRoundA> convert_A;
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementB,
+      InstMmaOperandB,
+      FragmentB,
+      MmaIterations,
+      MatrixShape<2, 1>,
+      kTransformB,
+      Operand::kB,
+      kRoundB> convert_B;
+
+    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
+    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
+    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<double>
+//  Math instruction: mma.sync.aligned.m16n8k4.f64.f64.f64.f64
+//  Output data type: complex<double>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<double>, 
+  LayoutA_, 
+  complex<double>,
+  LayoutB_,
+  complex<double>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = double;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = double;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = double;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = typename arch::OpMultiplyAddComplex;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ? 
+                          -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          A[m*MmaOperandA::kElements + mk].imag() : -A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+            operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ?
+                            -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          -A[m*MmaOperandA::kElements + mk].imag() : A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
new file mode 100755
index 000000000..d52c5e24b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
@@ -0,0 +1,663 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+namespace detail {
+
+template <
+  /// Data type of real & imag members of complex numbers in the SourceFragment
+  typename RealElement,
+  /// Destination fragment required by the mma operation 
+  typename DestinationFragment,
+  /// Source fragment holding complex<RealElement> elements
+  typename SourceFragment,
+  /// Number of mma operations performed
+  typename MmaIterations,
+  /// Shape of operand elements
+  typename MmaOperandShape,
+  /// Complex transform on A operand
+  ComplexTransform Transform_,
+  /// Operand A or Operand B
+  Operand Operand_,
+  /// Floating-point rounding style for big part
+  FloatRoundStyle RoundBig_,
+  /// Floating-point rounding style for small part
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32;
+
+// Partial specialization for OperandA and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle RoundBig_,
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32 <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kA,
+  RoundBig_,
+  RoundSmall_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kA;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRoundBig = RoundBig_;
+  static FloatRoundStyle const kRoundSmall = RoundSmall_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
+  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::ColumnMajor;
+  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
+
+  // BigSmall Fragment holding two TF32 elements (big, small) for every float
+  using BigSmallFragment = Array<MmaElement, 2>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMmaFastF32() {}
+
+  CUTLASS_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
+    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kRow * 2]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kRow; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r + i * MmaOperandShape::kRow;
+          int col = c;
+
+          // Access complex<RealElement> and apply rounding on real and imag parts
+          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_big_[i][pos] = a[kBigIndex];
+          dest_big_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_small_[i][pos] = a[kSmallIndex];
+          dest_small_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
+
+          // Next position
+          pos++;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for OperandB and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle RoundBig_,
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32 <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kB,
+  RoundBig_,
+  RoundSmall_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kB;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRoundBig = RoundBig_;
+  static FloatRoundStyle const kRoundSmall = RoundSmall_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
+  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::RowMajor;
+  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
+
+  // BigSmall Fragment holding two TF32 elements (big, small) for every float
+  using BigSmallFragment = Array<MmaElement, 2>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMmaFastF32() {}
+
+  CUTLASS_HOST_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
+    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kColumn * 2]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kColumn; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r;
+          int col = c + i * MmaOperandShape::kColumn;
+
+          // Access complex<RealElement> apply rounding on real and imag parts
+          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_big_[i][pos] = a[kBigIndex];
+          dest_big_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_small_[i][pos] = a[kSmallIndex];
+          dest_small_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
+
+          // next position
+          pos++;       
+        }
+      }
+    }
+  }
+};
+} // namespace detail 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaComplexTensorOpFastF32;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+//  Output data type: complex<float>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaComplexTensorOpFastF32<
+  Shape_, 
+  complex<float>, 
+  LayoutA_, 
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  Enable>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = float;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = float;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = float;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddComplexFastF32;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+
+  /// Tune F32 to TF32 big small conversion for complex<float> operation
+  /// Different combination of big small conversin can cause different tradeoff
+  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
+  /// improve the performance but hur the accuracy.
+  using ComplexFastF32 = FastF32 <
+    FloatRoundStyle::round_toward_zero,        // kRoundBigA
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
+    FloatRoundStyle::round_toward_zero,        // kRoundBigB
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
+    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
+  >;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  // (4 times the original FragmentA::kElements)
+  // (real_big), (imag_big), (real_small), (imag_small)
+  using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, 
+                                              FragmentA::kElements * 2 * 2>;
+
+  // Fragment bisecting big and small sections
+  // (real_big, imag_big), (real_small, imag_small)
+  using AccessTypeFragmentA = Array<typename ArchMmaOperator::ElementA, 
+                                                    FragmentA::kElements * 2>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile 
+  // (4 times the original FragmentB::kElements)
+  // (real_big), (imag_big), (real_small), (imag_small)
+  using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, 
+                                              FragmentB::kElements * 2 * 2>;
+
+  // Fragment bisecting big and small sections
+  // (real_big, imag_big), (real_small, imag_small)
+  using AccessTypeFragmentB = Array<typename ArchMmaOperator::ElementB, 
+                                                    FragmentB::kElements * 2>;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of complex products operations performed (one complex product needs four mma instructions)
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  //
+  // Alias types for underlying real-valued matrix multiply operator
+  //
+  using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+  using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+  using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+  static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
+    "This implementation only supports mma.m16n8k8 math instructions.");
+
+  static_assert(InstMmaOperandA::kElements == 4, 
+    "This implementation only supports math instructions in which exactly four element is needed for the A operand."
+    "We can geneneralize later.");
+
+  static_assert(InstMmaOperandB::kElements == 2, 
+    "This implementation only supports math instructions in which exactly two element is needed for the B operand."
+    "We can geneneralize later.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOpFastF32() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    AccessTypeFragmentA const *complex_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
+    AccessTypeFragmentB const *complex_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+
+
+    complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kBigIndex], D);
+
+    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kSmallIndex], D);
+
+    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kBigIndex], D);
+
+    if (ComplexFastF32::kPrecision == TensorFloat32Op::k4xTF32)
+      complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kSmallIndex], D);
+  }
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void complex_mma_operator(
+    FragmentC &D, 
+    AccessTypeFragmentA const &complex_A, 
+    AccessTypeFragmentB const &complex_B, 
+    FragmentC const &C
+  ) const {
+
+    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
+    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&complex_A);
+    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&complex_B);
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A[m], operand_B[n], *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
+      }
+
+      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // negate OperandB to accumulate  -(a.imag()*b.imag())
+        // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements
+        negate<InstMmaOperandB> negate_op;
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+         mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    detail::UnpackComplexConvertAndPackForMmaFastF32 <
+      RealElementA,
+      InstMmaOperandA,
+      FragmentA,
+      MmaIterations,
+      MatrixShape<2, 2>,
+      kTransformA,
+      Operand::kA,
+      ComplexFastF32::kRoundBigA,
+      ComplexFastF32::kRoundSmallA> convert_A;
+
+    detail::UnpackComplexConvertAndPackForMmaFastF32 <
+      RealElementB,
+      InstMmaOperandB,
+      FragmentB,
+      MmaIterations,
+      MatrixShape<2, 1>,
+      kTransformB,
+      Operand::kB,
+      ComplexFastF32::kRoundBigB,
+      ComplexFastF32::kRoundSmallB> convert_B;
+
+    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
+    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
+    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
new file mode 100755
index 000000000..bc51bca09
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,2485 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad_pair = lane_id / 8;
+    int quad = lane_id / 4;
+    int lane = lane_id % 4;
+
+    int row = (quad & 1) * 4 + (lane ^ quad_pair);
+    
+    byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset =
+      (tile_offset.contiguous() * Shape::kContiguous) +
+      (tile_offset.strided() * InstructionShape::kStrided * stride_);
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kStrided;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a planar complex arrangement with the real parts as entirely contiguous
+  /// followed by the imaginary parts.
+  using Fragment = Array<RealElement, Shape::kCount / kThreads * 2>;
+
+  static int const kRealIndex = 0;
+  static int const kImaginaryIndex = Shape::kCount / kThreads;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 8>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad = lane_id / 4;
+    int liq = lane_id % 4;
+
+    int c = liq + (quad & 1) * 4;
+    int s = (quad / 2);
+
+    byte_offset_ = (c + s * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    // Compute the offset in units of elements. Note, the external coordinate system is
+    // approximately transposed with respect to the tiled internal structure
+    int offset =
+      (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ +
+      (tile_offset.strided() * Shape::kStrided);
+
+    add_pointer_offset(offset);
+
+    byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    byte_offset_ ^= 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = s + c * Policy::Iterations::kStrided;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * InstructionShape::kContiguous * stride_ +
+        tile_offset.strided() * Shape::kStrided;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Congruous shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, cutlass::complex<float>,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility.");
+
+  /// Element type
+  using Element = cutlass::complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Crosswise shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, complex<float>,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<complex<float>>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+
+    if (k_group_idx_ & 1)
+      byte_offset_ ^= 0x40;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+    
+    // xor ptr
+    byte_offset_ ^= 0x40;
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c * Policy::Iterations::kStrided + s;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    // exchange on 64b granularity only for fragments held in k=8/2 to k=8 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) {
+      Element tmp = exchange_ptr[i];
+      exchange_ptr[i] = exchange_ptr[i + 1];
+      exchange_ptr[i + 1] = tmp;
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
new file mode 100755
index 000000000..5a02417aa
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -0,0 +1,642 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaGaussianComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddGaussianComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag());
+        operand_Br[0] = B[n].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        operand_Ar[0] = -A[m].real();
+        operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag();
+        operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddGaussianComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Asum[mk] = A[m*MmaOperandA::kElements + mk].real() + ((kTransformA == ComplexTransform::kConjugate) ?
+                            -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Br[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ar[mk] = -A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bdiff[nk] = B[n*MmaOperandB::kElements + nk].real() - ((kTransformB == ComplexTransform::kConjugate) ?
+                              -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ai[mk] = (kTransformA == ComplexTransform::kConjugate) ?
+                           -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bsum[nk] = B[n*MmaOperandB::kElements + nk].real() + ((kTransformB == ComplexTransform::kConjugate) ?
+                             -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
new file mode 100755
index 000000000..fe785f8d3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,390 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous
+  /// arranged as [part1, part2, part3]
+  using Fragment = Array<RealElement, (Shape::kCount / kThreads) * 3>;
+
+  static int const kPart1Index = (Shape::kCount / kThreads) * 0;
+  static int const kPart2Index = (Shape::kCount / kThreads) * 1;
+  static int const kPart3Index = (Shape::kCount / kThreads) * 2;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], 
+                      frag[kPart1Index + idx] + frag[kPart2Index + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
new file mode 100755
index 000000000..f553fbde9
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
@@ -0,0 +1,566 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+// Shuffle registers for layout conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment,
+  /// Identifies A or B multiplicand
+  Operand Operand_,
+  ///
+  typename Enable = void >
+struct FragmentShuffler {
+  public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand_;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+    return src;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand A multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kA,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kA;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const*>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment*>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const *>(&mma_frag_src_ptr[n]);
+        uint32_t *dst_ptr = reinterpret_cast<uint32_t *>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+        uint32_t tmp2 = __shfl_up_sync(0xFFFFFFFF, src_ptr[1], delta_up_);
+        uint32_t tmp3 = __shfl_down_sync(0xFFFFFFFF, src_ptr[1], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp2, byte_selector_);
+        dst_ptr[1] = __byte_perm(tmp1, tmp3, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand B multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kB,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kB;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const *>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&mma_frag_src_ptr[n]);
+        uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp1, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Data type conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Destination type
+  typename ElementDst_, 
+  /// Source type
+  typename ElementSrc_,
+  /// Number of elements
+  int N,
+  ///
+  typename Enable = void> 
+struct FragmentConverter {
+
+  using ElementDst = ElementDst_;
+  using ElementSrc = ElementSrc_;
+
+  // Operand fragment registers in destination and source types
+  using DestinationFragment = Array<ElementDst, N>;
+  using SourceFragment = Array<ElementSrc, N>;
+
+  FastNumericArrayConverter<ElementDst, ElementSrc, N> convert;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return convert(src);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+// Partial specialization for when Destination type is the *same* as 
+// Source type
+template <
+  /// Data type
+  typename Element,
+  /// Number of elements
+  int N,
+  /// 
+  typename Enable>
+struct FragmentConverter<Element, Element, N, Enable> {
+
+  using DestinationFragment = Array<Element, N>;
+  using SourceFragment = Array<Element, N>;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return src;
+  }
+};
+
+} // namespace detail
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaMixedInputTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Underlying arch::Mma instruction datatype for A operand
+  using ElementAMma = typename ArchMmaOperator::ElementA;
+
+  /// Underlying arch::Mma instruction datatype for B operand
+  using ElementBMma = typename ArchMmaOperator::ElementB;
+
+  /// Underlying arch::Mma instruction datatype for C operand
+  using MmaElementC = typename ArchMmaOperator::ElementC;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// 
+  // static int const kLoadShapeK = InstructionShape::kK * 
+  //  (sizeof_bits<ElementAMma>::value / sizeof_bits<ElementB>::value);
+
+public:
+
+  /// Iterates over the A operand in Shared Memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile in registers (loaded from Shared Memory)
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile in registers (for use in Mma instruction)
+  using TransformedFragmentA =
+      Array<ElementAMma, FragmentA::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix A
+  using MmaOperandA = typename ArchMmaOperator::FragmentA;
+
+  /// Iterates over the B operand in Shared Memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile in registers (loaded from Shared Memory)
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile in registers (for use in Mma instruction)
+  using TransformedFragmentB =
+      Array<ElementBMma, FragmentB::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix B
+  using MmaOperandB = typename ArchMmaOperator::FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Underlying arch::Mma instruction operand fragement for matrix C
+  using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaMixedInputTensorOp() {}
+
+    /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(
+            ptr_D[n_serpentine + m * MmaIterations::kColumn],
+            ptr_A[m],
+            ptr_B[n_serpentine],
+            ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+        }
+      }
+    }
+  }
+
+  /// Transform the operand warp fragment register to the required data types and layout 
+  /// for the `cultass::arch::Mma`
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementBMma, ElementB, MmaIterations::kColumn, 
+             FragmentB::kElements, MmaOperandB::kElements, Operand::kB> shuffler_B;
+    FragmentB tmp_B; 
+    tmp_B = shuffler_B(B);
+
+    // Convert the B operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementBMma, ElementB, FragmentB::kElements> convert_B;
+    dst_B = convert_B(tmp_B);
+
+    FragmentA tmp_A;
+
+    Array<ElementA, FragmentA::kElements / 2> *
+        ptr_tmp_A = reinterpret_cast<Array<ElementA,
+                                             FragmentA::kElements / 2> *>(&tmp_A);
+    Array<ElementAMma, FragmentA::kElements / 2> *
+        ptr_dst_A = reinterpret_cast<Array<ElementAMma,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementAMma, ElementA, MmaIterations::kRow,
+             FragmentA::kElements, MmaOperandA::kElements, Operand::kA> shuffler_A;
+
+    // Convert the A operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementAMma, ElementA, FragmentA::kElements / 2> convert_A;
+
+    tmp_A = shuffler_A(A);
+    ptr_dst_A[0] = convert_A(ptr_tmp_A[0]);
+
+    ptr_dst_A[1] = convert_A(ptr_tmp_A[1]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
new file mode 100755
index 000000000..c5dcfb7c0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/gemm/warp/tile_iterator_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Underlying real-valued warp-level matrix multiply
+  typename Operator_,
+  /// Transformation applied to A operand (typically folded into math instruction)
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Transformation applied to B operand (typically folded into math instruction)
+  ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplex {
+public:
+
+  /// Underlying real-valued warp-level matrix multiply
+  using Operator = Operator_;
+
+  /// Shape of warp-level matrix multipy
+  using Shape = typename Operator::Shape;
+
+  /// Transformation applied to A operand (typically folded into math instruction)
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B operand (typically folded into math instruction)
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Fragment of elements
+  using FragmentA = ArrayPlanarComplex<typename Operator::ElementA, Operator::FragmentA::kElements>;
+
+  /// Iterator into planar complex
+  using IteratorA = TileIteratorPlanarComplex<typename Operator::IteratorA>;
+
+  /// Layout in memory of the A operand
+  using LayoutA = typename Operator::LayoutA;
+
+  using FragmentB = ArrayPlanarComplex<typename Operator::ElementB, Operator::FragmentB::kElements>;
+
+  /// Iterator into planar complex
+  using IteratorB = TileIteratorPlanarComplex<typename Operator::IteratorB>;
+
+  /// Layout in memory of the B operand
+  using LayoutB = typename Operator::LayoutB;
+
+  /// Tile iterator for accumulator
+  using IteratorC = TileIteratorPlanarComplex<typename Operator::IteratorC>;
+
+  /// Accumulator fragment
+  using FragmentC = ArrayPlanarComplex<typename Operator::ElementC, Operator::FragmentC::kElements>;
+
+  /// Layout of accumulator fragment in memory
+  using LayoutC = typename Operator::LayoutC;
+
+private:
+
+    /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Operator::Shape::kM / Operator::Policy::Operator::Shape::kM,
+    Operator::Shape::kN / Operator::Policy::Operator::Shape::kN
+  >;
+
+public:
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaPlanarComplex() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A_in, 
+    FragmentB const &B_in, 
+    FragmentC const &C) const {
+
+    D.real = C.real;
+    D.imag = C.imag;
+
+    //
+    // Transform fragments based on conjugate operations.
+    //
+
+    negate<typename FragmentA::ArrayReal> neg_A;
+
+    FragmentA frag_A;
+    frag_A.real = A_in.real;
+
+    if (kTransformA == ComplexTransform::kConjugate) {
+      frag_A.imag = neg_A(frag_A.imag);
+    }
+    else {
+      frag_A.imag = frag_A.imag;
+    }
+
+    FragmentB frag_B;
+    frag_B.real = B_in.real;
+
+    if (kTransformB == ComplexTransform::kConjugate) {
+      negate<typename FragmentB::ArrayReal> neg;
+      frag_B.imag = neg(frag_B.imag);
+    }
+    else {
+      frag_B.imag = frag_B.imag;
+    }
+
+    //
+    // Accumulated real-valued matrix multiplies
+    //
+
+    Operator real_mma;
+
+    // D.i += A.i * B.r
+    real_mma(D.imag, frag_A.imag, frag_B.real, D.imag);
+
+    // D.r += A.r * B.r
+    real_mma(D.real, frag_A.real, frag_B.real, D.real);
+
+    // D.i += A.r * B.i
+    real_mma(D.imag, frag_A.real, frag_B.imag, D.imag);
+
+    // D.r += -A.i * B.i
+    frag_A.imag = neg_A(frag_A.imag);
+    real_mma(D.real, frag_A.imag, frag_B.imag, D.real);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
new file mode 100755
index 000000000..f5f2f063f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/thread/mma.h"
+
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK = 1,
+  /// Complex transformation on operand A
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transformation on operand B
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaSimt {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Layout of threads
+  using ThreadLayoutA = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA >::value,
+                  layout::ColumnMajor,
+                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value,
+                      layout::RowMajor,
+                      LayoutA>::type
+                 >::type;
+  
+  using ThreadLayoutB = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutB >::value,
+                  layout::ColumnMajor,
+                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutB >::value,
+                      layout::RowMajor,
+                      LayoutB>::type
+                 >::type;
+
+  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
+                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
+                                    platform::is_same< ElementA, int8_t >::value && 
+                                    platform::is_same< ElementB, int8_t >::value;
+
+  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
+
+  /// Thread-level matrix multiply accumulate operator
+  using ThreadMma = thread::Mma<
+    GemmShape<
+      Shape::kM / Policy::WarpShape::kRow,
+      Shape::kN / Policy::WarpShape::kColumn,
+      Policy::LaneMmaShape::kK>,
+    ElementA,
+    ThreadLayoutA,
+    ElementB,
+    ThreadLayoutB,
+    ElementC,
+    LayoutC,
+    arch::OpMultiplyAdd,
+    dp4a_type
+  >;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Shape of the underlying instruction
+  using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Policy::LaneMmaShape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaSimtTileIterator<
+    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    Operand::kC,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename ThreadMma::FragmentC;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaSimt() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &d, 
+    FragmentA a, 
+    FragmentB b, 
+    FragmentC const &c, int group_idx = 0) const {
+
+    ThreadMma mma;
+
+    if (kTransformA == ComplexTransform::kConjugate) {
+      conjugate<FragmentA> conj_a;
+      a = conj_a(a);
+    }
+
+    if (kTransformB == ComplexTransform::kConjugate) {
+      conjugate<FragmentB> conj_b;
+      b = conj_b(b);
+    }
+
+    mma(d, a, b, c);
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
new file mode 100755
index 000000000..8da3b9f86
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -0,0 +1,69 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Describes the arrangement and configuration of per-lane operations in warp-level matrix multiply 
+template <
+  typename WarpShape_,              ///< shape of the warp in lanes (concept: MatrixShape)
+  typename LaneLayout_,             ///< layout function of lanes
+  typename LaneMmaShape_            ///< size of each lane's thread-level matrix product (concept: GemmShape)
+>
+struct MmaSimtPolicy {
+  using WarpShape = WarpShape_;
+  using LaneLayout = LaneLayout_;
+  using LaneMmaShape = LaneMmaShape_;
+  using MmaShape = LaneMmaShape;
+
+  /// Returns a layout functor mapping lane position in the warp to thread ID
+  CUTLASS_HOST_DEVICE
+  static LaneLayout get_lane_layout() {
+    return LaneLayout::packed({WarpShape::kRow, WarpShape::kColumn});
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
new file mode 100755
index 000000000..6b0647ffd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -0,0 +1,1890 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
+///
+/// concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Operand identity
+  Operand Operand,
+  /// Data type of A elements
+  typename Element_,
+  /// Layout of operand
+  typename Layout_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK = 1,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize = 1
+>
+class MmaSimtTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kM>, layout::ColumnMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kM);
+  }
+  
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow / Policy::LaneMmaShape::kM, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({0, Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        // This logic has been replaced with calls to inline PTX to guarantee vectorization.
+        #if 0
+        dst_ptr[m + k * Iterations::kRow] = 
+          *(ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM);
+        #endif
+
+        auto ptr = ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM;
+        arch::shared_load(dst_ptr[m + k * Iterations::kRow], ptr);
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kN; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kM; ++m) {
+        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
+          src_ptr[m + k * Iterations::kM];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads (scalar loads)
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::RowMajor> ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() : divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) : extent_(Shape::kRow, Shape::kColumn), divisible_ (true) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref,
+    TensorCoord extent, 
+    int lane_id
+  ) : extent_(extent), divisible_ (false) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    origin_ = lane_offset;
+    
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    TensorCoord coord_offset(
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn);
+    
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({0, Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+          
+          MatrixCoord offset(m * Policy::WarpShape::kRow * Policy::LaneMmaShape::kM + i, k);
+            
+          MatrixCoord access_coord = origin_ + offset;
+
+          int frag_idx = m * Policy::LaneMmaShape::kM + i + k * Iterations::kRow;
+
+          if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+          
+            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
+          }
+          else {
+            frag[frag_idx] = Element();
+          }
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator. 
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+
+          *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM * Policy::LaneMmaShape::kM + i, k) + pointer_offset) = 
+            frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kM];
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+  
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+protected:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kN);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kN});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        #if 0
+        dst_ptr[n + k * Iterations::kColumn] = 
+          *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN);
+        #endif
+
+        void const *ptr = ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN;
+        arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+  
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::ColumnMajor> ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ): extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref,
+    TensorCoord extent, 
+    int lane_id
+  ): extent_(extent), divisible_(false) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    TensorCoord coord_offset(
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn);
+
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kN; ++i) {
+
+          MatrixCoord offset(k, n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + i);
+            
+          MatrixCoord access_coord = origin_ + offset;
+
+          int frag_idx = n * Policy::LaneMmaShape::kN + i + k * Iterations::kColumn;
+
+          if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+
+            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
+          }
+          else {
+            frag[frag_idx] = Element();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for C operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_
+>
+class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of accumulators in memory
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(
+    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thraed-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(
+    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+  
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using Delta = MatrixShape<
+    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
+    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to be loaded from memory
+    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kN; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+
+        Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(
+            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kN + n}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kM; ++mma_m) {
+
+          Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag) + 
+            mma_m + Iterations::kM * (n + mma_n * Policy::LaneMmaShape::kN);
+
+          *dst_ptr = src_ptr[mma_m * Policy::WarpShape::kM];
+        }
+      }
+    }
+  }
+    
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+
+        Array<Element, Policy::LaneMmaShape::kM> *dst_ptr= 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(
+            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kColumn + n}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+
+          Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(&frag) + 
+            mma_m + Iterations::kRow * (n + mma_n * Policy::LaneMmaShape::kN);
+
+          dst_ptr[mma_m * Policy::WarpShape::kRow] = *src_ptr;
+        }
+      }
+    }
+  }
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for C operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_
+>
+class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::RowMajor, Policy_> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of accumulators in memory
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(
+    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thraed-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(
+    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+  
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using Delta = MatrixShape<
+    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
+    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+    
+    ref_.add_coord_offset(lane_offset);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to be loaded from memory
+    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+
+        Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(
+            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+
+          Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag) + 
+            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
+
+          *dst_ptr = src_ptr[mma_n * Policy::WarpShape::kColumn];
+        }
+      }
+    }
+  }
+    
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+
+        Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(
+            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+
+          Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(&frag) + 
+            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
+
+          dst_ptr[mma_n * Policy::WarpShape::kColumn] = *src_ptr;
+        }
+      }
+    }
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of column-major-K interleaved layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Number of KGroups per kPartition
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajorInterleaved<4> ;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Iterleave factor
+  static const int kInterleave = 4;
+  
+  /// Number of partitions along K dimension
+  static const int kPartitionsK = PartitionsK;
+
+  /// Number of KGroups per kPartition
+  static const int kGroupPerTile = PartitionGroupSize / Shape::kColumn;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM) && !(ThreadShape::kColumn % Policy::LaneMmaShape::kK), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kK
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kMK>, layout::ColumnMajorInterleaved<4>> ref_;
+
+  /// group index within tile
+  int k_group_idx_;
+
+public:
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    ref.add_coord_offset(lane_offset);
+
+    k_group_idx_ = 0;
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(ref.data()), ref.stride(0)/Policy::LaneMmaShape::kMK);
+  }
+  
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow / Policy::LaneMmaShape::kMK, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == kGroupPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset({0, kGroupPerTile * (kPartitionsK-1)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kMK > *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        dst_ptr[m + k * Iterations::kRow] = 
+          *((ref_.data() + ref_.offset({m * Policy::WarpShape::kRow / kInterleave, 
+                  k*Policy::LaneMmaShape::kK}) + pointer_offset / Policy::LaneMmaShape::kM));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    Array<Element, Policy::LaneMmaShape::kMK> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK > *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kN; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kM; ++m) {
+        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
+          src_ptr[m + k * Iterations::kM];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major k-interleaved layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Number of KGroups per kPartition
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajorInterleaved<4>;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Interleave factor
+  static const int kInterleave = 4;
+
+  /// Number of partitions along K dimension
+  static const int kPartitionsK = PartitionsK;
+
+  /// Number of KGroups per kPartition
+  static const int kGroupPerTile = PartitionGroupSize / Shape::kRow;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN) && !(ThreadShape::kRow % Policy::LaneMmaShape::kK), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kK,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kKN>, layout::RowMajorInterleaved<4>> ref_;
+
+  /// group index within tile
+  int k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset);
+
+    k_group_idx_ = 0;
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kKN);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kKN});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    add_tile_offset({1, 0});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == kGroupPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset({kGroupPerTile * (kPartitionsK-1), 0});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kKN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        dst_ptr[n + k * Iterations::kColumn] = 
+          *(ref_.data() + ref_.offset({k * Policy::LaneMmaShape::kK, 
+                n * Policy::WarpShape::kColumn / kInterleave}) + pointer_offset / Policy::LaneMmaShape::kN);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
new file mode 100755
index 000000000..1ce1051c4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -0,0 +1,382 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate
+   operations targeting sparse Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class SparseMmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Equivalant base dense mma
+  using Base = MmaTensorOp<Shape, ElementA, LayoutA, ElementB, LayoutB,
+                           ElementC, LayoutC, Policy, PartitionsK_,
+                           AccumulatorsInRowMajor, Enable>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Base::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename Base::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = typename Base::OperatorClass;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename Base::InstructionShape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Base::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Base::kTransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Sparsity in Operand A
+  static int const kSparse = Policy::Operator::kSparse;
+
+  /// Meta data size in bits 
+  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
+
+  /// Max ID2
+  static int const kMaxID2 = Policy::Operator::kMaxID2;
+
+    static int const kVerticalVisit = false;
+  /// Data type of meta E that is moved at the same time
+  using ElementE =
+      typename cutlass::platform::conditional<kMaxID2 == 1, uint32_t,
+                                              uint16_t>::type;
+
+  /// Number of ElementA that is associated with one ElementE
+  static int const kElementsPerElementE =
+      128 / cutlass::sizeof_bits<ElementA>::value;
+
+  /// Meta data is essentially interleaved but mapped to ColumnMajor internally
+  static int const kInterleaved = 2;
+
+  /// Layout of meta E 
+  using LayoutE = cutlass::layout::ColumnMajor;
+
+ public:
+
+  /// Iterates over the A operand in memory
+ using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK / kSparse>, Operand::kA, ElementA,
+     LayoutA,
+     MatrixShape<Policy::Operator::Shape::kM,
+                 Policy::Operator::Shape::kK / kSparse>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+ /// Storage for A tile
+ using FragmentA = typename IteratorA::Fragment;
+
+ /// Storage for transformed A tile
+ using TransformedFragmentA =
+     Array<typename Policy::Operator::ElementA, FragmentA::kElements>;
+
+ /// Iterates over the B operand in memory
+ using IteratorB = typename Base::IteratorB;
+
+ /// Storage for B tile
+ using FragmentB = typename Base::FragmentB;
+
+ /// Storage for transformed B tile
+ using TransformedFragmentB = typename Base::TransformedFragmentB;
+
+ /// Iterates over the C operand in memory
+ using IteratorC = typename Base::IteratorC;
+
+ /// Storage for C tile
+ using FragmentC = typename Base::FragmentC;
+
+ /// Iterates over the E operand in memory
+ using IteratorE = SparseMmaTensorOpMetaTileIterator<
+     MatrixShape<Shape::kM * kInterleaved,
+                 Shape::kK / kSparse / kElementsPerElementE / kInterleaved>,
+     ElementE, LayoutE,
+     MatrixShape<Policy::Operator::Shape::kM,
+                 Policy::Operator::Shape::kK / kSparse / kElementsPerElementE /
+                     kInterleaved>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+ /// Storage for E tile
+ using FragmentE = typename IteratorE::Fragment;
+
+ /// Number of mma operations performed
+ using MmaIterations = typename Base::MmaIterations;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  SparseMmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C,
+    FragmentE const &E
+  ) const {
+
+    using MmaOperandA = typename Policy::Operator::FragmentA;
+    using MmaOperandB = typename Policy::Operator::FragmentB;
+    using MmaOperandC = typename Policy::Operator::FragmentC;
+    using MmaOperandE = typename Policy::Operator::FragmentE;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+    MmaOperandE const *ptr_E = reinterpret_cast<MmaOperandE const *>(&E);
+
+    if (kVerticalVisit) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+          int id2 = m_serpentine % kMaxID2;
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          }
+        }
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        int id2 = m % kMaxID2;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_E[(m / kMaxID2)],
+              id2);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_E[(m / kMaxID2)],
+                id2);
+          }
+        }
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+
+    if (kVerticalVisit) {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+  
+      dst_A = convert_A(A);
+  
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+    } else {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                             FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
new file mode 100755
index 000000000..d4aaf5be1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename T, typename S, int N, FloatRoundStyle Round>
+struct ConvertAndPack {
+
+  using Converter = NumericArrayConverter<T, S, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<S, N> const &source) {
+    Converter converter;
+
+    return converter(source);
+  }
+};
+
+template <typename T, int N, FloatRoundStyle Round>
+struct ConvertAndPack<T, T, N, Round> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &source) {
+		return source;
+  }
+};
+
+template <int N, FloatRoundStyle Round>
+struct ConvertAndPack<bfloat16_t, float, N, Round> {
+
+  using Converter = NumericArrayConverter<bfloat16_t, float, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(Array<float, N> const &source) {
+    Converter converter;
+
+    Array<float, N> tmp;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+      tmp[i] = source[idx];
+    }
+
+    return converter(tmp);
+  }
+};
+
+template <int N, FloatRoundStyle Round>
+struct ConvertAndPack<half_t, float, N, Round> {
+
+  using Converter = NumericArrayConverter<half_t, float, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<float, N> const &source) {
+    Converter converter;
+
+    Array<float, N> tmp;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+      tmp[i] = source[idx];
+    }
+
+    return converter(tmp);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting Tensor Cores.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  #if defined(__CUDA_ARCH__) && ((__CUDA_ARCH__ < 800) || (__CUDA_ARCH__ == 890)) 
+    static int const kVerticalVisit = true;
+  #else
+    static int const kVerticalVisit = false;
+  #endif
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+      
+    if (kVerticalVisit) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+          }
+        }
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        }
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+    if (kVerticalVisit) {    
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+  
+      dst_A = convert_A(A);
+  
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+    } else {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
new file mode 100755
index 000000000..148e71226
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
@@ -0,0 +1,471 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class TensorFloat32Op {
+  k3xTF32, 
+  k4xTF32 
+}; 
+
+template <
+  /// Floating-point rounding style
+  FloatRoundStyle RoundBigA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundSmallA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundBigB_ = RoundBigA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundSmallB_ = RoundSmallA_,
+  /// Precision for TensorFloat32Op 
+  // (k3xTF32: BigxBig, BigxSmall, SmallxBig)
+  // (k4xTF32: BigxBig, BigxSmall, SmallxBig, SmallxSmall)
+  TensorFloat32Op Precision_ = TensorFloat32Op::k3xTF32
+  >
+struct FastF32 {
+
+  static FloatRoundStyle const kRoundBigA = RoundBigA_;
+  static FloatRoundStyle const kRoundSmallA = RoundSmallA_;
+  static FloatRoundStyle const kRoundBigB = RoundBigB_;
+  static FloatRoundStyle const kRoundSmallB = RoundSmallB_;
+  static TensorFloat32Op const kPrecision = Precision_;
+};
+
+
+namespace detail {
+
+  template<
+    int N,
+    FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
+    FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
+  >
+  struct ConvertAndPackAccurateF32 {
+  
+    /// Rounding styles for big and small part
+    static FloatRoundStyle const kRoundBig = RoundBig;
+    static FloatRoundStyle const kRoundSmall = RoundSmall;
+
+    /// Converter type
+    using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+    /// Source fragement
+    using SourceFragment = Array<float, N>;
+
+    /// Destination fragment
+    using DestinationFragment = Array<tfloat32_t, N>;
+
+    /// Converter Fragment holding two tfloat32_t elements for every float
+    using ConverterFragment = Array<tfloat32_t, 2>;
+
+    /// Index in fargments for the big and small part
+    static int const kBigIndex = 0;
+    static int const kSmallIndex = 1;
+
+    CUTLASS_HOST_DEVICE
+    void operator()(SourceFragment const &source,
+                    DestinationFragment &dst_big,
+                    DestinationFragment &dst_small) {
+      
+      Converter convert_;
+      ConverterFragment result_;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < N; ++i) {
+        // convert source to result fragment
+        result_ = convert_(source[i]);
+
+        // store converted result fragments to destination fragment
+        dst_big[i] = result_[kBigIndex];
+        dst_small[i] = result_[kSmallIndex];
+      }
+    }
+  };
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOpFastF32;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float*float+float => float using TF32 TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaTensorOpFastF32<
+  Shape_,
+  float, LayoutA_,
+  float, LayoutB_,
+  float, LayoutC_,
+  Policy_, PartitionsK_,
+  AccumulatorsInRowMajor, Enable> {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = float;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = float;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = float;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddFastF32;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Tune F32 to TF32 big small conversion for float operation
+  /// Different combination of big small conversin can cause different tradeoff
+  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
+  /// improve the performance but hur the accuracy.
+  using MmaFastF32 = FastF32 <
+    FloatRoundStyle::round_toward_zero,        // kRoundBigA
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
+    FloatRoundStyle::round_toward_zero,        // kRoundBigB
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
+    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
+  >;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, 
+      Operand::kA, 
+      ElementA, 
+      LayoutA,
+      MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+      Policy::OpDelta::kRow, 
+      kThreadCount, 
+      kPartitionsK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
+
+  /// Fragment bisecting big and small sections
+  using AccessTypeFragmentA = 
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, 
+      Operand::kB, 
+      ElementB, 
+      LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, 
+      kThreadCount, 
+      kPartitionsK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
+
+  /// Fragment bisecting big and small sections
+  using AccessTypeFragmentB = 
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpFastF32() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    AccessTypeFragmentA const *ptr_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
+    AccessTypeFragmentB const *ptr_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+    
+    mma_operator(D, ptr_A[kSmallIndex], ptr_B[kBigIndex], D);
+
+    mma_operator(D, ptr_A[kBigIndex], ptr_B[kSmallIndex], D);
+
+    mma_operator(D, ptr_A[kBigIndex], ptr_B[kBigIndex], D);
+
+    if (MmaFastF32::kPrecision == TensorFloat32Op::k4xTF32)
+      mma_operator(D, ptr_A[kSmallIndex], ptr_B[kSmallIndex], D);
+  }
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void mma_operator(
+    FragmentC &D, 
+    AccessTypeFragmentA const &A, 
+    AccessTypeFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+      using MmaOperandA = typename ArchMmaOperator::FragmentA;
+      using MmaOperandB = typename ArchMmaOperator::FragmentB;
+      using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+      MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+      MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+      MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+      // Serpentine visitation order maximizing reuse of Ra
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          // This allows to reuse of Rb when at serpentine turns
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        } // end n loop
+      } // end m loop
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      
+      detail::ConvertAndPackAccurateF32<
+        FragmentA::kElements / 2,
+        MmaFastF32::kRoundBigA,
+        MmaFastF32::kRoundSmallA> convert_A;
+      
+      detail::ConvertAndPackAccurateF32<
+        FragmentB::kElements,
+        MmaFastF32::kRoundBigB,
+        MmaFastF32::kRoundSmallB> convert_B;
+      
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *ptr_dst_B = 
+        reinterpret_cast<Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *>(&dst_B);
+      
+      convert_B(B, ptr_dst_B[0], ptr_dst_B[1]);
+
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *ptr_dst_A =
+        reinterpret_cast<Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *>(&dst_A);
+      
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A = 
+        reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      
+      convert_A(ptr_A[0], ptr_dst_A[0], ptr_dst_A[2]);
+      
+      convert_A(ptr_A[1], ptr_dst_A[1], ptr_dst_A[3]);
+    #else
+      assert(0);
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
new file mode 100755
index 000000000..32460b629
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,559 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of a warp tile
+      that participate in one warp-level mma operation.
+
+      Typically, this is used to access the accumulator tile/fragement of a warp-level mma operation.
+      The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into 
+      next warp-level mma operation. 
+
+      This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is 
+      reused as multiplicand tile for the next mma.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of the accumulation tile shape (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on the fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator;
+
+
+// Partial specialization for col-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, OutputOp_> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow, 
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    int index = index_ * MmaIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            n * AccumulatorIterations::kRow + m + index;
+            
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset]);
+      }
+    }
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  /// Then apply per-channel scale and bias
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, ScaleBiasFragment &scale, 
+        ScaleBiasFragment &bias, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
+    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
+
+    int index = index_ * MmaIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            n * AccumulatorIterations::kRow + m + index;
+            
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            frag_ptr[m * MmaIterations::kColumn + n] = 
+                output_op(accumulators_[accumulator_access_offset], 
+                    scale_ptr[n] /*scale*/, bias_ptr[n] /*bias*/);
+      }
+    }
+  }
+
+
+
+};
+
+// Partial specialization for row-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, OutputOp_> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+  
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow, 
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kRowsPerIteration = 8;
+  static int const kColumnsPerIteration = 16;
+  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kN / kThreads;
+  static int const kElementsPerAccess = kRowsPerIteration * kColumnsPerIteration / kThreads;
+  static int const kIterationsPerAccess = kElementsPerAccess / kElementsPerIteration;
+  
+  // Number of iterations per actual instruction
+  static int const kIterationsPerInstruction = InstructionShape::kM / kRowsPerIteration;
+
+  static int const kAccessStride = kIterationsPerInstruction;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of Accesses in a warp
+  using AccessIterations = MatrixShape<MmaIterations::kRow * kIterationsPerInstruction, 
+                                        MmaIterations::kColumn / kIterationsPerAccess>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn;
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerIteration>;
+  using FragmentAccessType = Array<Element, kElementsPerIteration>;
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerIteration>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    index_ = idx;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    int index = index_ * AccessIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < AccessIterations::kCount; i++) {
+
+      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
+                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
+                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
+                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
+                                    (kIterationsPerInstruction * kIterationsPerAccess) +
+                                    (index % kIterationsPerInstruction);
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kIterationsPerAccess; j++) {
+  
+        frag_ptr[i*kIterationsPerAccess + j].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+              frag_ptr[i*kIterationsPerAccess + j] = output_op(accumulators_[accumulator_access_offset + j * kAccessStride]);
+      }
+      index++;
+    }
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  /// Then apply per-channel scale and bias
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, ScaleBiasFragment &scale, 
+        ScaleBiasFragment & bias, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
+    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
+
+    int index = index_ * AccessIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < AccessIterations::kCount; i++) {
+
+      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
+                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
+                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
+                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
+                                    (kIterationsPerInstruction * kIterationsPerAccess) +
+                                    (index % kIterationsPerInstruction);
+
+      int scale_bias_offset = (index 
+                    % (kIterationsPerInstruction * AccessIterations::kColumn))
+                    * kIterationsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kIterationsPerAccess; j++) {
+
+  
+        frag_ptr[i*kIterationsPerAccess + j].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+              frag_ptr[i*kIterationsPerAccess + j] = output_op(
+                    accumulators_[accumulator_access_offset + j * kAccessStride], 
+                    scale_ptr[scale_bias_offset + j], bias_ptr[scale_bias_offset + j]);
+      }
+      index++;
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
new file mode 100755
index 000000000..0a768caef
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Policy describing implementation details of warp-level GEMM targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Policy 
+template <
+  typename Operator_,        ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
+  typename OpDelta_          ///< distance between operations (concept: MatrixShape)
+>
+struct MmaTensorOpPolicy {
+
+  using Operator = Operator_;    ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
+  using OpDelta = OpDelta_;      ///< distance between operations (concept: MatrixShape)
+  using MmaShape = typename Operator::Shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
new file mode 100755
index 000000000..c40790fa8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -0,0 +1,280 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+
+    This is a work in progress.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaVoltaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Architecture tag
+  using ArchTag = arch::Sm70;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Underlying instruction shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// interleaved 32x32 tiles
+  using InterleavedTileShape = GemmShape<32, 32, 4>;
+
+  static_assert(!(Shape::kM % InterleavedTileShape::kM) &&
+                !(Shape::kN % InterleavedTileShape::kN),
+                "Shape must be a multiple of InterleavedTileShape.");
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaVoltaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<
+      ArchMmaOperator::Shape::kM,
+      ArchMmaOperator::Shape::kK
+    >,
+    Policy::OpDelta::kRow,
+    kThreadCount
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaVoltaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<
+      ArchMmaOperator::Shape::kK,
+      ArchMmaOperator::Shape::kN
+    >,
+    Policy::OpDelta::kRow,
+    kThreadCount
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaVoltaTensorOpAccumulatorTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    ElementC,
+    LayoutC,
+    typename ArchMmaOperator::Shape,
+    typename Policy::OpDelta
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / ArchMmaOperator::Shape::kM,
+    InterleavedTileShape::kN / ArchMmaOperator::Shape::kN
+  >;
+  using TileIterations = MatrixShape<
+    Shape::kM / InterleavedTileShape::kM,
+    Shape::kN / InterleavedTileShape::kN
+  >;
+
+  // Whether matrix B is reordered
+  bool reorder_B_;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+  
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C)  {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int outer_col = 0; outer_col < TileIterations::kColumn; ++outer_col) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inner_col = 0; inner_col < MmaIterations::kColumn; ++inner_col) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int outer_row = 0; outer_row < TileIterations::kRow; ++outer_row) {
+          CUTLASS_PRAGMA_UNROLL
+
+          for (int inner_row = 0; inner_row < MmaIterations::kRow; ++inner_row) {
+      
+            int op_col = inner_col + MmaIterations::kColumn * outer_col;
+
+            // Column-major serpentine sequence to maximize reuse of A operand.
+            int inner_row_serp = inner_row;
+            int outer_row_serp = outer_row;
+            if (op_col & 1) {
+              inner_row_serp = MmaIterations::kRow - inner_row - 1;
+              outer_row_serp = TileIterations::kRow - outer_row - 1;
+            }
+            int op_row = inner_row_serp + MmaIterations::kRow * outer_row_serp;
+            int op_idx = inner_row_serp + MmaIterations::kRow * 
+                         (inner_col + MmaIterations::kColumn * 
+                          (outer_row_serp + TileIterations::kRow * outer_col));
+            mma(
+              ptr_D[op_idx],
+              ptr_A[op_row],
+              ptr_B[op_col],
+              ptr_D[op_idx]);
+
+          }
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
new file mode 100755
index 000000000..4588efb98
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+/// Tile access iterator
+/// Each iteration acess in the tile is
+/// used as multiplicand for one
+/// warp-level matrix multiplication
+template <
+    /// Size of the tile (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Enable Residual Support
+    bool EnableResidual = false,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1
+>
+class MmaTensorOpMultiplicandTileAccessIterator {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 
+    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+    Shape::kRow / InstructionShape::kRow,
+    Shape::kColumn / InstructionShape::kColumn
+  >;
+
+  static int const kIterations = (kOperand == Operand::kA) ? 
+    InstructionCount::kColumn : InstructionCount::kRow;
+
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA) ? 
+      (Shape::kRow * InstructionShape::kColumn / kThreads) : 
+      (Shape::kColumn * InstructionShape::kRow / kThreads)
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to load residual tile
+  bool is_residual_;
+  
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+public:
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), is_residual_(false), iterations_(0) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+
+    if(EnableResidual) {
+      // compute residual offset
+      if (kOperand == Operand::kA) {
+        typename TensorCoord::Index residual_size = 
+          extent_.column() % Shape::kColumn;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(0, residual_size);
+        }
+      }
+      else {
+        typename TensorCoord::Index residual_size = 
+          extent_.row() % Shape::kRow;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(residual_size, 0);
+        }
+      }
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): MmaTensorOpMultiplicandTileAccessIterator(ref,
+    {Shape::kRow, Shape::kColumn}, lane_id) {
+  }
+ 
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+
+    if(EnableResidual && is_residual_) {
+      is_residual_ = false;
+
+      origin_ += residual_offset_;
+      ref_.add_coord_offset(residual_offset_);
+
+    }
+
+    else {
+      if (kOperand == Operand::kA) {
+        add_tile_offset({0, 1});
+      }
+      else {
+        add_tile_offset({1, 0});
+      }
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator & operator++() {
+
+    iterations_++;
+
+    if(iterations_ >= kIterations)
+      advance();
+    
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    int const kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
+
+    // Take advantage of Tensor Op's 8 x 4T access pattern
+    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+
+    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    if (kOperand == Operand::kA) {
+      int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
+            int access_idx = 
+              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+            
+            MatrixCoord offset(
+              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
+              inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kColumn);
+
+            MatrixCoord access_coord = origin_ + offset;
+
+//            if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+
+              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+                ref_.data() + ref_.offset(offset));
+//            }
+//            else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//            }
+          }
+        }
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+            inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kRow,
+            inst_n_idx * 8);
+
+          MatrixCoord access_coord = origin_ + offset;
+
+//          if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+              
+            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              ref_.data() + ref_.offset(offset));
+//          }
+//          else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//          }
+        }
+      } 
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
new file mode 100755
index 000000000..e6e6d70f3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -0,0 +1,4803 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaTensorOpMultiplicandTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                   64>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, 64>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner), 
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+      
+    int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 4) {
+        // Matrix multiply 1688 A/B
+        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
+        // Four blocks are next to each other in the contiguous dimension.
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
+        access_contiguous_idx = (quad_pair ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx =
+            (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 2));
+        access_contiguous_idx = ((i & 3) ^ lane_in_quad);
+        access_strided_idx = lane_id;
+      }
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
+/// uses LDS.32 to load from shared memory and therefore must be initialized
+/// with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
+    OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual 32bit
+    // shared memory load op.  Every one warp of 32bit shared memory load loads
+    // 8x4 elements
+    static int const kLdsOpInner = Layout::TileShape::kStrided;
+    static int const kLdsOpOuter = kThreads / kLdsOpInner;
+
+    static_assert(!(Shape::kContiguous % kLdsOpOuter),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsOpInner),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    /// Number of 32 bit shared memory load instructions needed by one MMA instruction
+    /// 1688  A 2x2
+    /// 1688  B 1x2
+    /// 16816 B 1x4
+    static int const LdsShapeContiguous =
+        InstructionShape::kContiguous / kLdsOpOuter;
+    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
+    using LdsShape =
+        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
+
+    /// Number and arrangement of LDS instructions
+    using LdsIterations = layout::PitchLinearShape<
+        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = Layout::TileShape::kContiguous *
+                                   Layout::kElementsPerAccess /
+                                   Policy::kLdsOpOuter;
+
+  /// Vectorized access is not used
+  static int const kElementsPerAccess = 1;
+
+  /// Pointer type used for accesses
+  using AccessType = Element;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+ private:
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int access_strided = lane_id % Policy::kLdsOpInner;
+      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
+                              (access_strided ^ i) * Policy::kLdsOpOuter;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
+      if (tile_offset.contiguous() % 2) {
+        // Matrix multiply 1688 pointer_[0] <=> pointer_[4] pointer_[1] <=> pointer_[5]
+        //           pointer_[2] <=> pointer_[6] pointer_[3] <=> pointer_[7]
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
+            int access_idx =
+                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
+                               Policy::LdsShape::kStrided) *
+                         Policy::LdsShape::kContiguous;
+            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
+            int access_idx_strided =
+                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
+
+            AccessType const *source_ptr =
+                pointer_[access_idx_contiguous % kPointerCount] +
+                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
+                    (access_idx_contiguous / kPointerCount) +
+                access_idx_strided * stride_;
+
+            char const *source_byte_ptr =
+                reinterpret_cast<char const *>(source_ptr) + byte_offset +
+                byte_offset_;
+
+            fetch_ptr[access_idx] =
+                *reinterpret_cast<Element const *>(source_byte_ptr);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps with 64B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 32>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = 32;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+      
+    int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    //int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 4) {
+        // Matrix multiply 1688 A/B
+        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
+        // Four blocks are next to each other in the contiguous dimension.
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = quad_pair ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (((quad_pair & 1) + i * 2) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_pair + (lane_id >> 4 << 3)) / 2;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (quad_quad + i * 2) ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor) ^ i;
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps with 32B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 16>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = 16;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+
+    //int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    int lane_in_pair = (lane_id & 1);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + quad_quad) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        int factor_in_partition =
+            (Layout::PartitionShape::kContiguous * Layout::kFactor /
+             Layout::TileShape::kContiguous);
+
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
+                                 (lane_in_quad_quad / Layout::kFactor) ^ i);
+        access_strided_idx = lane_id / Layout::kFactor;
+      } 
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA,
+                "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "
+                "only be instantiated for A operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// MBlock or NBlock size
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "
+                "only be instantiated for B operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                   Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeContiguous =
+        InstructionShape::kContiguous / kLdsmOpOuter;
+    static int const LdsmShapeStrided =
+        ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
+            ? (Shape::kStrided / kLdsmOpInner)
+            : (4 / LdsmShapeContiguous);
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations =
+        layout::PitchLinearShape<1, Shape::kStrided / kLdsmOpInner /
+                                        LdsmShape::kStrided>;
+
+    ///
+    static int const kGroupsPerTile = Layout::TileShape::kContiguous /
+                                      Layout::kFactor / LdsmShape::kContiguous;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kStrided *
+                                      InstructionShape::kContiguous / kThreads>;
+
+ private:
+
+  /// Total number of sections.  The memory is divided into stages.  One stage
+  /// can store one tile.  Stage is divided into sections.  Interleaved layout
+  /// can have multiple sections in a stage.  The rest layout only has one section
+  /// in a stage.
+  int sections_;
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator()
+      : pointer_(nullptr),
+        sections_(0),
+        stride_(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        sections_(ref.stride(0) / kCrosswise),
+        // stride_ = kCrosswise x sections_ x kFactor
+        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0),
+        k_group_idx_(0) {
+    // Warp level iterator at most use double buffer to hide latency.  If there
+    // are more than 2 sections, every stage should have more than 1 section.
+
+    // Turing silicon requires all 32 threads in a warp provide valid addresses
+    // even for LDSM.1 and LDSM.2
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))
+    lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
+#endif
+
+    int quad_quad = (lane_id >> 4);
+    int quad_pair = (lane_id >> 3);
+    int lane_in_pair = (lane_id & 1);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    int partition_contiguous_idx = -1;
+    int access_contiguous_idx = -1;
+    int access_strided_idx = -1;
+
+    if (Layout::kFactor == 8) {
+      int factor_in_partition =
+          (Layout::PartitionShape::kContiguous * Layout::kFactor /
+           Layout::TileShape::kContiguous);
+
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        partition_contiguous_idx = lane_in_quad_pair / factor_in_partition;
+        access_contiguous_idx = ((lane_in_quad) ^ (lane_id / Layout::kFactor));
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+    } else if (Layout::kFactor == 4) {
+      // Super Integer matrix multiply Interleaved-32
+
+      int factor_in_partition =
+          (Layout::PartitionShape::kContiguous * Layout::kFactor /
+           Layout::TileShape::kContiguous);
+
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Integer matrix multiply 8816  A/B
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
+                                 (lane_in_quad_quad / Layout::kFactor));
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Integer matrix multiply 16832 A
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + quad_quad) ^
+             access_strided_idx);
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Integer matrix multiply 16832 B
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      }
+    } else if (Layout::kFactor == 2) {
+      // Super Matrix multiply kBlock = 32
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Matrix multiply 1688 A/B
+        // (Q stands for 1 8x128bit block).
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        // Four blocks are next to each other in the strided dimension.
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = lane_id / Layout::kFactor;
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (quad_quad ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx =
+            (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor;
+      } 
+      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
+        // Matrix multiply 16832.SP B
+        // Q0 Q1 Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (quad_pair ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
+      }
+    } else if (Layout::kFactor == 1) {
+      // Super Matrix multiply kBlock = 64
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = lane_in_quad;
+        access_strided_idx = lane_id;
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = (quad_quad ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } 
+      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
+        // Matrix multiply 16832.SP B
+        // Q0 Q1 Q2 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = (quad_pair ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair;
+      }
+    }
+
+    int access_contiguous =
+        partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+        access_contiguous_idx;
+
+    int access_strided = access_strided_idx;
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) *
+                   sizeof_bits<Element>::value * Layout::kElementsPerAccess / 8;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+
+    byte_offset_ ^= k_groups_delta * sizeof_bits<Element>::value *
+                    Layout::kElementsPerAccess *
+                    Policy::LdsmShape::kContiguous / 8;
+    pointer_ +=
+        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
+        whole_tiles * stride_ / sections_;
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+    if (k_groups_delta < 0) {
+        whole_tiles -= 1;
+        k_groups_delta += Policy::kGroupsPerTile;
+    }
+
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) {
+      byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) == 8) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+
+    k_group_idx_ += k_groups_delta;
+    whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK);
+    k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK);
+
+    pointer_ +=
+        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
+        whole_tiles * stride_ / sections_;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+
+    // Integer matrix multiply 16832 Interleaved-32
+    //   NONE
+    // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32
+
+    // Integer matrix multiply 8816  Interleaved-32
+    //   ^1 ^1
+    // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64
+    // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
+    //   ^1 ^3 ^1 ^3
+    // Matrix multiply 1688 kblock=64
+    //   ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
+
+    // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64
+    //   ^2 ^2
+    // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128
+    //   ^2 ^6 ^2 ^6
+
+    if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
+      int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
+                     ? 3
+                     : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
+
+      if (((k_group_idx_ & mask) % 2) == 0)
+        byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+      else if ((k_group_idx_ & mask) == 1)
+        byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+      else if ((k_group_idx_ & mask) == 3)
+        byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+
+    k_group_idx_++;
+
+    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
+      k_group_idx_ = 0;
+      add_tile_offset({Policy::kGroupsPerTile, 0});
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ + Policy::LdsmShape::kContiguous * c +
+            Policy::kLdsmOpInner / Layout::kFactor *
+                Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr =
+            reinterpret_cast<char const *>(source_ptr) + byte_offset +
+            byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "
+                "only be instantiated for B operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA,
+                "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "
+                "only be instantiated for A operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible =
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout.
+///
+/// This iterator is not tested.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::AffineRankN<2>, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible =
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<Shape_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible = 
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            frag[idx] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element typ
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Interleaved N
+    int InterleavedN>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::ColumnMajorInterleaved<InterleavedN>,
+    InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorInterleaved<InterleavedN>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  static int const kElementsPerAccess = 2;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+          offset_ref.offset(TensorCoord(accum_m, accum_n)));
+
+        frag_ptr[idx] = access_ptr[0];
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 offset_ref.offset(TensorCoord(accum_m, accum_n)));
+
+        access_ptr[0] = frag_ptr[idx];               
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element typ
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Interleaved N
+    int InterleavedN>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::TensorNCxHWx<InterleavedN>,
+    InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = int8_t;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorNCxHWx<InterleavedN>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    /// Number of elements in strided dimension that each STG writes
+    static int const kStridedPerSTG = 8;
+
+    /// Factor to calculate reorder index to pack accumulator.
+    static int const kPackedFactor = Shape::kColumn / 32;
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / kStridedPerSTG,
+                                      Shape::kColumn / InterleavedN>;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InterleavedN / 4;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  struct alignas((kElementsPerAccess * sizeof_bits<Element>::value / 8)) AccessType {
+      Array<Element, kElementsPerAccess> storage;
+  };
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<int32_t, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+  /// Row offset index globally
+  LongIndex global_offset_row_;
+
+  /// Column offset index globally
+  LongIndex global_offset_col_;
+
+  /// Output tensor size
+  TensorCoord extent_;
+
+  /// Alpha 
+  float alpha_;
+
+  /// Beta
+  float beta_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref,
+    int const lane_id,
+    TensorCoord extent,
+    float alpha = 1.0f,
+    float beta = 0.0f
+  ):
+    ref_(ref),
+    extent_(extent),
+    alpha_(alpha),
+    beta_(beta) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    global_offset_row_ = quad;
+
+    global_offset_col_ = lane_in_quad * kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(MatrixCoord const &tile_offset) {
+
+    global_offset_row_ += tile_offset.row() * Shape::kRow;
+
+    global_offset_col_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kN; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kM; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kM;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 accum_m * offset_ref.stride(0) + accum_n);
+
+        frag_ptr[idx] = access_ptr[0];
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    Array<float, Shape::kCount / kThreads> output_frag_f;
+    Array<Element, Shape::kCount / kThreads> output_frag;
+
+    LongIndex pq = extent_.h() * extent_.w();
+
+    LongIndex extent_row = extent_.n() * pq;
+    LongIndex extent_col = extent_.c();
+
+    LongIndex k_major = (global_offset_col_ / InterleavedN) * pq;
+    Index k_minor = global_offset_col_ % InterleavedN;
+    LongIndex k_offset = k_major * InterleavedN + k_minor;
+    LongIndex k_offset_delta = pq * InterleavedN;
+
+    LongIndex stride_n = pq * extent_.c();
+
+    Index n;
+    LongIndex pq_rem;
+
+    unsigned int pq_mul, pq_shr;
+    find_divisor(pq_mul, pq_shr, pq);
+
+    if(beta_ == 0.0f) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < int(frag.size()); ++i) {
+        output_frag_f[i] = frag[i];
+      }
+
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          output_frag[i] = (Element)(output_frag_f[i] * alpha_);
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag[i] = (Element)(output_frag_f[map_i] * alpha_);
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
+      }
+    } else {
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          output_frag_f[i] = frag[i];
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag_f[i] = frag[map_i];
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      Array<Element, kElementsPerAccess> ref_frag;
+      AccessType *ref_frag_ptr = reinterpret_cast<AccessType *>(&ref_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            ref_frag_ptr[0] = access_ptr[0];
+
+            CUTLASS_PRAGMA_UNROLL
+            for(int i = 0; i < kElementsPerAccess; ++i) {
+              output_frag[idx * kElementsPerAccess + i] = Element(alpha_ * output_frag_f[idx * kElementsPerAccess + i]
+                                                                + beta_ * ref_frag[i]);
+            }
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
new file mode 100755
index 000000000..bcac335f2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -0,0 +1,3098 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+
+#include "cutlass/platform/platform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads>
+class MmaVoltaTensorOpMultiplicandTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Shape of one individual LDS.128
+    // TODO: 32 and 4 are hardcoded, 32-by-4 is logical shape
+    using LdsShape = layout::PitchLinearShape<
+      32,
+      4
+    >;
+
+    // LdsShapes are arranged in the strided direction in SMEM
+    using LdsIterations = layout::PitchLinearShape<
+      InstructionShape::kStrided / LdsShape::kStrided,
+      Shape::kContiguous / LdsShape::kContiguous
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = 2;
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment = Array<Element, Shape::kContiguous *
+                                     InstructionShape::kStrided / kThreads * 2>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+    // swizzle patterns for operandA LDS are
+    // 1. (tid[4] << 3) | (tid[2:0] ^ tid[4])
+    // 2. (tid[4] << 3) | (tid[2:0] ^ tid[4] ^ 0b10010)
+
+    int vec_row = (lane_id >> 4); // tid[4]
+    int vec_col = ((lane_id & 4) >> 2); // tid[2]
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+
+      if(i == 1) {
+        vec_row |= 2;
+      }
+      int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
+      int access_contiguous = access_contiguous_idx;
+
+      int access_strided = vec_row;
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+        access_contiguous + access_strided * stride_;
+    }
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+
+    // To support 32x32 tile size
+    if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
+      if (contiguous_offset % 2) {
+        AccessType const *tmp_pointer = pointer_[0];
+        pointer_[0] = pointer_[1];
+        pointer_[1] = tmp_pointer;
+      }
+      contiguous_offset = contiguous_offset / 2 * 2;
+    }
+
+    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
+                     Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_[s & 1] +
+          Policy::LdsShape::kContiguous * c +
+          Policy::LdsShape::kStrided * (s / 2) * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandBCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+    /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Shape of one individual LDS
+    // TODO: remove hardcoded 32 and 4
+    using LdsShape = layout::PitchLinearShape<
+      32,
+      4
+    >;
+
+    using LdsIterations = layout::PitchLinearShape<
+      Shape::kContiguous / LdsShape::kContiguous,
+      InstructionShape::kStrided / LdsShape::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile, needs on more time number of registers
+ using Fragment = Array<Element, Shape::kContiguous *
+                                     InstructionShape::kStrided / kThreads * 2>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    // swizzle pattern is (tid & (3 << 3) | (tid[1:0] ^ tid[4:3]))
+    int access_strided = (lane_id >> 3) & 0x3;
+    int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
+
+    pointer_ = reinterpret_cast<AccessType const *>(ref.data()) +
+                access_contiguous + access_strided * stride_;
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+
+    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
+                     Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+          Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
+          Policy::LdsShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_,
+    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaVoltaTensorOpAccumulatorTileIterator {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+
+    /// Volta Tensor Op uses 32x32 interleaved tile
+    using InterleavedTile = MatrixShape<32, 32>;
+
+    static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
+      "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using TileIterations = MatrixShape<
+      Shape::kRow / InterleavedTile::kRow,
+      Shape::kColumn / InterleavedTile::kColumn
+    >;
+
+    using MmaIterations =
+        MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
+                    InterleavedTile::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is multipile interleaved 32x32 tile.
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename platform::conditional<
+                              platform::is_same<Element, float>::value,
+                              MatrixShape<2, 2>,
+                              MatrixShape<1, 4> >::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = MatrixShape<4, 4>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                  (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    MatrixCoord lane_offset(accum_m, accum_n);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                    Policy::MmaIterations::kColumn + mma_n) *
+                     Policy::MmaIterations::kRow + mma_m) * 
+                    kElementsPerMma;
+
+           CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
+                                mma_n * QuadShapePerPatialMma::kColumn +
+                                p * Policy::InterleavedTile::kColumn/2 + n;
+                  int idx = mma_accum_start + p * kElementsPerPartial + 
+                            m * EleShapePerPatial::kColumn + n;
+                frag[idx] = offset_ref.at({accum_m, accum_n});
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                    Policy::MmaIterations::kColumn + mma_n) *
+                     Policy::MmaIterations::kRow + mma_m) * 
+                    kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
+                                mma_n * QuadShapePerPatialMma::kColumn +
+                                p * Policy::InterleavedTile::kColumn/2 + n;
+                  int idx = mma_accum_start + p * kElementsPerPartial + 
+                            m * EleShapePerPatial::kColumn + n;
+                  offset_ref.at({accum_m, accum_n}) = frag[idx];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+
+    /// Shape of one individual LDS instruction
+    using LdsShape = layout::PitchLinearShape<1, 32>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsIterations = layout::PitchLinearShape<1, Shape::kStrided / 32>;
+
+    /// Using LDS.128
+    static int const kElementsPerAccess = 8;
+
+    /// Contiguous elements per line
+    static int const kContiguousElementsPerLine = 4;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+      Array<Element,
+            Shape::kStrided * InstructionShape::kContiguous / kThreads * 2>;
+
+ private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Crosswised elements are arranged in a SMEM line
+  /// in units of AccessType
+  Index line_size;
+
+  /// Internal counter used to determine load addr offset 
+  /// and when to swap higher 64bit with lower 64bit
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator()
+      : pointer_(nullptr),
+        stride_(0),
+        line_size(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        stride_(ref.stride(0) * Policy::kElementsPerAccess),
+        line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
+                  Policy::kElementsPerAccess),
+        k_group_idx_(0),
+        byte_offset_(0) {
+
+    int quad = (lane_id / 4);
+    int lane_in_quad = (lane_id % 4);
+    int access_contiguous;
+
+    if(kOperand == Operand::kA) {
+
+      // swizzle id: tid[4]|tid[1:0]|(tid[2]^tid[4])
+      access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
+                            ((quad & 0x1) ^ ((quad & 0x4) >> 2));
+    } else {
+
+      // swizzle id: tid[4]|tid[1:0]|tid[3]
+      access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
+                            ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
+    }
+
+    byte_offset_ = access_contiguous *
+                   sizeof(Element) * Policy::kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+    k_group_idx_ = 0;
+
+    pointer_ += contiguous_offset *
+                    (InstructionShape::kContiguous /
+                     Policy::kContiguousElementsPerLine) *
+                    line_size +
+                strided_offset * Shape::kStrided / 2;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    k_group_idx_ = (k_group_idx_ + 1) % 8;
+
+    if (k_group_idx_ == 4 || k_group_idx_ == 0) {
+      byte_offset_ ^= 1 * sizeof(Element) * Policy::kElementsPerAccess;
+    }
+
+    pointer_ += line_size;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+          Policy::LdsShape::kContiguous * c * line_size +
+          Policy::LdsShape::kStrided * s / 2;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+
+        // swap higher 64bit and lower 64bit
+        if (k_group_idx_ &  0x2) {
+            uint64_t *low = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2;
+            uint64_t *high = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2 + 1;
+            uint64_t tmp = *low;
+            *low = *high;
+            *high = tmp;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Policy::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 kKBlock>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 kKBlock>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+  
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for 'TN' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile  + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[row_offset * ldm / kElementsPerAccess];
+      } 
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[col_offset * ldm / kElementsPerAccess];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+
+/// Tile iterator specialized for 'NT' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows;
+        frag_ptr[idx] = access_ptr[row_offset / kElementsPerAccess + quad_idx];
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns;
+        frag_ptr[idx] = access_ptr[col_offset / kElementsPerAccess + quad_idx];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::RowMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::ColumnMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::ColumnMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::RowMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
new file mode 100755
index 000000000..4ccf0b580
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,2441 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    int old_k_group_idx = k_group_idx_;
+
+    k_group_idx_ += tile_offset.contiguous();
+
+    if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) {
+      byte_offset_ ^= 0x40;
+    }
+
+    return *this;
+  }
+
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    // TODO: fix this if it becomes an issue during warp it reset
+    add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    if (k_group_idx_ & 0x1) {
+      // xor ptr
+      byte_offset_ ^= 0x40;
+    }
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    if (k_group_idx_ & 1) {
+      // exchange on 64b granularity
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Fragment::kElements; i += 2) {
+        Element tmp = exchange_ptr[i];
+        exchange_ptr[i] = exchange_ptr[i + 1];
+        exchange_ptr[i + 1] = tmp;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Tile iterator specialized for canonical matrix layouts
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaTensorOpMultiplicandTileIteratorCanonical {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 
+    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+private:
+
+  static int const kWarpShapeOuter = 
+    (kOperand == Operand::kA ? Shape::kRow : Shape::kColumn);
+
+  static int const kWarpShapeInner =
+    (kOperand == Operand::kA ? Shape::kColumn : Shape::kRow);
+
+  
+  /// Rounded up instruction counts
+  using InstructionCount = MatrixShape<
+    Shape::kRow / InstructionShape::kRow,
+    Shape::kColumn / InstructionShape::kColumn
+  >;
+
+  /// Rounded up tile dimensions
+  using WarpShapeDivisible = MatrixShape<
+    InstructionCount::kRow * InstructionShape::kRow,
+    InstructionCount::kColumn * InstructionShape::kColumn
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    WarpShapeDivisible::kRow * WarpShapeDivisible::kColumn / kThreads
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(
+    TensorRef const &ref, 
+    int lane_id
+  ): ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    int const kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? WarpShapeDivisible::kColumn : WarpShapeDivisible::kRow);
+
+    // Take advantage of Tensor Op's 8 x 4T access pattern
+    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+
+    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    if (kOperand == Operand::kA) {
+      int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
+            int access_idx = 
+              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+            
+            MatrixCoord offset(
+              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
+              inner_idx * 4 * kElementsPerAccess);
+
+            MatrixCoord access_coord = origin_ + offset;
+
+            if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+
+              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+                ref_.data() + ref_.offset(offset));
+            }
+            else {
+              AccessType zero;
+              zero.clear();
+              access_ptr[access_idx] = zero;
+            }
+          }
+        }
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+            inner_idx * 4 * kElementsPerAccess,
+            inst_n_idx * 8);
+
+          MatrixCoord access_coord = origin_ + offset;
+
+          if (divisible_ ||
+            (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+              
+            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              ref_.data() + ref_.offset(offset));
+          }
+          else {
+              AccessType zero;
+              zero.clear();
+              access_ptr[access_idx] = zero;
+          }
+        }
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+/// Wrapper for ColumnMajor
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajor,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
+      Shape, kOperand, Element,
+      layout::ColumnMajor,
+      InstructionShape,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    TensorCoord const & extent,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+/// Wrapper for RowMajor
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajor,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
+      Shape, kOperand, Element,
+      layout::RowMajor,
+      InstructionShape,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    TensorCoord const &extent,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
new file mode 100755
index 000000000..c4ed8bc98
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators to load sparse meta data used by warp-level matrix multiply operations
+   targeting Sparse Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class SparseMmaTensorOpMetaTileIterator {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  static int const kSparse = 2;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kColumn % InstructionShape::kColumn),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    
+    static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = InstructionShape::kColumn;
+    static int const kLdsmOpInner = 8 * kElementsPerAccess / kLdsmOpOuter;
+
+    static_assert(!(Shape::kColumn % kLdsmOpOuter),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kRow % kLdsmOpInner),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeColumn =
+        InstructionShape::kColumn / kLdsmOpOuter;
+    static int const LdsmShapeRow =
+        ((4 / LdsmShapeColumn * kLdsmOpInner) > Shape::kRow)
+            ? (Shape::kRow / kLdsmOpInner)
+            : (4 / LdsmShapeColumn);
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeRow, LdsmShapeColumn>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kRow / kLdsmOpInner / LdsmShapeRow,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kColumn / InstructionShape::kColumn;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Policy::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+      Array<Element, Shape::kRow * InstructionShape::kColumn / kThreads>;
+
+ private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  SparseMmaTensorOpMetaTileIterator()
+      : pointer_(nullptr),
+        stride_(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        stride_(ref.stride(0) / Policy::kElementsPerAccess),
+        byte_offset_(0),
+        k_group_idx_(0) {
+
+    int access_contiguous = (lane_id % (Shape::kRow / Policy::kElementsPerAccess));
+    int access_strided = (lane_id / (Shape::kRow / Policy::kElementsPerAccess));
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) *
+                   sizeof_bits<Element>::value * Policy::kElementsPerAccess / 8;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int offset = tile_offset.row() * Shape::kRow +
+                 tile_offset.column() * InstructionShape::kColumn * stride_ *
+                     Policy::kElementsPerAccess;
+
+    add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator--(){
+    byte_offset_ -= stride_ * InstructionShape::kColumn *
+                    sizeof_bits<Element>::value * Policy::kElementsPerAccess /
+                    8;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE SparseMmaTensorOpMetaTileIterator &
+  operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ +
+            Policy::LdsmShape::kContiguous * Policy::kLdsmOpInner * c +
+            Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) +
+                                      byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kRow / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kColumn * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
new file mode 100755
index 000000000..0da043e67
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -0,0 +1,805 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity (A or B)
+    Operand Operand,
+    /// Data type of operand
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Delta between *MMA operations (in units of *WMMA operations, concept:MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator;
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::load_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    int OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_, Layout_,
+    OpDelta_, 32, Policy_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *WMMA operations
+  static int const kOpDelta = OpDelta_;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Stride Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape for operand A (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kM, 
+    Policy::Operator::Shape::kK
+  >;
+
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Shape of individual WMMA load / stores for operand A
+  using Iterations = MatrixShape<
+    Shape::kRow / WmmaShape::kRow,
+    1 
+  >;
+
+  /// Fragment object holding a warps part 
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentA, Iterations::kCount>;
+
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically assert this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// This iterator is specalized for Operand A
+  static_assert(kOperand == Operand::kA,
+    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for A operands to warp-level Mma.");
+
+  /// Supported memory layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+
+  /// Shared memory base pointers - not advanced
+  char const *pointer_;
+  
+  /// Byte offset into shared memory - advanced
+  Index byte_offset_;
+  
+  /// Stride in units of number of elements
+  StrideIndex stride_;
+
+  /// Layout of shared memory
+  Layout layout_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) { 
+  
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    Index elements_offset = layout_({tile_offset.row() * Shape::kRow, tile_offset.column() * WmmaShape::kColumn});
+    
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
+    
+    Index elements_offset = layout_({0, WmmaShape::kColumn});
+
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
+    
+    Index elements_offset = layout_({0, WmmaShape::kColumn});
+
+    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        Index load_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset); 
+
+        nvcuda::wmma::load_matrix_sync(frag[m], ptr, stride_); 
+      
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        Index store_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
+
+        nvcuda::wmma::store_matrix_sync(ptr, frag[m], stride_); 
+      
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::load_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    int OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_, Layout_,
+    OpDelta_, 32, Policy_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *WMMA operations
+  static int const kOpDelta = OpDelta_;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Stride Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kK, 
+    Policy::Operator::Shape::kN
+  >;
+
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Shape of individual WMMA load / stores for operand B
+  using Iterations = MatrixShape<
+    1,
+    Shape::kColumn / WmmaShape::kColumn
+  >;
+
+  /// Fragment object holding a warps part
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentB, Iterations::kCount>;
+
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically asserts this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// This iterator is specalized for Operand B
+  static_assert(kOperand == Operand::kB,
+    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for B operands to warp-level Mma.");
+
+  /// Supported memory layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+
+  /// Shared memory base pointers - not advanced
+  char const *pointer_;
+  
+  /// Byte offset into shared memory - advanced
+  Index byte_offset_;
+  
+  /// Stride in units of number of elements
+  StrideIndex stride_;
+
+  /// Layout of shared memory
+  Layout layout_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    
+    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    
+    Index elements_offset = layout_({tile_offset.row() * WmmaShape::kRow, tile_offset.column() * Shape::kColumn});
+    
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
+    
+    Index elements_offset = layout_({WmmaShape::kRow, 0});
+
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+    
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
+
+    Index elements_offset = layout_({WmmaShape::kRow, 0});
+
+    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        
+        Index load_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset);
+
+        nvcuda::wmma::load_matrix_sync(frag[n], ptr, stride_);        
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        Index store_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
+        
+        nvcuda::wmma::store_matrix_sync(ptr, frag[n], stride_);        
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::store_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    typename OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaAccumulatorTileIterator
+{
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kM, 
+    Policy::Operator::Shape::kN
+  >;
+  
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Map cutlass::layout to nvuda::wmma::layout_t enum
+  static nvcuda::wmma::layout_t const WmmaLayout = cutlass::arch::CutlassToWmmaLayout<Layout>::value;
+
+  /// Shape of individual WMMA load / stores for accumulator
+  using Iterations = MatrixShape<
+    Shape::kRow / WmmaShape::kRow,
+    Shape::kColumn / WmmaShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentC, Iterations::kCount>;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically asserts this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Supported layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+private:
+  
+  /// Internal reference
+  cutlass::TensorRef<Element, Layout> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): ref_(ref) { }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    ref_.add_coord_offset({tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn});
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator++() {
+    ref_.add_coord_offset({Shape::kRow, 0});
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator--() {
+    ref_.add_coord_offset({-Shape::kRow, 0});
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < Iterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        const WmmaDataType * ptr = reinterpret_cast<const WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
+        
+        nvcuda::wmma::load_matrix_sync(frag[m * Iterations::kColumn + n], ptr, ref_.stride()[0], WmmaLayout); 
+
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < Iterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        WmmaDataType * ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
+
+        nvcuda::wmma::store_matrix_sync(ptr, frag[m * Iterations::kColumn + n], ref_.stride()[0], WmmaLayout); 
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
new file mode 100755
index 000000000..971ad3b81
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+///< Structure to compute the matrix product targeting CUDA cores via WMMA.
+template < 
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  ///< Data type of A elements
+  typename ElementA_,
+  ///< Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  ///< Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  ///< Element type of C matrix
+  typename ElementC_,
+  ///< Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  ///< Policy describing warp-level Wmma operation (concept: MmaTensorOpPolicy)
+  typename Policy_,
+  ///< Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  ///< Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOpWmma {
+public:
+  ///< Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  ///< Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  ///< Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  ///< Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  ///< Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  ///< Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+  using Policy = Policy_;
+
+  /// Underlying instruction shape
+  using InstructionShape = typename Policy::Operator::Shape;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Underlying architecture tag
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpWmmaMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     Policy::OpDelta::kRow, kThreadCount, Policy>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpWmmaMultiplicandTileIterator<
+     MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+     Policy::OpDelta::kRow, kThreadCount, Policy>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpWmmaAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+    typename Policy::OpDelta, Policy>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  static_assert(
+    !(Shape::kM % Policy::Operator::Shape::kM) && 
+    !(Shape::kN % Policy::Operator::Shape::kN),
+    "Shape of warp-level Wmma must be divisible by operator shape (wmma native size)");
+
+  /// Number of wmma operations performed
+  using WmmaIterations = MatrixShape<
+    Shape::kM / Policy::Operator::Shape::kM,
+    Shape::kN / Policy::Operator::Shape::kN 
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: cutlass::arch::Wmma)
+  typename Policy::Operator wmma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpWmma() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < WmmaIterations::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < WmmaIterations::kRow; ++m) {
+
+        // accumulate wmma mma
+        wmma(D[m * WmmaIterations::kColumn + n], A[m], B[n], C[m * WmmaIterations::kColumn + n]);
+      }
+    }  
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
new file mode 100755
index 000000000..67231d35a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
@@ -0,0 +1,449 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Reduce operand A or B along K dimension
+  bool ReduceKForA_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaWithReductionTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  static bool const kReduceKForA = ReduceKForA_;
+
+  static_assert(platform::is_same<ElementA, cutlass::half_t>::value ||
+                platform::is_same<ElementA, cutlass::bfloat16_t>::value,
+                "ElementA needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<ElementB, cutlass::half_t>::value ||
+                platform::is_same<ElementB, cutlass::bfloat16_t>::value,
+                "ElementB needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<InstructionShape,
+                                  cutlass::gemm::GemmShape<16, 8, 16>>::value,
+                "Only supports 16x8x16 tensor core instruction.");
+
+  static_assert(!AccumulatorsInRowMajor,
+                "Only calls tensor core instructions in column major.");
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+  using FragmentReduction = Array<ElementC, kReduceKForA ? (Shape::kM / 8) : (Shape::kN / 8)>;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaWithReductionTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    TransformedFragmentA const &A,
+    TransformedFragmentB const &B,
+    FragmentC const &C,
+    FragmentReduction &gemm_k_reduction
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    [[maybe_unused]] MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    [[maybe_unused]] MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    [[maybe_unused]] MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      assert(0);
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      // Serpentine visitation order maximizing reuse of Ra
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+
+          if (!kReduceKForA && m == 0) {
+            #if 0
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 3]);
+            #else
+            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&B);
+
+            if (platform::is_same<ElementB, cutlass::half_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f16 low, high;\n\t"
+                " .reg .f32 tmp;\n\t"
+                " mov.b32 {low, high}, %1;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %2;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[n_serpentine])
+                : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else if (platform::is_same<ElementB, cutlass::bfloat16_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %1, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %1, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[n_serpentine])
+              : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else {
+                assert(0);
+            }
+            #endif
+          }
+
+          if (kReduceKForA && (n == 0)) {
+            #if 0
+            gemm_k_reduction[m * 2] += float(A[m * 8]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 1]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 4]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 5]);
+
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 7]);
+            #else
+            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&A);
+
+            if (platform::is_same<ElementA, cutlass::half_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f16 low, high;\n\t"
+                " .reg .f32 tmp;\n\t"
+                " mov.b32 {low, high}, %2;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %3;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " mov.b32 {low, high}, %4;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %5;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else if (platform::is_same<ElementA, cutlass::bfloat16_t>::value) {
+
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %3, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %3, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " shl.b32 tmp, %4, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %4, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %5, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %5, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else {
+              assert(0);
+            }
+            #endif
+          }
+        }
+      }
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+
+      dst_A = convert_A(A);
+
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+
+      dst_B = convert_B(B);
+
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    #else
+      assert(0);
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
new file mode 100755
index 000000000..7d74ac8cf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
@@ -0,0 +1,572 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines iterators used by warp-level loading scale and bias vectors.
+   Every scale/bias data only needs to be loaded once for every channel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class ScaleBiasTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::PitchLinear,
+                             InstructionShape_, Policy_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::PitchLinear;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Number of partitions along K dimension
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  using Policy = Policy_;
+
+ private:
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, 2 * Policy::kLdsmOpInner *
+                                      InstructionShape::kContiguous / kThreads>;
+
+ private:
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator()
+      : pointer_(nullptr),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias,
+                         int lane_id)
+      : byte_offset_(0), k_group_idx_(0) {
+    /// 16816 only
+    pointer_ = reinterpret_cast<AccessType const *>(ref_scale_bias.data()) +
+               ((lane_id >> 3) & 1) * Shape::kContiguous / kElementsPerAccess +
+               (lane_id >> 4);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+
+    byte_offset_ += k_groups_delta * sizeof_bits<Element>::value *
+                    kElementsPerAccess * Policy::LdsmShape::kContiguous / 8;
+
+    // Multiply by 2 because scale and bias belonging to the same stage are next
+    // to each other in the shared memory.
+    pointer_ += (2 * whole_tiles * Shape::kContiguous / kElementsPerAccess);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator++() {
+    byte_offset_ += Policy::LdsmShape::kContiguous *
+                    sizeof_bits<Element>::value * kElementsPerAccess / 8;
+
+    k_group_idx_++;
+
+    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
+      k_group_idx_ = 0;
+      byte_offset_ -= (Policy::kGroupsPerTile / kPartitionsK) *
+                      Policy::LdsmShape::kContiguous *
+                      sizeof_bits<Element>::value * kElementsPerAccess / 8;
+      add_tile_offset({Policy::kGroupsPerTile, 0});
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, 4> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, 4> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < 1; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ + Policy::LdsmShape::kContiguous * c;
+
+        char const *source_byte_ptr =
+            reinterpret_cast<char const *>(source_ptr) + byte_offset +
+            byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, 4>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               kElementsPerAccess;
+
+    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::RowMajor,
+                             InstructionShape_, Policy_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  using Policy = Policy_;
+
+  /// Underlying tile iterator implementation
+  using Base = ScaleBiasTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      Policy, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias, int lane_id)
+      : iterator_({ref_scale_bias.data(), ref_scale_bias.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm 
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
new file mode 100755
index 000000000..d8d99d675
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per-channel softmax before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentNormSum>
+struct SoftmaxScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumNormSum = FragmentNormSum::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using NormSumOperand = Array<__half2, MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 NormSumOperand const &norm_sum) {
+
+    __half2* packed_activations = reinterpret_cast<__half2*>(&activations);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < MmaElements / 2; ++i) {
+      __half2 out = ::h2exp(__hsub2(packed_activations[i], norm_sum[2*i]));
+      packed_activations[i] = __hmul2(out, norm_sum[2*i + 1]);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentNormSum const &norm_sum) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    NormSumOperand const *ptr_norm_sum =
+        reinterpret_cast<NormSumOperand const *>(&norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_norm_sum[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
new file mode 100755
index 000000000..42c6728bc
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/array_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename TileIterator_>
+class TileIteratorPlanarComplex {
+public:
+
+  /// Underlying iterator over real-valued tiles
+  using TileIterator = TileIterator_;
+
+  /// Underlying element type
+  using Element = typename TileIterator::Element;
+
+  /// Underlying layout type
+  using Layout = typename TileIterator::Layout;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = typename TileIterator::TensorRef;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Planar complex fragment
+  using Fragment = ArrayPlanarComplex<Element, TileIterator::Fragment::kElements>;
+
+public:
+
+  /// Underlying tile iterator
+  TileIterator tile_iterator_;
+
+  /// Offset (in units of bytes) to the imaginary part of the planar complex matrix
+  LongIndex imaginary_offset_;
+
+public:
+    /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex(): imaginary_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex(
+    TensorRef const &ref, 
+    int lane_id,
+    LongIndex imaginary_offset
+  ):
+    tile_iterator_(ref, lane_id),
+    imaginary_offset_((imaginary_offset * sizeof_bits<Element>::value) / 8) { }
+
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex &add_pointer_offset(LongIndex offset) {
+
+    tile_iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex &add_tile_offset(TensorCoord const &tile_offset) {
+
+    tile_iterator_.add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator++() {
+    ++tile_iterator_;
+    return *this;
+  }
+
+  //
+  // WIP
+  //
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex & operator--() {
+    --tile_iterator_;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator+=(TensorCoord const &tile_offset) {
+    tile_iterator_.add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator-=(TensorCoord const &tile_offset) {
+    tile_iterator_.add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, 0);
+    tile_iterator_.load_with_byte_offset(frag.imag, imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
+
+    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, 0);
+    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, byte_offset + imaginary_offset_);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    tile_iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
new file mode 100755
index 000000000..61b97a1e1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a matrix multiply-add operation
+template <
+  /// Rows of matrix product
+  int M = 1,
+  /// Columns of matrix product
+  int N = 1,
+  /// Inner dimension of matrix product
+  int K = 1
+>
+struct GemmShape {
+  static int const kM = M;
+  static int const kN = N;
+  static int const kK = K;
+
+  static int const kMN = M * N;
+  static int const kMK = M * K;
+  static int const kKN = N * K;
+  static int const kMNK = M * N * K;
+
+  static int const kCount = kMNK;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<3> toCoord() {
+    return make_Coord(kM, kN, kK);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Type alias of the transpose of a GemmShape
+template <
+  /// concept: GemmShape
+  typename Shape
+>
+using GemmShapeTranspose = GemmShape<Shape::kN, Shape::kM, Shape::kK>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GemmCoord is a structure derived from Coord<3> that specifies a location within the
+/// coordinate space of a GEMM problem.
+struct GemmCoord : public Coord<3, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=3
+  typedef Coord<3, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  GemmCoord() { }
+
+  /// Constructs from Coord<3> and a batch
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Coord<3, Index> const& coord): Base(make_Coord(coord[0], coord[1], coord[2])) { }
+
+  /// Helper to construct from a K, N, M, batch variables
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Index m, Index n, Index k): Base(make_Coord(m, n, k)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> mnk() const {
+    return make_Coord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> knm() const {
+    return make_Coord(k(), n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nm() const {
+    return make_Coord(n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mn() const {
+    return make_Coord(m(), n());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mk() const {
+    return make_Coord(m(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> km() const {
+    return make_Coord(k(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nk() const {
+    return make_Coord(n(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> kn() const {
+    return make_Coord(k(), n());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator+(Base const& b) const {
+    return GemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator-(Base const& b) const {
+    return GemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator*(Base const& b) const {
+    return GemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator/(Base const& b) const {
+    return GemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// BatchedGemmCoord is a structure derived from Coord<4> that specifies a location within the
+/// coordinate space of a batched GEMM problem.
+struct BatchedGemmCoord : public Coord<4, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=4
+  typedef Coord<4, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  /// GEMM Batch dimension - inner dimension of the GEMM problem
+  static int const kBatch = 3;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord() { }
+
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Base const& coord): Base(coord) { }
+
+  /// Helper to construct from a K, N, M, and batch variables
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Index m, Index n, Index k, Index b): Base(make_Coord(m, n, k, b)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Returns the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  batch() const { return this->at(kBatch); }
+
+  /// Returns reference to the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index & batch() { return this->at(kBatch); }
+
+  /// Obtains a GemmCoord from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  GemmCoord mnk() const {
+    return GemmCoord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<4> from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<4> mnkb() const {
+    return make_Coord(m(), n(), k(), batch());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator+(Base const& b) const {
+    return BatchedGemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator-(Base const& b) const {
+    return BatchedGemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator*(Base const& b) const {
+    return BatchedGemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator/(Base const& b) const {
+    return BatchedGemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
new file mode 100755
index 000000000..a979241ef
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Utilities to convert a CuTe tuple to a GemmCoord or BatchedGemmCoord
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cutlass/gemm_coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Tuple>
+CUTLASS_HOST_DEVICE
+auto
+to_gemm_coord(Tuple tuple) {
+  static_assert(cute::rank(tuple) <= 4, "Can only convert tuples of rank <= 4.");
+
+  if constexpr (cute::rank(tuple) <= 3) {
+    auto tuple_mnk = cute::append<3>(tuple, cute::Int<0>{});
+    return GemmCoord(cute::size<0>(tuple_mnk), cute::size<1>(tuple_mnk), cute::size<2>(tuple_mnk));
+  }
+  else {
+    return BatchedGemmCoord(cute::size<0>(tuple), cute::size<1>(tuple), cute::size<2>(tuple), cute::size<3>(tuple));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/half.h b/lightllm-kernel/cutlass/include/cutlass/half.h
new file mode 100755
index 000000000..a0f398284
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/half.h
@@ -0,0 +1,930 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using IEEE half-precision floating-point types in host or
+      device code.
+*/
+
+#pragma once
+
+#ifndef CUTLASS_ENABLE_F16C
+#define CUTLASS_ENABLE_F16C 0
+#endif
+
+#if defined(__CUDACC_RTC__)
+
+#include "cutlass/floating_point_nvrtc.h"
+
+// F16C extensions are not meaningful when compiling for NVRTC which only accommodates device code.
+#undef CUTLASS_ENABLE_F16C
+#define CUTLASS_ENABLE_F16C 0
+
+#else
+//
+// Standard Library headers belong here to avoid conflicts with NVRTC.
+//
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/float8.h"
+#include "cutlass/platform/platform.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Optionally target F16C extentions to accelerate half-precision conversion.
+#if !defined(__CUDA_ARCH__) && (CUTLASS_ENABLE_F16C)
+#if defined(_MSC_VER)
+
+#include <immintrin.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <intrin.h>
+#endif
+
+#define F16C_ROUND_NEAREST 0
+
+#if !defined(__CUDA_ARCH__)
+extern __inline float _cvtsh_ss (unsigned short __S) {
+  __m128i packed;
+  std::memcpy(&packed, &__S, sizeof(__S));
+
+  __m128 result = _mm_cvtph_ps(packed);
+
+  float flt;
+  std::memcpy(&flt, &result, sizeof(flt));
+
+  return flt;
+}
+
+__inline unsigned short _cvtss_sh (float __F, const int) {
+  __m128 packed;
+  std::memcpy(&packed, &__F, sizeof(__F));
+
+  __m128i result = _mm_cvtps_ph(packed, F16C_ROUND_NEAREST);
+
+  unsigned short u;
+  std::memcpy(&u, &result, sizeof(u));
+
+  return u;
+}
+#endif
+
+#else
+
+// Linux
+#include <x86intrin.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
+#define F16C_ROUND_NEAREST (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)
+
+#endif // _MSC_VER
+
+class CpuId {
+
+  bool f16c_enabled;
+
+  CpuId() {
+  #if defined(__i386__) || defined(__x86_64__)
+    #if defined(_MSC_VER)
+      int exx[4];
+
+      __cpuid (exx, 1); 
+      f16c_enabled = exx[2] & 0x20000000;
+
+    #else 
+    // GCC / Clang
+       int eax, ebx, ecx, edx;
+
+      __cpuid (1 , eax, ebx, ecx, edx); 
+      f16c_enabled = ecx & 0x20000000;
+    #endif
+  #else 
+  // Arm / PowerPC etc.
+    f16c_enabled = false;
+  #endif
+  }
+
+public:
+
+  bool is_f16c_supported() const {
+    return f16c_enabled;
+  } 
+
+  static const CpuId& instance() {
+      static CpuId cpu;
+      return cpu;
+  }
+};
+#endif // !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// IEEE half-precision floating-point type
+struct alignas(2) half_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint16_t storage;
+
+  //
+  // Static conversion operators
+  //
+
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static half_t bitcast(uint16_t x) {
+    half_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+    // Avoid inlining in device code if no hardware support
+    __device__ __noinline__
+  #else
+    CUTLASS_HOST_DEVICE
+  #endif  
+  static half_t convert(float const& flt) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__float2half_rn(flt));
+  #else
+
+    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+      if( CpuId::instance().is_f16c_supported() ) {
+        unsigned short u = _cvtss_sh(flt, F16C_ROUND_NEAREST);
+        return bitcast(u);
+      }
+    #endif
+
+    // software implementation rounds toward nearest even
+    unsigned s;
+
+    #if defined(__CUDA_ARCH__)
+    s = reinterpret_cast<unsigned const &>(flt);
+    #else
+    std::memcpy(&s, &flt, sizeof(s));
+    #endif
+
+    uint16_t sign = uint16_t((s >> 16) & 0x8000);
+    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    int mantissa = s & 0x7fffff;
+    uint16_t u = 0;
+
+    if ((s & 0x7fffffff) == 0) {
+      // sign-preserving zero
+      return bitcast(sign);
+    }
+
+    if (exp > 15) {
+      if (exp == 128 && mantissa) {
+        // not a number
+        u = 0x7fff;
+      } else {
+        // overflow to infinity
+        u = sign | 0x7c00;
+      }
+      return bitcast(u);
+    }
+
+    int sticky_bit = 0;
+
+    if (exp >= -14) {
+      // normal fp32 to normal fp16
+      exp = uint16_t(exp + uint16_t(15));
+      u = uint16_t(((exp & 0x1f) << 10));
+      u = uint16_t(u | (mantissa >> 13));
+    } else {
+      // normal single-precision to subnormal half_t-precision representation
+      int rshift = (-14 - exp);
+      if (rshift < 32) {
+        mantissa |= (1 << 23);
+
+        sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
+
+        mantissa = (mantissa >> rshift);
+        u = (uint16_t(mantissa >> 13) & 0x3ff);
+      } else {
+        mantissa = 0;
+        u = 0;
+      }
+    }
+
+    // round to nearest even
+    int round_bit = ((mantissa >> 12) & 1);
+    sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);
+
+    if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+      u = uint16_t(u + 1);
+    }
+
+    u |= sign;
+
+    return bitcast(u);
+  #endif
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  CUTLASS_HOST_DEVICE
+  static half_t convert(int const& n) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__int2half_rn(n));
+  #else
+    return convert(float(n));
+  #endif
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  CUTLASS_HOST_DEVICE
+  static half_t convert(unsigned const& n) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__uint2half_rn(n));
+  #else
+    return convert(float(n));
+  #endif
+  }
+
+  /// Converts a half-precision value stored as a uint16_t to a float
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+    // Avoid inlining in device code if no hardware support
+    __device__ __noinline__
+  #else
+    CUTLASS_HOST_DEVICE
+  #endif
+  static float convert(half_t const& x) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return __half2float(x.to_half());
+  #else
+
+    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+      if( CpuId::instance().is_f16c_supported() ) {
+        unsigned short u = x.storage;
+        return _cvtsh_ss(u);
+      }
+    #endif
+
+    uint16_t const &h = x.storage;
+    uint32_t sign = ((h >> 15) & 1);
+    uint32_t exp = ((h >> 10) & 0x1f);
+    uint32_t mantissa = (h & 0x3ff);
+    unsigned f = 0;
+
+    if (exp > 0 && exp < 31) {
+      // normal
+      exp += 112;
+      f = (sign << 31) | (exp << 23) | (mantissa << 13);
+    } else if (exp == 0) {
+      if (mantissa) {
+        // subnormal
+        exp += 113;
+        while ((mantissa & (1 << 10)) == 0) {
+          mantissa <<= 1;
+          exp--;
+        }
+        mantissa &= 0x3ff;
+        f = (sign << 31) | (exp << 23) | (mantissa << 13);
+      } else {
+        // sign-preserving zero
+        f = (sign << 31);
+      }
+    } else if (exp == 31) {
+      if (mantissa) {
+        f = 0x7fffffff;  // not a number
+      } else {
+        f = (0xff << 23) | (sign << 31);  //  inf
+      }
+    }
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const&>(f);
+    #else
+    float flt;
+    std::memcpy(&flt, &f, sizeof(flt));
+    return flt;
+    #endif
+  #endif
+  }
+
+  //
+  // Methods
+  //
+
+  /// Default constructor
+  half_t() = default;
+
+  /// Reinterpret cast from CUDA's half type
+  CUTLASS_HOST_DEVICE
+  explicit half_t(half const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __half_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
+
+  /// Floating point conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float x) {
+    storage = convert(x).storage;
+  }
+
+  /// Floating point conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(double x): half_t(float(x)) {
+
+  }
+
+  /// float_e4m3_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e4m3_t x): half_t(float(x)) {
+
+  }
+
+  /// float_e5m2_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e5m2_t x): half_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round to nearest even
+  CUTLASS_HOST_DEVICE
+  explicit half_t(int x) {
+    storage = convert(x).storage;
+  }
+
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit half_t(unsigned x) {
+    storage = convert(x).storage;
+  }
+
+  /// Assignment
+  CUTLASS_HOST_DEVICE
+  half_t & operator=(half const &x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __half_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+    return *this;
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    return convert(*this);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(convert(*this));
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(convert(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (convert(*this) != 0.0f);
+  }
+
+  /// Bitcasts to CUDA's half type
+  CUTLASS_HOST_DEVICE
+  half to_half() const {
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<half const &>(storage);
+    #else
+    __half_raw raw;
+    std::memcpy(&raw.x, &storage, sizeof(raw.x));
+    return half(raw);
+    #endif
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  uint16_t& raw() {
+    return storage;
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((storage & 0x8000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((storage >> 10) & 0x1f);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 15;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(storage & 0x3ff);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::half_t const& h) {
+  return ((h.raw() & 0x8000) != 0);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t abs(cutlass::half_t const& h) {
+  return cutlass::half_t::bitcast(h.raw() & 0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::half_t const& h) {
+  return (h.exponent_biased() == 0x1f) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::half_t const& h) {
+  return (h.exponent_biased() != 0x1f);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t nanh(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::half_t::bitcast(0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::half_t const& h) {
+  return (h.exponent_biased() == 0x1f) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::half_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x1f;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::half_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x1f) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t sqrt(cutlass::half_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::half_t(sqrtf(float(h)));
+#else
+  return cutlass::half_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t copysign(half_t const& a, half_t const& b) {
+
+  uint16_t a_mag = (a.raw() & 0x7fff);  
+  uint16_t b_sign = (b.raw() & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+
+  return half_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::half_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = true;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 10;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
+};
+}  // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::half_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = true;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 10;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
+};
+}  // namespace platform 
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __heq(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) == float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hne(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) != float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hlt(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) < float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hle(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) <= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hgt(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) > float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hge(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) >= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator+(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hadd(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) + float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator-(half_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hneg(lhs.to_half()));
+#else
+  return half_t(-float(lhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator-(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hsub(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) - float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator*(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hmul(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) * float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator/(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hdiv(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) / float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator+=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) + float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator-=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) - float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator*=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hmul(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) * float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator/=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hdiv(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) / float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator++(half_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  ++tmp;
+  lhs = half_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator--(half_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  --tmp;
+  lhs = half_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator++(half_t & lhs, int) {
+  half_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  tmp++;
+  lhs = half_t(tmp);
+#endif
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator--(half_t & lhs, int) {
+  half_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  tmp--;
+  lhs = half_t(tmp);
+#endif
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t operator "" _hf(long double x) {
+  return cutlass::half_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t operator "" _hf(unsigned long long int x) {
+  return cutlass::half_t(int(x));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h b/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
new file mode 100755
index 000000000..b84d322db
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
@@ -0,0 +1,280 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using integer types smaller than one byte in host or
+      device code.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_size.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+template <int Bits, bool Signed = true>
+struct integer_subbyte {
+  using Storage = uint8_t;
+
+  static_assert(Bits <= 8*sizeof(Storage), "Require a subbyte of bits in integer_subbyte");
+
+  // "External type"; the integer type for which
+  // integer_subbyte has a conversion-to operator
+  using xint_t = typename cutlass::platform::conditional<Signed, int, unsigned>::type;
+
+  // Bitmask for truncation from larger integers
+  static constexpr Storage bits_mask_ = Storage(Storage(-1) >> (8 - Bits));
+  // Bitmask for the sign bit
+  static constexpr Storage sign_mask_ = Storage((Signed ? 1 : 0) << (Bits - 1));
+
+  // Where the bits are stored
+  Storage storage;
+
+  // Default construction does NOT zero-initialize
+  integer_subbyte() = default;
+
+  // Implicit conversion is DEPRECATED.
+  // Please use one of the two explicit constructors below.
+  template<class T,
+    class Enable = cutlass::platform::enable_if_t<cutlass::platform::is_convertible_v<T, int>>
+  >
+  [[deprecated("Implicit conversion is deprecated; please use explicit construction instead")]]
+  CUTLASS_HOST_DEVICE
+  integer_subbyte(T value)
+      : integer_subbyte(static_cast<xint_t>(value)) {}
+
+  // CUTLASS code commonly converts both signed and unsigned integers
+  // into integer_subbyte, so the class provides both explicit
+  // conversions.
+
+  // Precondition: If the external type is unsigned int, then value
+  // fits in unsigned int (is nonnegative).
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(int value)
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value < upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value >= 0);
+      assert(value < static_cast<int>(upper_bound));
+    }
+  }
+
+  // Precondition: If the external type is (signed) int, then value
+  // fits in int.
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(unsigned value)
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value < upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value < upper_bound);
+    }
+  }
+
+  // Convert to the "external" integer type (int or unsigned)
+  CUTLASS_HOST_DEVICE
+  operator xint_t() const {
+    if (sign_mask_ & storage) {  // Sign extend
+      return xint_t(storage) | ~xint_t(bits_mask_);
+    } else {
+      return xint_t(storage);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator==(integer_subbyte const& rhs) const {
+    return storage == rhs.storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator!=(integer_subbyte const& rhs) const {
+    return storage != rhs.storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator<(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage < rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator<=(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage <= rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator>=(integer_subbyte const& rhs) const {
+    return !(*this < rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator>(integer_subbyte const& rhs) const {
+    return !(*this <= rhs);
+  }
+
+  CUTLASS_HOST_DEVICE friend integer_subbyte
+  conj(integer_subbyte const& x) {
+    return x;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-bit Unsigned integer type
+using uint1b_t = integer_subbyte<1, false>;
+
+/// 2-bit Integer type
+using int2b_t = integer_subbyte<2, true>;
+
+/// 2-bit Unsigned integer type
+using uint2b_t = integer_subbyte<2, false>;
+
+/// 4-bit Integer type
+using int4b_t = integer_subbyte<4, true>;
+
+/// 4-bit Unsigned integer type
+using uint4b_t = integer_subbyte<4, false>;
+
+/// 1-bit binary type
+using bin1_t = bool;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Bits, bool Signed>
+struct sizeof_bits<integer_subbyte<Bits,Signed>> {
+  static constexpr int value = Bits;
+};
+
+/// Defines the size of an element in bits - specialized for bin1_t
+template <>
+struct sizeof_bits<bin1_t> {
+  static constexpr int value = 1;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+// Specialization for signed integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, true>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, true>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{
+      -(1 << (NumBits - 1))
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1 << (NumBits - 1)) - 1
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool has_infinity = false;
+};
+
+// Specialization for unsigned integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, false>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, false>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{0u};
+  }
+
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1u << NumBits) - 1u
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_signed = false;
+};
+
+} // namespace platform
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
new file mode 100755
index 000000000..62dcb8b45
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include "cuda_runtime.h"
+
+#include "cutlass/trace.h"
+#endif
+
+namespace cutlass {
+
+struct KernelHardwareInfo {
+  //
+  // Data members
+  //
+  int device_id = 0;
+  int sm_count  = 0;
+
+  //
+  // Methods
+  //
+
+#if !defined(__CUDACC_RTC__)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    cudaError_t result = cudaGetDevice(&device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    return multiprocessor_count;
+  }
+#endif
+};
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
new file mode 100755
index 000000000..876aacc6b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
@@ -0,0 +1,35 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// Simply import .h version of header so as to avoid breaking any existing CUTLASS builds
+// after .hpp was changed to .h
+#include "cutlass/kernel_hardware_info.h"
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h b/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
new file mode 100755
index 000000000..ca3380a2a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines structures and helpers to launch CUDA kernels within CUTLASS.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure containing the basic launch configuration of a CUDA kernel.
+struct KernelLaunchConfiguration {
+
+  /// CUDA grid dimensions
+  dim3 grid;
+
+  /// CUDA threablock dimensions
+  dim3 block;
+
+  /// Bytes of dynamically allocated SMEM in addition to static SMEM
+  size_t dynamic_smem;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a KernellaunchConfiguration object
+  CUTLASS_HOST_DEVICE
+  KernelLaunchConfiguration(
+    dim3 _grid = dim3(1,1,1),
+    dim3 _block = dim3(1,1,1),
+    size_t _dynamic_smem = 0
+  ):
+    grid(_grid),
+    block(_block),
+    dynamic_smem(_dynamic_smem) { }
+};
+
+
+template <typename GemmKernel, typename Params>
+Status kernel_launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    const Params &kernel_params,
+    bool launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  CUTLASS_TRACE_HOST("cutlass::kernel_launch");
+#endif
+
+  if (not launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: No PDL");
+#endif
+    device_kernel<GemmKernel><<<grid_dims, block_dims, smem_size, cuda_stream>>>(kernel_params);
+  }
+  else {
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability < 90) {
+      CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported for SM90.");
+      return Status::kInvalid;
+    }
+
+    cudaLaunchConfig_t config;
+    cudaLaunchAttribute attrs[1];
+
+    config.gridDim = grid_dims;
+    config.blockDim = block_dims;
+    config.dynamicSmemBytes = smem_size;
+    config.stream = cuda_stream;
+
+    config.attrs = attrs;
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = 1;
+    config.numAttrs = 1;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: Calling cudaLaunchKernelEx");
+#endif
+    cudaError_t launch_result = cudaLaunchKernelEx(&config, &device_kernel<GemmKernel>, kernel_params);
+    if (cudaSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaLaunchKernelEx failed with error: " << cudaGetErrorString(launch_result));
+      return Status::kErrorInternal;
+    }
+#else
+    CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported starting CUDA 11.8.");
+    return Status::kInvalid;
+#endif
+  }
+
+  cudaError_t result = cudaGetLastError();
+  if (cudaSuccess == result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaGetLastError reports success");
+#endif
+    return Status::kSuccess;
+  }
+  else {
+    CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+    return Status::kErrorInternal;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/layout.h b/lightllm-kernel/cutlass/include/cutlass/layout/layout.h
new file mode 100755
index 000000000..1089add39
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/layout.h
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes. 
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h b/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
new file mode 100755
index 000000000..32aa17a5d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
@@ -0,0 +1,1349 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes. 
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/pitch_linear_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines data layouts of various matrix formats usable by TensorRef and other classes.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for row-major matrices.
+class RowMajor {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  RowMajor(LongIndex ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajor(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajor packed(MatrixCoord const &extent) {
+    return RowMajor(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return LongIndex(coord.row()) * LongIndex(stride_[0]) + coord.column();
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(Index(offset / stride_[0]), Index(offset % stride_[0]));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return LongIndex(extent.row()) * LongIndex(stride_[0]);
+  }
+};
+
+/// Mapping function for column-major matrices.
+class ColumnMajor {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajor(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajor(Stride stride): stride_(stride) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajor packed(MatrixCoord const &extent) {
+    return ColumnMajor(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return LongIndex(coord.column()) * LongIndex(stride_[0]) + coord.row();
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(Index(offset % stride_[0]), Index(offset / stride_[0]));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return LongIndex(extent.column()) * LongIndex(stride_[0]);
+  }
+};
+
+/// Mapping function for interleaved matrices. Matrix is structured
+/// as row-major arrangement of fixed-size columns.
+template <int Interleave>
+struct RowMajorInterleaved {
+  
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of interleaved columns
+  static int const kInterleave = Interleave;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorInterleaved packed(MatrixCoord const &extent) {
+    return RowMajorInterleaved(extent.column() * kInterleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index row_major = coord.row() / kInterleave;
+    Index row_minor = coord.row() % kInterleave;
+    return LongIndex(row_major) * LongIndex(stride_[0]) + LongIndex(coord.column()) * kInterleave + row_minor;
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    Index row_major = Index(offset / stride_[0]);
+    Index residual = Index(offset % stride_[0]);
+
+    Index column = residual / kInterleave;
+    Index row_minor =  residual % kInterleave;
+
+    return MatrixCoord(row_major * kInterleave + row_minor, column);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.row() + kInterleave - 1) / kInterleave * stride_[0];
+  }
+};
+
+/// Mapping function for interleaved matrices. Matrix is structured
+/// as column-major arrangement of fixed-size rows.
+template <int Interleave>
+struct ColumnMajorInterleaved {
+  
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of interleaved columns
+  static int const kInterleave = Interleave;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorInterleaved(Stride stride): stride_(stride) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorInterleaved packed(MatrixCoord const &extent) {
+    return ColumnMajorInterleaved(extent.row() * kInterleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index column_major = coord.column() / kInterleave;
+    Index column_minor = coord.column() % kInterleave;
+    return LongIndex(column_major) * LongIndex(stride_[0]) + LongIndex(coord.row()) * kInterleave + column_minor;
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    Index column_major = Index(offset / stride_[0]);
+    Index residual = Index(offset % stride_[0]);
+
+    Index row = residual / kInterleave;
+    Index column_minor =  residual % kInterleave;
+
+    return MatrixCoord(row, column_major * kInterleave + column_minor);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.column() + kInterleave - 1) / kInterleave * stride_[0];
+  }
+};
+
+/// Enumerated type for canonical pitch-linear matrix layouts
+enum class Matrix {
+  kColumnMajor,       ///< leading dimension refers to stride between columns; stride along rows is 1
+  kRowMajor           ///< leading dimension refers to stride between rows; stride along columns is 1
+};
+
+/// Mapping function for scenario in which layout is row-major or column-major but this information
+/// is only available at runtime.
+struct ContiguousMatrix {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+  /// Enumerated type indicating canonical matrix layout
+  Matrix layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ContiguousMatrix(
+    Index ldm = 0, 
+    Matrix layout = Matrix::kColumnMajor
+  ):
+    stride_(ldm), layout_(layout) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ContiguousMatrix packed(
+    MatrixCoord const &extent, 
+    Matrix layout = Matrix::kColumnMajor) {
+
+    Index ldm = 0;
+    if (layout == Matrix::kColumnMajor) {
+      ldm = extent.row();
+    }
+    else if (layout == Matrix::kRowMajor) {
+      ldm = extent.column();
+    }
+    return ContiguousMatrix(ldm, layout);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    if (layout_ == Matrix::kColumnMajor) {
+      return coord.row() + coord.column() * stride_[0];
+    }
+    else if (layout_ == Matrix::kRowMajor) {
+      return coord.row() * stride_[0] + coord.column();
+    }
+    else {
+      // degenerate case
+      return 0;
+    }
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    if (layout_ == Matrix::kColumnMajor) {
+      return stride_[0] * extent.column();
+    }
+    else if (layout_ == Matrix::kRowMajor) {
+      return stride_[0] * extent.row();
+    }
+    else {
+      // degenerate case
+      return 0;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+template <int Rank>
+struct AffineRankN {
+
+  /// Logical rank of tensor
+  static int const kRank = Rank;
+
+  /// Rank of stride vector
+  static int const kStrideRank = kRank;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    Coord<kRank/2, LongIndex> const &stride_m,
+    Coord<kRank/2, LongIndex> const &stride_n
+  ) { 
+
+    // Concatenate the strides
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < kRank/2; ++m) {
+      stride_[m] = stride_m[m];
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kRank/2; ++n) {
+      stride_[n + kRank/2] = stride_n[n];
+    }
+  }
+
+  /// Ctor for N = 2
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    LongIndex const &stride_m,
+    LongIndex const &stride_n
+  ) { 
+      stride_[0] = stride_m;
+      stride_[1] = stride_n;
+  }
+
+  /// Ctor for N = 2
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    LongIndex const &stride
+  ) { 
+      stride_[0] = stride;
+      stride_[1] = 1;
+  }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRankN packed(TensorCoord const &extent) {
+    
+    AffineRankN layout;
+    layout.stride_[kRank - 1] = 1;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = kRank - 1; i > 0; --i) {
+      layout.stride_[i - 1] = layout.stride_[i] * extent[i];
+    }
+
+    return layout;
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    int idx = stride_.max_dim_index();
+    return extent[idx] * stride_[idx];
+  }
+};
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+/// Row stride is smaller than column stride in AffineRank2ColumnMajor.
+struct AffineRank2ColumnMajor {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    LongIndex row_stride,           ///< stride between elements in consecutive rows
+    LongIndex column_stride         ///< stride between elements in consecutive columns
+  )
+    { stride_[0] = row_stride; stride_[1] = column_stride;}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    LongIndex stride
+  )
+    { stride_[0] = 1; stride_[1] = stride;}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRank2ColumnMajor packed(MatrixCoord const &extent) {
+    return AffineRank2ColumnMajor(1, extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return extent.column() * stride_[1];
+  }
+};
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+/// Column stride is smaller than row stride in AffineRank2RowMajor.
+struct AffineRank2RowMajor {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    LongIndex row_stride,           ///< stride between elements in consecutive rows
+    LongIndex column_stride         ///< stride between elements in consecutive columns
+  ) { stride_[0] = row_stride; stride_[1] = column_stride;}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    LongIndex stride
+  ) { stride_[0] = stride; stride_[1] = 1;}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRank2RowMajor packed(MatrixCoord const &extent) {
+    return AffineRank2RowMajor(1, extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return extent.row() * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Utility functions to convert stride_factor to the strides used by the Affine2 layout.
+//
+// stride_factor is the logical distance between two coorinates.
+//
+// All Coodinates used here are matrix coordinates.  stride[0] and extent[0] are for the
+// rows.  stride[1] and extent[1] are for the columns.
+template <typename Affine2Layout>
+  struct Affine2Layout_Factory {
+  CUTLASS_HOST_DEVICE
+  static Affine2Layout layout_factory(cutlass::Coord<2> const &extent, typename Affine2Layout::Stride stride_factor) {
+    return Affine2Layout::packed(extent);
+  }
+};
+
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRank2ColumnMajor> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRank2ColumnMajor layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRank2ColumnMajor::Stride stride_factor) {
+    return cutlass::layout::AffineRank2ColumnMajor({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
+  }
+};
+
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRank2RowMajor> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRank2RowMajor layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRank2RowMajor::Stride stride_factor) {
+    return cutlass::layout::AffineRank2RowMajor({ stride_factor[0] * stride_factor[1] * extent[1], stride_factor[1] });
+  }
+};
+
+// The base layout cutlass::layout::AffineRankN<2> is similar to AffineRank2ColumnMajor
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRankN<2>> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRankN<2> layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRankN<2>::Stride stride_factor) {
+    return cutlass::layout::AffineRankN<2>({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for block-linear matrices. Matrix is structured
+/// as column-major arrangement of 2D tiles (that are column-major).
+template <int BlockRows, int BlockColumns>
+struct ColumnMajorBlockLinear {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of a block in rows
+  static int const kBlockRows = BlockRows;
+
+  /// Size of a block in columns
+  static int const kBlockColumns = BlockColumns;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorBlockLinear packed(MatrixCoord const &extent) {
+    return ColumnMajorBlockLinear(extent.row() * kBlockRows * kBlockColumns);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return 
+      (coord.row() % kBlockRows) + 
+      (coord.column() % kBlockColumns) * kBlockRows +
+      (coord.row() / kBlockRows) * kBlockRows * kBlockColumns +
+      (coord.column() / kBlockColumns) * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.column() + kBlockColumns - 1) / kBlockColumns * stride_[0];
+  }
+};
+
+/// Mapping function for block-linear matrices. Matrix is structured
+/// as row-major arrangement of 2D tiles (that are row-major)
+template <int BlockRows, int BlockColumns>
+struct RowMajorBlockLinear {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of a block in rows
+  static int const kBlockRows = BlockRows;
+
+  /// Size of a block in columns
+  static int const kBlockColumns = BlockColumns;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorBlockLinear packed(MatrixCoord const &extent) {
+    return RowMajorBlockLinear(extent.column() * kBlockRows * kBlockColumns);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return 
+      (coord.column() % kBlockColumns) +
+      (coord.row() % kBlockRows) * kBlockColumns +
+      (coord.column() / kBlockColumns) * kBlockRows * kBlockColumns +
+      (coord.row() / kBlockRows) * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+  
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.row() + kBlockRows - 1) / kBlockRows * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GeneralMatrix {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+  //
+  // Data members
+  //
+
+  Matrix layout_id_;
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  GeneralMatrix(): layout_id_(Matrix::kColumnMajor), stride_(make_Coord(0, 1)) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  GeneralMatrix(
+    Matrix layout_id, 
+    Index ldm, 
+    Index interleave): layout_id_(layout_id), stride_(make_Coord(ldm, interleave)) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static GeneralMatrix packed(
+    MatrixCoord const &extent, 
+    Matrix layout_id = Matrix::kColumnMajor, 
+    Index interleave = 1) {
+
+    Index c;
+    if (layout_id == Matrix::kRowMajor) {
+      c = extent.column();
+    }
+    else {
+      c = extent.row();
+    }
+
+    Index ldm = c * interleave;
+
+    return GeneralMatrix(layout_id, ldm, interleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index c, s;
+    if (layout_id_ == Matrix::kRowMajor) {
+      c = coord.column();
+      s = coord.row();
+    }
+    else {
+      s = coord.column();
+      c = coord.row();
+    }
+
+    Index v = s / stride_[1];
+    Index residual = (s % stride_[1]);
+
+    return LongIndex(c) * LongIndex(stride_[1]) + LongIndex(v) * LongIndex(stride_[0]) + residual;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix layout_id() const {
+    return layout_id_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix & layout_id() {
+    return layout_id_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+  
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    Index s;
+    if (layout_id_ == Matrix::kRowMajor) {
+      s = extent.row();
+    }
+    else {
+      s = extent.column();
+    }
+
+    Index v = Index((s + stride_[1] - 1) / stride_[1]);
+    return LongIndex(v) * LongIndex(stride_[0]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines transposes of matrix layouts
+template <typename Layout>
+struct LayoutTranspose;
+
+/// Transpose of row-major is column-major
+template <>
+struct LayoutTranspose<layout::RowMajor> {
+  using type = layout::ColumnMajor;
+};
+
+/// Transpose of column-major is row-major
+template <>
+struct LayoutTranspose<layout::ColumnMajor> {
+  using type = layout::RowMajor;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/permute.h b/lightllm-kernel/cutlass/include/cutlass/layout/permute.h
new file mode 100755
index 000000000..912eb2c8c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/permute.h
@@ -0,0 +1,828 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by GEMM+permute path for common tensor or matrix formats.
+
+    Like Layout functions, permute layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Permute layout functions must implement all members in the interface of NoPermute<> defined in this file. Address offset
+    computation lies in operator() with private member variables  {col_permute_, row_permute_ and stride_} as new addresses after permute op.
+*/
+#pragma once
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include "assert.h"
+#endif
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+// template<PermuteTag, typename Layout, bool Inverse>
+// struct PermuteSelect {
+//   // Try to give a reasonable error message to the user
+//   static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+//                 "You've tried to use a layout permutation for which the implementation is not availble. "
+//                 "In order to provide an implementation for a particular combination of matrix layout "
+//                 "and direction (direct/inverse), please specialize PermuteSelect trait.");
+// };
+
+// Base template for defining specializations of permutation inverses
+template<typename Permute>
+struct InversePermute
+{
+  // Try to give a reasonable error message to the user
+  static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+                "To apply permutation to a GEMM input operand (A or B), an inverse permutation for the desired "
+                "permute class must be defined and enabled by specializing cutlass::layout::InversePermute trait.");
+};
+
+class PermuteBase {
+public:
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+};
+
+class NoPermute : public PermuteBase {
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor from matrix extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(MatrixCoord extent, Index stride) { };
+
+  /// Constructor from pitch-linear extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(PitchLinearCoord extent, Index stride) { };
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const { return 0; } // not correct but should never be called
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { return 0; } // not correct but should never be called
+};
+
+template<>
+struct InversePermute<NoPermute> {
+  using type = NoPermute;
+};
+
+/// Helper trait to detect if permute operation is a noop
+template<typename Permute>
+inline bool constexpr is_trivial_permute = platform::is_same<Permute, cutlass::layout::NoPermute>::value;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines permute layouts of various tensor formats.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermute0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D3_ = extent.column() / D2;
+
+    stride_ = stride * D1 / D2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() % D3_;
+    Index k = coord.column() / D3_;
+    Index j = coord.row() % D1;
+    Index i = coord.row() / D1;
+
+    MatrixCoord permuted{k + i * D2, l + j * D3_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajorInverse : public Tensor4DPermute0213RowMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213RowMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajor<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajor<D1, D2>;
+};
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D0_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D0_ = extent.row() / D1;
+
+    stride_ = stride * D2 / D1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() / D2;
+    Index k = coord.column() % D2;
+    Index j = coord.row() / D0_;
+    Index i = coord.row() % D0_;
+
+    MatrixCoord permuted{i + k * D0_, j + l * D1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajorInverse : public Tensor4DPermute0213ColumnMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213ColumnMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajor<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajor<D1, D2>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermuteBMM0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 2, 1, 3]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(MatrixCoord extent, Index stride) {
+
+    Index D2 = extent.row();
+    D3_ = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k, l + j * D3_};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.column() % D1 == 0);
+
+    Index D2 = extent.row();
+    D3_ = extent.column() / D1;
+
+    stride_ = stride / D1;
+
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
+    Index l = coord.column() % D3_;
+    Index j = coord.column() / D3_;
+    Index k = coord.row();
+    Index i = BMM_batch_idx;
+
+    // compute original [batch, row, col] index
+    Index pbatch = j + i * D1;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajor<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajor<D1>;
+};
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 3, 2, 1]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord extent, Index stride) {
+
+    D2_ = extent.row();
+    Index D3 = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k + j * D2_, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+
+    D2_ = extent.row() / D1;
+    Index D3 = extent.column();
+
+    stride_ = stride / D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
+    Index l = coord.column();
+    Index k = coord.row() % D2_;
+    Index j = coord.row() / D2_;
+    Index i = BMM_batch_idx;
+
+    Index pbatch = i * D1 + j;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajor<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajor<D1>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor5DPermute20314
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with output matrix (dimension as [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([2, 0, 3, 1, 4]) on the corresponding output tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T2 * T1; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T2, T0, T3, T1, T4].
+
+    Index m = coord.column() % T4_;
+    Index l = (coord.column() / T4_) % T3;
+    Index k = (coord.column() / T4_) / T3;
+    Index j = coord.row() % T1;
+    Index i = coord.row() / T1;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T1 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+/// Inverse for Tensor5DPermute20314 (could also be given a proper name, e.g. Tensor5DPermute13024).
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T1 * T2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{j + i * T1, m + l * T4_ + k * T3 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensor5DPermute02413
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([0, 2, 4, 1, 3]) on the corresponding tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T1 * T2; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T0, T2, T4, T1, T3].
+
+    Index m = (coord.column() / T2) / T3;
+    Index l = (coord.column() / T2) % T3;
+    Index k = coord.column() % T2;
+    Index j = coord.row() / T0_;
+    Index i = coord.row() % T0_;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T4_ * T1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+/// Inverse for Tensor5DPermute02413ColumnMajor
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T2 * T1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{i + j * T0_, k + l * T2 + m * T2 * T3};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
new file mode 100755
index 000000000..8c9540f40
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/pitch_linear_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+template <int Contiguous, int Strided>
+  using PitchLinearShape = cutlass::PitchLinearShape < Contiguous, Strided >;
+  using PitchLinearCoord = PitchLinearCoord;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for pitch-linear memory
+class PitchLinear {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+  
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  PitchLinear(LongIndex ldm = 0): stride_(ldm) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  PitchLinear(Stride _stride): stride_(_stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static PitchLinear packed(TensorCoord const &extent) {
+    return PitchLinear(extent.contiguous());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return LongIndex(coord.contiguous()) + LongIndex(coord.strided()) * LongIndex(stride_[0]);
+  }
+
+  /// Returns the logical coordinate given an offset.
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex index) const {
+    return make_Coord(
+      TensorCoord::Index(index % stride_[0]),
+      TensorCoord::Index(index / stride_[0])
+    );
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  LongIndex stride(int rank) const {
+    return stride_[rank];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  LongIndex & stride(int rank) {
+    return stride_[rank];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.strided() * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
new file mode 100755
index 000000000..8374fe31d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
@@ -0,0 +1,648 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D
+      tensor formats.
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+#pragma once
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include "assert.h"
+#endif
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines data layouts of various tensor formats usable by TensorRef and other classes.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag used for 3-D NWC tensors for 1-D convolutions; only used in 3.x API
+class TensorNWC {};
+
+/// Tag used for n-D KCSRT tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorKCS {};
+class TensorKCSR {};
+class TensorKCSRT {};
+
+/// Tag used for n-D CSRTK tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorCSK {};
+class TensorCSRK {};
+class TensorCSRTK {};
+
+/// Mapping function for 4-D NHWC tensors.
+class TensorNHWC {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, h, w, c)
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [stride_w, stride_h, stride_n]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ): 
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed NHWC tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNHWC packed(TensorCoord const &extent) {
+    return TensorNHWC(
+      make_Coord(
+        extent.c(), 
+        extent.w() * extent.c(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.c() + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.n());
+  }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
+
+  /// Returns the logical coordinate (n, h, w, c) from a given offset in linear memory.
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex index) const {
+
+    int n = 0, h = 0, w = 0, c = 0;
+
+    #if defined(__CUDA_ARCH__)
+    int tmp = 0;
+    c = int(index % static_cast<int>(stride_[0]));
+
+    unsigned int hw_mul, hw_shr, w_mul, w_shr, c_mul, c_shr;
+
+    find_divisor(hw_mul, hw_shr, stride_[2]);
+    find_divisor(w_mul, w_shr, stride_[1]);
+    find_divisor(c_mul, c_shr, stride_[0]);
+
+    fast_divmod(n, tmp, index, int(stride_[2]), hw_mul, hw_shr);
+    fast_divmod(h, w, tmp, int(stride_[1]), w_mul, w_shr);
+    fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);
+    #else
+
+    n = int(index / stride_[2]);
+    LongIndex residual = index % stride_[2];
+
+    h = int(residual / stride_[1]);
+    residual = (residual % stride_[1]);
+
+    w = int(residual / stride_[0]);
+    c = int(residual % stride_[0]);
+
+    #endif
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.c() > stride_[0])
+        || (extent.w() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])) {
+      assert(0);
+    }
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NCHW tensors.
+class TensorNCHW {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [w, hw, chw]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCHW(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorNCHW packed(TensorCoord const &extent) {
+    return TensorNCHW(
+      make_Coord(
+        extent.w(),
+        extent.w() * extent.h(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.w() + 
+      LongIndex(stride_[0] * coord.h()) + 
+      LongIndex(stride_[1] * coord.c()) + 
+      LongIndex(stride_[2] * coord.n());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NC/xHWx tensors.
+template <int Interleave>
+class TensorNCxHWx {
+public:
+
+  /// Interleaving quantity
+  static int const kInterleave = Interleave;
+
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [Interleave x w, Interleave x wh, hwc]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorNCxHWx packed(TensorCoord const &extent) {
+    return TensorNCxHWx(
+      make_Coord(
+        kInterleave * extent.w(),
+        kInterleave * extent.w() * extent.h(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index c_minor = (coord.c() % kInterleave);
+    Index c_major = (coord.c() / kInterleave);
+
+    return c_minor + 
+      LongIndex(kInterleave * coord.w()) + 
+      LongIndex(stride_[0] * coord.h()) + 
+      LongIndex(stride_[1] * c_major) + 
+      LongIndex(stride_[2] * coord.n());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D CxRSKx tensors.
+template <int Interleave>
+class TensorCxRSKx {
+public:
+
+  /// Interleaving quantity
+  static int const kInterleave = Interleave;
+
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [Interleave x n, Interleave x nw, Interleave x nwh]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorCxRSKx packed(TensorCoord const &extent) {
+    return TensorCxRSKx(
+      make_Coord(
+        kInterleave * extent.n(),
+        kInterleave * extent.n() * extent.w(),
+        kInterleave * extent.n() * extent.w() * extent.h()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index c_minor = (coord.c() % kInterleave);
+    Index c_major = (coord.c() / kInterleave);
+
+    return c_minor + 
+      LongIndex(kInterleave * coord.n()) + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) + 
+      LongIndex(stride_[2] * c_major);
+  }
+
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord const &coord) const {
+    return (coord.contiguous() % kInterleave) +
+      LongIndex((coord.contiguous() / kInterleave) * stride_[2]) +
+      LongIndex(coord.strided() * kInterleave);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent.c() / kInterleave * stride_[2]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 5-D NDHWC tensors.
+class TensorNDHWC {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 5;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 4;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, d, h, w, c)
+  using TensorCoord = Tensor5DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [c, wc, hwc, dhwc]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(
+    typename Stride::Index c, 
+    typename Stride::Index wc, 
+    typename Stride::Index hwc, 
+    typename Stride::Index dhwc): 
+  stride_(make_Coord(c, wc, hwc, dhwc)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]),
+      static_cast<typename Stride::Index>(stride[3]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed NHWC tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNDHWC packed(TensorCoord const &extent) {
+    return TensorNDHWC(
+      make_Coord(
+        extent.c(), 
+        extent.w() * extent.c(),
+        extent.h() * extent.w() * extent.c(),
+        extent.d() * extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, d, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.c() + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.d()) +
+      LongIndex(stride_[3] * coord.n());
+  }
+
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[3]);
+  }
+  
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.c() > stride_[0])
+        || (extent.w() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])
+        || (extent.d() * stride_[2] > stride_[3])) {
+      assert(0);
+    }
+    return extent.n() * stride_[3];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
new file mode 100755
index 000000000..4691b9829
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -0,0 +1,1044 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/pitch_linear.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct VoltaTensorOpMultiplicandCongruous;
+
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct ColumnMajorVoltaTensorOpMultiplicandCongruous;
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct RowMajorVoltaTensorOpMultiplicandCongruous;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize>
+struct VoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<8, 2>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  
+  using PartitionCount = PitchLinearShape<
+    TileShape::kContiguous / PartitionShape::kContiguous,
+    TileShape::kStrided / PartitionShape::kStrided
+  >;
+
+  using AccessCount = PitchLinearShape<
+    PartitionShape::kContiguous,
+    PartitionShape::kStrided
+  >;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCongruous(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCongruous(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    
+    // First, compute c and s of vector within source (in units of vector accesses)
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
+    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
+
+    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Then swizzle in a tile
+    // Swizzle pattern is (tid[2:0] << 2)|(tid[4:3] ^ tid[2:1])
+    int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
+    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
+                                       ((tile_contiguous_residual & 1) << 2);
+    // Compute final element location
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
+
+    return element_contiguous + element_strided * stride_[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct ColumnMajorVoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct RowMajorVoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+// template <int ElementSize, Operand Operand>
+template <int ElementSize>
+struct VoltaTensorOpMultiplicandBCongruous {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<4, 4>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  
+  using PartitionCount = PitchLinearShape<
+    TileShape::kContiguous / PartitionShape::kContiguous,
+    TileShape::kStrided / PartitionShape::kStrided
+  >;
+
+  using AccessCount = PitchLinearShape<
+    PartitionShape::kContiguous,
+    PartitionShape::kStrided
+  >;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandBCongruous(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandBCongruous(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandBCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    
+    // First, compute c and s of vector within source (in units of vector accesses)
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
+    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
+
+    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Then swizzle in a tile
+    // Swizzle pattern is (tid[1:0] << 3)|(tid & 0x4)|(tid[1:0])
+    int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
+    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
+                                       (tile_contiguous_residual & 0x4);
+  
+    // Compute final element location
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
+
+    return element_contiguous + element_strided * stride_[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE 
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct ColumnMajorVoltaTensorOpMultiplicandBCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandBCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct RowMajorVoltaTensorOpMultiplicandBCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandBCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and KBlock size (in elements).
+template <int ElementSize, int KBlock>
+struct VoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = 64;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kKBlock = KBlock;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member. For GEMM, it equals to KBlock x stage.
+  Stride stride_;
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCrosswise(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandCrosswise(extent[1]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    //
+    // First, compute c and s of vector within source (in units of vector
+    // accesses)
+    //
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    //
+    // Then swizzle
+    // The mapping is like this:
+    // id[1:0]|(id[3]^id[4])|id[2]
+
+    int vec_strided_within_tile = vec_contiguous_idx & 0x7;
+    int permuted_vec_contiguous =
+        (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
+        (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);
+
+    permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);
+
+    int permuted_vec_strided = vec_contiguous_idx;
+
+    //
+    // Compute final element location
+    //
+
+    int element_contiguous = permuted_vec_contiguous *  kElementsPerAccess + 
+                             (coord.contiguous() % kElementsPerAccess);
+    
+    return element_contiguous + permuted_vec_strided * (stride_[0] * kElementsPerAccess);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[0] * stride_[0];
+  }
+};
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// VoltaTensorOpMultiplicandCrosswise
+template <int ElementSize, int KBlock>
+struct ColumnMajorVoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = Base::kAccessSize;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int KBlock>
+struct RowMajorVoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = Base::kAccessSize;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+} // namespace layout
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
new file mode 100755
index 000000000..1cda44286
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -0,0 +1,1169 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+/// This one is the base class of all Ampere/Turing fp16/bf16/int8/int4/int1
+/// tensor core kernels.  tf32 TN uses this too.
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicand {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kCrosswise = Crosswise;
+
+  /// Contiguous dimension of the tile shape matches one shared memory cache
+  /// line - 128B.  For 128bit access size, it equals to 8 accesses.
+  static int const kTileShapeContiguous = 128 / (kAccessSize / 8);
+
+  /// Number of kblocks to store PartitionShape::kContiguous Elements
+  static int const kFactor =
+      kTileShapeContiguous * kElementsPerAccess / kCrosswise;
+
+  static_assert(
+      (kFactor > 0),
+      "kCrosswise should be no large than one shared memory cache line.");
+
+  /// The strided dimension needs to be at least (WarpSize(32) /
+  /// kTileShapeContiguous) for a warp to access.  To ensure conflict free
+  /// access, it also needs to be at least (kTileShapeContiguous / kFactor).
+  /// See comments below
+  static int const kTileShapeStride =
+      ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
+          ? (kTileShapeContiguous / kFactor)
+          : (32 / kTileShapeContiguous);
+
+  /// Fundamental tile shape in units of vectors to guarantee bank conflict free
+  /// shared memory load/store.
+  /// For kFactor = 1, TileShape = <8, 8> 
+  /// For kFactor > 1, TileShape = <8, 4>
+  using TileShape = PitchLinearShape<kTileShapeContiguous, kTileShapeStride>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<4, 4>;
+
+  using PartitionCount =
+      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
+                       TileShape::kStrided / PartitionShape::kStrided>;
+
+  using AccessCount =
+      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member. For GEMM, it equals to kCrosswise x stage.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicand packed(TensorCoord const &extent) {
+    return TensorOpMultiplicand(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    //
+    // First, compute c and s of vector within source (in units of vector
+    // accesses)
+    //
+
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided() / kFactor;
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx =
+        vec_contiguous_idx / (TileShape::kContiguous / kFactor);
+
+    int tile_contiguous_residual =
+        vec_contiguous_idx % (TileShape::kContiguous / kFactor) +
+        ((coord.strided() % kFactor) * (TileShape::kContiguous / kFactor));
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Compute the 'partition' within the fundamental tile
+    int partition_contiguous_idx =
+        tile_contiguous_residual / PartitionShape::kContiguous;
+    int partition_strided_idx =
+        tile_strided_residual / PartitionShape::kStrided;
+
+    int partition_contiguous_residual =
+        tile_contiguous_residual % PartitionShape::kContiguous;
+    int partition_strided_residual =
+        tile_strided_residual % PartitionShape::kStrided;
+
+    //
+    // Then swizzle
+    //
+
+    int permuted_vec_contiguous_within_partition =
+        partition_contiguous_residual ^ (partition_strided_residual % 4);
+
+    int permuted_partition_contiguous_within_tile =
+        partition_contiguous_idx ^ (partition_strided_idx % 2);
+
+    //
+    // Compute final element location
+    //
+
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+                              permuted_partition_contiguous_within_tile *
+                                  PartitionShape::kContiguous +
+                              permuted_vec_contiguous_within_partition) *
+                                 kElementsPerAccess +
+                             (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = vec_strided_idx;
+
+    return element_contiguous + element_strided * stride_[0] * kFactor;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicandCongruous {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(coord);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return coord;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(extent);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+/// This one is just for TF32 NT kernel.
+template <int Crosswise>
+struct TensorOpMultiplicandCongruous<32, Crosswise> {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Partitionshape is the same as TileShape for this layout
+  using PartitionShape = PitchLinearShape<8, 4>;
+
+  using PartitionCount =
+      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
+                       TileShape::kStrided / PartitionShape::kStrided>;
+
+  using AccessCount =
+      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
+
+  //
+  // Static constants
+  //
+  static int const kElementSize = 32;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kCrosswise = Crosswise;
+  static int const kFactor = 1;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int tc = coord.contiguous() / 32;
+    int ts = coord.strided() / 4;
+
+    int c = (coord.contiguous() % 32) / kElementsPerAccess;
+    int s = coord.strided() % 4;
+
+    LongIndex offset = (c ^ (2 * s)) * kElementsPerAccess + s * stride_[0] +
+                       tc * 32 + ts * stride_[0] * 4 + coord.contiguous() % 4;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+template <int ElementSize, int Crosswise>
+struct ColumnMajorTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+template <int ElementSize, int Crosswise>
+struct RowMajorTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCrosswise(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(coord);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return coord;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(extent);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int Crosswise>
+struct ColumnMajorTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount = typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int Crosswise>
+struct RowMajorTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount = typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize, int InterleavedK>
+struct TensorOpMultiplicandColumnMajorInterleaved {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+
+  //static int const kThreadBlockStrided = ThreadBlockStrided;
+  static int const kInterleavedK = InterleavedK;
+  
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandColumnMajorInterleaved(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandColumnMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandColumnMajorInterleaved(extent[0] * kInterleavedK);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int const rows_per_smem_cache_line = 128 / kInterleavedK;
+
+    int row_id = coord.strided() / rows_per_smem_cache_line;
+    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
+
+    int access_block_id = col_id >> 4;
+    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
+
+    int swizzle_col_id = swizzle_access_block_id << 4;
+
+    return row_id * 128 + swizzle_col_id;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent[1] / kInterleavedK) * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize, int InterleavedK>
+struct TensorOpMultiplicandRowMajorInterleaved {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+
+  //static int const kThreadBlockStrided = ThreadBlockStrided;
+  static int const kInterleavedK = InterleavedK;
+  
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandRowMajorInterleaved(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandRowMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandRowMajorInterleaved(extent[1] * kInterleavedK);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int const rows_per_smem_cache_line = 128 / kInterleavedK;
+
+    int row_id = coord.strided() / rows_per_smem_cache_line;
+    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
+
+    int access_block_id = col_id >> 4;
+    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
+
+    int swizzle_col_id = swizzle_access_block_id << 4;
+
+    return row_id * 128 + swizzle_col_id;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent[0] / kInterleavedK) * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
new file mode 100755
index 000000000..15d528399
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -0,0 +1,1139 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief layouts needed by Ampere fp64 tensor core kernels.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous64b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous64b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 4;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 4;
+
+
+    int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8);
+    int row = (c & 6) / 2;
+
+    bank ^= ((s & 2) * 2);
+
+    LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous64b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return TensorOpMultiplicand64bCrosswise(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 16;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 16;
+
+    int k_group = c / 4;
+    int access_s = s / 2;
+
+    int row = access_s % 4;
+    int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1);
+
+    int smem_row = (k_group * 4 + row) + tc * 16;
+    int smem_col = ts * 16 + bank;
+
+    LongIndex offset = smem_row * stride_[0] + smem_col;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct ColumnMajorTensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct RowMajorTensorOpMultiplicand64bCrosswise {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicand64bCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous128b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous128b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 4;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 4;
+
+    Index k_index = (c / 2);
+
+    Index bank = (((c & 1) * 4) | (s ^ k_index));
+
+    LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();   
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous128b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCrosswise128x4 {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCrosswise128x4(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 8;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 8;
+
+    Index liq = c % 4;
+
+    Index bank = liq + ((s & 1) * 4) ^ (c & 4);
+
+    Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2);
+
+    LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/vector.h b/lightllm-kernel/cutlass/include/cutlass/layout/vector.h
new file mode 100755
index 000000000..56506feab
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/layout/vector.h
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used for rank=1 vectors.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/// Tensor layout for densely packed vectors.
+class PackedVectorLayout {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 1;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+
+  //
+  // No actual stride vector stored
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  PackedVectorLayout() { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static PackedVectorLayout packed(TensorCoord const &size) {
+    CUTLASS_UNUSED(size);
+    return PackedVectorLayout();
+  }
+
+  /// Returns the offset of a coordinate in linear memory
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return make_Coord(1);
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &size) const {
+    return size[0];
+  }
+};
+
+} // namespace layout
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix.h b/lightllm-kernel/cutlass/include/cutlass/matrix.h
new file mode 100755
index 000000000..5d8ccb3c1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/matrix.h
@@ -0,0 +1,14129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*  
+  \file
+  \brief Matrix classes with value semantics.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <iosfwd>
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Primary template with partial specializations to follow
+template <typename Element, int Rows, int Columns> struct Matrix;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 2;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 1-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> transpose() const {
+    Matrix<Element, 2, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Forms a 1-by-2 matrix by horizontally concatenating an Element with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Element rhs) {
+    return Matrix(
+      lhs, rhs);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> hcat(Element rhs) const {
+    return Matrix<Element, 1, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> hcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 1, 4>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 2, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 3, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 1-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 1-by-2 matrix
+template <typename Element>
+using Matrix1x2 = Matrix<Element, 1, 2>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x2<Element> make_Matrix1x2(
+    Element _0_0, Element _0_1
+) {
+  return Matrix1x2<Element>(
+  _0_0, _0_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 3;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 1-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> transpose() const {
+    Matrix<Element, 3, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Forms a 1-by-3 matrix by horizontally concatenating an Element with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Matrix<Element, 1, 2> const & rhs) {
+    return Matrix(
+      lhs, rhs.at(0, 0), rhs.at(0, 1));
+  }
+  
+  /// Forms a 1-by-3 matrix by horizontally concatenating a 1-by-2 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Element rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> hcat(Element rhs) const {
+    return Matrix<Element, 1, 4>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 2, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 3, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 3, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    // k=2
+    accum += data[2] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 1-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+  /// Cross product
+  CUTLASS_HOST_DEVICE
+  Matrix cross(Matrix const &rhs) const {
+    return Matrix(
+      data[1] * rhs.data[2] - data[2] * rhs.data[1],
+      data[0] * rhs.data[2] - data[2] * rhs.data[1],
+      data[0] * rhs.data[1] - data[1] * rhs.data[0]
+    );
+  }
+  
+};
+
+/// Template alias for 1-by-3 matrix
+template <typename Element>
+using Matrix1x3 = Matrix<Element, 1, 3>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x3<Element> make_Matrix1x3(
+    Element _0_0, Element _0_1, Element _0_2
+) {
+  return Matrix1x3<Element>(
+  _0_0, _0_1, _0_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 1-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> transpose() const {
+    Matrix<Element, 4, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Forms a 1-by-4 matrix by horizontally concatenating an Element with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Matrix<Element, 1, 3> const & rhs) {
+    return Matrix(
+      lhs, rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2));
+  }
+  
+  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Matrix<Element, 1, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1));
+  }
+  
+  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-3 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 3> const & lhs, Element rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs);
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 2, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-4 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
+    return Matrix<Element, 3, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 3, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    // k=2
+    accum += data[2] * rhs.data[2];
+
+    // k=3
+    accum += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 1-by-4 matrix
+template <typename Element>
+using Matrix1x4 = Matrix<Element, 1, 4>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x4<Element> make_Matrix1x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3
+) {
+  return Matrix1x4<Element>(
+  _0_0, _0_1, _0_2, _0_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 2;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 2-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> transpose() const {
+    Matrix<Element, 1, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 2, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-1 matrix by vertically concatenating an Element with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Element lower) {
+    return Matrix(
+      upper
+      , lower);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> vcat(Element rhs) const {
+    return Matrix<Element, 3, 1>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> vcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 4, 1>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-1 matrix
+template <typename Element>
+using Matrix2x1 = Matrix<Element, 2, 1>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x1<Element> make_Matrix2x1(
+    Element _0_0, 
+    Element _1_0
+) {
+  return Matrix2x1<Element>(
+  _0_0, 
+  _1_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 2-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+  }
+    
+  /// Constucts a 2-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+  }
+    
+  /// Static method to construct a 2-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[3] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> transpose() const {
+    Matrix<Element, 2, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[1] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-2 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0));
+  }
+  
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 3, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A, B
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+  /// Returns 2-by-2 rotation matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta) {
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    return Matrix(
+      c, -s,
+      s,  c
+    );
+  }
+    
+  /// Computes the determinant of a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+        accum += data[0] * data[3] - data[1] * data[2];
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 2-by-2 matrix given
+  /// the matrix's determinant
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element det) const {
+    return Matrix(
+      data[3], -data[1],
+      -data[2], data[0]
+    ) * (Element(1) / det); 
+  }
+
+  /// Computes the inverse of a 2-by-2 matrix.
+  CUTLASS_HOST_DEVICE
+  Matrix inverse() const {
+    return inverse(determinant());
+  }
+    
+};
+
+/// Template alias for 2-by-2 matrix
+template <typename Element>
+using Matrix2x2 = Matrix<Element, 2, 2>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x2<Element> make_Matrix2x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1
+) {
+  return Matrix2x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 6;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 2-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+  }
+    
+  /// Constucts a 2-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+  }
+    
+  /// Static method to construct a 2-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> transpose() const {
+    Matrix<Element, 3, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[4] = data[2];
+    mt.data[1] = data[3];
+    mt.data[3] = data[4];
+    mt.data[5] = data[5];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1));
+  }
+  
+  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0));
+  }
+  
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 3, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 2-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-3 matrix
+template <typename Element>
+using Matrix2x3 = Matrix<Element, 2, 3>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x3<Element> make_Matrix2x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2
+) {
+  return Matrix2x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 8;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 2-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+  }
+    
+  /// Constucts a 2-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+  }
+    
+  /// Static method to construct a 2-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> transpose() const {
+    Matrix<Element, 4, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[4] = data[2];
+    mt.data[6] = data[3];
+    mt.data[1] = data[4];
+    mt.data[3] = data[5];
+    mt.data[5] = data[6];
+    mt.data[7] = data[7];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2));
+  }
+  
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1));
+  }
+  
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-3 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 3> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0));
+  }
+  
+  /// Forms a 2-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 3, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-4 matrix
+template <typename Element>
+using Matrix2x4 = Matrix<Element, 2, 4>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x4<Element> make_Matrix2x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3
+) {
+  return Matrix2x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 3;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 3-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+    data[2] = _2_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> transpose() const {
+    Matrix<Element, 1, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 3, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-3 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 3> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-1 matrix by vertically concatenating an Element with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Matrix<Element, 2, 1> const & lower) {
+    return Matrix(
+      upper
+      , lower.at(0, 0)
+      , lower.at(1, 0));
+  }
+  
+  /// Forms a 3-by-1 matrix by vertically concatenating a 2-by-1 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Element lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , lower);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> vcat(Element rhs) const {
+    return Matrix<Element, 4, 1>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    data[2] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    data[2] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+    accum.data[2] += data[2] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+    accum.data[6] += data[2] * rhs.data[0];
+    accum.data[7] += data[2] * rhs.data[1];
+    accum.data[8] += data[2] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+    accum.data[8] += data[2] * rhs.data[0];
+    accum.data[9] += data[2] * rhs.data[1];
+    accum.data[10] += data[2] * rhs.data[2];
+    accum.data[11] += data[2] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+  /// Cross product
+  CUTLASS_HOST_DEVICE
+  Matrix cross(Matrix const &rhs) const {
+    return Matrix(
+      data[1] * rhs.data[2] - data[2] * rhs.data[1],
+      data[0] * rhs.data[2] - data[2] * rhs.data[1],
+      data[0] * rhs.data[1] - data[1] * rhs.data[0]
+    );
+  }
+  
+};
+
+/// Template alias for 3-by-1 matrix
+template <typename Element>
+using Matrix3x1 = Matrix<Element, 3, 1>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x1<Element> make_Matrix3x1(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0
+) {
+  return Matrix3x1<Element>(
+  _0_0, 
+  _1_0, 
+  _2_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 6;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 3-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+    data[4] = _2_0;  data[5] = _2_1;
+  }
+    
+  /// Constucts a 3-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1,
+    Matrix<Element, 1, 2> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+    data[4] = row_2.data[0];
+    data[5] = row_2.data[1];
+  }
+    
+  /// Static method to construct a 3-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    result.data[4] = column_0.data[2];
+    result.data[5] = column_1.data[2];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> transpose() const {
+    Matrix<Element, 2, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[1] = data[2];
+    mt.data[4] = data[3];
+    mt.data[2] = data[4];
+    mt.data[5] = data[5];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-2 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0)
+      , lhs.at(2, 0), rhs.at(2, 0));
+  }
+  
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1));
+  }
+  
+  /// Forms a 3-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A, B
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+    accum.data[2] += data[4] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+    accum.data[2] += data[5] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[2];
+    accum.data[5] += data[5] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+    accum.data[6] += data[4] * rhs.data[0];
+    accum.data[7] += data[4] * rhs.data[1];
+    accum.data[8] += data[4] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[3];
+    accum.data[7] += data[5] * rhs.data[4];
+    accum.data[8] += data[5] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+    accum.data[8] += data[4] * rhs.data[0];
+    accum.data[9] += data[4] * rhs.data[1];
+    accum.data[10] += data[4] * rhs.data[2];
+    accum.data[11] += data[4] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+    accum.data[8] += data[5] * rhs.data[4];
+    accum.data[9] += data[5] * rhs.data[5];
+    accum.data[10] += data[5] * rhs.data[6];
+    accum.data[11] += data[5] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 3-by-2 matrix
+template <typename Element>
+using Matrix3x2 = Matrix<Element, 3, 2>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x2<Element> make_Matrix3x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1
+) {
+  return Matrix3x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1, 
+  _2_0, _2_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 9;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 3-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
+  }
+    
+  /// Constucts a 3-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1,
+    Matrix<Element, 1, 3> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+    data[6] = row_2.data[0];
+    data[7] = row_2.data[1];
+    data[8] = row_2.data[2];
+  }
+    
+  /// Static method to construct a 3-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    result.data[6] = column_0.data[2];
+    result.data[7] = column_1.data[2];
+    result.data[8] = column_2.data[2];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[4] = Element(1);
+    m.data[8] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> transpose() const {
+    Matrix<Element, 3, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[6] = data[2];
+    mt.data[1] = data[3];
+    mt.data[4] = data[4];
+    mt.data[7] = data[5];
+    mt.data[2] = data[6];
+    mt.data[5] = data[7];
+    mt.data[8] = data[8];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1));
+  }
+  
+  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0));
+  }
+  
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
+  }
+  
+  /// Forms a 3-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+    result.data[8] = data[8] + rhs.data[8];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+    data[8] += rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+    result.data[8] = data[8] - rhs.data[8];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+    data[8] -= rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+    result.data[8] = data[8] * rhs.data[8];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+    result.data[8] = data[8] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+    data[8] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+    result.data[8] = data[8] / rhs.data[8];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+    result.data[8] = data[8] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+    data[8] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+    data[8] /= rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+    m.data[8] = -m.data[8];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+    accum.data[2] += data[6] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+    accum.data[2] += data[7] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+    accum.data[2] += data[8] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+    accum.data[4] += data[6] * rhs.data[0];
+    accum.data[5] += data[6] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[7] * rhs.data[2];
+    accum.data[5] += data[7] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+    accum.data[4] += data[8] * rhs.data[4];
+    accum.data[5] += data[8] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+    accum.data[8] += data[6] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[7] * rhs.data[3];
+    accum.data[7] += data[7] * rhs.data[4];
+    accum.data[8] += data[7] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+    accum.data[6] += data[8] * rhs.data[6];
+    accum.data[7] += data[8] * rhs.data[7];
+    accum.data[8] += data[8] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+    accum.data[8] += data[6] * rhs.data[0];
+    accum.data[9] += data[6] * rhs.data[1];
+    accum.data[10] += data[6] * rhs.data[2];
+    accum.data[11] += data[6] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+    accum.data[8] += data[7] * rhs.data[4];
+    accum.data[9] += data[7] * rhs.data[5];
+    accum.data[10] += data[7] * rhs.data[6];
+    accum.data[11] += data[7] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[8] * rhs.data[9];
+    accum.data[10] += data[8] * rhs.data[10];
+    accum.data[11] += data[8] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+    accum += data[8];
+
+    return accum;
+  }
+    
+  /// Returns 3-by-3 rotation matrix around the X axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_X(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(1, 1) = c;
+    m.at(1, 2) = -s;
+    m.at(2, 1) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 3-by-3 rotation matrix around the Y axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Y(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(2, 0) = -s;
+    m.at(0, 2) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 3-by-3 rotation matrix around the Z axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Z(Element theta) {
+    Matrix m = Matrix::identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(0, 1) = -s;
+    m.at(1, 0) = s;
+    m.at(1, 1) = c;
+
+    return m;
+  }
+
+  /// Returns a 3-by-3 rotation matrix around a unit-length axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
+    Element x = u.data[0];
+    Element y = u.data[1];
+    Element z = u.data[2];
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    Element one_minus_cos = Element(1) - fast_cos(theta);
+
+    Matrix m;
+
+    m.set_slice_3x3({
+      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
+      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
+      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
+    });
+
+    return m;
+  }
+
+  /// Returns a 3-by-3 reflection about the plane specified by the 
+  /// unit-length normal vector n_unit
+  CUTLASS_HOST_DEVICE
+  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
+
+    Element a = n_unit.data[0];
+    Element b = n_unit.data[1];
+    Element c = n_unit.data[2];
+
+    Matrix m = Matrix::identity();
+
+    m.set_slice_3x3({
+      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
+      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
+      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
+    });
+
+    return m;
+  }
+
+  /// Computes the determinant of a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+    
+    accum += at(0, 0) * Matrix<Element, 2, 2>({ at(1, 1), at(1, 2), at(2, 1), at(2, 2) }).determinant();
+    accum -= at(0, 1) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 2), at(2, 0), at(2, 2) }).determinant();
+    accum += at(0, 2) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 1), at(2, 0), at(2, 1) }).determinant();
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 3-by-3 matrix given
+  /// the matrix's determinant
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element det) const {
+    return Matrix(
+      at(1, 1) * at(2, 2) - at(1, 2) * at(2, 1),
+      at(0, 2) * at(2, 1) - at(0, 1) * at(2, 2),
+      at(0, 1) * at(1, 2) - at(0, 2) * at(1, 1),
+
+      at(1, 2) * at(2, 0) - at(1, 0) * at(2, 2),
+      at(0, 0) * at(2, 2) - at(0, 2) * at(2, 0),
+      at(0, 2) * at(1, 0) - at(0, 0) * at(1, 2),
+
+      at(1, 0) * at(2, 1) - at(1, 1) * at(2, 0),
+      at(0, 1) * at(2, 0) - at(0, 0) * at(2, 1),
+      at(0, 0) * at(1, 1) - at(0, 1) * at(1, 0)
+    ) * (Element(1) / det);
+  }
+  /// Computes the inverse of a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix inverse() const {
+    return inverse(determinant());
+  }
+    
+};
+
+/// Template alias for 3-by-3 matrix
+template <typename Element>
+using Matrix3x3 = Matrix<Element, 3, 3>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x3<Element> make_Matrix3x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2
+) {
+  return Matrix3x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2, 
+  _2_0, _2_1, _2_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 12;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 3-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
+  }
+    
+  /// Constucts a 3-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1,
+    Matrix<Element, 1, 4> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+    data[8] = row_2.data[0];
+    data[9] = row_2.data[1];
+    data[10] = row_2.data[2];
+    data[11] = row_2.data[3];
+  }
+    
+  /// Static method to construct a 3-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    result.data[8] = column_0.data[2];
+    result.data[9] = column_1.data[2];
+    result.data[10] = column_2.data[2];
+    result.data[11] = column_3.data[2];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> transpose() const {
+    Matrix<Element, 4, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[6] = data[2];
+    mt.data[9] = data[3];
+    mt.data[1] = data[4];
+    mt.data[4] = data[5];
+    mt.data[7] = data[6];
+    mt.data[10] = data[7];
+    mt.data[2] = data[8];
+    mt.data[5] = data[9];
+    mt.data[8] = data[10];
+    mt.data[11] = data[11];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2));
+  }
+  
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1));
+  }
+  
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-3 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 3> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0));
+  }
+  
+  /// Forms a 3-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
+  }
+  
+  /// Forms a 3-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    result.data[8] = data[8] + rhs.data[8];
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    data[8] += rhs.data[8];
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    result.data[8] = data[8] - rhs.data[8];
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    data[8] -= rhs.data[8];
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    result.data[8] = data[8] * rhs.data[8];
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    result.data[8] = data[8] * s;
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    data[8] *= s;
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    result.data[8] = data[8] / rhs.data[8];
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    result.data[8] = data[8] / s;
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    data[8] /= s;
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    data[8] /= rhs.data[8];
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+    m.data[8] = -m.data[8];
+    m.data[9] = -m.data[9];
+    m.data[10] = -m.data[10];
+    m.data[11] = -m.data[11];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+    accum.data[2] += data[8] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+    accum.data[2] += data[9] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+    accum.data[2] += data[10] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+    accum.data[2] += data[11] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+    accum.data[4] += data[8] * rhs.data[0];
+    accum.data[5] += data[8] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[9] * rhs.data[2];
+    accum.data[5] += data[9] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+    accum.data[4] += data[10] * rhs.data[4];
+    accum.data[5] += data[10] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+    accum.data[4] += data[11] * rhs.data[6];
+    accum.data[5] += data[11] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+    accum.data[6] += data[8] * rhs.data[0];
+    accum.data[7] += data[8] * rhs.data[1];
+    accum.data[8] += data[8] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[9] * rhs.data[3];
+    accum.data[7] += data[9] * rhs.data[4];
+    accum.data[8] += data[9] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+    accum.data[6] += data[10] * rhs.data[6];
+    accum.data[7] += data[10] * rhs.data[7];
+    accum.data[8] += data[10] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+    accum.data[6] += data[11] * rhs.data[9];
+    accum.data[7] += data[11] * rhs.data[10];
+    accum.data[8] += data[11] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+    accum.data[8] += data[8] * rhs.data[0];
+    accum.data[9] += data[8] * rhs.data[1];
+    accum.data[10] += data[8] * rhs.data[2];
+    accum.data[11] += data[8] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+    accum.data[8] += data[9] * rhs.data[4];
+    accum.data[9] += data[9] * rhs.data[5];
+    accum.data[10] += data[9] * rhs.data[6];
+    accum.data[11] += data[9] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[10] * rhs.data[9];
+    accum.data[10] += data[10] * rhs.data[10];
+    accum.data[11] += data[10] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+    accum.data[8] += data[11] * rhs.data[12];
+    accum.data[9] += data[11] * rhs.data[13];
+    accum.data[10] += data[11] * rhs.data[14];
+    accum.data[11] += data[11] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+    accum += data[10];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 3-by-4 matrix
+template <typename Element>
+using Matrix3x4 = Matrix<Element, 3, 4>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x4<Element> make_Matrix3x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3
+) {
+  return Matrix3x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3, 
+  _2_0, _2_1, _2_2, _2_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 4-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0, 
+    Element _3_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+    data[2] = _2_0;
+    data[3] = _3_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> transpose() const {
+    Matrix<Element, 1, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+    m.data[3] = data[i * 1 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+    data[i * 1 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 2> const & rhs) const {
+    return Matrix<Element, 4, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-3 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 3> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-1 matrix by vertically concatenating an Element with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Matrix<Element, 3, 1> const & lower) {
+    return Matrix(
+      upper
+      , lower.at(0, 0)
+      , lower.at(1, 0)
+      , lower.at(2, 0));
+  }
+  
+  /// Forms a 4-by-1 matrix by vertically concatenating a 2-by-1 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Matrix<Element, 2, 1> const & lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , lower.at(0, 0)
+      , lower.at(1, 0));
+  }
+  
+  /// Forms a 4-by-1 matrix by vertically concatenating a 3-by-1 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 1> const & upper, Element lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , upper.at(2, 0)
+      , lower);
+  }
+  
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    data[2] *= s;
+
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    data[2] /= s;
+
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[0];
+    accum.data[7] += data[3] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+    accum.data[6] += data[2] * rhs.data[0];
+    accum.data[7] += data[2] * rhs.data[1];
+    accum.data[8] += data[2] * rhs.data[2];
+    accum.data[9] += data[3] * rhs.data[0];
+    accum.data[10] += data[3] * rhs.data[1];
+    accum.data[11] += data[3] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+    accum.data[8] += data[2] * rhs.data[0];
+    accum.data[9] += data[2] * rhs.data[1];
+    accum.data[10] += data[2] * rhs.data[2];
+    accum.data[11] += data[2] * rhs.data[3];
+    accum.data[12] += data[3] * rhs.data[0];
+    accum.data[13] += data[3] * rhs.data[1];
+    accum.data[14] += data[3] * rhs.data[2];
+    accum.data[15] += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-1 matrix
+template <typename Element>
+using Matrix4x1 = Matrix<Element, 4, 1>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x1<Element> make_Matrix4x1(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0, 
+    Element _3_0
+) {
+  return Matrix4x1<Element>(
+  _0_0, 
+  _1_0, 
+  _2_0, 
+  _3_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 8;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 4-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1, 
+    Element _3_0, Element _3_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+    data[4] = _2_0;  data[5] = _2_1;
+    data[6] = _3_0;  data[7] = _3_1;
+  }
+    
+  /// Constucts a 4-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1,
+    Matrix<Element, 1, 2> const &row_2,
+    Matrix<Element, 1, 2> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+    data[4] = row_2.data[0];
+    data[5] = row_2.data[1];
+    data[6] = row_3.data[0];
+    data[7] = row_3.data[1];
+  }
+    
+  /// Static method to construct a 4-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    result.data[4] = column_0.data[2];
+    result.data[5] = column_1.data[2];
+    result.data[6] = column_0.data[3];
+    result.data[7] = column_1.data[3];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> transpose() const {
+    Matrix<Element, 2, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[1] = data[2];
+    mt.data[5] = data[3];
+    mt.data[2] = data[4];
+    mt.data[6] = data[5];
+    mt.data[3] = data[6];
+    mt.data[7] = data[7];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+    m.data[3] = data[i * 2 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+    data[i * 2 + j + 6] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+    m.data[6] = data[i * 2 + j + 6];
+    m.data[7] = data[i * 2 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+    data[i * 2 + j + 6] = m.data[6];
+    data[i * 2 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-2 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0)
+      , lhs.at(2, 0), rhs.at(2, 0)
+      , lhs.at(3, 0), rhs.at(3, 0));
+  }
+  
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 2> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 3, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1)
+      , lower.at(2, 0), lower.at(2, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by vertically concatenating a 3-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , upper.at(2, 0), upper.at(2, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A, B
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+      , C.at(2, 0), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 1> const & B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , A.at(2, 0), B.at(2, 0)
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[6] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+    accum.data[2] += data[5] * rhs.data[1];
+    accum.data[3] += data[7] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[2];
+    accum.data[5] += data[5] * rhs.data[3];
+    accum.data[6] += data[7] * rhs.data[2];
+    accum.data[7] += data[7] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+    accum.data[6] += data[4] * rhs.data[0];
+    accum.data[7] += data[4] * rhs.data[1];
+    accum.data[8] += data[4] * rhs.data[2];
+    accum.data[9] += data[6] * rhs.data[0];
+    accum.data[10] += data[6] * rhs.data[1];
+    accum.data[11] += data[6] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[3];
+    accum.data[7] += data[5] * rhs.data[4];
+    accum.data[8] += data[5] * rhs.data[5];
+    accum.data[9] += data[7] * rhs.data[3];
+    accum.data[10] += data[7] * rhs.data[4];
+    accum.data[11] += data[7] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+    accum.data[8] += data[4] * rhs.data[0];
+    accum.data[9] += data[4] * rhs.data[1];
+    accum.data[10] += data[4] * rhs.data[2];
+    accum.data[11] += data[4] * rhs.data[3];
+    accum.data[12] += data[6] * rhs.data[0];
+    accum.data[13] += data[6] * rhs.data[1];
+    accum.data[14] += data[6] * rhs.data[2];
+    accum.data[15] += data[6] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+    accum.data[8] += data[5] * rhs.data[4];
+    accum.data[9] += data[5] * rhs.data[5];
+    accum.data[10] += data[5] * rhs.data[6];
+    accum.data[11] += data[5] * rhs.data[7];
+    accum.data[12] += data[7] * rhs.data[4];
+    accum.data[13] += data[7] * rhs.data[5];
+    accum.data[14] += data[7] * rhs.data[6];
+    accum.data[15] += data[7] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-2 matrix
+template <typename Element>
+using Matrix4x2 = Matrix<Element, 4, 2>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x2<Element> make_Matrix4x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1, 
+    Element _3_0, Element _3_1
+) {
+  return Matrix4x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1, 
+  _2_0, _2_1, 
+  _3_0, _3_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 12;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 4-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2, 
+    Element _3_0, Element _3_1, Element _3_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
+    data[9] = _3_0;  data[10] = _3_1;  data[11] = _3_2;
+  }
+    
+  /// Constucts a 4-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1,
+    Matrix<Element, 1, 3> const &row_2,
+    Matrix<Element, 1, 3> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+    data[6] = row_2.data[0];
+    data[7] = row_2.data[1];
+    data[8] = row_2.data[2];
+    data[9] = row_3.data[0];
+    data[10] = row_3.data[1];
+    data[11] = row_3.data[2];
+  }
+    
+  /// Static method to construct a 4-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    result.data[6] = column_0.data[2];
+    result.data[7] = column_1.data[2];
+    result.data[8] = column_2.data[2];
+    result.data[9] = column_0.data[3];
+    result.data[10] = column_1.data[3];
+    result.data[11] = column_2.data[3];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> transpose() const {
+    Matrix<Element, 3, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[8] = data[2];
+    mt.data[1] = data[3];
+    mt.data[5] = data[4];
+    mt.data[9] = data[5];
+    mt.data[2] = data[6];
+    mt.data[6] = data[7];
+    mt.data[10] = data[8];
+    mt.data[3] = data[9];
+    mt.data[7] = data[10];
+    mt.data[11] = data[11];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+    m.data[3] = data[i * 3 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+    data[i * 3 + j + 9] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+    m.data[6] = data[i * 3 + j + 9];
+    m.data[7] = data[i * 3 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+    data[i * 3 + j + 9] = m.data[6];
+    data[i * 3 + j + 10] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+    m.data[9] = data[i * 3 + j + 9];
+    m.data[10] = data[i * 3 + j + 10];
+    m.data[11] = data[i * 3 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+    data[i * 3 + j + 9] = m.data[9];
+    data[i * 3 + j + 10] = m.data[10];
+    data[i * 3 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1)
+      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1));
+  }
+  
+  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0)
+      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0));
+  }
+  
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 3, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2)
+      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by vertically concatenating a 3-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+      , C.at(2, 0), D.at(2, 0), D.at(2, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+      , C.at(2, 0), C.at(2, 1), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , A.at(2, 0), B.at(2, 0), B.at(2, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 1> const & B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , A.at(2, 0), A.at(2, 1), B.at(2, 0)
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+    result.data[8] = data[8] + rhs.data[8];
+
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+    data[8] += rhs.data[8];
+
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+    result.data[8] = data[8] - rhs.data[8];
+
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+    data[8] -= rhs.data[8];
+
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+    result.data[8] = data[8] * rhs.data[8];
+
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+    result.data[8] = data[8] * s;
+
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+    data[8] *= s;
+
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+    result.data[8] = data[8] / rhs.data[8];
+
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+    result.data[8] = data[8] / s;
+
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+    data[8] /= s;
+
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+    data[8] /= rhs.data[8];
+
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+    m.data[8] = -m.data[8];
+    m.data[9] = -m.data[9];
+    m.data[10] = -m.data[10];
+    m.data[11] = -m.data[11];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+    accum.data[2] += data[6] * rhs.data[0];
+    accum.data[3] += data[9] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+    accum.data[2] += data[7] * rhs.data[1];
+    accum.data[3] += data[10] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+    accum.data[2] += data[8] * rhs.data[2];
+    accum.data[3] += data[11] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+    accum.data[4] += data[6] * rhs.data[0];
+    accum.data[5] += data[6] * rhs.data[1];
+    accum.data[6] += data[9] * rhs.data[0];
+    accum.data[7] += data[9] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[7] * rhs.data[2];
+    accum.data[5] += data[7] * rhs.data[3];
+    accum.data[6] += data[10] * rhs.data[2];
+    accum.data[7] += data[10] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+    accum.data[4] += data[8] * rhs.data[4];
+    accum.data[5] += data[8] * rhs.data[5];
+    accum.data[6] += data[11] * rhs.data[4];
+    accum.data[7] += data[11] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+    accum.data[8] += data[6] * rhs.data[2];
+    accum.data[9] += data[9] * rhs.data[0];
+    accum.data[10] += data[9] * rhs.data[1];
+    accum.data[11] += data[9] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[7] * rhs.data[3];
+    accum.data[7] += data[7] * rhs.data[4];
+    accum.data[8] += data[7] * rhs.data[5];
+    accum.data[9] += data[10] * rhs.data[3];
+    accum.data[10] += data[10] * rhs.data[4];
+    accum.data[11] += data[10] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+    accum.data[6] += data[8] * rhs.data[6];
+    accum.data[7] += data[8] * rhs.data[7];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[11] * rhs.data[6];
+    accum.data[10] += data[11] * rhs.data[7];
+    accum.data[11] += data[11] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+    accum.data[8] += data[6] * rhs.data[0];
+    accum.data[9] += data[6] * rhs.data[1];
+    accum.data[10] += data[6] * rhs.data[2];
+    accum.data[11] += data[6] * rhs.data[3];
+    accum.data[12] += data[9] * rhs.data[0];
+    accum.data[13] += data[9] * rhs.data[1];
+    accum.data[14] += data[9] * rhs.data[2];
+    accum.data[15] += data[9] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+    accum.data[8] += data[7] * rhs.data[4];
+    accum.data[9] += data[7] * rhs.data[5];
+    accum.data[10] += data[7] * rhs.data[6];
+    accum.data[11] += data[7] * rhs.data[7];
+    accum.data[12] += data[10] * rhs.data[4];
+    accum.data[13] += data[10] * rhs.data[5];
+    accum.data[14] += data[10] * rhs.data[6];
+    accum.data[15] += data[10] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[8] * rhs.data[9];
+    accum.data[10] += data[8] * rhs.data[10];
+    accum.data[11] += data[8] * rhs.data[11];
+    accum.data[12] += data[11] * rhs.data[8];
+    accum.data[13] += data[11] * rhs.data[9];
+    accum.data[14] += data[11] * rhs.data[10];
+    accum.data[15] += data[11] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+    accum += data[8];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-3 matrix
+template <typename Element>
+using Matrix4x3 = Matrix<Element, 4, 3>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x3<Element> make_Matrix4x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2, 
+    Element _3_0, Element _3_1, Element _3_2
+) {
+  return Matrix4x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2, 
+  _2_0, _2_1, _2_2, 
+  _3_0, _3_1, _3_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 16;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constucts a 4-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
+    Element _3_0, Element _3_1, Element _3_2, Element _3_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
+    data[12] = _3_0;  data[13] = _3_1;  data[14] = _3_2;  data[15] = _3_3;
+  }
+    
+  /// Constucts a 4-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1,
+    Matrix<Element, 1, 4> const &row_2,
+    Matrix<Element, 1, 4> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+    data[8] = row_2.data[0];
+    data[9] = row_2.data[1];
+    data[10] = row_2.data[2];
+    data[11] = row_2.data[3];
+    data[12] = row_3.data[0];
+    data[13] = row_3.data[1];
+    data[14] = row_3.data[2];
+    data[15] = row_3.data[3];
+  }
+    
+  /// Static method to construct a 4-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    result.data[8] = column_0.data[2];
+    result.data[9] = column_1.data[2];
+    result.data[10] = column_2.data[2];
+    result.data[11] = column_3.data[2];
+    result.data[12] = column_0.data[3];
+    result.data[13] = column_1.data[3];
+    result.data[14] = column_2.data[3];
+    result.data[15] = column_3.data[3];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[5] = Element(1);
+    m.data[10] = Element(1);
+    m.data[15] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+    m.data[12] = s;
+    m.data[13] = s;
+    m.data[14] = s;
+    m.data[15] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 4, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 4> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> diagonal() const {
+    Matrix<Element, 4, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> transpose() const {
+    Matrix<Element, 4, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[8] = data[2];
+    mt.data[12] = data[3];
+    mt.data[1] = data[4];
+    mt.data[5] = data[5];
+    mt.data[9] = data[6];
+    mt.data[13] = data[7];
+    mt.data[2] = data[8];
+    mt.data[6] = data[9];
+    mt.data[10] = data[10];
+    mt.data[14] = data[11];
+    mt.data[3] = data[12];
+    mt.data[7] = data[13];
+    mt.data[11] = data[14];
+    mt.data[15] = data[15];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+    m.data[3] = data[i * 4 + j + 12];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+    data[i * 4 + j + 12] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+    m.data[6] = data[i * 4 + j + 12];
+    m.data[7] = data[i * 4 + j + 13];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+    data[i * 4 + j + 12] = m.data[6];
+    data[i * 4 + j + 13] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+    m.data[9] = data[i * 4 + j + 12];
+    m.data[10] = data[i * 4 + j + 13];
+    m.data[11] = data[i * 4 + j + 14];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+    data[i * 4 + j + 12] = m.data[9];
+    data[i * 4 + j + 13] = m.data[10];
+    data[i * 4 + j + 14] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> slice_4x4(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+    m.data[12] = data[i * 4 + j + 12];
+    m.data[13] = data[i * 4 + j + 13];
+    m.data[14] = data[i * 4 + j + 14];
+    m.data[15] = data[i * 4 + j + 15];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x4(Matrix<Element, 4, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+    data[i * 4 + j + 12] = m.data[12];
+    data[i * 4 + j + 13] = m.data[13];
+    data[i * 4 + j + 14] = m.data[14];
+    data[i * 4 + j + 15] = m.data[15];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2)
+      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1), rhs.at(3, 2));
+  }
+  
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1)
+      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0), rhs.at(3, 1));
+  }
+  
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-3 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 3> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0)
+      , lhs.at(3, 0), lhs.at(3, 1), lhs.at(3, 2), rhs.at(3, 0));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 3, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3)
+      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2), lower.at(2, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 3-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2), upper.at(2, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+      , C.at(2, 0), D.at(2, 0), D.at(2, 1), D.at(2, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+      , C.at(2, 0), C.at(2, 1), D.at(2, 0), D.at(2, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 3, 3> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+      , C.at(2, 0), C.at(2, 1), C.at(2, 2), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , A.at(2, 0), B.at(2, 0), B.at(2, 1), B.at(2, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , A.at(2, 0), A.at(2, 1), B.at(2, 0), B.at(2, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 3> const & A, Matrix<Element, 3, 1> const & B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , A.at(2, 0), A.at(2, 1), A.at(2, 2), B.at(2, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    result.data[8] = data[8] + rhs.data[8];
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    result.data[12] = data[12] + rhs.data[12];
+    result.data[13] = data[13] + rhs.data[13];
+    result.data[14] = data[14] + rhs.data[14];
+    result.data[15] = data[15] + rhs.data[15];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    data[8] += rhs.data[8];
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    data[12] += rhs.data[12];
+    data[13] += rhs.data[13];
+    data[14] += rhs.data[14];
+    data[15] += rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    result.data[8] = data[8] - rhs.data[8];
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    result.data[12] = data[12] - rhs.data[12];
+    result.data[13] = data[13] - rhs.data[13];
+    result.data[14] = data[14] - rhs.data[14];
+    result.data[15] = data[15] - rhs.data[15];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    data[8] -= rhs.data[8];
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    data[12] -= rhs.data[12];
+    data[13] -= rhs.data[13];
+    data[14] -= rhs.data[14];
+    data[15] -= rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    result.data[8] = data[8] * rhs.data[8];
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    result.data[12] = data[12] * rhs.data[12];
+    result.data[13] = data[13] * rhs.data[13];
+    result.data[14] = data[14] * rhs.data[14];
+    result.data[15] = data[15] * rhs.data[15];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    result.data[8] = data[8] * s;
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    result.data[12] = data[12] * s;
+    result.data[13] = data[13] * s;
+    result.data[14] = data[14] * s;
+    result.data[15] = data[15] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    data[8] *= s;
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    data[12] *= s;
+    data[13] *= s;
+    data[14] *= s;
+    data[15] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    result.data[8] = data[8] / rhs.data[8];
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    result.data[12] = data[12] / rhs.data[12];
+    result.data[13] = data[13] / rhs.data[13];
+    result.data[14] = data[14] / rhs.data[14];
+    result.data[15] = data[15] / rhs.data[15];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    result.data[8] = data[8] / s;
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    result.data[12] = data[12] / s;
+    result.data[13] = data[13] / s;
+    result.data[14] = data[14] / s;
+    result.data[15] = data[15] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    data[8] /= s;
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    data[12] /= s;
+    data[13] /= s;
+    data[14] /= s;
+    data[15] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    data[8] /= rhs.data[8];
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    data[12] /= rhs.data[12];
+    data[13] /= rhs.data[13];
+    data[14] /= rhs.data[14];
+    data[15] /= rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -m.data[0];
+    m.data[1] = -m.data[1];
+    m.data[2] = -m.data[2];
+    m.data[3] = -m.data[3];
+    m.data[4] = -m.data[4];
+    m.data[5] = -m.data[5];
+    m.data[6] = -m.data[6];
+    m.data[7] = -m.data[7];
+    m.data[8] = -m.data[8];
+    m.data[9] = -m.data[9];
+    m.data[10] = -m.data[10];
+    m.data[11] = -m.data[11];
+    m.data[12] = -m.data[12];
+    m.data[13] = -m.data[13];
+    m.data[14] = -m.data[14];
+    m.data[15] = -m.data[15];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+    accum.data[2] += data[8] * rhs.data[0];
+    accum.data[3] += data[12] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+    accum.data[2] += data[9] * rhs.data[1];
+    accum.data[3] += data[13] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+    accum.data[2] += data[10] * rhs.data[2];
+    accum.data[3] += data[14] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+    accum.data[2] += data[11] * rhs.data[3];
+    accum.data[3] += data[15] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+    accum.data[4] += data[8] * rhs.data[0];
+    accum.data[5] += data[8] * rhs.data[1];
+    accum.data[6] += data[12] * rhs.data[0];
+    accum.data[7] += data[12] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[9] * rhs.data[2];
+    accum.data[5] += data[9] * rhs.data[3];
+    accum.data[6] += data[13] * rhs.data[2];
+    accum.data[7] += data[13] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+    accum.data[4] += data[10] * rhs.data[4];
+    accum.data[5] += data[10] * rhs.data[5];
+    accum.data[6] += data[14] * rhs.data[4];
+    accum.data[7] += data[14] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+    accum.data[4] += data[11] * rhs.data[6];
+    accum.data[5] += data[11] * rhs.data[7];
+    accum.data[6] += data[15] * rhs.data[6];
+    accum.data[7] += data[15] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+    accum.data[6] += data[8] * rhs.data[0];
+    accum.data[7] += data[8] * rhs.data[1];
+    accum.data[8] += data[8] * rhs.data[2];
+    accum.data[9] += data[12] * rhs.data[0];
+    accum.data[10] += data[12] * rhs.data[1];
+    accum.data[11] += data[12] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[9] * rhs.data[3];
+    accum.data[7] += data[9] * rhs.data[4];
+    accum.data[8] += data[9] * rhs.data[5];
+    accum.data[9] += data[13] * rhs.data[3];
+    accum.data[10] += data[13] * rhs.data[4];
+    accum.data[11] += data[13] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+    accum.data[6] += data[10] * rhs.data[6];
+    accum.data[7] += data[10] * rhs.data[7];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[14] * rhs.data[6];
+    accum.data[10] += data[14] * rhs.data[7];
+    accum.data[11] += data[14] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+    accum.data[6] += data[11] * rhs.data[9];
+    accum.data[7] += data[11] * rhs.data[10];
+    accum.data[8] += data[11] * rhs.data[11];
+    accum.data[9] += data[15] * rhs.data[9];
+    accum.data[10] += data[15] * rhs.data[10];
+    accum.data[11] += data[15] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+    accum.data[8] += data[8] * rhs.data[0];
+    accum.data[9] += data[8] * rhs.data[1];
+    accum.data[10] += data[8] * rhs.data[2];
+    accum.data[11] += data[8] * rhs.data[3];
+    accum.data[12] += data[12] * rhs.data[0];
+    accum.data[13] += data[12] * rhs.data[1];
+    accum.data[14] += data[12] * rhs.data[2];
+    accum.data[15] += data[12] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+    accum.data[8] += data[9] * rhs.data[4];
+    accum.data[9] += data[9] * rhs.data[5];
+    accum.data[10] += data[9] * rhs.data[6];
+    accum.data[11] += data[9] * rhs.data[7];
+    accum.data[12] += data[13] * rhs.data[4];
+    accum.data[13] += data[13] * rhs.data[5];
+    accum.data[14] += data[13] * rhs.data[6];
+    accum.data[15] += data[13] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[10] * rhs.data[9];
+    accum.data[10] += data[10] * rhs.data[10];
+    accum.data[11] += data[10] * rhs.data[11];
+    accum.data[12] += data[14] * rhs.data[8];
+    accum.data[13] += data[14] * rhs.data[9];
+    accum.data[14] += data[14] * rhs.data[10];
+    accum.data[15] += data[14] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+    accum.data[8] += data[11] * rhs.data[12];
+    accum.data[9] += data[11] * rhs.data[13];
+    accum.data[10] += data[11] * rhs.data[14];
+    accum.data[11] += data[11] * rhs.data[15];
+    accum.data[12] += data[15] * rhs.data[12];
+    accum.data[13] += data[15] * rhs.data[13];
+    accum.data[14] += data[15] * rhs.data[14];
+    accum.data[15] += data[15] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+    accum += data[12];
+    accum += data[13];
+    accum += data[14];
+    accum += data[15];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+    accum += data[12] * data[12];
+    accum += data[13] * data[13];
+    accum += data[14] * data[14];
+    accum += data[15] * data[15];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+    accum += data[10];
+    accum += data[15];
+
+    return accum;
+  }
+    
+  /// Returns 4-by-4 rotation matrix around the X axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_X(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(1, 1) = c;
+    m.at(1, 2) = -s;
+    m.at(2, 1) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 4-by-4 rotation matrix around the Y axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Y(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(2, 0) = -s;
+    m.at(0, 2) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 4-by-4 rotation matrix around the Z axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Z(Element theta) {
+    Matrix m = Matrix::identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(0, 1) = -s;
+    m.at(1, 0) = s;
+    m.at(1, 1) = c;
+
+    return m;
+  }
+
+  /// Returns a 4-by-4 rotation matrix around a unit-length axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
+    Element x = u.data[0];
+    Element y = u.data[1];
+    Element z = u.data[2];
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    Element one_minus_cos = Element(1) - fast_cos(theta);
+
+    Matrix m;
+
+    m.set_slice_3x3({
+      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
+      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
+      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
+    });
+
+    return m;
+  }
+
+  /// Returns a 4-by-4 reflection about the plane specified by the 
+  /// unit-length normal vector n_unit
+  CUTLASS_HOST_DEVICE
+  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
+
+    Element a = n_unit.data[0];
+    Element b = n_unit.data[1];
+    Element c = n_unit.data[2];
+
+    Matrix m = Matrix::identity();
+
+    m.set_slice_3x3({
+      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
+      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
+      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
+    });
+
+    return m;
+  }
+
+  /// Returns a perspective projection matrix typical of OpenGL applications
+  CUTLASS_HOST_DEVICE
+  static Matrix perspective(Element near_plane, Element far_plane, Element fovH, Element fovV) {
+    Element aspect = fovH / fovV;
+    Element f = Element(cos(fovV)) / Element(fovH);
+    Element Q = near_plane - far_plane;
+
+    return Matrix(
+      f / aspect, 0,                0,                           0,
+      0,          f,                0,                           0,
+      0,          0, (near_plane + far_plane) / Q, Element(2) * far_plane * near_plane / Q,
+      0,          0,                -1,                          0
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Matrix translation(Matrix<Element, 3, 1> const &v) {
+    return Matrix(
+      1, 0, 0, v.data[0],
+      0, 1, 0, v.data[1],
+      0, 0, 1, v.data[2],
+      0, 0, 0, 1
+    );
+  }
+  
+  /// Computes the determinant of a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+    
+    accum += at(0, 0) * Matrix<Element, 3, 3>({ at(1, 1), at(1, 2), at(1, 3), at(2, 1), at(2, 2), at(2, 3), at(3, 1), at(3, 2), at(3, 3) }).determinant();
+    accum -= at(0, 1) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 2), at(1, 3), at(2, 0), at(2, 2), at(2, 3), at(3, 0), at(3, 2), at(3, 3) }).determinant();
+    accum += at(0, 2) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 3), at(2, 0), at(2, 1), at(2, 3), at(3, 0), at(3, 1), at(3, 3) }).determinant();
+    accum -= at(0, 3) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 2), at(2, 0), at(2, 1), at(2, 2), at(3, 0), at(3, 1), at(3, 2) }).determinant();
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 4-by-4 matrix (ignores the optional argument)
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element ignore = 1) const {
+    Matrix<Element, 2, 2> B = slice_2x2(0, 2);
+    Matrix<Element, 2, 2> A = slice_2x2(0, 0);
+    Matrix<Element, 2, 2> C = slice_2x2(2, 0);
+    Matrix<Element, 2, 2> D = slice_2x2(2, 2);
+
+    Matrix<Element, 2, 2> D_inv = D.inverse();
+
+    Matrix<Element, 2, 2> E = (A - B * D_inv * C).inverse();
+
+    return Matrix::block(
+      E,              -E * B * D_inv,
+      -D_inv * C * E, D_inv + D_inv * C * E * B * D_inv
+    );
+  }
+    
+};
+
+/// Template alias for 4-by-4 matrix
+template <typename Element>
+using Matrix4x4 = Matrix<Element, 4, 4>;
+
+
+/// Free funciton to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x4<Element> make_Matrix4x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
+    Element _3_0, Element _3_1, Element _3_2, Element _3_3
+) {
+  return Matrix4x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3, 
+  _2_0, _2_1, _2_2, _2_3, 
+  _3_0, _3_1, _3_2, _3_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Elementwise scalar multiplication
+template <typename Element, int Rows, int Columns>
+CUTLASS_HOST_DEVICE
+Matrix<Element, Rows, Columns> operator*(Element s, Matrix<Element, Rows, Columns> const &rhs) {
+  return rhs.multiply(s);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h b/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
new file mode 100755
index 000000000..719575d59
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a canonical coordinate for rank=2 matrices offering named indices.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
+/// expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.
+struct MatrixCoord : public Coord<2, int> {
+
+public:
+
+  /// Integer-valued index
+  using Index = int;
+
+  /// Base type is a Coord of rank=2
+  using Base = Coord<2, Index>;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+private:
+
+  /// Rows dimension
+  static int const kRow = 0;
+
+  /// Columns dimension
+  static int const kColumn = 1;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  MatrixCoord() { }
+
+  /// Constructs from Coord<2>
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(Coord<2, Index> const &coord): Base(coord) { }
+
+  /// Helper to construct from a row and column
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(Index row, Index column): Base(make_Coord(row, column)) { }
+
+  /// Helper to construct from a row and column, which are LongIndex based
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(LongIndex row, LongIndex column): Base(make_Coord(Index(row), Index(column))) { }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & row() const { return this->at(kRow); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & row() { return this->at(kRow); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & column() const { return this->at(kColumn); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & column() { return this->at(kColumn); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator+(Base const& b) const {
+    return MatrixCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator-(Base const& b) const {
+    return MatrixCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator*(Base const& b) const {
+    return MatrixCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator/(Base const& b) const {
+    return MatrixCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h b/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
new file mode 100755
index 000000000..66623a431
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a Shape template for matrix tiles
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Describes the size of a matrix tile
+template <
+  int Row_,     ///< rows of a matrix
+  int Column_      ///< columns of a matrix
+>
+struct MatrixShape {
+  static int const kRow = Row_;           ///< rows of a matrix
+  static int const kColumn = Column_;           ///< columns of a matrix
+  static int const kCount = Row_ * Column_;  ///< total number of elements in a matrix
+
+  //
+  // Static member functions
+  //
+
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kRow, kColumn);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h b/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
new file mode 100755
index 000000000..17c1ac14d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
@@ -0,0 +1,4547 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Boost-like numeric conversion operator for CUTLASS numeric types
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cfenv>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/thread/unary_op.h"
+
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Floating-point rounding style similare to Standard Library's formats but supporting
+/// additional rounding options.
+enum class FloatRoundStyle {
+  round_indeterminate,          ///< rounding mode unknown
+  round_toward_zero,            ///< round toward zero
+  round_to_nearest,             ///< round to nearest even
+  round_to_nearest_satfinite,   ///< round to nearest even, capping value to min and max of destination type
+  round_toward_infinity,        ///< round toward infinity
+  round_toward_neg_infinity,    ///< round toward negative infinity
+  round_half_ulp_truncate,      ///< add 0.5ulp to integer representation then round toward zero
+  round_half_ulp_trunc_dntz     ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+struct NumericConverter {
+
+  using result_type = T;
+  using source_type = S;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<result_type>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => int32_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__)
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+    return __float2int_rn(s);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return __float2int_rz(s);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#elif !defined(__CUDACC_RTC__)
+
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TONEAREST);
+    return (result_type)std::nearbyint(s);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TOWARDZERO);
+    return (result_type)std::nearbyint(s);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => int8_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__)
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & s) {
+
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#elif !defined(__CUDACC_RTC__)
+
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TOWARDZERO);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  static result_type convert(source_type const & s) {
+    std::fesetround(FE_TOWARDZERO);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
+
+    return static_cast<result_type>(intermediate);
+  }
+
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => integer_subbyte
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ true>, float, Round> {
+private:
+  static constexpr bool result_is_signed = true;
+
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
+  using source_type = float;
+  static constexpr FloatRoundStyle round_style = Round;
+
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = int;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
+  }
+
+  CUTLASS_HOST_DEVICE result_type
+  operator()(source_type const& s) const {
+    return convert(s);
+  }
+};
+
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ false>, float, Round> {
+private:
+  static constexpr bool result_is_signed = false;
+
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
+  using source_type = float;
+  static constexpr FloatRoundStyle round_style = Round;
+
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = unsigned;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in unsigned int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
+  }
+
+  CUTLASS_HOST_DEVICE result_type  
+  operator()(source_type const& s) const {
+    return convert(s);
+  }
+};
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::half_t
+template <typename T, FloatRoundStyle Round>
+struct NumericConverter<T, T, Round> {
+
+  using result_type = T;
+  using source_type = T;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return s;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::half_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::half_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::half_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::half_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result = static_cast<float>(s);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Specialization for round-to-nearest
+template <>
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = cutlass::half_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result = static_cast<cutlass::half_t>(s);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Specialization for round-toward-zero
+template <>
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = cutlass::half_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  /// Round toward zero
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & flt) {
+
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return cutlass::half_t(__float2half_rz(flt));
+  #else
+    // software implementation rounds toward nearest even
+    unsigned const& s = reinterpret_cast<unsigned const &>(flt);
+    uint16_t sign = uint16_t((s >> 16) & 0x8000);
+    int32_t exp = int32_t((s >> 23) & 0xff) - 127;
+    int mantissa = s & 0x7fffff;
+    uint16_t u = 0;
+
+    if ((s & 0x7fffffff) == 0) {
+      // sign-preserving zero
+      return cutlass::half_t::bitcast(sign);
+    }
+
+    if (exp > 15) {
+      if (exp == 128 && mantissa) {
+        // not a number
+        u = 0x7fff;
+      } else {
+        // overflow to infinity
+        u = sign | 0x7c00;
+      }
+      return cutlass::half_t::bitcast(u);
+    }
+
+    if (exp >= -14) {
+      // normal fp32 to normal fp16
+      u = uint16_t((uint32_t(exp + 15) & 0x1f) << 10);
+      u = uint16_t(u | (mantissa >> 13));
+    } else {
+      // normal single-precision to subnormal cutlass::half_t-precision representation
+      int rshift = (-14 - exp);
+      if (rshift < 32) {
+        mantissa |= (1 << 23);
+        mantissa = (mantissa >> rshift);
+        u = (uint16_t(mantissa >> 13) & 0x3ff);
+      } else {
+        mantissa = 0;
+        u = 0;
+      }
+    }
+
+    u |= sign;
+
+    return cutlass::half_t::bitcast(u);
+
+  #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::bfloat16_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::bfloat16_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::bfloat16_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::bfloat16_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return static_cast<cutlass::bfloat16_t>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #endif
+
+    uint16_t x16 = uint16_t((x32 >> 16) & 0xffff);
+    return cutlass::bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+    uint16_t x16 = uint16_t(x32 >> 16);
+
+    return cutlass::bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::tfloat32_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::tfloat32_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::tfloat32_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::tfloat32_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned storage = reinterpret_cast<unsigned const &>(s);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile("cvt.rn.tf32.f32 %0, %1;" : "=r"(storage) : "r"(storage));
+#else
+    if ((storage & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((storage & (1 << 13)) != 0);
+      bool round_bit = ((storage & (1 << 12)) != 0);
+      bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0);
+
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        storage += uint32_t(1 << 13);
+      }
+
+      // Note, the following is intentionally commented out. TF32
+      // does not define the low order bits, so they may be left in
+      // an undefined state.
+      //
+      // By not truncating these bit explicitly, we avoid an extra logical
+      // operation.
+      //
+      // TF32 may be implicitly converted to float by performing this
+      // operation as needed.
+      //
+      // storage = (storage & ~0x1fff);
+    }
+    else if (storage & ~0xff800000) {
+      storage = 0x7fffffff;
+    }
+#endif
+
+    return cutlass::tfloat32_t::bitcast(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return cutlass::tfloat32_t::round_half_ulp_truncate(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero.
+/// It avoids predicated code, though it requires a temporary register.
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned y = reinterpret_cast<unsigned const &>(s);
+    y = y & 0xff800000;
+    float d = reinterpret_cast<float const &>(y);
+    float z = d / float(1 << 11) + s;
+
+    return reinterpret_cast<result_type const &>(z);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x = reinterpret_cast<uint32_t const &>(s);
+    return cutlass::tfloat32_t::bitcast(x & 0xffffe000);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion operator for float to cutlass::tfloat32_t big and small values
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
+  FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
+>
+struct NumericConverterFastF32 {
+
+  // result_type holds big cutlass::tfloat32_t at idx(0) and small cutlass::tfloat32_t at idx(1)
+  using result_type = Array<cutlass::tfloat32_t, 2>;
+
+  // source data type
+  using source_type = float;
+
+  // rounding styles for big and small part
+  static FloatRoundStyle const kRoundBig = RoundBig;
+  static FloatRoundStyle const kRoundSmall = RoundSmall;
+
+  CUTLASS_HOST_DEVICE
+    static result_type convert(source_type const & source) {
+
+    result_type result;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundBig> convert_big_;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundSmall> convert_small_;
+
+    // convert and fill cutlass::tfloat32_t big at idx 0
+    result[0] = convert_big_(source);
+
+    // convert and fill cutlass::tfloat32_t small at idx 1
+    result[1] = convert_small_(source - static_cast<float>(result[0]));
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+    result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion and Clamp operator for Integers
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S
+>
+struct NumericConverterClamp {
+
+  using result_type = T;
+  using source_type = S;
+
+  CUTLASS_HOST_DEVICE
+    static result_type convert(source_type const & s) {
+    NumericConverter<result_type, source_type> convert_op;
+    result_type const kClamp_max = cutlass::platform::numeric_limits<result_type>::max();
+    result_type const kClamp_min = cutlass::platform::numeric_limits<result_type>::lowest();
+    if (s < (source_type)kClamp_min)
+      return kClamp_min;
+    if (s > (source_type)kClamp_max)
+      return kClamp_max;
+    return convert_op(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// This converter is needed to enable cutlass::half_t output types when using int32_t accumulators.
+// Since floating-point types do not require a clamp, this converter simply casts from
+// the source type to cutlass::half_t.
+template <
+  typename S
+>
+struct NumericConverterClamp<cutlass::half_t, S> {
+
+  using result_type = cutlass::half_t;
+  using source_type = S;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const &source) {
+    return static_cast<cutlass::half_t>(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion operator for Array
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Conversion operator for Array
+template <
+  typename T,
+  typename S,
+  int N,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
+>
+struct NumericArrayConverter {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result;
+    NumericConverter<T, S, Round> convert_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+        result[i] = convert_(s[i]);
+      } else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round,
+  typename Transform
+>
+struct NumericArrayConverter<T, T, N, Round, Transform> {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<T, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const &source) {
+    if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+      return source;
+    } else {
+      result_type result;
+      for (int i = 0; i < N; ++i) {
+        result[i] = conj(static_cast<typename source_type::Element>(source[i]));
+      }
+      return result;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half, 2> <= Array<float, 2>, round to nearest
+template <>
+struct NumericArrayConverter<cutlass::half_t, float, 2, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = Array<cutlass::half_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+      Array<cutlass::half_t, 2> result;
+      reinterpret_cast<__half2 &>(result) = __float22half2_rn(reinterpret_cast<float2 const &>(source));
+      return result;
+    #else
+      NumericConverter<cutlass::half_t, float, round_style> convert_;
+      // NOTE: cutlass::Array<half, N> is NOT an aggregate type and
+      //  below `{}` does NOT conduct zero initialization. Below `{}` will 
+      //  conduct default initialization (calling default ctr). We use this syntax
+      //  to resolve compiler warning on uninitialized member variable.
+      Array<cutlass::half_t, 2> result{};
+      result[0] = convert_(source[0]);
+      result[1] = convert_(source[1]);
+      return result;
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 2> <= Array<cutlass::half_t, 2>, round to nearest
+template <FloatRoundStyle Round>
+struct NumericArrayConverter<float, cutlass::half_t, 2, Round> {
+
+  using result_type = Array<float, 2>;
+  using source_type = Array<cutlass::half_t, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+      float2 result2 = __half22float2(reinterpret_cast<__half2 const &>(source));
+      return {
+        float{result2.x},
+        float{result2.y}
+      };
+    #else
+      NumericConverter<float, cutlass::half_t, round_style> convert_;
+      return {
+        convert_(source[0]),
+        convert_(source[1])
+      };
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, float, N, Round> {
+
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<cutlass::half_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::half_t, float, Round> convert_element_;
+
+    result_type result;
+
+    Array<cutlass::half_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::half_t, 2> *>(&result);
+    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<half> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::half_t, N, Round> {
+
+  using result_type = Array<float, N>;
+  using source_type = Array<cutlass::half_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<float, cutlass::half_t, 2, Round> convert_vector_;
+    NumericConverter<float, cutlass::half_t, Round> convert_element_;
+
+    result_type result;
+
+    Array<float, 2> *result_ptr = reinterpret_cast<Array<float, 2> *>(&result);
+    Array<cutlass::half_t, 2> const *source_ptr = reinterpret_cast<Array<cutlass::half_t, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 2> <= Array<float, 2>, round to nearest
+template <>
+struct NumericArrayConverter<cutlass::bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = Array<cutlass::bfloat16_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned d;
+
+    asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) );
+
+    return reinterpret_cast<result_type const &>(d);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, float, N, Round> {
+
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::bfloat16_t, float, Round> convert_element_;
+
+    result_type result;
+
+    Array<cutlass::bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::bfloat16_t, 2> *>(&result);
+    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \
+    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
+     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Partial specialization for Array<int8_t, 1> <= Array<int, 1>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 1, Round> {
+
+  using result_type = Array<int8_t, 1>;
+  using source_type = Array<int, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<int8_t, int, Round> convert_element_;
+
+    result_type result;
+
+    result[0] = convert_element_(source[0]);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t, 2> <= Array<int, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 2, Round> {
+
+  using result_type = Array<int8_t, 2>;
+  using source_type = Array<int, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    uint32_t tmp;
+
+    asm volatile(
+      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, 0;\n"
+      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
+
+    uint16_t out = (tmp & 0xffff);
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t, 4> <= Array<int, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 4, Round> {
+
+  using result_type = Array<int8_t, 4>;
+  using source_type = Array<int, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+      "{ .reg .u32 r4;"
+      "cvt.pack.sat.s8.s32.b32   r4, %4, %3, 0;"
+      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, r4;"
+      "}"
+      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int8_t, int, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 4> *result_ptr = reinterpret_cast<Array<int8_t, 4> *>(&result);
+    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 1> <= Array<int, 1>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 1, Round> {
+
+  using result_type = Array<uint8_t, 1>;
+  using source_type = Array<int, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<uint8_t, int, Round> convert_element_;
+
+    result_type result;
+
+    result[0] = convert_element_(source[0]);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 2> <= Array<int, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 2, Round> {
+
+  using result_type = Array<uint8_t, 2>;
+  using source_type = Array<int, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    uint32_t tmp;
+
+    asm volatile(
+      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, 0;\n"
+      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
+
+    uint16_t out = (tmp & 0xffff);
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 4> <= Array<int, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 4, Round> {
+
+  using result_type = Array<uint8_t, 4>;
+  using source_type = Array<int, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+      "{ .reg .u32 r4;"
+      "cvt.pack.sat.u8.s32.b32   r4, %4, %3, 0;"
+      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, r4;"
+      "}"
+      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<uint8_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<uint8_t, int, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<uint8_t, 4> *result_ptr = reinterpret_cast<Array<uint8_t, 4> *>(&result);
+    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, float, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, float, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<half, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<half, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<bfloat16_t, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+namespace detail {
+
+/// Special converters that can be used with 4 8-bit elements packed in a register.
+/// Common use is for fast FP8 converters.
+template <
+  typename T,
+  typename S,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
+>
+struct NumericArrayConverterPacked4Element {
+  using result_type = Array<T, 4>;
+  using source_type = Array<S, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result;
+    NumericConverter<T, S, Round> convert_;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+        result[i] = convert_(s[i]);
+      }
+      else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e4m3_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, float, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e5m2_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, float, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float_e4m3_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for:
+//       Array<T, N> <=> Array<float_e4m3_t, N>
+//       Array<T, N> <=> Array<float_e5m2_t, N>
+// using packed converter under the hood
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct PackedNumericArrayConverter {
+  using result_element = T;
+  using source_element = S;
+
+  using result_type = Array<result_element, N>;
+  using source_type = Array<source_element, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using packed_result_type = Array<result_element, 4>;
+  using packed_source_type = Array<source_element, 4>;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    packed_result_type* packed_result = reinterpret_cast<packed_result_type*>(&result);
+    const packed_source_type* packed_source = reinterpret_cast<const packed_source_type*>(&source);
+
+    detail::NumericArrayConverterPacked4Element<result_element, source_element, Round> packed_converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      packed_result[i] = packed_converter(packed_source[i]);
+    }
+
+    // Handle leftovers
+    NumericConverter<result_element, source_element, Round> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N % 4; ++i) {
+      int idx = ((N / 4) * 4) + i;
+      result[idx] = converter(source[idx]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const{
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<T, N> <= Array<float_e4m3_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<T, N> <= Array<float_e5m2_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<int8_t> <= Array<float>
+/// Conversion is performed with saturation regardless of setting of
+/// the `Round` template parameter.
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, 1, Round> {
+
+  using result_type = Array<int8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<int8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, 1, Round> {
+
+  using result_type = Array<uint8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<uint8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// To convert a FP32 to Int that has less than 32 bits, we need to convert it to int32 first.
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayFP32ToIntConverter {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(cutlass::platform::numeric_limits<T>::is_integer, "the dest type has to be int.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    // Convert float to int
+    Array<int32_t, N> temporary;
+
+    NumericArrayConverter<int32_t, float, N, Round> compute_converter;
+    temporary = compute_converter(source);
+
+    // Convert to int to int8_t
+    NumericArrayConverter<T, int32_t, N, Round> destination_converter;
+    return destination_converter(temporary);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, N, Round> {
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, N, Round> {
+
+  using result_type = Array<uint8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, float, N, Round> {
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, float, N, Round> {
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
+    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
+     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Partial specialization for Array<int4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, 8, Round> {
+
+  using result_type = Array<int4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.s4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.s4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int4b_t, 8> *result_ptr = reinterpret_cast<Array<int4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, 8, Round> {
+
+  using result_type = Array<uint4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.u4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.u4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<uint4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<uint4b_t, 8> *result_ptr = reinterpret_cast<Array<uint4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif  // Conditional guards to enable partial specialization for packed integers
+
+namespace detail {
+
+  /*
+      A helper class that can vectorize a numeric converter with implementation for several vector widths.
+
+      The vector widths must be giving in decreasing order or width, and must be a power of 2.
+
+      The vector converters must produce identical results to the scalar converters for consistency.
+    */
+  class VectorizedConverter {
+  private:
+    // Base case to handle remainder elements as scalars.
+    template <int Offset, size_t ParentWidth, typename ArrayConverter>
+    CUTLASS_DEVICE
+    static void convert_helper(
+      typename ArrayConverter::result_type& result,
+      typename ArrayConverter::source_type const& source) {
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+      // If no more converters, handle the remaining elements as scalars.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int remainder = total_elements - Offset;
+      static_assert(remainder == (total_elements % ParentWidth), "Unexpected remainder.");
+
+      typename ArrayConverter::ScalarConverter scalar_converter;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = Offset; i < ArrayConverter::result_type::kElements; ++i) {
+        result[i] = scalar_converter(ElementSrc(source[i]));
+      }
+    }
+
+    template <int Offset, size_t ParentWidth, typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert_helper(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      static_assert(sizeof...(OtherVectorArrays) % 2 == 0, "Vector converters must come in {dst, src} pairs");
+      static_assert(ResultVectorArray::kElements == SourceVectorArray::kElements, "Vector converters must have the same vector width");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::result_type::Element, typename ResultVectorArray::Element>::value,
+        "ResultVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::source_type::Element, typename SourceVectorArray::Element>::value,
+        "SourceVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(Offset >= 0 && Offset <= ArrayConverter::result_type::kElements, "Offset must be between 0 and N");
+
+      static_assert(ParentWidth == 0 || ParentWidth > ResultVectorArray::kElements, "Vector arrays must be given in decreasing order of width");
+
+      constexpr int vector_width = ResultVectorArray::kElements;
+      static_assert(ispow2(vector_width), "Vector width must be a power of 2");
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+
+      constexpr int vector_bits_res = vector_width * cutlass::sizeof_bits<ElementRes>::value;
+      constexpr int vector_bits_src = vector_width * cutlass::sizeof_bits<ElementSrc>::value;
+
+      static_assert(vector_bits_res % 8 == 0, "Result vector type must be byte addressed.");
+      static_assert(vector_bits_src % 8 == 0, "Source vector type must be byte addressed.");
+
+      constexpr int vector_offset = Offset / vector_width;
+      ResultVectorArray* packed_result_vec = reinterpret_cast<ResultVectorArray*>(&result) + vector_offset;
+      SourceVectorArray const* packed_source_vec = reinterpret_cast<SourceVectorArray const*>(&source) + vector_offset;
+
+      // Convert the remaining elements as vectors.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int groups_of_vec = (total_elements - Offset) / vector_width;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < groups_of_vec; ++i) {
+        packed_result_vec[i] = ArrayConverter::template packed_convert<ResultVectorArray, SourceVectorArray>(packed_source_vec[i]);
+      }
+
+      constexpr int new_offset = Offset + vector_width * groups_of_vec;
+      // Recurse to handle other vector converters, or the scalar base case.
+      convert_helper<new_offset, ResultVectorArray::kElements, ArrayConverter, OtherVectorArrays...>(result, source);
+    }
+
+  public:
+    /*
+        A method to convert vectors of elements using the packed_convert method of the converter.
+
+        Converters using this class must implement packed convert and support 1 or more vector conversions.
+      */
+    template <typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      convert_helper<0, 0, ArrayConverter, ResultVectorArray, SourceVectorArray, OtherVectorArrays...>(result, source);
+    }
+  };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__)
+/// Partial specialization for Array<int8_t, 8> <= Array<int4b_t, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, 8, Round> {
+
+  using result_type = Array<int8_t, 8>;
+  using source_type = Array<int4b_t, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned const& storage = reinterpret_cast<unsigned const &>(source);
+    unsigned out[2];
+
+    asm volatile(
+        "{\n"
+        "  .reg .u32 tmp0, tmp1, tmp2;\n"
+        "  shl.b32 tmp0, %2, 4;\n"                // tmp0 = x1x2x3x4x5x6x7__
+        "  and.b32 tmp0, tmp0, 0xf0f0f0f0;\n"     // tmp0 = x1__x3__x5__x7__
+        "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s1s3s5s7
+        "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s1__s3__s5__s7__
+        "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x1__x3__x5__x7
+        "  or.b32 tmp2, tmp0, tmp1;\n"            // tmp2 = y1y3y5y7
+        "  and.b32 tmp0, %2, 0xf0f0f0f0;\n"       // tmp0 = x0__x2__x4__x6__
+        "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s0s2s4s6
+        "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s0__s2__s4__s6__
+        "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x0__x2__x4__x6
+        "  or.b32 tmp0, tmp0, tmp1;\n"            // tmp0 = y0y2y4y6
+        "  prmt.b32 %0, tmp2, tmp0, 0x5140;\n"    // %0 = y0y1y2y3
+        "  prmt.b32 %1, tmp2, tmp0, 0x7362;\n"    // %1 = y4y5y6y7
+        "}\n"
+        : "=r"(out[0]), "=r"(out[1])
+        : "r"(storage));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int4b_t>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int4b_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int8_t, int4b_t, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 8> *result_ptr = reinterpret_cast<Array<int8_t, 8> *>(&result);
+    Array<int4b_t, 8> const *source_ptr = reinterpret_cast<Array<int4b_t, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+#endif // defined(__CUDA_ARCH__)
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using result_type_packed_4 = Array<cutlass::float_e4m3_t, 4>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses a lookup table to converts i4 -> e4m3.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
+
+    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
+    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the signed or unsigned candidates
+    uint32_t sign = (reg & 0x88888888) >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = (reg & 0x77777777);
+
+    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
+    const uint32_t final_prmt_base = 0x32103210;
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG1 = 0x44403800;
+    // [4, 5, 6, 7] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG2 = 0x4E4C4A48;
+    // [-1, -2, -3, -4] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG1 = 0xCACCCED0;
+    // [-5, -6, -7, -7] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG2 = 0xB8C0C4C8;
+
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
+      uint32_t final_prmt_idx = final_prmt_base | sign;
+
+      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos_f8s, neg_f8s;\n"
+          "  prmt.b32 pos_f8s, %1, %2, %5;\n"
+          "  prmt.b32 neg_f8s, %3, %4, %5;\n"
+          "  prmt.b32 %0, pos_f8s, neg_f8s, %6;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "n"(POS_E4M3s_REG1), "n"(POS_E4M3s_REG2), "n"(NEG_E4M3s_REG1), "n"(NEG_E4M3s_REG2),
+            "r"(lut_idx), "r"(final_prmt_idx));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, cutlass::int4b_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<float, 8>;
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <int offset, int elements_to_convert, typename PackedResultType>
+  CUTLASS_DEVICE
+  static void packed_convert_vec(PackedResultType& result, uint32_t src_reg) {
+    static_assert(offset == 0 || offset == 4, "Invalid offset");
+    // Selects one of the bottom int4s and constructs:
+    // 8388608 + (x + 8)
+    // 8388608 + 16 * (x + 8)
+    // 8388608 + 256 * (x + 8)
+    // 8388608 + 4096 * (x + 8)
+    uint32_t const and_masks[4] = {0x0000000F, 0x000000F0, 0x00000F00, 0x0000F000};
+    uint32_t const xor_masks[4] = {0x4B000008, 0x4B000080, 0x4B000800, 0x4B008000};
+
+    float const scales[4] = {1.f, 1.f / 16.f, 1.f / 256.f, 1.f / 4096.f};
+    float const offsets[4] = {-8388616.f, -524296.f, -32776.f, -2056.f};
+
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&result);
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < elements_to_convert; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(result_as_int[offset + ii])
+          : "r"(src_reg), "r"(and_masks[ii]), "r"(xor_masks[ii]), "n"(immLut));
+
+      result[offset + ii] = __fmaf_rn(result[offset + ii], scales[ii], offsets[ii]);
+    }
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 1, 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    PackedResultType r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    constexpr int total_elements = PackedResultType::kElements == 8 ? 4 : PackedResultType::kElements;
+    packed_convert_vec<0, total_elements>(r, src_reg);
+
+
+    if (PackedResultType::kElements == 8) {
+      uint32_t src_reg_shifted = src_reg >> 16;
+      packed_convert_vec<4, 4>(r, src_reg_shifted);
+    }
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, int8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_2 const& source) {
+    return static_cast<int32_t>(reinterpret_cast<const int16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_4 const& source) {
+    return reinterpret_cast<const int32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ <= 800
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    static constexpr int fp32_base = 0x4B400000;
+    uint32_t const prmt_indices[4] = {0x8880, 0x9991, 0xAAA2, 0xBBB3};
+
+    int* result_as_int = reinterpret_cast<int*>(&r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(result_as_int[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii)
+    {
+      result_as_int[ii] += fp32_base;
+      r[ii] -= reinterpret_cast<const float&>(fp32_base);
+    }
+  #else
+    int32_t x = to_int32(source);
+    int32_t t[4];
+    constexpr int32_t mask[4] = {0x00000001, 0x00000100, 0x00010000, 0x01000000};
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      t[ii] = __dp4a(x, mask[ii], 0);
+      r[ii] = static_cast<float>(t[ii]);
+    }
+  #endif
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, uint8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of u8x4 source and stores
+    // the result in r (without introducing extra cvt.u32.u8 instruction)
+    uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      result_as_int[ii] = __byte_perm(src_reg, 0x4B000000, prmt_indices[ii]);
+      // Subtract the magic number 0x4B000000 from tmp in floating-point arithmetic to obtain final result
+      r[ii] -= 8388608.f;
+    }
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
+    // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
+    // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
+    // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for F16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  prmt.b32 %0, %1, %2, %3;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    1024 + x + 8 OR 1024 + 16 * (x + 8), then using hfma to subtract 1032 from that
+    // 2) Adds 8 to the int4 value that we will process in the FP16 (for uint4, we can simply avoid this step)
+    // The AND does the following:
+    // 1) Clear the set bits for the int4 we will ignore.
+    // We use lop3 so that we can use 1 instruction for AND and XOR.
+    static constexpr uint32_t xor_mask = 0x64806408;
+    static constexpr uint32_t and_mask = 0xFFF0FF0F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 hfmas that do the following:
+    // For the high FP16:
+    //  Divide by 16 {packed as a operand} to get:
+    //    64 + (x + 8)
+    //    x + 72
+    //  Subtract 72 {packed as c operand} to get x
+    // For the low FP16:
+    //    1024 + (x + 8)
+    //    x + 1032
+    // So, we subtract 1032 {packed as c operand} to get x
+
+    // {-72, -1032}
+    static constexpr uint32_t hfma_bias_rep = 0xD480E408;
+    // {1 / 16, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x2C003C00;
+
+    const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
+    const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    #if 0 // Scalar conversion (Please keep this code for reference for vectorized version below)
+    auto result = reinterpret_cast<PackedResultType&>(r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < PackedResultType::kElements; ++i) {
+      int16_t tmp = source[i] + 26112 /* 0x6600 */;
+      result[i] = reinterpret_cast<cutlass::half_t const &>(tmp) - 1536.0_hf;
+    }
+    #endif
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x9180, 0xB3A2};
+
+    // Pack s8x2 (s8[1], s8[0]) -> s16x2 (sext.s8[1], sext.s8[0])
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt)
+    // The inline ptx below uses `msb=0` and `msb=1` from the above link to sign-extend the sign bit in 0, 1, 2, 3 bytes of s8x4
+    // into result_ptr[0] and result_ptr[1]'s 08-15 and 24-31 bits, respectively.
+    // Note that `__byte_perm(source_ptr[0], source_ptr[0], 0x9180);` won't achieve the same result and doesn't sign-extend the sign bit.
+    // Thus, we use inline ptx `prmt.b32` instruction for the desired sign extend from s8x2 to s16x2.
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(r[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    // In the absense of add.s16x2 instruction, use bit-wise operation to execute signed addition with magic numbers to achieve
+    // the same result as add.s16x2 instruction.
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
+    // For a logical operation F(a, b, c) the value of kImmLut can be computed by applying the same operation to
+    // three predefined constant values as follows:
+    //                                        ta = 0xF0;
+    //                                        tb = 0xCC;
+    //                                        tc = 0xAA;
+    //                                   kImmLut = F(ta, tb, tc);
+    // If we want F = ((a & b) ^ c) then set kImmLut = (0xF0 & 0xCC) ^ 0xAA
+    static constexpr uint32_t kImmLut = (0xF0 & 0xCC) ^ 0xAA;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      // The bit-wise operation executed below is `r[ii] = (r[ii] & 0x03FF03FF) ^ 0x66006600;`
+      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" :
+                                "=r"(r[ii]) : "r"(r[ii]), "n"(0x03FF03FF), "n"(0x66006600), "n"(kImmLut));
+    }
+
+    static constexpr uint32_t bias_rep = 0x66006600;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x5150, 0x5352};
+    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(r[ii]) : "r"(src_reg), "n"(start_byte_for_fp16), "r"(prmt_indices[ii]));
+    }
+
+    static constexpr uint32_t bias_rep = 0x64006400;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  prmt.b32 %0, %1, %2, %3;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    128 + (x + 8) and subtracting 136 to get x
+    static constexpr uint32_t xor_mask = 0x43084308;
+    static constexpr uint32_t and_mask = 0x000F000F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 bfmas that do the following:
+    // high BF16:
+    // hi_bf16 - 136, lo_bf16 - 136
+
+    // This is the BF16 {136, 136} represented as an integer.
+    static constexpr uint32_t bias_rep = 0x43084308;
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, int8_t, PackedResultType::kElements, Round> convert_int8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_int8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16;
+    return convert_f32_to_bf16(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, uint8_t, PackedResultType::kElements, Round> convert_uint8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_uint8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16_;
+    return convert_f32_to_bf16_(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// FastNumericArrayConverter only works when the source is within center range.
+/// Conversion operator for Array.  See the comments before
+/// FastLinearCombinationClamp.
+template <typename T, typename S, int N,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+          typename Enable = void>
+struct FastNumericArrayConverter {
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &s) {
+    NumericArrayConverter<T, S, N, Round> convert_;
+
+    return convert_(s);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<float> <= Array<int>
+template <int N, FloatRoundStyle Round>
+struct FastNumericArrayConverter<float, int, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int tmp = source[i] + 1262485504 /*0x4B400000*/;
+      result[i] = reinterpret_cast<float const &>(tmp) - 12582912.0f;
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<int8_t, 4> <= Array<float, 4>
+template <FloatRoundStyle Round>
+struct FastNumericArrayConverter<int8_t, float, 4, Round> {
+  using result_type = Array<int8_t, 4>;
+  using source_type = Array<float, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    Array<int32_t, 4> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      float tmp = source[i] + 12582912.0f;
+      result[i] = reinterpret_cast<int32_t const &>(tmp);
+    }
+
+    result[0] = __byte_perm(result[0], result[1], 0x40);
+    result[2] = __byte_perm(result[2], result[3], 0x40);
+    result[0] = __byte_perm(result[0], result[2], 0x5410);
+
+    return reinterpret_cast<result_type const &>(result[0]);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<float>
+template <int N, FloatRoundStyle Round>
+struct FastNumericArrayConverter<int8_t, float, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    FastNumericArrayConverter<int8_t, float, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 4> *result_ptr =
+        reinterpret_cast<Array<int8_t, 4> *>(&result);
+    Array<float, 4> const *source_ptr =
+        reinterpret_cast<Array<float, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines preferred rounding mode for a pair of types
+template <typename T, typename S>
+struct PreferredRoundingMode {
+  static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest;
+};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+/// Defines preferred rounding mode for a pair of types
+template <>
+struct PreferredRoundingMode<cutlass::tfloat32_t, float> {
+  static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate;
+};
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Packs predicates into an array.
+template <int N>
+struct PackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must pack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(bool const predicates[]) {
+
+    result_type packed;
+    packed.clear();
+
+    int const kWordSize = 8;
+    uint8_t *bytes = reinterpret_cast<uint8_t *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      uint8_t mask = static_cast<uint8_t>((predicates[i] ? 1u : 0u) << bit_idx);
+      bytes[word_idx] = (bytes[word_idx] | mask);
+    }
+    return packed;
+  }
+};
+
+/// Packs predicates into an array
+template <int N>
+struct UnpackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must unpack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  void operator()(bool predicates[], result_type const &packed) {
+
+    int const kWordSize = 8;
+    uint8_t const *bytes = reinterpret_cast<uint8_t const *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      predicates[i] = bool((bytes[word_idx] >> bit_idx) & 0x1);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_size.h b/lightllm-kernel/cutlass/include/cutlass/numeric_size.h
new file mode 100755
index 000000000..4ff83bab8
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/numeric_size.h
@@ -0,0 +1,83 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an element in bits
+template <typename T>
+struct sizeof_bits {
+  static constexpr int value = int(sizeof(T) * 8);
+};
+
+template <typename T>
+struct sizeof_bits<T const>: sizeof_bits<T> {};
+
+template <>
+struct sizeof_bits<void> {
+  static constexpr int value = 0;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the number of bytes required to hold a specified number of bits
+template <class R = int, class T>
+CUTLASS_HOST_DEVICE
+constexpr
+R
+bits_to_bytes(T bits) {
+  return (R(bits) + R(7)) / R(8);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+struct is_subbyte {
+  static constexpr bool value = sizeof_bits<T>::value < 8;
+};
+
+template <class T>
+struct is_subbyte<T const> : is_subbyte<T> {};
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_types.h b/lightllm-kernel/cutlass/include/cutlass/numeric_types.h
new file mode 100755
index 000000000..5519fbe7c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/numeric_types.h
@@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/numeric_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <size_t... Seq>
+struct index_sequence;
+
+template <size_t N, size_t... Next>
+struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
+
+template <size_t... Next>
+struct index_sequence_helper<0, 0, Next...> {
+  using type = index_sequence<0, Next...>;
+};
+
+template <size_t N>
+using make_index_sequence = typename index_sequence_helper<N>::type;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Get the register type used in kernel
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<typename T>
+struct get_unpacked_element_type {
+  using type = T;
+};
+
+} // namespace detail
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
+#include "cutlass/tfloat32.h"
+#include "cutlass/float8.h"
+#include "cutlass/uint128.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp b/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
new file mode 100755
index 000000000..0b5617976
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
@@ -0,0 +1,36 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp b/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
new file mode 100755
index 000000000..96bb8db74
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -0,0 +1,1173 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/layout_composed.hpp"  // cute::composition
+#include "cute/swizzle.hpp"             // cute::Swizzle
+#include "cute/swizzle_layout.hpp"      // cute::composition
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/container/array.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+
+enum class BarrierStatus : uint32_t {
+  WaitAgain = 0u,
+  WaitDone  = 1u,
+};
+
+class ArrivalToken {
+public:
+  CUTLASS_HOST_DEVICE
+  ArrivalToken(BarrierStatus barrier_status) : barrier_status_(barrier_status) {}
+
+  CUTLASS_HOST_DEVICE
+  ArrivalToken() = delete;
+
+  CUTLASS_HOST_DEVICE
+  BarrierStatus get() const {
+    return barrier_status_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator==(ArrivalToken const& other) const {
+    return barrier_status_ == other.get();
+  }
+
+private:
+  BarrierStatus barrier_status_;
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() == right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const BarrierStatus& left, const ArrivalToken& right) {
+    return left == right.get();
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() != right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const BarrierStatus& left, const ArrivalToken& right) {
+    return left != right.get();
+  }
+};
+
+class ProducerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+class ConsumerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+// Circular Buffer Index + Associated Phase
+// Assumes only one operation possible - i.e., ++
+template<uint32_t Stages_>
+struct PipelineState {
+
+  static constexpr uint32_t Stages = Stages_;
+
+  int index_ = 0;
+  uint32_t phase_ = 0;
+  uint32_t count_ = 0;
+
+  CUTLASS_DEVICE
+  PipelineState(): index_{}, phase_{}, count_{} {}
+
+  CUTLASS_DEVICE
+  PipelineState(int index, uint32_t phase, uint32_t count)
+    : index_(index)
+    , phase_(phase)
+    , count_(count) {}
+
+  CUTLASS_DEVICE
+  int index() const {
+    return index_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t phase() const {
+    return phase_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t count() const {
+    return count_;
+  }
+
+  CUTLASS_DEVICE
+  void operator++() {
+    if constexpr (Stages > 0) {
+      ++index_;
+      ++count_;
+      if (index_ == Stages) {
+        index_ = 0;
+        phase_ ^= 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator+=(uint32_t num_iterations) {
+    return advance(num_iterations);
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator=(PipelineState const& other) {
+    index_ = other.index();
+    phase_ = other.phase();
+    count_ = other.count();
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& advance(uint32_t num_iterations) {
+    if constexpr (Stages > 0) {
+      // Number of iterations cross over the stage boundary => flipped phase
+      if ((num_iterations < Stages) && (index_ + num_iterations) >= Stages ) {
+        phase_ ^= 1;
+      }
+      // How many times number of iterations cross over the stage boundary and
+      // end up on a odd number => flipped phase
+      if ((num_iterations >= Stages) && (((index_ + num_iterations) / Stages) % 2) == 1) {
+        phase_ ^= 1;
+      }
+      index_ = (index_ + num_iterations) % Stages;
+      count_ += num_iterations;
+    }
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  static PipelineState make_pipeline_state(PipelineState start_state, uint32_t num_iterations) {
+    return start_state.advance(num_iterations);
+  }
+};
+
+template<class Pipeline>
+CUTLASS_DEVICE
+PipelineState<Pipeline::Stages> make_producer_start_state() {
+  // Producer starts with an opposite phase as the buffers are initially empty
+  constexpr int InitialProducerStage = 0;
+  constexpr uint32_t InitialProducerPhase = 1;
+  constexpr uint32_t InitialProducerCount = 0;
+  return {InitialProducerStage, InitialProducerPhase, InitialProducerCount};
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA load (producer) Async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Assumptions : Constructor is visible Cluster-wide (as it needs a Cluster-Sync)
+// We have exactly one thread elected in the Producer as the "leader"
+// Currently, it is optional to elect a leader for the Consumers
+template <int Stages_>
+class PipelineTmaAsync {
+public :
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    uint32_t transaction_bytes = 0;
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t is_leader = 0;
+    uint32_t num_consumers = 0;
+  };
+
+  // Constructor
+  template<class ClusterShape>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape)
+      : params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    if (warp_idx == 0 && lane_predicate == 1) {
+      // Barrier FULL init
+      for (int i = 0; i < Stages; ++i) {
+        full_barrier_ptr_[i].init(1);
+      }
+      uint32_t const num_consumer_warpgroups_per_cluster = params_.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t const multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1) *
+          num_consumer_warpgroups_per_cluster;
+      // Barrier EMPTY init
+      for (int i = 0; i < Stages; ++i) {
+        empty_barrier_ptr_[i].init(multicast_consumer_arrival_count);
+      }
+    }
+    cutlass::arch::fence_barrier_init();
+
+    // Logic to optimally schedule Empty Arrives
+    // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads)
+    dim3 block_id = cute::block_id_in_cluster();
+    auto cluster_size = cute::size(cluster_shape);
+    static constexpr int MaxClusterSize = 16;
+
+    // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15)
+    if (params_.num_consumers % NumThreadsPerWarpGroup == 0) {
+      int thread_idx = threadIdx.x % NumThreadsPerWarpGroup;
+      is_signalling_thread_ = (thread_idx % (NumThreadsPerWarpGroup / MaxClusterSize)) == 0;
+      auto layout = cute::composition(Swizzle<2,0,-2>{},
+                                      Layout<Shape<_4,_4>,Stride<_4,_1>>{});
+      uint32_t thread_row = warp_idx % 4;
+      uint32_t thread_col = (thread_idx / 8) % 4;
+      dst_blockid_ = layout(thread_row, thread_col);
+    }
+    else if (params_.num_consumers == 32) {
+      int thread_idx = threadIdx.x % 32;
+      is_signalling_thread_ = (thread_idx % (32 / MaxClusterSize)) == 0;
+      auto layout = Layout<Shape<_4,_4>,Stride<_4, _1>>{};
+      uint32_t thread_row = thread_idx / 8;
+      uint32_t thread_col = (thread_idx % 8) / 2;
+      dst_blockid_ = layout(thread_row, thread_col);
+    }
+    else {
+      is_signalling_thread_ = 0;
+      #ifndef NDEBUG
+        asm volatile ("brkpt;\n" ::);
+      #endif
+    }
+
+    // STEP 2: Find if this dst block-id needs an arrival for this problem
+    is_signalling_thread_ &= dst_blockid_ < cluster_size;
+    is_signalling_thread_ &= is_same_row_or_col(dst_blockid_, block_id, cluster_shape);
+  }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
+    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) ||
+            (
+              ((dst_block_id / cute::size<0>(cluster_shape)) == block_id.y)
+            ));
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, uint32_t bytes) {
+    producer_commit(state.index(), bytes);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      empty_barrier_ptr_[state.index()].wait(state.phase());
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state) {
+    consumer_wait(state.index(), state.phase());
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private :
+  uint32_t dst_blockid_ = 0;
+  uint32_t is_signalling_thread_ = 0;
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    if (barrier_token != BarrierStatus::WaitDone) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+
+    // Most likely you have elected more than one leader
+    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  // NOP for TMA based mainloop
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage, uint32_t bytes) {
+    // Below code is used only for unit-testing (in the absence of TMA commit)
+    #if CUTLASS_UNIT_TEST_PIPELINE
+      if (params_.is_leader) {
+        // STEP 1 : Commit to self
+        full_barrier_ptr_[stage].complete_transaction(bytes);
+
+        // STEP 2 : Commit to other blocks in our cluster
+        auto cluster_shape = cute::cluster_shape();
+        Layout block_layout_in_cluster = make_layout(cluster_shape);
+        dim3 local_block_id = cute::block_id_in_cluster();
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < size<1>(block_layout_in_cluster); ++n) {
+          uint32_t dst_block_id = block_layout_in_cluster(local_block_id.x,n,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, n!=local_block_id.y);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int m = 0; m < size<0>(block_layout_in_cluster); ++m) {
+          uint32_t dst_block_id = block_layout_in_cluster(m,local_block_id.y,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, m!=local_block_id.x);
+        }
+      }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    full_barrier_ptr_[stage].wait(phase);
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    empty_barrier_ptr_[stage].arrive(dst_blockid_, is_signalling_thread_ & (!skip));
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Producer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA store pipeline class
+// producer-only class, no async barriers between threads because consumer is TMA unit
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  int Stages_,
+  // The number of committed TMA store batches that can be in flight upon return of producer acquire
+  int UnacquiredStages_ = Stages_-1
+>
+class PipelineTmaStore {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  static_assert(Stages_ > 0);
+  static_assert(UnacquiredStages_ >= 0);
+  static constexpr uint32_t UnacquiredStages = static_cast<uint32_t>(UnacquiredStages_);
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  CUTLASS_DEVICE
+  PipelineTmaStore(Params params = {}) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Wait for the least recently committed batch of TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state) {
+    producer_acquire(state.index(), state.count());
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Wait for the least recently committed batch of TMA stores to complete
+  // or until at most UnacquiredStages TMA store batches are in-flight (if specified)
+  CUTLASS_DEVICE
+  void producer_acquire([[maybe_unused]] uint32_t stage, uint32_t count) {
+    if (params_.always_wait || count > UnacquiredStages) {
+      tma_store_wait<UnacquiredStages>();
+    }
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+template <>
+class PipelineTmaStore< /* Stages_ = */ 0, /* UnacquiredStages = Stages_ - 1 = */ -1 > {
+public:
+  static constexpr uint32_t Stages = 0;
+  static constexpr uint32_t UnacquiredStages = 0;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  PipelineTmaStore() = default;
+  CUTLASS_DEVICE
+    PipelineTmaStore(Params params) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+
+  template<class ThisTemplateParameterExistsOnlyForDependentFalse = int>
+  CUTLASS_DEVICE
+    void producer_acquire(PipelineState /* state */,
+      ThisTemplateParameterExistsOnlyForDependentFalse* /* unused */ = nullptr) {
+    static_assert(cutlass::detail::dependent_false<ThisTemplateParameterExistsOnlyForDependentFalse>,
+      "It is never valid to call PipelineTmaStore<0>::producer_acquire");
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+    void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class using producer transaction barriers
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Stages_>
+class PipelineTransactionAsync {
+public :
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    cute::array<FullBarrier, Stages> full_barrier_;
+    cute::array<EmptyBarrier, Stages> empty_barrier_;
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t transaction_bytes = 0;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+  };
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineTransactionAsync(SharedStorage& storage, Params const& params)
+    : params_(params)
+    , full_barrier_ptr_(storage.full_barrier_.data())
+    , empty_barrier_ptr_(storage.empty_barrier_.data()) {
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    // Barrier FULL, EMPTY init
+    // Init is done only by thread 0 of the block
+    if (warp_idx == 0 && lane_predicate) {
+      for (int i = 0; i < Stages; ++i) {
+        full_barrier_ptr_[i].init(params.producer_arv_count);
+        empty_barrier_ptr_[i].init(params.consumer_arv_count);
+      }
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(PipelineState state) {
+    producer_expect_transaction(state.index());
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(uint32_t stage) {
+    full_barrier_ptr_[stage].expect_transaction(params_.transaction_bytes);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    full_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid, (not skip));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+  template<int Stages>
+  using PipelineAsyncPipelineState = cutlass::PipelineState<Stages>;
+
+  template<int Stages>
+  struct PipelineAsyncSharedStorage {
+    using FullBarrier = cutlass::arch::ClusterBarrier;
+    using EmptyBarrier = cutlass::arch::ClusterBarrier;
+
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+};
+
+template <int Stages_>
+class PipelineAsync {
+public :
+  static constexpr uint32_t Stages = Stages_;
+  using SharedStorage = PipelineDetail::PipelineAsyncSharedStorage<Stages>;
+  using FullBarrier = typename SharedStorage::FullBarrier;
+  using EmptyBarrier = typename SharedStorage::EmptyBarrier;
+  using ProducerBarrierType = typename FullBarrier::ValueType;
+  using ConsumerBarrierType = typename EmptyBarrier::ValueType;
+  using PipelineState = PipelineDetail::PipelineAsyncPipelineState<Stages>;
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+  };
+
+  // Default assumption when only storage is passed is :
+  // => single producer, single consumer & they are in the same block (within the Cluster)
+  CUTLASS_DEVICE
+  PipelineAsync(SharedStorage& storage)
+    : PipelineAsync(storage, {}) {}
+
+  CUTLASS_DEVICE
+  PipelineAsync(
+    SharedStorage& storage,
+    Params const& params) :
+      params_(params),
+      full_barrier_ptr_(&storage.full_barrier_[0]),
+      empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    // Barrier FULL, EMPTY init
+    // Init is done only by thread 0 of the block
+    if (warp_idx == 0 && lane_predicate == 1) {
+      for (int i = 0; i < Stages; ++i) {
+        full_barrier_ptr_[i].init(params.producer_arv_count);
+        empty_barrier_ptr_[i].init(params.consumer_arv_count);
+      }
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  template<class UserDefinedArriveOp>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
+    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));
+    producer_commit(state);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+private:
+  Params params_;
+  FullBarrier *full_barrier_ptr_;
+  EmptyBarrier *empty_barrier_ptr_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    full_barrier_ptr_[stage].arrive();
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    bool done = full_barrier_ptr_[stage].test_wait(phase);
+    if (!done) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage) {
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Barrier to ensure an Ordered Sequence between
+// SequenceLength number of groups (each with group_size participants) executing SequenceDepth Stages
+// i.e., for all i < j - only after id "i" arrives at a particular stage "m"
+// will the wait() for id "j" succeed for the same stage
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+
+template<int SequenceDepth, int SequenceLength>
+struct OrderedSequenceBarrierSharedStorage {
+  using Barrier = cutlass::arch::ClusterBarrier;
+  Barrier barrier_[SequenceDepth][SequenceLength];
+};
+
+} // namespace PipelineDetail
+
+template<int SequenceDepth_, int SequenceLength_>
+class OrderedSequenceBarrier {
+public:
+  static constexpr int SequenceDepth = SequenceDepth_;
+  static constexpr int SequenceLength = SequenceLength_;
+  using SharedStorage =
+    PipelineDetail::OrderedSequenceBarrierSharedStorage<SequenceDepth, SequenceLength>;
+  using Barrier = typename SharedStorage::Barrier;
+
+  struct Params {
+    uint32_t group_id;
+    uint32_t group_size;
+  };
+
+private :
+  // In future this Params object can be replaced easily with a CG object
+  Params params_;
+  Barrier *barrier_ptr_;
+  PipelineState<SequenceDepth> stage_;
+
+  static constexpr int Depth = SequenceDepth;
+  static constexpr int Length = SequenceLength;
+
+public:
+  OrderedSequenceBarrier() = delete;
+  OrderedSequenceBarrier(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier(OrderedSequenceBarrier&&) = delete;
+  OrderedSequenceBarrier& operator=(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier& operator=(OrderedSequenceBarrier&&) = delete;
+  ~OrderedSequenceBarrier() = default;
+
+  CUTLASS_DEVICE
+  OrderedSequenceBarrier(SharedStorage& storage, Params const& params) :
+      params_(params),
+      barrier_ptr_(&storage.barrier_[0][0]),
+      // Group 0 - starts with an opposite phase
+      stage_({0, params.group_id == 0, 0}) {
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    // Barrier FULL, EMPTY init
+    // Init is done only by the one elected thread of the block
+    if (warp_idx == 0 && lane_predicate) {
+      for (int d = 0; d < Depth; ++d) {
+        for (int l = 0; l < Length; ++l) {
+          barrier_ptr_[d * Length + l].init(params.group_size);
+        }
+      }
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  // Wait on a stage to be unlocked
+  CUTLASS_DEVICE
+  void wait() {
+    get_barrier_for_current_stage(params_.group_id).wait(stage_.phase());
+  }
+
+  // Signal completion of Stage and move to the next stage
+  // (group_id) signals to (group_id+1)
+  CUTLASS_DEVICE
+  void arrive() {
+    int signalling_id = (params_.group_id + 1) % Length;
+    get_barrier_for_current_stage(signalling_id).arrive();
+    ++stage_;
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    ++stage_;
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  Barrier& get_barrier_for_current_stage(int group_id) {
+    return barrier_ptr_[stage_.index() * Length + group_id];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Synchronization call. Blocks until barriers are initialized in shared memory.
+CUTLASS_DEVICE
+void
+pipeline_init_wait(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_wait();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+// Used to guarantee that the Pipeline init is visible
+// to all producers and consumer threadblocks in the cluster
+CUTLASS_DEVICE
+void
+pipeline_init_arrive_relaxed(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_arrive_relaxed();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // end namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h b/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
new file mode 100755
index 000000000..475229a25
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
@@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defining a shape used by pitch-linear operators
+template <
+  int Contiguous,
+  int Strided
+>
+struct PitchLinearShape {
+  static int const kContiguous = Contiguous;
+  static int const kStrided = Strided;
+  static int const kCount = Contiguous * Strided;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Coordinate in pitch-linear space
+struct PitchLinearCoord : public Coord<2, int> {
+public:
+
+  /// Integer-valued index
+  using Index = int;
+
+  /// Base type is a Coord of rank=2
+  using Base = Coord<2, Index>;
+
+  /// Long integer type
+  using LongIndex = typename Base::LongIndex;
+
+private:
+
+  /// Rows dimension
+  static int const kContiguous = 0;
+
+  /// Columns dimension
+  static int const kStrided = 1;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord() { }
+
+  /// Constructs from Coord<2>
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(Coord<2, Index> const &coord): Base(coord) { }
+
+  /// Helper to construct from a row and column
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(Index contiguous_, Index strided_): Base(make_Coord(contiguous_, strided_)) { }
+
+  /// Helper to construct from a row and column based on LongIndex
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(LongIndex contiguous_, LongIndex strided_)
+    : Base(make_Coord(Index(contiguous_), Index(strided_))) { }
+
+  /// Returns the contiguous dimension
+  CUTLASS_HOST_DEVICE
+  Index const & contiguous() const { return this->at(kContiguous); }
+
+  /// Returns the contiguous dimension
+  CUTLASS_HOST_DEVICE
+  Index & contiguous() { return this->at(kContiguous); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & strided() const { return this->at(kStrided); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & strided() { return this->at(kStrided); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator+(Base const& b) const {
+    return PitchLinearCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator-(Base const& b) const {
+    return PitchLinearCoord(Base::operator-(b));
+  }
+
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator-() const {
+    return PitchLinearCoord(-at(0), -at(1));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator*(Base const& b) const {
+    return PitchLinearCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator/(Base const& b) const {
+    return PitchLinearCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/platform/platform.h b/lightllm-kernel/cutlass/include/cutlass/platform/platform.h
new file mode 100755
index 000000000..ba1f74011
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/platform/platform.h
@@ -0,0 +1,913 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ features that may be otherwise unimplemented for CUDA device functions.
+ *
+ * This file has three components:
+ *
+ *   (1) Macros:
+ *       - Empty macro defines for C++ keywords not supported by the current
+ *         version of C++. These simply allow compilation to proceed (but do
+ *         not provide the added semantics).
+ *           - \p noexcept
+ *           - \p constexpr
+ *           - \p nullptr
+ *           - \p static_assert
+ *
+ *       - Macro functions that we need in constant expressions because the
+ *         C++ equivalents require constexpr compiler support.  These are
+ *         prefixed with \p __NV_STD_*
+ *           - \p __NV_STD_MAX
+ *           - \p __NV_STD_MIN
+ *
+ *   (2) Re-implementations of STL functions and types:
+ *       - C++ features that need the \p __device__ annotation.  These are
+ *         placed into the \p platform namespace.
+ *           - \p abs
+ *           - \p plus
+ *           - \p less
+ *           - \p greater
+ *           - \p min
+ *           - \p max
+ *           - \p methods on std::pair (==, !=, <, <=, >, >=, and make_pair())
+ *
+ *   (3) Stop-gap implementations of unsupported STL functions and types:
+ *       - STL functions and types defined by C++ 11/14/17/etc. that are not
+ *         provided by the current version of C++. These are placed into the
+ *         \p platform namespace
+ *           - \p integral_constant
+ *           - \p nullptr_t
+ *           - \p true_type
+ *           - \p false_type
+ *           - \p bool_constant
+ *           - \p enable_if
+ *           - \p conditional
+ *           - \p is_same
+ *           - \p is_base_of
+ *           - \p remove_const
+ *           - \p remove_volatile
+ *           - \p remove_cv
+ *           - \p is_volatile
+ *           - \p is_pointer
+ *           - \p is_void
+ *           - \p is_integral
+ *           - \p is_floating_point
+ *           - \p is_arithmetic
+ *           - \p is_fundamental
+ *           - \p is_trivially_copyable
+ *           - \p alignment_of
+ *           - \p aligned_storage
+ *
+ * The idea is that, as we drop support for older compilers, we can simply #define
+ * the \p __NV_STD_XYZ macros and \p platform namespace to alias their C++
+ * counterparts (or trivially find-and-replace their occurrences in code text).
+ */
+
+//-----------------------------------------------------------------------------
+// Dependencies
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(__CUDACC_RTC__)
+//-----------------------------------------------------------------------------
+// Include STL files that platform provides functionality for
+//-----------------------------------------------------------------------------
+
+#include <algorithm>   // Minimum/maximum operations
+#include <cstddef>     // nullptr_t
+#include <functional>  // Arithmetic operations
+#include <utility>     // For methods on std::pair
+#include <limits>      // float_round_style, float_denorm_style
+#if (!defined(_MSC_VER) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MS_VER >= 1500))
+#include <type_traits>  // For integral constants, conditional metaprogramming, and type traits
+#endif
+
+#include <cutlass/cutlass.h>
+
+#endif
+
+//-----------------------------------------------------------------------------
+// OS
+//-----------------------------------------------------------------------------
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
+#define CUTLASS_OS_WINDOWS
+#endif
+
+/******************************************************************************
+ * Macros
+ ******************************************************************************/
+/// std
+#if !defined(CUTLASS_STL_NAMESPACE)
+#if defined(__CUDACC_RTC__)
+#define CUTLASS_STL_NAMESPACE cuda::std
+#else
+#define CUTLASS_STL_NAMESPACE std
+#endif
+#endif
+
+/// builtin_unreachable
+#if !defined(CUTLASS_GCC_UNREACHABLE)
+#  if defined(__GNUC__)
+#    define CUTLASS_GCC_UNREACHABLE __builtin_unreachable()
+#  else
+#    define CUTLASS_GCC_UNREACHABLE
+#  endif
+#endif
+
+//-----------------------------------------------------------------------------
+// Keywords
+//-----------------------------------------------------------------------------
+
+/// noexcept, constexpr
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1900))
+#ifndef noexcept
+#define noexcept
+#endif
+#ifndef constexpr
+#define constexpr
+#endif
+#endif
+
+/// nullptr
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1310))
+#ifndef nullptr
+#define nullptr 0
+#endif
+#endif
+
+/// static_assert
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#ifndef static_assert
+#define __platform_cat_(a, b) a##b
+#define __platform_cat(a, b) __platform_cat_(a, b)
+#define static_assert(__e, __m) typedef int __platform_cat(AsSeRt, __LINE__)[(__e) ? 1 : -1]
+#endif
+#endif
+
+//-----------------------------------------------------------------------------
+// Functions
+//-----------------------------------------------------------------------------
+
+/// Select maximum(a, b)
+#ifndef __NV_STD_MAX
+#define __NV_STD_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+/// Select minimum(a, b)
+#ifndef __NV_STD_MIN
+#define __NV_STD_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+/******************************************************************************
+ * Re-implementations
+ ******************************************************************************/
+namespace cutlass {
+namespace platform {
+
+//-----------------------------------------------------------------------------
+// Abs operations <algorithm>
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__)
+/// std::abs
+CUTLASS_HOST_DEVICE constexpr int abs(int a) {
+    return (a < 0) ? -a : a;
+}
+CUTLASS_HOST_DEVICE constexpr long long abs(long long a) {
+    return (a < 0) ? -a : a;
+}
+#else
+using std::abs;
+#endif
+
+//-----------------------------------------------------------------------------
+// Minimum/maximum operations <algorithm>
+//-----------------------------------------------------------------------------
+
+/// std::min
+template <typename T>
+CUTLASS_HOST_DEVICE constexpr const T& min(const T& a, const T& b) {
+  return (b < a) ? b : a;
+}
+
+/// std::max
+template <typename T>
+CUTLASS_HOST_DEVICE constexpr const T& max(const T& a, const T& b) {
+  return (a < b) ? b : a;
+}
+
+#if !defined(__CUDACC_RTC__)
+//-----------------------------------------------------------------------------
+// Methods on std::pair
+//-----------------------------------------------------------------------------
+
+using std::pair;
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator==(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first == rhs.first) && (lhs.second == rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator!=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first != rhs.first) && (lhs.second != rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator<(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first < rhs.first) ? true : (rhs.first < lhs.first) ? false
+                                                                  : (lhs.second < rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator<=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return !(rhs < lhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator>(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (rhs < lhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator>=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return !(lhs < rhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE std::pair<T1, T2> make_pair(T1 t, T2 u) {
+  std::pair<T1, T2> retval;
+  retval.first = t;
+  retval.second = u;
+  return retval;
+}
+#endif
+
+}  // namespace platform
+
+/******************************************************************************
+ * Implementations of C++ 11/14/17/... STL features
+ ******************************************************************************/
+
+namespace platform {
+
+//-----------------------------------------------------------------------------
+// Integral constant helper types <type_traits>
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// std::integral_constant
+template <typename value_t, value_t V>
+struct integral_constant;
+
+/// std::integral_constant
+template <typename value_t, value_t V>
+struct integral_constant {
+  static const value_t value = V;
+
+  typedef value_t value_type;
+  typedef integral_constant<value_t, V> type;
+
+  CUTLASS_HOST_DEVICE operator value_type() const { return value; }
+
+  CUTLASS_HOST_DEVICE const value_type operator()() const { return value; }
+};
+
+#else
+
+using std::integral_constant;
+using std::pair;
+
+#endif
+
+using CUTLASS_STL_NAMESPACE::bool_constant;
+using CUTLASS_STL_NAMESPACE::true_type;
+using CUTLASS_STL_NAMESPACE::false_type;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1700))
+
+/// std::nullptr_t
+struct nullptr_t {};
+
+#else
+
+using std::nullptr_t;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Conditional metaprogramming <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::conditional;
+using CUTLASS_STL_NAMESPACE::conditional_t;
+using CUTLASS_STL_NAMESPACE::enable_if;
+using CUTLASS_STL_NAMESPACE::enable_if_t;
+using CUTLASS_STL_NAMESPACE::void_t;
+
+//-----------------------------------------------------------------------------
+// Const/volatility specifiers <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::remove_const;
+using CUTLASS_STL_NAMESPACE::remove_const_t;
+using CUTLASS_STL_NAMESPACE::remove_cv;
+using CUTLASS_STL_NAMESPACE::remove_cv_t;
+using CUTLASS_STL_NAMESPACE::remove_reference;
+using CUTLASS_STL_NAMESPACE::remove_reference_t;
+using CUTLASS_STL_NAMESPACE::remove_volatile;
+using CUTLASS_STL_NAMESPACE::remove_volatile_t;
+
+// remove_cvref and remove_cvref_t are C++20 features,
+// but CUTLASS finds them useful enough to back-port.
+#if defined(__cpp_lib_remove_cvref)
+
+using CUTLASS_STL_NAMESPACE::remove_cvref;
+using CUTLASS_STL_NAMESPACE::remove_cvref_t;
+
+#else
+
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
+};
+
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Type relationships <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::is_same;  
+using CUTLASS_STL_NAMESPACE::is_same_v;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// Helper for std::is_base_of
+template <typename BaseT, typename DerivedT>
+struct is_base_of_helper {
+  typedef char (&yes)[1];
+  typedef char (&no)[2];
+
+  template <typename B, typename D>
+  struct dummy {
+    CUTLASS_HOST_DEVICE operator B*() const;
+    CUTLASS_HOST_DEVICE operator D*();
+  };
+
+  template <typename T>
+  CUTLASS_HOST_DEVICE static yes check(DerivedT*, T);
+
+  CUTLASS_HOST_DEVICE static no check(BaseT*, int);
+
+  static const bool value = sizeof(check(dummy<BaseT, DerivedT>(), int())) == sizeof(yes);
+};
+
+/// std::is_base_of
+template <typename BaseT, typename DerivedT>
+struct is_base_of
+    : integral_constant<bool,
+                        (is_base_of_helper<typename remove_cv<BaseT>::type,
+                                           typename remove_cv<DerivedT>::type>::value) ||
+                            (is_same<typename remove_cv<BaseT>::type,
+                                     typename remove_cv<DerivedT>::type>::value)> {};
+
+#else
+
+using std::is_base_of;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Type properties <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::is_arithmetic;
+using CUTLASS_STL_NAMESPACE::is_arithmetic_v;
+using CUTLASS_STL_NAMESPACE::is_void;
+using CUTLASS_STL_NAMESPACE::is_void_v;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// std::is_volatile
+template <typename T>
+struct is_volatile : false_type {};
+template <typename T>
+struct is_volatile<volatile T> : true_type {};
+
+/// Helper for std::is_pointer (false specialization)
+template <typename T>
+struct is_pointer_helper : false_type {};
+
+/// Helper for std::is_pointer (true specialization)
+template <typename T>
+struct is_pointer_helper<T*> : true_type {};
+
+/// std::is_pointer
+template <typename T>
+struct is_pointer : is_pointer_helper<typename remove_cv<T>::type> {};
+
+/// std::is_integral
+template <typename T>
+struct is_integral : false_type {};
+template <>
+struct is_integral<char> : true_type {};
+template <>
+struct is_integral<signed char> : true_type {};
+template <>
+struct is_integral<unsigned char> : true_type {};
+template <>
+struct is_integral<short> : true_type {};
+template <>
+struct is_integral<unsigned short> : true_type {};
+template <>
+struct is_integral<int> : true_type {};
+template <>
+struct is_integral<unsigned int> : true_type {};
+template <>
+struct is_integral<long> : true_type {};
+template <>
+struct is_integral<unsigned long> : true_type {};
+template <>
+struct is_integral<long long> : true_type {};
+template <>
+struct is_integral<unsigned long long> : true_type {};
+template <typename T>
+struct is_integral<volatile T> : is_integral<T> {};
+template <typename T>
+struct is_integral<const T> : is_integral<T> {};
+template <typename T>
+struct is_integral<const volatile T> : is_integral<T> {};
+
+/// std::is_floating_point
+template <typename T>
+struct is_floating_point
+    : integral_constant<bool,
+                        (is_same<float, typename remove_cv<T>::type>::value ||
+                         is_same<double, typename remove_cv<T>::type>::value)> {};
+
+/// std::is_fundamental
+template <typename T>
+struct is_fundamental
+    : integral_constant<bool,
+                        (is_arithmetic<T>::value || is_void<T>::value ||
+                         is_same<nullptr_t, typename remove_cv<T>::type>::value)> {};
+
+#else
+
+using std::is_volatile;
+using std::is_pointer;
+using std::is_integral;
+using std::is_floating_point;
+using std::is_fundamental;
+
+#endif
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800)) || \
+    (defined(__GNUG__) && (__GNUC__ < 5))
+
+/**
+     * std::is_trivially_copyable
+     *
+     * This implementation only evaluates true if T is fundamental or pointer
+     *
+     * Without help from partial template specializations provided by the user for
+     * a specific class or struct, this trait will never report that the specified
+     * class or struct  is trivially-copyable ; this is always safe,
+     * if possibly sub-optimal.
+     */
+template <typename T>
+struct is_trivially_copyable
+    : integral_constant<bool, (is_fundamental<T>::value || is_pointer<T>::value)> {};
+
+#else
+
+using std::is_trivially_copyable;
+
+#endif
+
+#if (201703L <=__cplusplus)
+
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_integral_v;
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_unsigned_v;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// <utility>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::declval;
+  
+//-----------------------------------------------------------------------------
+// bit_cast <bit>
+//-----------------------------------------------------------------------------
+
+template< class To, class From >
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& from ) noexcept;
+
+template <class To, class From>
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& src) noexcept
+{
+  static_assert(sizeof(To) == sizeof(From), "sizes must match");
+  return reinterpret_cast<To const &>(src);
+}
+
+//-----------------------------------------------------------------------------
+// Convertable
+//-----------------------------------------------------------------------------
+using CUTLASS_STL_NAMESPACE::is_convertible;
+using CUTLASS_STL_NAMESPACE::is_convertible_v;
+
+//-----------------------------------------------------------------------------
+// Alignment and layout utilities
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// std::alignment_of
+template <typename value_t>
+struct alignment_of {
+  struct pad {
+    value_t val;
+    char byte;
+  };
+
+  enum { value = sizeof(pad) - sizeof(value_t) };
+};
+
+#else
+
+template <typename value_t>
+struct alignment_of : std::alignment_of<value_t> {};
+
+#endif
+
+/* 16B specializations where 32-bit Win32 host compiler disagrees with device compiler */
+template <>
+struct alignment_of<int4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<uint4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<float4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<long4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<longlong2> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulonglong2> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<double2> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<longlong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulonglong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<double4> {
+  enum { value = 16 };
+};
+
+// Specializations for volatile/const qualified types
+template <typename value_t>
+struct alignment_of<volatile value_t> : alignment_of<value_t> {};
+template <typename value_t>
+struct alignment_of<const value_t> : alignment_of<value_t> {};
+template <typename value_t>
+struct alignment_of<const volatile value_t> : alignment_of<value_t> {};
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800))
+
+template <size_t Align>
+struct aligned_chunk;
+template <>
+struct __align__(1) aligned_chunk<1> {
+  uint8_t buff;
+};
+template <>
+struct __align__(2) aligned_chunk<2> {
+  uint16_t buff;
+};
+template <>
+struct __align__(4) aligned_chunk<4> {
+  uint32_t buff;
+};
+template <>
+struct __align__(8) aligned_chunk<8> {
+  uint32_t buff[2];
+};
+template <>
+struct __align__(16) aligned_chunk<16> {
+  uint32_t buff[4];
+};
+template <>
+struct __align__(32) aligned_chunk<32> {
+  uint32_t buff[8];
+};
+template <>
+struct __align__(64) aligned_chunk<64> {
+  uint32_t buff[16];
+};
+template <>
+struct __align__(128) aligned_chunk<128> {
+  uint32_t buff[32];
+};
+template <>
+struct __align__(256) aligned_chunk<256> {
+  uint32_t buff[64];
+};
+template <>
+struct __align__(512) aligned_chunk<512> {
+  uint32_t buff[128];
+};
+template <>
+struct __align__(1024) aligned_chunk<1024> {
+  uint32_t buff[256];
+};
+template <>
+struct __align__(2048) aligned_chunk<2048> {
+  uint32_t buff[512];
+};
+template <>
+struct __align__(4096) aligned_chunk<4096> {
+  uint32_t buff[1024];
+};
+
+/// std::aligned_storage
+template <size_t Len, size_t Align>
+struct aligned_storage {
+  typedef aligned_chunk<Align> type[Len / sizeof(aligned_chunk<Align>)];
+};
+
+#else
+
+using std::aligned_storage;
+
+#endif
+
+#if !defined(__CUDACC_RTC__)
+/// Default deleter
+template <typename T>
+struct default_delete {
+  void operator()(T* ptr) const { delete ptr; }
+};
+
+/// Partial specialization for deleting array types
+template <typename T>
+struct default_delete<T[]> {
+  void operator()(T* ptr) const { delete[] ptr; }
+};
+
+/// std::unique_ptr
+template <class T, class Deleter = default_delete<T> >
+class unique_ptr {
+ public:
+  typedef T* pointer;
+  typedef T element_type;
+  typedef Deleter deleter_type;
+
+ private:
+  /// Pointer to memory
+  pointer _ptr;
+
+  /// Deleter
+  deleter_type _deleter;
+
+ public:
+  unique_ptr() : _ptr(nullptr) {}
+  unique_ptr(pointer p) : _ptr(p) {}
+
+  ~unique_ptr() {
+    if (_ptr) {
+      _deleter(_ptr);
+    }
+  }
+  /// Returns a pointer to the managed object or nullptr if no object is owned.
+  pointer get() const noexcept { return _ptr; }
+
+  /// Releases ownership of the managed object, if any
+  pointer release() noexcept {
+    pointer p(_ptr);
+    _ptr = nullptr;
+    return p;
+  }
+
+  /// Replaces the managed object, deleting the old object.
+  void reset(pointer p = pointer()) noexcept {
+    pointer old_ptr = _ptr;
+    _ptr = p;
+    if (old_ptr != nullptr) {
+      get_deleter()(old_ptr);
+    }
+  }
+
+  /// Swaps the managed objects with *this and another unique_ptr
+  void swap(unique_ptr& other) noexcept { std::swap(_ptr, other._ptr); }
+
+  /// Returns the deleter object
+  Deleter& get_deleter() noexcept { return _deleter; }
+
+  /// Returns the deleter object
+  Deleter const& get_deleter() const noexcept { return _deleter; }
+
+  /// Checks whether an object is owned
+  operator bool() const noexcept { return _ptr != nullptr; }
+
+  /// Dereferences the unique_ptr
+  T& operator*() const { return *_ptr; }
+
+  /// Returns a pointer to the managed object
+  pointer operator->() const noexcept { return _ptr; }
+
+  /// Array access to managed object
+  T& operator[](size_t i) const { return _ptr[i]; }
+};
+
+/// Specializes the swap algorithm
+template <typename T, typename Deleter>
+void swap(unique_ptr<T, Deleter>& lhs, unique_ptr<T, Deleter>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+#endif
+
+/// std::numeric_limits
+template <class T>
+struct numeric_limits;
+
+template <>
+struct numeric_limits<int32_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int32_t lowest() noexcept { return -2147483647 - 1;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int32_t max() noexcept { return 2147483647;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<int16_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int16_t lowest() noexcept { return -32768;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int16_t max() noexcept { return 32767;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<int8_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int8_t lowest() noexcept { return -128;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int8_t max() noexcept { return 127;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+
+template <>
+struct numeric_limits<uint32_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint32_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint32_t max() noexcept { return 4294967295U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<uint16_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint16_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint16_t max() noexcept { return 65535U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<uint8_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint8_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint8_t max() noexcept { return 255U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<float> {
+  CUTLASS_HOST_DEVICE
+  static constexpr float infinity() noexcept { return bit_cast<float, int32_t>(0x7f800000);}
+  CUTLASS_HOST_DEVICE
+  static constexpr float max() noexcept { return bit_cast<float, int32_t>(0x7f7fffff);}
+  static constexpr bool is_integer = false;
+  static constexpr bool has_infinity = true;
+};
+
+/// Returns a value that curries the `std::maximum()` function into the identity
+/// function. No value will compare < than this value.
+template <typename T>
+constexpr T identity_for_maximum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return -numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::lowest();
+  }
+}
+
+/// Returns a value that curries the `std::minimum()` function into the identity
+/// function. No value will compare > than this value.
+template <typename T>
+constexpr T identity_for_minimum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::max();
+  }
+}
+
+/// std::float_round_style
+using CUTLASS_STL_NAMESPACE::float_round_style;
+using CUTLASS_STL_NAMESPACE::round_indeterminate;
+using CUTLASS_STL_NAMESPACE::round_toward_zero;
+using CUTLASS_STL_NAMESPACE::round_to_nearest;
+using CUTLASS_STL_NAMESPACE::round_toward_infinity;
+using CUTLASS_STL_NAMESPACE::round_toward_neg_infinity;
+
+/// std::float_denorm_style
+using CUTLASS_STL_NAMESPACE::float_denorm_style;
+using CUTLASS_STL_NAMESPACE::denorm_indeterminate;
+using CUTLASS_STL_NAMESPACE::denorm_absent;
+using CUTLASS_STL_NAMESPACE::denorm_present;
+
+}  // namespace platform
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h b/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
new file mode 100755
index 000000000..aa4e3f1a1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
@@ -0,0 +1,547 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines container classes and iterators for managing a statically sized vector
+      of boolean predicates.
+*/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#else
+#include <assert.h>
+#include <stdint.h>
+#endif
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_vector_concept Predicate Vector Concept
+@{
+
+Implementations of \ref predicate_vector_concept contain an ordered set of boolean predicates which
+may be used as conditionals in other device-side operations. Both random access and iterators
+offering sequential access are provided.
+
+@par Predicate Vector
+   A \ref predicate_vector_concept satisfies the following expressions
+  - <b>at(int idx)</b> - returns the value of the indexed predicate
+  - <b>set(int idx, bool value)</b> - sets the value of the indexed predicate
+  - <b>begin()</b> - returns a \ref predicate_iterator_concept pointing to the first predicate
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_iterator_concept Predicate Iterator Concept
+@{
+
+Implementations of \ref predicate_iterator_concept enables accessing and traversing elements of a
+bit vector.
+
+@par Const Predicate Iterator
+  A const \ref predicate_iterator_concept satisfies the following expressions
+ - <b>++it</b> increments the iterator to the next predicate
+ - <b>*it</b> returns the value of the currently pointed-to predicate
+
+@par Mutable Predicate Iterator
+ A \ref predicate_iterator_concept that is non-const <b>also</b> satisfies the following expressions
+ - <b>it.set(bool value)</b> sets the value of the currently pointed-to predicate
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_tile_adapter Predicate Tile Adapter Concept
+@{
+
+Implementations of \ref predicate_tile_adapter provide a mapping between a the elements of a \ref
+tile_traits_concept and a \ref predicate_vector_concept.
+
+@par Predicate Tile Adapter
+  A \ref predicate_tile_adapter satisfies the following expressions
+ - <b>at(int d, int h, int w, int c)</b> - returns the value of a predicate corresponding to the
+   access (d, h, w, c) within the tile.
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array of bits implementing @concept{predicate_vector_concept}.
+template <
+    /// Number of predicates contained in predicate vector
+    int kPredicates_,
+    /// Number of predicates contained in each byte of internal storage
+    int kPredicatesPerByte_ = 4,
+    /// Location of first predicate within byte of internal storage
+    int kPredicateStart_ = 0>
+struct PredicateVector {
+  /// Number of bits stored by the PredicateVector
+  static constexpr int kPredicates = kPredicates_;
+
+  /// Number of bits stored within each byte of the predicate bit vector
+  static constexpr int kPredicatesPerByte = kPredicatesPerByte_;
+
+  /// First bit within each byte containing predicates
+  static constexpr int kPredicateStart = kPredicateStart_;
+
+  // Make sure no one tries to put more than 8 bits in a byte :)
+  static_assert(kPredicatesPerByte <= 8, "kPredicatesPerByte must fit within an actual byte");
+  // Make sure the "offsetted" bits fit in one byte.
+  static_assert(kPredicateStart + kPredicatesPerByte <= 8,
+                "The offsetted predicates must fit within an actual byte.");
+
+  /// Storage type of individual elements
+  typedef uint32_t Storage;
+
+  /// Number of bytes needed
+  static constexpr int kBytes = (kPredicates + kPredicatesPerByte - 1) / kPredicatesPerByte;
+
+  /// Number of storage elements needed
+  static constexpr int kWordCount = (kBytes + int(sizeof(Storage)) - 1) / int(sizeof(Storage));
+
+  /// The byte mask corresponding to predicates
+  static constexpr Storage kByteMask = (((1 << kPredicatesPerByte) - 1) << kPredicateStart);
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Words of bit vector
+  Storage storageData[kWordCount];
+
+  //
+  // Methods
+  //
+
+  /// Computes the word and bit corresponding to a logical predicate index
+  CUTLASS_HOST_DEVICE void computeStorageOffset(int &word, int &bit, int idx) const {
+    CUTLASS_ASSERT(idx < kPredicates);
+
+    int byte = (idx / kPredicatesPerByte);
+    int bit_offset = (idx % kPredicatesPerByte);
+
+    word = byte / sizeof(Storage);
+    int byte_offset = (byte % sizeof(Storage));
+
+    bit = byte_offset * 8 + bit_offset + kPredicateStart;
+  }
+
+  /// Returns word mask.
+  CUTLASS_HOST_DEVICE static constexpr bool computeWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t byte = 0; byte < sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
+  /// Returns mask of last word.
+  CUTLASS_HOST_DEVICE static constexpr bool computeLastWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < kBytes % sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
+  /// Accesses a given word with optional assertions
+  CUTLASS_HOST_DEVICE Storage &storage(int word) {
+    CUTLASS_ASSERT(word < kWordCount);
+    return storageData[word];
+  }
+
+  /// Accesses a given word with optional assertions
+  CUTLASS_HOST_DEVICE Storage const &storage(int word) const {
+    CUTLASS_ASSERT(word < kWordCount);
+    return storageData[word];
+  }
+
+ public:
+  //
+  // Iterator
+  //
+
+  /**
+  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
+  * read and write access to predicates.
+  * @concept{predicate_iterator_concept}
+  */
+  class Iterator {
+    /// Reference to PredicateVector instance
+    PredicateVector &vec_;
+
+    /// Index into PredicateVector
+    int bit_;
+
+   public:
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    Iterator(Iterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    Iterator(PredicateVector &vec, int _start = 0) : vec_(vec), bit_(_start) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    Iterator &operator++() {
+      ++bit_;
+      return *this;
+    }
+
+    /// Increment
+    CUTLASS_HOST_DEVICE
+    Iterator &operator+=(int offset) {
+      bit_ += offset;
+      return *this;
+    }
+
+    /// Pre-decrement
+    CUTLASS_HOST_DEVICE
+    Iterator &operator--() {
+      --bit_;
+      return *this;
+    }
+
+    /// Decrement
+    CUTLASS_HOST_DEVICE
+    Iterator &operator-=(int offset) {
+      bit_ -= offset;
+      return *this;
+    }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    Iterator operator++(int) {
+      Iterator ret(*this);
+      ret.bit_++;
+      return ret;
+    }
+
+    /// Post-decrement
+    CUTLASS_HOST_DEVICE
+    Iterator operator--(int) {
+      Iterator ret(*this);
+      ret.bit_--;
+      return ret;
+    }
+
+    /// Iterator advances by some amount
+    CUTLASS_HOST_DEVICE
+    Iterator operator+(int offset) {
+      Iterator ret(*this);
+      ret.bit_ += offset;
+      return ret;
+    }
+
+    /// Iterator recedes by some amount
+    CUTLASS_HOST_DEVICE
+    Iterator operator-(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ -= offset;
+      return ret;
+    }
+
+    /// Returns true if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator==(Iterator const &it) const { return bit_ == it.bit_; }
+
+    /// Returns false if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator!=(Iterator const &it) const { return bit_ != it.bit_; }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool get() { return vec_.at(bit_); }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool at() const { return vec_.at(bit_); }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return at(); }
+
+    /// Sets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    void set(bool value = true) { vec_.set(bit_, value); }
+  };
+
+  /**
+  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
+  * read and write access to predicates.
+  * @concept{predicate_iterator_concept}
+  */
+  class ConstIterator {
+    /// Reference to PredicateVector instance
+    PredicateVector const &vec_;
+
+    /// Index into PredicateVector
+    int bit_;
+
+   public:
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    ConstIterator(ConstIterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    ConstIterator(PredicateVector const &vec, int _start = 0) : vec_(vec), bit_(_start) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator++() {
+      ++bit_;
+      return *this;
+    }
+
+    /// Increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator+=(int offset) {
+      bit_ += offset;
+      return *this;
+    }
+
+    /// Pre-decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator--() {
+      --bit_;
+      return *this;
+    }
+
+    /// Decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator-=(int offset) {
+      bit_ -= offset;
+      return *this;
+    }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator++(int) {
+      ConstIterator ret(*this);
+      ret.bit_++;
+      return ret;
+    }
+
+    /// Post-decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator--(int) {
+      ConstIterator ret(*this);
+      ret.bit_--;
+      return ret;
+    }
+
+    /// Iterator advances by some amount
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator+(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ += offset;
+      return ret;
+    }
+
+    /// Iterator recedes by some amount
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator-(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ -= offset;
+      return ret;
+    }
+
+    /// Returns true if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator==(ConstIterator const &it) const { return bit_ == it.bit_; }
+
+    /// Returns false if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator!=(ConstIterator const &it) const { return bit_ != it.bit_; }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool get() { return vec_.at(bit_); }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool at() const { return vec_.at(bit_); }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return at(); }
+  };
+
+  /// Iterator that always returns true
+  struct TrivialIterator {
+    /// Constructor
+    CUTLASS_HOST_DEVICE
+    TrivialIterator() {}
+
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    TrivialIterator(Iterator const &it) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    TrivialIterator(PredicateVector const &_vec) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    TrivialIterator &operator++() { return *this; }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    TrivialIterator operator++(int) { return *this; }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return true; }
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Initialize the predicate vector
+  CUTLASS_HOST_DEVICE PredicateVector(bool value = true) { fill(value); }
+
+  /// Fills all predicates with a given value
+  CUTLASS_HOST_DEVICE void fill(bool value = true) {
+    Storage item = (value ? ~Storage(0) : Storage(0));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = item;
+    }
+  }
+
+  /// Clears all predicates
+  CUTLASS_HOST_DEVICE void clear() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = 0;
+    }
+  }
+
+  /// Sets all predicates to true
+  CUTLASS_HOST_DEVICE void enable() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = ~Storage(0);
+    }
+  }
+
+  /// Accesses a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE bool operator[](int idx) const { return at(idx); }
+
+  /// Accesses a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE bool at(int idx) const {
+    int bit, word;
+    computeStorageOffset(word, bit, idx);
+
+    return ((storage(word) >> bit) & 1);
+  }
+
+  /// Set a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE void set(int idx, bool value = true) {
+    int bit, word;
+    computeStorageOffset(word, bit, idx);
+
+    Storage disable_mask = (~(Storage(1) << bit));
+    Storage enable_mask = (Storage(value) << bit);
+
+    storage(word) = ((storage(word) & disable_mask) | enable_mask);
+  }
+
+  /// Computes the intersection of two identical predicate vectors.
+  CUTLASS_HOST_DEVICE PredicateVector &operator&=(PredicateVector const &predicates) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = (storage(i) & predicates.storage(i));
+    }
+    return *this;
+  }
+
+  /// Computes the union of two identical predicate vectors.
+  CUTLASS_HOST_DEVICE PredicateVector &operator|=(PredicateVector const &predicates) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = (storage(i) | predicates.storage(i));
+    }
+    return *this;
+  }
+
+  /// Returns true if entire predicate array is zero.
+  CUTLASS_HOST_DEVICE bool is_zero() const {
+   constexpr Storage mask = computeWordMask();
+    Storage result = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int word = 0; word < kWordCount - 1; ++word) {
+      result |= (storage(word) & mask);
+    }
+    constexpr Storage last_word_mask = computeLastWordMask();
+    result |= (storage(kWordCount - 1) & last_word_mask);
+    
+    return result == 0;
+  }
+
+  /// Returns an iterator to the start of the bit vector
+  CUTLASS_DEVICE
+  Iterator begin() { return Iterator(*this); }
+
+  /// Returns an iterator
+  CUTLASS_DEVICE
+  Iterator end() { return Iterator(*this, kPredicates); }
+
+  /// Returns a ConstIterator
+  CUTLASS_DEVICE
+  ConstIterator const_begin() const { return ConstIterator(*this); }
+
+  /// Returns a ConstIterator
+  CUTLASS_DEVICE
+  ConstIterator const_end() const { return ConstIterator(*this, kPredicates); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/quaternion.h b/lightllm-kernel/cutlass/include/cutlass/quaternion.h
new file mode 100755
index 000000000..b31df4557
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/quaternion.h
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a densely packed quaternion object intended for storing data in registers and
+    executing quaternion operations within a CUDA or host thread.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/array.h"
+#include "cutlass/real.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Quaternion: xi + yj + zk + w
+template <
+  typename Element_ = float      ///< element type
+>
+class Quaternion : public Array<Element_, 4> {
+public:
+
+  /// Logical rank of tensor index space
+  static int const kRank = 1;
+
+  /// Number of elements
+  static int const kExtent = 4;
+
+  /// Base class is a four-element array
+  using Base = Array<Element_, kExtent>;
+
+  /// Element type
+  using Element = typename Base::Element;
+
+  /// Reference type to an element
+  using Reference = typename Base::reference;
+
+  /// Index type
+  using Index = int;
+
+  /// Quaternion storage - imaginary part
+  static int const kX = 0;
+
+  /// Quaternion storage - imaginary part
+  static int const kY = 1;
+
+  /// Quaternion storage - imaginary part
+  static int const kZ = 2;
+
+  /// Quaternion storage - real part
+  static int const kW = 3;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a quaternion q = 0
+  CUTLASS_HOST_DEVICE
+  Quaternion() {
+    Base::at(kX) = Element();
+    Base::at(kY) = Element();
+    Base::at(kZ) = Element();
+    Base::at(kW) = Element();
+  }
+
+  /// Constructs a quaternion q = w + 0*i + 0*j + 0*k
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Element w_
+  ) {
+    Base::at(kX) = Element();
+    Base::at(kY) = Element();
+    Base::at(kZ) = Element();
+    Base::at(kW) = w_;
+  }
+
+  /// Constructs a quaternion q = w + x*i + y*j + z*k
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Element x_,
+    Element y_,
+    Element z_,
+    Element w_
+  ) {
+    Base::at(kX) = x_;
+    Base::at(kY) = y_;
+    Base::at(kZ) = z_;
+    Base::at(kW) = w_;
+  }
+
+  /// Constructs a quaternion from a vector representing the imaginary part and a real number
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Matrix3x1<Element> const &imag_,
+    Element w_ = Element()
+  ) {
+    Base::at(kX) = imag_[0];
+    Base::at(kY) = imag_[1];
+    Base::at(kZ) = imag_[2];
+    Base::at(kW) = w_;
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(Index idx) const {
+    return Base::at(idx);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(Index idx) {
+    return Base::at(idx);
+  }
+
+  /// Accesses the x element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element x() const {
+    return Base::at(kX);
+  }
+
+  /// Accesses the x element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference x() {
+    return Base::at(kX);
+  }
+
+  /// Accesses the y element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element y() const {
+    return Base::at(kY);
+  }
+
+  /// Accesses the y element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference y() {
+    return Base::at(kY);
+  }
+
+  /// Accesses the z element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element z() const {
+    return Base::at(kZ);
+  }
+
+  /// Accesses the z element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference z() {
+    return Base::at(kZ);
+  }
+
+  /// Accesses the real part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element w() const {
+    return Base::at(kW);
+  }
+
+  /// Accesses the real part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference w() {
+    return Base::at(kW);
+  }
+
+  /// Returns the pure imaginary part of the quaternion as a 3-vector
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> pure() const {
+    return Matrix3x1<Element>(x(), y(), z());
+  }
+
+  /// Returns a quaternion representation of a spatial rotation given a unit-length axis and
+  /// a rotation in radians.
+  CUTLASS_HOST_DEVICE
+  static Quaternion<Element> rotation(
+    Matrix3x1<Element> const &axis_unit,    ///< axis of rotation (assumed to be unit length)
+    Element theta) {                        ///< angular rotation in radians
+
+    Element s = fast_sin(theta / Element(2));
+
+    return Quaternion(
+      s * axis_unit[0],
+      s * axis_unit[1],
+      s * axis_unit[2],
+      fast_cos(theta / Element(2))
+    );
+  }
+  
+  /// Returns a quaternion representation of a spatial rotation represented as a
+  /// unit-length rotation axis (r_x, r_y, r_z) and an angular rotation in radians
+  CUTLASS_HOST_DEVICE
+  static Quaternion<Element> rotation(
+    Element r_x,
+    Element r_y,
+    Element r_z,
+    Element theta) {                      ///< angular rotation in radians
+
+    return rotation({r_x, r_y, r_z}, theta);
+  }
+
+  /// Geometric rotation of a 3-element vector
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> rotate(Matrix3x1<Element> const &rhs) const {
+    return (*this * Quaternion<Element>(rhs, 0) * reciprocal(*this)).pure();
+  }
+
+  /// Inverse rotation operation
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> rotate_inv(Matrix3x1<Element> const &rhs) const {
+    return (reciprocal(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
+  }
+
+  /// Rotates a 3-vector assuming this is a unit quaternion (a spinor)
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> spinor(Matrix3x1<Element> const &rhs) const {
+    return (*this * Quaternion<Element>(rhs, 0) * conj(*this)).pure();
+  }
+
+  /// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor)
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> spinor_inv(Matrix3x1<Element> const &rhs) const {
+    return (conj(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
+  }
+
+  /// In-place addition
+  template <typename Element>
+  CUTLASS_HOST_DEVICE 
+  Quaternion<Element> &operator+=(Quaternion<Element> const &rhs) {
+    *this = (*this + rhs);
+    return *this;
+  }
+
+  /// In-place subtraction
+  template <typename Element>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator-=(Quaternion<Element> const &rhs) {
+    *this = (*this - rhs);
+    return *this;
+  }
+
+  /// In-place multiplication
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator*=(Quaternion<Element> const &rhs) {
+    *this = (*this * rhs);
+    return *this;
+  }
+
+  /// Scalar multiplication
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator*=(Element s) {
+    *this = (*this * s);
+    return *this;
+  }
+
+  /// In-place Division
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator/=(Quaternion<Element> const &rhs) {
+    *this = (*this / rhs);
+    return *this;
+  }
+
+  /// In-place Division
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator/=(Element s) {
+    *this = (*this / s);
+    return *this;
+  }
+
+  /// Computes a 3x3 rotation matrix (row-major representation)
+  CUTLASS_HOST_DEVICE
+  Matrix3x3<Element> as_rotation_matrix_3x3() const {
+    Matrix3x3<Element> m(
+      w() * w() + x() * x() - y() * y() - z() * z(),
+      2 * x() * y() - 2 * w() * z(),
+      2 * x() * z() + 2 * w() * y(),
+
+      2 * x() * y() + 2 * w() * z(),
+      w() * w() - x() * x() + y() * y() - z() * z(),
+      2 * y() * z() - 2 * w() * x(),
+
+      2 * x() * z() - 2 * w() * y(),
+      2 * y() * z() + 2 * w() * x(),
+      w() * w() - x() * x() - y() * y() + z() * z()
+    );
+    return m;
+  }
+
+  /// Computes a 4x4 rotation matrix (row-major representation)
+  CUTLASS_HOST_DEVICE
+  Matrix4x4<Element> as_rotation_matrix_4x4() const {
+    Matrix4x4<Element> m = Matrix4x4<Element>::identity();
+    m.set_slice_3x3(as_rotation_matrix_3x3());
+    return m;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a quaternion that is non-zero only in its real element.
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(
+  Element w) {                                ///< real part
+
+  return Quaternion<Element>(w);
+}
+
+/// Constructs a quaternion from a vector and real
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(
+  Matrix3x1<Element> const &imag,             ///< imaginary party as a vector
+  Element w) {                                ///< real part
+
+  return Quaternion<Element>(imag, w);
+}
+
+/// Constructs a quaternion from a unit-length rotation axis and a rotation 
+/// angle in radians
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_QuaternionRotation(
+  Matrix3x1<Element> const &axis_unit,        ///< rotation axis (unit-length)
+  Element w) {                                ///< rotation angle in radians
+
+  return Quaternion<Element>::rotation(axis_unit, w);
+}
+
+/// Constructs a quaternion q = xi + yj + zk + w
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(Element x, Element y, Element z, Element w) {
+  return Quaternion<Element>(x, y, z, w);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the real part of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+Element const &real(Quaternion<Element> const &q) {
+  return q.w();
+}
+
+/// Returns the real part of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element &real(Quaternion<Element> &q) {
+  return q.w();
+}
+
+/// Returns the magnitude of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element abs(Quaternion<Element> const &q) {
+  return fast_sqrt(norm(q));
+}
+
+/// Quaternion conjugate
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> conj(Quaternion<Element> const &q) {
+  return make_Quaternion(
+    -q.x(),
+    -q.y(),
+    -q.z(),
+    q.w()
+  );
+}
+
+/// Computes the squared magnitude of the quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element norm(Quaternion<Element> const &q) {
+  return q.x() * q.x() + q.y() * q.y() + q.z() * q.z() + q.w() * q.w();
+}
+
+/// Quaternion reciprocal
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> reciprocal(Quaternion<Element> const &q) {
+  
+  Element nsq = norm(q);
+  
+  return make_Quaternion(
+    -q.x() / nsq,
+    -q.y() / nsq,
+    -q.z() / nsq,
+    q.w() / nsq
+  );
+}
+
+/// Returns a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> unit(Quaternion<Element> const &q) {
+  
+  Element rcp_mag = Element(1) / abs(q);
+  
+  return make_Quaternion(
+    q.x() * rcp_mag,
+    q.y() * rcp_mag,
+    q.z() * rcp_mag,
+    q.w() * rcp_mag
+  );
+}
+
+/// Quaternion exponential
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> exp(Quaternion<Element> const &q) {
+  
+  Element exp_ = fast_exp(q.w());
+  Element imag_norm = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
+  Element sin_norm = fast_sin(imag_norm);
+
+  return make_Quaternion(
+    exp_ * q.x() * sin_norm / imag_norm,
+    exp_ * q.y() * sin_norm / imag_norm,
+    exp_ * q.z() * sin_norm / imag_norm,
+    exp_ * fast_cos(imag_norm)
+  );
+}
+
+/// Quaternion natural logarithm
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> log(Quaternion<Element> const &q) {
+  
+  Element v = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
+  Element s = fast_acos(q.w() / abs(q)) / v;
+  
+  return make_Quaternion(
+    q.x() * s,
+    q.y() * s,
+    q.z() * s,
+    fast_log(q.w())
+  );
+}
+
+/// Gets the rotation angle from a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element get_rotation_angle(Quaternion<Element> const &q_unit) {
+  return fast_acos(q_unit.w()) * Element(2);
+}
+
+/// Gets the rotation axis from a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> get_rotation_axis(Quaternion<Element> const &q_unit) {
+  return q_unit.pure().unit();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Equality operator
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+bool operator==(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return lhs.x() == rhs.x() &&
+    lhs.y() == rhs.y() &&
+    lhs.z() == rhs.z() &&
+    lhs.w() == rhs.w();
+}
+
+/// Inequality operator
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+bool operator!=(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return !(lhs == rhs);
+}
+
+/// Quaternion scalar multiplication
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Quaternion<Element> q, Element s) {
+  return make_Quaternion(
+    q.x() * s,
+    q.y() * s,
+    q.z() * s,
+    q.w() * s
+  );
+}
+
+/// Quaternion scalar multiplication
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Element s, Quaternion<Element> const &q) {
+  return make_Quaternion(
+    s * q.x(),
+    s * q.y(),
+    s * q.z(),
+    s * q.w()
+  );
+}
+
+/// Quaternion scalar division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Quaternion<Element> const &q, Element s) {
+  return make_Quaternion(
+    q.x() / s,
+    q.y() / s,
+    q.z() / s,
+    q.w() / s
+  );
+}
+
+/// Quaternion unary negation
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator-(Quaternion<Element> const &q) {
+  return make_Quaternion(
+    -q.x(),
+    -q.y(),
+    -q.z(),
+    -q.w()
+  );
+}
+
+/// Quaternion addition
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator+(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.x() + rhs.x(), 
+    lhs.y() + rhs.y(), 
+    lhs.z() + rhs.z(), 
+    lhs.w() + rhs.w()
+  );
+}
+
+/// Quaternion subtraction
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator-(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.x() - rhs.x(), 
+    lhs.y() - rhs.y(), 
+    lhs.z() - rhs.z(), 
+    lhs.w() - rhs.w()
+  );
+}
+
+/// Quaternion product
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.w() * rhs.x() + rhs.w() * lhs.x() + lhs.y() * rhs.z() - lhs.z() * rhs.y(),
+    lhs.w() * rhs.y() + rhs.w() * lhs.y() + lhs.z() * rhs.x() - lhs.x() * rhs.z(),
+    lhs.w() * rhs.z() + rhs.w() * lhs.z() + lhs.x() * rhs.y() - lhs.y() * rhs.x(),
+    lhs.w() * rhs.w() - lhs.x() * rhs.x() - lhs.y() * rhs.y() - lhs.z() * rhs.z()
+  );
+}
+
+/// Quaternion division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return lhs * reciprocal(rhs);
+}
+
+/// Quaternion scalar division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Element s, Quaternion<Element> const &q) {
+  return s * reciprocal(q);
+}
+
+/// Comparison 
+template <typename Element>
+CUTLASS_HOST_DEVICE
+bool operator<(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return true; 
+}
+
+/// Rotates a 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
+/// a reciprocal.
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> spinor_rotation(
+  Quaternion<Element> const &spinor,        /// unit-length quaternion
+  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
+
+  return (spinor * Quaternion<Element>(rhs, 0) * conj(spinor)).pure();
+}
+
+/// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
+/// a reciprocal.
+template <typename  Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> spinor_rotation_inv(
+  Quaternion<Element> const &spinor,        /// unit-length quaternion
+  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
+
+  return (conj(spinor) * Quaternion<Element>(rhs, 0) * spinor).pure();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Quaternion-valued type.
+template <typename T>
+struct RealType< Quaternion<T> > {
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = Quaternion<T>::kExtent;
+
+CUTLASS_HOST_DEVICE
+  static Quaternion<T> from_real(double x) {
+    return Quaternion<T>(static_cast<T>(x));
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<half_t> from_real<cutlass::Quaternion<half_t> >(double r) {
+  return cutlass::Quaternion<half_t>(half_t(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<float> from_real<cutlass::Quaternion<float> >(double r) {
+  return cutlass::Quaternion<float>(float(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<double> from_real<cutlass::Quaternion<double> >(double r) {
+  return cutlass::Quaternion<double>(r);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct multiplies<Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(Quaternion<T> lhs, Quaternion<T> const &rhs) const {
+    lhs = lhs * rhs;
+    return lhs;
+  }
+};
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<Quaternion<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(Quaternion<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_w = Output(lhs.w());
+    Output y_x = Output(lhs.x());
+    Output y_y = Output(lhs.y());
+    Output y_z = Output(lhs.z());
+
+    return mul_op(y_w, y_w) + mul_op(y_x, y_x) + mul_op(y_y, y_y) + \
+           mul_op(y_z, y_z);
+  }
+};
+
+template <typename T>
+struct multiply_add<Quaternion<T>, Quaternion<T>, Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(
+    Quaternion<T> const &a,
+    Quaternion<T> const &b,
+    Quaternion<T> const &c) const {
+
+    T x = c.x();
+    T y = c.y();
+    T z = c.z();
+    T w = c.w();
+
+    x += a.w() * b.x();
+    x += b.w() * a.x();
+    x += a.y() * b.z();
+    x += -a.z() * b.y(),
+
+    y += a.w() * b.y();
+    y += b.w() * a.y();
+    y += a.z() * b.x();
+    y += -a.x() * b.z();
+
+    z += a.w() * b.z();
+    z += b.w() * a.z();
+    z += a.x() * b.y();
+    z += -a.y() * b.x();
+
+    w += a.w() * b.w();
+    w += -a.x() * b.x();
+    w += -a.y() * b.y();
+    w += -a.z() * b.z();
+
+    return cutlass::make_Quaternion(x, y, z, w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/real.h b/lightllm-kernel/cutlass/include/cutlass/real.h
new file mode 100755
index 000000000..e53301b3f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/real.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/**
+  \file
+  \brief This class provides helpers to support real<> and complex<> types in generic code.
+*/
+
+#pragma once
+
+namespace cutlass {
+
+/// Used to determine the real-valued underlying type of a numeric type T.
+template <typename T>
+struct RealType {
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = 1;
+
+CUTLASS_HOST_DEVICE
+  static T from_real(double x) {
+    return static_cast<T>(x);
+  }
+};
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+static T from_real(double r) {
+  return T(r);
+}
+
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
new file mode 100755
index 000000000..0b8ac7a56
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
@@ -0,0 +1,232 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ReductionKernel_
+>
+class ReduceSplitK {
+public:
+  using ReductionKernel = ReductionKernel_;
+
+  using Shape = typename ReductionKernel::Shape;
+  using ReductionOp = typename ReductionKernel::ReductionOp;
+  using OutputOp = typename ReductionKernel::OutputOp;
+
+  using ElementWorkspace = typename ReductionKernel::ElementWorkspace;
+  using ElementAccumulator = typename ReductionKernel::ElementAccumulator;
+  using ElementOutput = typename ReductionKernel::ElementOutput;
+
+  using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef;
+  using OutputTensorRef = typename ReductionKernel::OutputTensorRef;
+
+  using StrideIndex = typename ReductionKernel::StrideIndex;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    MatrixCoord problem_size{0,0};
+    int partitions{1};
+    size_t partition_stride{0};
+    WorkspaceTensorRef workspace{};
+    OutputTensorRef destination{};
+    OutputTensorRef source{};
+    typename OutputOp::Params output{};
+    typename ReductionOp::Params reduction{};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      MatrixCoord const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(partition_stride_),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_)
+    {
+
+    }
+
+  };
+
+private:
+  /// Kernel parameters object
+  typename ReductionKernel::Params params_;
+
+public:
+  /// Constructs Reduction SplitK
+  ReduceSplitK() { }
+
+  /// Determines whether the ReduceSplitK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    // needs no additional workspace
+    return 0;
+  }
+
+  /// Initializes Reduction state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename ReductionKernel::Params(
+      args.problem_size,
+      args.partitions,
+      args.partition_stride,
+      args.workspace,
+      args.destination,
+      args.source,
+      args.output,
+      args.reduction
+    );
+
+    return Status::kSuccess;
+
+   }
+
+  /// Initializes Reduction kernel state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.workspace.reset(args.workspace.non_const_ref().data());
+    params_.destination.reset(args.destination.non_const_ref().data());
+    params_.source.reset(args.source.non_const_ref().data());
+    params_.output = args.output;
+    params_.reduction = args.reduction;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+
+    //
+    // Launch reduction kernel
+    //
+    dim3 block = ReductionKernel::block_shape();
+    dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
+
+    if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params_};
+          cuda_adapter->launch(
+              grid, dim3(1,1,1), block, 0, stream, kernel_params, kernel_index);
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream,cuda_adapter, kernel_index);
+    }
+
+    return status;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
new file mode 100755
index 000000000..f36c72c92
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
+#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
+template <
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename Layout_,
+  typename ReductionOp_,
+  int VectorLength_  = 1,
+  typename ElementCompute_ = ElementOutput_
+>
+struct TensorReduction {
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using Layout = Layout_;
+  using ReductionOp = ReductionOp_;
+  static int const kVectorLength = VectorLength_;
+  using ElementCompute = ElementCompute_;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Reduction operator
+  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  //
+  // Data members
+  //
+
+  ReductionDeviceStridedOperator reduction_strided;
+  ReductionDeviceContiguousOperator reduction_contiguous;
+  int reduction_index;
+
+  //
+  // Methods
+  //
+
+  ///
+  TensorReduction(
+    TensorCoord extent, 
+    int reduction_index_
+  ): 
+    reduction_index(reduction_index_) {
+
+    Coord<4> extent_affine;
+
+    switch (reduction_index) {
+    case 0:
+      extent_affine[0] = extent[1];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[0];
+      extent_affine[3] = extent[3];
+      break;
+    case 1:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[1];
+      extent_affine[3] = extent[3];
+      break;
+    case 2:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    case 3:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);  
+    }
+    else {
+      reduction_strided = ReductionDeviceStridedOperator(extent_affine);  
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.good();
+    }
+    return reduction_strided.good();
+  }
+
+  /// Size of one workspace
+  int64_t workspace_stride() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_stride();
+    }
+    else {
+      return reduction_strided.workspace_stride();
+    }
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_size();
+    }
+    else {
+      return reduction_strided.workspace_size();
+    }
+  }
+
+  /// Helper to use overloaded function call operator
+  Status reduce(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    int64_t src_stride[3];
+    int64_t dst_stride[3];
+
+    switch (reduction_index) {
+    case 0:
+      src_stride[0] = src_ref.stride()[1];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[2];
+      dst_stride[0] = dst_ref.stride()[1];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 1:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[1];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 2:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      break;
+    case 3:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      dst_stride[2] = dst_ref.stride()[0];
+
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      return reduction_contiguous(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+    else {
+      return reduction_strided(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+  }
+
+  Status operator()(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    return reduce(
+      dst_ref, 
+      src_ref, 
+      device_workspace_ptr, 
+      reduction_identity,
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
new file mode 100755
index 000000000..8d71aa9dd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@@ -0,0 +1,374 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (e.g. ND => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguous {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineContiguous():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineContiguous(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p];
+    }
+
+    int cta_count_x = 1;
+    int cta_count_y = 1;
+    int cta_count_z = 1;
+
+    int cta_threads_x = kThreads;
+    int cta_threads_y = 1;
+    int cta_threads_z = 1;
+
+    // Determine CTA shape
+    int64_t inner_vector_count = inner_count / kVectorLength;
+
+    // Priority 1. Assign threadblocks to outer indices if possible
+    if (outer_count > target_threadblock_count) {
+      cta_count_x = 1;
+      cta_count_y = target_threadblock_count;
+      cta_count_z = 1;
+    }
+    else {
+
+      cta_count_y = int(outer_count);
+      int remaining_ctas = target_threadblock_count / cta_count_y;
+
+      // Priority 2. Assign inner dimensions to one CTA
+      if (inner_vector_count > cta_threads_x) {
+        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
+        if (cta_z_bound > remaining_ctas) {
+          cta_count_z = remaining_ctas;
+        }
+        else {
+          cta_count_z = int(cta_z_bound);
+        }
+      }
+      else {
+        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
+        cta_count_z = 1;
+      }
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
+
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    if (workspace_count) {
+
+      int final_threads = kThreads;
+      int final_ctas = 1;
+
+      if (outer_count > kThreads) {
+        final_ctas = int(outer_count + kThreads - 1) / kThreads;
+      }
+      else {
+        final_threads = int(outer_count);
+      }
+
+      grid_final = dim3(final_ctas, 1, 1);
+      threadblock_final = dim3(final_threads, 1, 1); 
+    }
+    else {
+      grid_final = dim3(0, 0, 0);
+      threadblock_final = dim3(0, 0, 0);
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    return outer_count * sizeof_bits<ElementCompute>::value / 8;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineContiguous<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+    }
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
new file mode 100755
index 000000000..5ec7e6549
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStrided {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineStrided():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineStrided(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p - 1];
+    }
+
+    // Compute plan for the reduction
+    int extent_c = extent[kRank - 1];
+    int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
+
+    // Determine CTA shape
+    int cta_width = kThreads * kVectorLength;
+    int cta_ways = reshape_pow2(extent_c, cta_width);
+    int cta_threads_x = kThreads / cta_ways;
+
+    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
+
+    // This leads to an error.
+    if (threadblock_shape.z > 1) {
+      if (threadblock_shape.y != 1) {
+        status = Status::kErrorInternal;
+        return;
+      }
+    }
+    
+    // Determine grid shape
+    int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
+    int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
+
+    // Limit the number of CTAs assigned to outer dimension
+    if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
+      cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
+    }
+
+    // Limit the number of CTAs assigned to inner dimension
+    int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
+    if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
+      cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    grid_final = dim3(cta_count_x, int(outer_count));
+    threadblock_final = dim3(cta_threads_x, 1, 1);
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size of one CTA's workspace
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
+
+    return extent[kRank - 1] * vector_size_bytes;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * outer_count * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,             ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineStrided<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+
+      // Check error condition
+      if (cudaPeekAtLastError() == cudaSuccess) {
+        status = Status::kSuccess;
+      }
+      else {
+        status = Status::kErrorInternal;
+      }
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(
+      dst_ptr, 
+      dst_stride, 
+      src_ptr, 
+      src_stride, 
+      device_workspace_ptr, 
+      reduction_identity, 
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
new file mode 100755
index 000000000..9752b9b76
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a final reduction for softmax
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+template <
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ThreadblockShape_,
+  bool GroupedProblem = false
+>
+class ApplySoftmaxFinalReduction {
+public:
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+  using ThreadblockShape = ThreadblockShape_;
+  static const bool isGroupedProblem = GroupedProblem;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+
+    cutlass::gemm::GemmCoord*  problem_sizes{nullptr};
+    cutlass::gemm::GemmCoord   problem_size{};
+    ElementNorm*               block_Norm{nullptr};
+    ElementSum*                block_Sum{nullptr};
+    int64_t*                   offset_Norm_Device{nullptr};
+    int64_t*                   offset_Sum_Device{nullptr};
+    int64_t                    batch_stride_Max{0};
+    int64_t                    batch_stride_Sum{0};
+
+    //
+    // Methods
+    //
+    Arguments() { }
+
+    // Non-grouped constructor without batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    // Non-grouped constructor with batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t                   batch_stride_Max,
+      int64_t                   batch_stride_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      batch_stride_Max(batch_stride_Max),
+      batch_stride_Sum(batch_stride_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr)
+    {
+
+    }
+
+
+    // Grouped constructor
+    Arguments(
+      cutlass::gemm::GemmCoord  *problem_sizes,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t*                  offset_Norm_Device,
+      int64_t*                  offset_Sum_Device
+    ):
+      problem_sizes(problem_sizes),
+      problem_size(cutlass::gemm::GemmCoord(0, 0, 0)),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      offset_Norm_Device(offset_Norm_Device),
+      offset_Sum_Device(offset_Sum_Device)
+    {
+
+    }
+  };
+
+  struct SharedStorage {
+
+
+  };
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+private:
+
+public:
+
+  CUTLASS_DEVICE
+  ApplySoftmaxFinalReduction() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    apply(params, shared_storage);
+  }
+
+private:
+
+  /// Full reduction
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    int bdim = blockDim.x;
+    
+    int block_batch = blockIdx.z;
+
+    // defining three vars for a general reduction module
+    cutlass::gemm::GemmCoord problem_size = isGroupedProblem ? params.args.problem_sizes[bid] : params.args.problem_size;
+    int m_dim_in_loop = isGroupedProblem ? problem_size.m() : tid + bdim;
+    int access_offset = isGroupedProblem ? 0 : bid * bdim;
+
+    if (!isGroupedProblem && access_offset + tid >= problem_size.m()) return;
+
+    ElementNorm *curr_ptr_Max = isGroupedProblem ? \
+              params.args.block_Norm + params.args.offset_Norm_Device[bid] : \
+              params.args.block_Norm + block_batch * params.args.batch_stride_Max;
+    ElementSum *curr_ptr_Sum = isGroupedProblem ? \
+              params.args.block_Sum + params.args.offset_Sum_Device[bid] : \
+              params.args.block_Sum + block_batch * params.args.batch_stride_Sum;
+
+    int threadblock_num = (problem_size.n() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
+    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
+
+    ConvertSum   convert_sum;
+    ConvertNorm  convert_norm;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    uint32_t float_max_bits = 0xff7fffff;
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_m = tid; idx_m < m_dim_in_loop; idx_m += bdim) {
+      ElementNorm *access_n = curr_ptr_Max + idx_m + access_offset;
+      ElementSum *access_s = curr_ptr_Sum + idx_m + access_offset;
+      ElementNorm *access_n_bak = access_n;
+      ElementSum *access_s_bak = access_s;
+      ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
+      ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
+      ElementNorm fetch_n;
+      ElementSum fetch_s;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        max_val = cutlass::fast_max(max_val, convert_norm(fetch_n));
+        access_n += problem_size.m();
+      }
+
+      access_n = access_n_bak;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        cutlass::arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
+        sum_val += convert_sum(fetch_s) * cutlass::fast_exp(convert_norm(fetch_n) - max_val);
+        access_n += problem_size.m();
+        access_s += problem_size.m();
+      }
+
+      ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
+
+      access_n = access_n_bak;
+      access_s = access_s_bak;
+
+      access_n[0] = convert_norm_output(max_val);
+      access_s[0] = convert_sum_output(inv_sum);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
new file mode 100755
index 000000000..d9c701396
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,              ///< shape of CTA        (concept: MatrixShape)
+  typename OutputOp_ ,          ///< output operator     (concept: epilogue::thread operator)
+  typename ReductionOp_,        ///< reduction operator  (concept: ReductionOperator)
+  int PartitionsPerStage = 4    ///< number of partitions to issue 
+>
+class ReduceSplitK {
+public:
+
+  using Shape = Shape_;
+  using ReductionOp = ReductionOp_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = OutputOp::kCount;
+  static int const kPartitionsPerStage = PartitionsPerStage;
+
+  using ElementWorkspace = typename ReductionOp::Element;
+  using ElementAccumulator = typename ReductionOp::ElementAccumulator;
+  using ElementOutput = typename OutputOp::ElementOutput;
+
+  using WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>;
+  using OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>;
+  using StrideIndex = typename WorkspaceTensorRef::Layout::Stride::Index;
+
+  using FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>;
+
+  //
+  // Types
+  //
+
+  /// Params structure
+  struct Params {
+
+    MatrixCoord problem_size;
+    int partitions;
+    size_t partition_stride;
+    WorkspaceTensorRef workspace;
+    OutputTensorRef destination;
+    OutputTensorRef source;
+    typename OutputOp::Params output;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(sizeof(FragmentWorkspace) * partition_stride_ / kElementsPerAccess),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_) {
+
+    }
+  };
+
+  struct SharedStorage { };
+
+
+public:
+
+  /// Computes the grid size given a chosen threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 grid_shape(
+    cutlass::MatrixCoord problem_size) {
+
+    return dim3(
+      (problem_size.row() + Shape::kRow - 1) / Shape::kRow,
+      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn);
+  }
+
+  /// Determines the threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 block_shape() {
+    return dim3(Shape::kColumn / kElementsPerAccess, Shape::kRow);
+  }
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &storage) {
+
+    // Determine CTA position
+    MatrixCoord thread_offset(
+      MatrixCoord::Index(int(blockIdx.x) * Shape::kRow + threadIdx.y),
+      MatrixCoord::Index(int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess)
+    );
+
+    // One guard conditional
+    if (!(thread_offset.row() < params.problem_size.row() && 
+          thread_offset.column() < params.problem_size.column())) {
+
+      return;
+    }
+
+
+    ReductionOp reduction_op(params.reduction);
+
+    FragmentAccumulator accumulator;
+
+    accumulator.clear();  
+    
+    //
+    // Load the first slice
+    //
+
+    char const *workspace_ptr = 
+      reinterpret_cast<char const *>(
+        params.workspace.data() + params.workspace.offset(thread_offset));
+
+    FragmentWorkspace workspace_frag[kPartitionsPerStage];
+    
+    //
+    // Construct the output operator
+    //
+    
+    OutputOp output_op(params.output);
+
+    //
+    // Load and accumulate with a simple batched loading sequence.
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int k = 0; k < params.partitions; k += kPartitionsPerStage) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          workspace_frag[i] = *reinterpret_cast<FragmentWorkspace const *>(workspace_ptr);
+          workspace_ptr += params.partition_stride;
+        }
+      }   
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          accumulator = reduction_op(accumulator, workspace_frag[i]);
+        }
+      }
+    }
+
+    //
+    // Conditionally load the source
+    //
+
+    FragmentOutput source_frag;
+
+    source_frag.clear();
+
+    FragmentOutput const *source_ptr = reinterpret_cast<FragmentOutput const *>(
+      params.source.data() + params.source.offset(thread_offset));
+
+    if (output_op.is_source_needed()) {
+      reinterpret_cast<FragmentOutput &>(source_frag) = *source_ptr;
+    }
+    
+    //
+    // Compute the output
+    //
+
+    typename OutputOp::FragmentOutput output_frag = output_op(accumulator, source_frag);
+
+    //
+    // Store
+    //
+
+    FragmentOutput *dest_ptr = reinterpret_cast<FragmentOutput *>(
+      params.destination.data() + params.destination.offset(thread_offset));
+
+    *dest_ptr = reinterpret_cast<FragmentOutput const &>(output_frag);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
new file mode 100755
index 000000000..bffc956f2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (i.e. number of outer ranks)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguousParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank];             /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element used by reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineContiguousParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineContiguousParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kRank - 1 - p]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguous {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into a coordinate of rank <kInnerRank>
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kRank - kInnerRank]);
+
+    // Compute an offset using the souce stride
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank - 1; ++i) {
+      src_offset += coord[i] * params.src_stride[kReducedRank + i];
+    }
+    src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices yielding a single element
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr,
+    int coord_c) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    //
+    // Early exit or initialize to identity element
+    //
+    if (!params.inner_count) {
+      return params.reduction_identity;
+    }
+
+    ComputeFragment accumulator;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(accumulator.size()); ++i) {
+      accumulator[i] = params.reduction_identity;
+    }
+    
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            cvt);
+        }
+      }
+    };
+
+    //
+    // Reduction of vectors to scalar
+    //
+
+    ElementCompute reduced_accumulator = accumulator[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kVectorLength; ++i) {
+      reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
+    }
+
+    //
+    // Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
+    //
+    // This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
+    //
+
+    int thread_count = blockDim.x * blockDim.z;
+    int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
+    int thread_i = threadIdx.y;
+
+    ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
+
+    frag_ptr[thread_j] = reduced_accumulator;
+
+    //
+    // Reduce
+    //
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (thread_count > 1) {
+      thread_count /= 2;
+
+      __syncthreads();
+
+      if (thread_j < thread_count) {
+        ElementCompute other = frag_ptr[thread_j + thread_count];
+
+        reduced_accumulator = reduction_op(reduced_accumulator, other);
+
+        frag_ptr[thread_j] = reduced_accumulator;
+      }
+
+      __syncthreads();
+    }
+
+
+    return reduced_accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
+    }
+
+    uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+
+          // Convert to output type and store
+          NumericConverter<ElementOutput, ElementCompute> convert_output;
+          ElementOutput cvt = convert_output(result);
+
+          *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        int64_t byte_offset = 
+          blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
+
+        // Store the result for final reduction
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+          *reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+      } // while
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguousFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute const *device_workspace) {
+
+    ReductionOp reduction_op(params.reduction_op);
+    char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
+
+    // Accumulated output
+    ElementCompute accumulator = params.reduction_identity;
+
+    for (int iter = 0; iter < params.workspace_count; ++iter) {
+      ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
+      
+      accumulator = reduction_op(accumulator, workspace_item);
+
+      src_byte_ptr += params.workspace_stride;
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
+
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
+
+      // Convert to output type and store
+      NumericConverter<ElementOutput, ElementCompute> convert_output;
+
+      *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
+
+      // Update indices and pointers
+      idx_linear += gridDim.x * blockDim.x;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
new file mode 100755
index 000000000..0d449e687
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@@ -0,0 +1,641 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStridedParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank - 1];         /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int64_t workspace_outer_stride;               /// stride (units of bytes) between 'rows' of the workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element for reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineStridedParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineStridedParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_  = ElementCompute() ///< Identity element for reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_outer_stride(0),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    workspace_outer_stride = workspace_stride * workspace_count;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kReducedRank + p - 1]);
+    }
+  }
+};
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStrided {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kReducedRank - 1]);
+
+    // Compute linear offset
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank; ++i) {
+      src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
+    }
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear coordinate
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute offset into tensors
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    if (!params.inner_count) {
+      return identity_frag;
+    }
+    
+    ComputeFragment accumulator = identity_frag;
+
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += blockDim.z * gridDim.z;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+             accumulator, 
+             cvt);
+        }
+      }
+    };
+
+    // Optional reduction within a CTA
+    if (blockDim.z > 1) {
+
+      // Linearized thread ID
+      int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+
+      // all threads store to workspace
+      ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
+
+      frag_ptr[thread_idx] = accumulator;
+
+      __syncthreads();
+
+      if (threadIdx.z == 0) {
+        // Load all additional block indices
+        for (int z = 1; z < blockDim.z; ++z) {
+          ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            frag);
+        } 
+      }
+
+      __syncthreads();
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    }
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          // Convert to output type and store
+          NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+          auto cvt = convert_output(result);
+
+          *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+            reinterpret_cast<OutputFragment const &>(cvt);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with a device workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          int64_t byte_offset = 
+            blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
+
+          // No conversion - store in compute type
+          *reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) = 
+            reinterpret_cast<ComputeFragment const &>(result);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+        
+      } // while (outer index)
+    } // if ()
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStridedFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear index
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute tensor offset
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    char *src_byte_ptr) {
+
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    ComputeFragment accumulator = identity_frag;
+    ComputeFragment workspace_fragments[kBatchSize];
+
+    // Partially unrolled loop
+    for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (idx + b < params.workspace_count) {
+          workspace_fragments[b] = 
+            *reinterpret_cast<ComputeFragment *>(src_byte_ptr);  
+        }
+        else {
+          workspace_fragments[b] = identity_frag;
+        }
+        src_byte_ptr += + params.workspace_stride;
+      }
+
+      // Perform a reduction
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorLength; ++i) {
+          accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
+        }
+      }
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
+
+      ComputeFragment result = reduce_indices_(
+        params, 
+        src_byte_ptr + src_byte_offset);
+
+      // Convert to output type and store
+      NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+      auto cvt = convert_output(result);
+
+      *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+        reinterpret_cast<OutputFragment const &>(cvt);
+
+      // Update indices and pointers
+      idx_linear += gridDim.y * blockDim.y;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
new file mode 100755
index 000000000..d2551f977
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic thread level reduction with specializations for Array<T, N>.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+namespace reduction {
+namespace thread {
+
+/// Structure to compute the thread level reduction
+template <typename Op, typename T>
+struct Reduce;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization of Reduce for "plus" (a functional operator)
+template <typename T>
+struct Reduce< plus<T>, T > {
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    plus<T> _op;
+    return _op(lhs, rhs);
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization of Reduce for Array<T, N>
+template <typename T, int N>
+struct Reduce < plus<T>, Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, 1> operator()(Array<T, N> const &in) const {
+
+    Array<T, 1> result;
+    Reduce< plus<T>, T > scalar_reduce;
+    result.clear();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (auto i = 0; i < N; ++i) {
+      result[0] = scalar_reduce(result[0], in[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specializations of Reduce for Array<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, Array<half_t, N> > {
+  
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(Array<half_t, N> const &input) {
+
+    Array<half_t, 1> result;
+
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+
+      result[0] = input.front();
+
+    } else {
+    
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        
+        __half result_d;
+        Array<half_t, 1> const *in_ptr_half = reinterpret_cast<Array<half_t, 1> const *>(&input);
+        Array<half_t, 2> const *in_ptr_half2 = reinterpret_cast<Array<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+
+        }
+        
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+    
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+
+          __half last_element;
+          Array<half_t, 1> tmp_last;
+          Array<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+
+          result_d = __hadd(result_d, last_element);
+
+        } 
+
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+
+      #else
+        
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+
+          result[0] = scalar_reduce(result[0], input[i]);
+
+        }
+
+      #endif
+    }
+
+    return result;
+      
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specializations of Reduce for AlignedArray<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, AlignedArray<half_t, N> > {
+  
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(AlignedArray<half_t, N> const &input) {
+
+    Array<half_t, 1> result;
+
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+
+      result[0] = input.front();
+
+    } else {
+    
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        
+        __half result_d;
+        AlignedArray<half_t, 1> const *in_ptr_half = reinterpret_cast<AlignedArray<half_t, 1> const *>(&input);
+        AlignedArray<half_t, 2> const *in_ptr_half2 = reinterpret_cast<AlignedArray<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+
+        }
+        
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+    
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+
+          __half last_element;
+          AlignedArray<half_t, 1> tmp_last;
+          AlignedArray<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+
+          result_d = __hadd(result_d, last_element);
+
+        } 
+
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+
+      #else
+        
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+
+          result[0] = scalar_reduce(result[0], input[i]);
+
+        }
+
+      #endif
+    }
+
+    return result;
+      
+  }
+};
+}
+}
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
new file mode 100755
index 000000000..ba62c1b50
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mixed-precision reduction
+template <
+  typename ElementAccumulator_,
+  typename Element_,
+  int Count = 1
+>
+struct ReduceAdd {
+
+  //
+  // Type definitions
+  //
+
+  using ElementAccumulator = ElementAccumulator_;
+  using Element = Element_;
+  static int const kCount = Count;
+
+  using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
+  using FragmentElement = cutlass::Array<Element, kCount>;
+
+  struct Params { };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ReduceAdd(Params params_ = Params()): params(params_) { }
+
+  /// Operator
+  CUTLASS_HOST_DEVICE
+  FragmentAccumulator operator()(
+    FragmentAccumulator accumulator, 
+    FragmentElement element) const {
+
+    plus<FragmentAccumulator> op;
+
+    NumericArrayConverter<
+      ElementAccumulator, 
+      Element, 
+      kCount, 
+      PreferredRoundingMode<ElementAccumulator, Element>::kRound> converter;
+
+    return op(accumulator, converter(element));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Special handling for binary operators
+template <typename ReductionOp, typename Element, int N>
+struct VectorizeArrayOperation {
+
+  using ValueType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(
+    ReductionOp const &reduction_op, 
+    ValueType const &lhs, 
+    ValueType const &rhs) const {
+
+    ValueType result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = reduction_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ReductionOp, typename Element, int N>
+struct ReduceArrayOperation {
+
+  using ArrayType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  Element operator()(
+    ReductionOp const &reduction_op, 
+    ArrayType const &array) const {
+
+    Element item = reduction_op(array[0], array[1]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 2; i < N; ++i) {
+      item = reduction_op(item, array[i]);
+    }
+
+    return item;
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || !bits);
+    }
+
+    return uint1b_t(!item);
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = true;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || bits);
+    }
+
+    return uint1b_t(item);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to infer template argument types
+template <typename ReductionOp, typename Element, int N>
+CUTLASS_HOST_DEVICE
+Array<Element, N> ApplyArrayOperator(
+  ReductionOp const &reduction_op,
+  Array<Element, N> const &lhs, 
+  Array<Element, N> const &rhs) {
+
+  VectorizeArrayOperation<ReductionOp, Element, N> vectorize_op;
+
+  return vectorize_op(reduction_op, lhs, rhs);
+}
+
+/// Helper to reduce an array
+template <typename ReductionOp, typename Element, int N>
+Element ReduceArray(ReductionOp const &reduction_op, Array<Element, N> const &array) {
+  ReduceArrayOperation<ReductionOp, Element, N> reduce_array_op;
+
+  return reduce_array_op(reduction_op, array);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
new file mode 100755
index 000000000..ffb35dada
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+/*! \file
+\brief Defies functors for mapping blockIdx to partitions of the batched reduction computation.
+*/
+#pragma once
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace reduction {
+struct DefaultBlockSwizzle {
+  /// Ctor
+  CUTLASS_HOST_DEVICE DefaultBlockSwizzle() {}
+
+  /// Swizzle the block index.
+  CUTLASS_DEVICE dim3 swizzle() { return blockIdx; }
+
+  /// 
+  CUTLASS_HOST_DEVICE dim3 get_grid_layout(Coord<3> const &problem_size,
+                                           Coord<3> const &OutputTile) {
+    assert(OutputTile[0] == 1 && OutputTile[1] == 1);
+    assert((problem_size[0] * problem_size[1] * problem_size[2]) % OutputTile[2] == 0);
+    dim3 grid;
+    grid.x = problem_size[0] * problem_size[1] * problem_size[2]
+      / OutputTile[2] ;
+    return grid;
+  }
+
+  ///
+  CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &SubTile) {
+    assert(SubTile[0] == 1 && SubTile[1] == 1);
+    dim3 block = swizzle();
+    Coord<3> threadblock_offset =
+      make_Coord(0, 0, block.x * SubTile[2]);
+    return threadblock_offset;
+  }
+};
+} // namespace reduction
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h b/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
new file mode 100755
index 000000000..26b7c66b1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Performs comparison between two elements with support for floating-point comparisons.
+*/
+
+#pragma once
+
+#include "numeric_types.h"
+#include "complex.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename U = T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(T a, T b, U epsilon, U nonzero_floor);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// This floating-point comparison function implements the method described in
+//
+// https://floating-point-gui.de/errors/comparison/
+//
+template <typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
+  
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  T zero = T(0);
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || diff < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+  
+  return diff < epsilon * (abs_A + abs_B);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bool>(bool a, bool b, bool, bool) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint1b_t>(uint1b_t a, uint1b_t b, uint1b_t, uint1b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int2b_t>(int2b_t a, int2b_t b, int2b_t, int2b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint2b_t>(uint2b_t a, uint2b_t b, uint2b_t, uint2b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int4b_t>(int4b_t a, int4b_t b, int4b_t, int4b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint4b_t>(uint4b_t a, uint4b_t b, uint4b_t, uint4b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int8_t>(int8_t a, int8_t b, int8_t, int8_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint8_t>(uint8_t a, uint8_t b, uint8_t, uint8_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int16_t>(int16_t a, int16_t b, int16_t, int16_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint16_t>(uint16_t a, uint16_t b, uint16_t, uint16_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int32_t>(int32_t a, int32_t b, int32_t, int32_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint32_t>(uint32_t a, uint32_t b, uint32_t, uint32_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int64_t>(int64_t a, int64_t b, int64_t, int64_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint64_t>(uint64_t a, uint64_t b, uint64_t, uint64_t) {
+  return (a == b);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e4m3_t>(float_e4m3_t a, float_e4m3_t b, float_e4m3_t epsilon, float_e4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e5m2_t>(float_e5m2_t a, float_e5m2_t b, float_e5m2_t epsilon, float_e5m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bfloat16_t>(
+  bfloat16_t a, 
+  bfloat16_t b, 
+  bfloat16_t epsilon, 
+  bfloat16_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<tfloat32_t>(
+  tfloat32_t a, 
+  tfloat32_t b, 
+  tfloat32_t epsilon, 
+  tfloat32_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float>(float a, float b, float epsilon, float nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<double>(double a, double b, double epsilon, double nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template<typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(complex<T> a, complex<T> b, T epsilon, T nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || diff < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+
+  return diff < epsilon * (abs_A + abs_B);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE 
+bool relatively_equal(complex<T> a,  complex<T> b, complex<T> epsilon, complex<T> nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  complex<T> diff = a - b;
+  T abs_diff = abs(diff);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || abs_diff < abs(nonzero_floor)) {
+    return abs_diff < abs(epsilon * nonzero_floor);
+  }
+
+  return abs_diff < abs(epsilon) * (abs_A + abs_B);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/semaphore.h b/lightllm-kernel/cutlass/include/cutlass/semaphore.h
new file mode 100755
index 000000000..efcd9211c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/semaphore.h
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide semaphore for inter-CTA synchronization.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// CTA-wide semaphore for inter-CTA synchronization.
+class Semaphore { 
+public:
+
+  int *lock;
+  bool wait_thread;
+  int state;
+
+public:
+
+  /// Implements a semaphore to wait for a flag to reach a given value
+  CUTLASS_HOST_DEVICE
+  Semaphore(int *lock_, int thread_id): 
+    lock(lock_), 
+    wait_thread(thread_id < 0 || thread_id == 0),
+    state(-1) {
+
+  }
+
+  /// Permit fetching the synchronization mechanism early
+  CUTLASS_DEVICE
+  void fetch() {
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #else
+      asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #endif
+    }
+  }
+
+  /// Gets the internal state
+  CUTLASS_DEVICE
+  int get_state() const {
+    return state;
+  }
+
+  /// Waits until the semaphore is equal to the given value
+  CUTLASS_DEVICE
+  void wait(int status = 0) {
+    while( __syncthreads_and(state != status) ) {
+      fetch();
+    }
+
+    __syncthreads();
+  }
+
+  /// Updates the lock with the given result
+  CUTLASS_DEVICE
+  void release(int status = 0) {
+    __syncthreads();
+
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #else
+      asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #endif
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h b/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
new file mode 100755
index 000000000..8d43f503e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
@@ -0,0 +1,1388 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Provides a mechanism for packing and unpacking elements smaller than one byte
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+
+namespace detail {
+// This is an implementation detail of cutlass::SubbyteReference and.
+// cutlass::HostTensor.  For a given logical element type Element,
+// and its corresponding storage (physical) element type StorageUnit,
+// it computes quantities that help with managing allocations.
+//
+// CUTLASS uses a hidden "ContainerUnitType" or StorageUnit type to support
+// packed arrays of subbyte types such as int4.  Element is the "logical" type
+// for computations, while CUTLASS uses StorageUnit as the element type
+// of a packed array of Element.  If Element is not a subbyte type,
+// then the corresponding StorageUnit type is just Element itself.
+//
+// The ContainerType is always calculated as an array StorageUnit type (the StorageUnit
+// is always a byte for subbyte types),
+// and its number of bits is the lcm of the subbyte type's number of bits and 8.
+// Below are some examples for different subbyte types.
+//
+// * Subbyte Type=int2, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+// * Subbyte Type=int4, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+template<class Element, class StorageUnit>
+struct StorageContainerCalculator {
+  // kContainerTypeNumBits: The number of bits needed for ContainerType
+  static constexpr int kContainerTypeNumBits   = (sizeof_bits<Element>::value < 8) ? cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value) : sizeof_bits<Element>::value;
+  static_assert(kContainerTypeNumBits % sizeof_bits<Element>::value == 0, "The bits of ContainerType should be divisible by the element's number of bits");
+  // kContainerTypeNumLogicalElements: The number of logical Element instance(s) that can be stored per ContainerType instance
+  static constexpr int kContainerTypeNumLogicalElements = kContainerTypeNumBits / sizeof_bits<Element>::value;
+  /// 3. kContainerTypeNumBytes: The number of bytes per ContainerType instance
+  static constexpr int kContainerTypeNumBytes = kContainerTypeNumBits / 8;
+  /// 4. kContainerTypeNumBytes: The number of base StorageUnit in the ContainerType
+  static constexpr int kContainerTypeNumStorageUnit = kContainerTypeNumBits / sizeof_bits<StorageUnit>::value;
+
+  static_assert(kContainerTypeNumBits != 0, "kContainerTypeNumBits can not be zero");
+  static_assert(kContainerTypeNumLogicalElements != 0, "kContainerTypeNumLogicalElements can not be zero");
+  static_assert(kContainerTypeNumBytes != 0, "kContainerTypeNumBytes can not be zero");
+};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This class provides a mechanism for packing and unpacking elements smaller than one byte. It
+/// assumes these sub-byte elements are packed in a traditional C++ numeric type.
+///
+/// The intended application is to provide a mechanism to indirectly reference elements in
+/// memory or Array<> objects whose addresses cannot otherwise be taken since they are smaller
+/// than one byte.
+/// 
+/// Supports basic pointer arithmetic:
+///
+/// Example:
+///
+///   int4b_t *ptr = ...;
+///
+///   SubbyteReference<int4b_t> ref = ptr;
+///   ref += 15;
+///
+///   int4b_t x = ref;      // load an int4b_t
+///   ref = x + 2_s4;      // perform arithmetic on int4b_t and then store
+///
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ = uint8_t,    /// Underlying storage type. Must be able to hold an integer 
+                                  ///   number of objects of type Element.
+  class = void
+>
+class ConstSubbyteReference {
+public:
+
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage const *;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask 
+  Storage const kMask = 
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+
+private:
+
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    Storage item = Storage((*ptr_ >> (offset_ * sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ =             /// Underlying storage type. Must be able to hold an integer
+                                  ///   number of objects of type Element.
+
+#if defined(__CUDA_ARCH__)        /// Default size depends on width of atomicCas() overloads.
+  #if (__CUDA_ARCH__ >= 700)      ///
+  uint16_t
+  #else
+  uint32_t
+  #endif
+#else
+  uint8_t
+#endif
+  ,
+  class = void
+>
+class SubbyteReference {
+public:
+
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage *;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask 
+  Storage const kMask = 
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+
+private:
+
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    uint8_t const* byte_ptr = reinterpret_cast<uint8_t const*>(ptr_);
+    // Convert offset in elements to offset in bytes
+    constexpr int elements_per_byte = cutlass::sizeof_bits<uint8_t>::value / cutlass::sizeof_bits<Element>::value;
+    byte_ptr += offset_ / elements_per_byte;
+    // Offset of element within a byte
+    int byte_offset = offset_ % elements_per_byte;
+    uint8_t item = uint8_t((*byte_ptr >> (byte_offset * cutlass::sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+
+    Storage item        = (reinterpret_cast<Storage const &>(x) & kMask);
+    Storage kUpdateMask = Storage(~(kMask << (offset_ * cutlass::sizeof_bits<Element>::value)));
+    Storage new_bits    = Storage(item << (offset_ * cutlass::sizeof_bits<Element>::value));
+
+#if defined(__CUDA_ARCH__)
+
+    //
+    // Homebrew read-modify-write
+    //
+    Storage original;
+    Storage updated;
+
+    do {
+
+      original = (*ptr_);
+
+      updated  = Storage((original & kUpdateMask) | new_bits);
+
+      original = atomicCAS(ptr_, original, updated);
+
+    } while (updated != original);
+
+#else
+
+    Storage original = (*ptr_);
+    Storage updated  = Storage((original & kUpdateMask) | new_bits);
+    *ptr_ = updated;
+
+#endif
+
+    return *this;
+  }
+
+  ////
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, Storage> const &x) {
+    return set(x.get());
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying basic storage type.
+>
+class SubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  /// Note: It's possible that StorageUnit is not divisible by Element.
+  /// For example, an Element instance might be stored across 2 StorageUnit instances.
+  /// Thus, CUTLASS needs a storage vector to hold an integer number of Element instances.
+
+  using StorageUnit = Storage_;
+private:
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+public:
+  static int const kBitsStoredVec = StorageContainerCalculator::kContainerTypeNumBits; 
+  static int const kNumStorageUnitPerStoredVec = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
+
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
+  /// Gets element offset within StorageVec vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+
+    uint64_t item = static_cast<uint64_t>((reinterpret_cast<uint8_t const &>(x) & kMask)) << start_bit_idx_;
+    
+    StorageUnit low_new_bits  = StorageUnit(item & ~StorageUnit(0));
+    StorageUnit high_new_bits = StorageUnit(item >> sizeof_bits<StorageUnit>::value);
+
+    StorageUnit const kLowUpdateMask  = StorageUnit((~full_element_mask_) & (~StorageUnit(0)));
+    StorageUnit const kHighUpdateMask = StorageUnit(((~full_element_mask_) >> sizeof_bits<StorageUnit>::value) & (~StorageUnit(0)));
+
+#if defined(__CUDA_ARCH__)
+    //
+    // Homebrew read-modify-write
+    //
+    if(high_storage_unit_idx_ != low_storage_unit_idx_){
+      /// Only need update 2 storage unit at once.
+      /// consider misaligned address issue, we need to do atomicCAS twice 
+      StorageUnit original_low_bits, original_high_bits, update_low_bits, update_high_bits;
+      do {
+        original_low_bits  = ((*ptr_)[low_storage_unit_idx_]);
+        update_low_bits  = (original_low_bits & kLowUpdateMask) | low_new_bits;
+        original_low_bits = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original_low_bits, update_low_bits);
+      } while (update_low_bits != original_low_bits);
+      do {
+        original_high_bits = ((*ptr_)[high_storage_unit_idx_]);
+        update_high_bits  = (original_high_bits & kHighUpdateMask) | high_new_bits;
+        original_high_bits = atomicCAS(&((*ptr_)[high_storage_unit_idx_]), original_high_bits, update_high_bits);
+      } while (update_high_bits != original_high_bits);
+    }
+    else {
+      /// Only need update 1 storage unit.
+      StorageUnit original, updated;
+      do {
+        original = ((*ptr_)[low_storage_unit_idx_]);
+
+        updated = (original & kLowUpdateMask) | low_new_bits;
+
+        original = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original, updated);
+
+      } while (updated != original);
+    }
+#else
+
+
+    StorageUnit update_low_bits  = ((*ptr_)[low_storage_unit_idx_] & kLowUpdateMask) | low_new_bits;
+    StorageUnit update_high_bits = ((*ptr_)[high_storage_unit_idx_] & kHighUpdateMask) | high_new_bits;
+
+    (*ptr_)[low_storage_unit_idx_] = update_low_bits;
+
+    if(low_storage_unit_idx_ != high_storage_unit_idx_)
+      (*ptr_)[high_storage_unit_idx_] = update_high_bits;
+#endif
+
+    return *this;
+  }
+
+  ////
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, StorageVec> const &x) {
+    return set(x.get());
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying storage type. Must be able to hold an integer 
+>
+class ConstSubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  ///! Note: Storage unit could not be divisibale by Element,   
+  ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
+  ///   number of objects of type Element.
+  using StorageUnit = Storage_;
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
+  static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec const *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
+
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
+struct ReferenceFactory;
+
+template <typename Element>
+struct ReferenceFactory<Element, false> {
+
+  ///! Number of elements per storage vector
+  static int const kElementsPerVector = 1;
+
+  CUTLASS_HOST_DEVICE
+  static Element &get(Element *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const &get(Element const *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+};
+
+template <typename Element>
+struct ReferenceFactory<Element, true> {
+
+  //
+  // Static methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
+    return SubbyteReference<Element>(ptr, offset);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static ConstSubbyteReference<Element> get(Element const *ptr,
+                                             int64_t offset) {
+    return ConstSubbyteReference<Element>(ptr, offset);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset_in_elements) {
+    return &SubbyteReference<Element>(ptr, offset_in_elements);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset_in_elements) {
+    return &ConstSubbyteReference<Element>(ptr, offset_in_elements);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h b/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
new file mode 100755
index 000000000..982ec4e03
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
@@ -0,0 +1,326 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a canonical coordinate for rank=4 tensors offering named indices.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a canonical 4D coordinate used by tensor operations.
+struct Tensor4DCoord : public Coord<4> {
+
+  /// Base class
+  using Base = Coord<4>;
+
+  /// Index type
+  using Index = typename Base::Index;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+  /// Batch dimension
+  static int const kN = 0;
+
+  /// Height dimension
+  static int const kH = 1;
+
+  /// Width dimension
+  static int const kW = 2;
+
+  /// Channels dimension
+  static int const kC = 3;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord() { }
+
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Coord<4> const &coord): Base(coord) { }
+
+  /// Helper to construct from N, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Index n, Index h, Index w, Index c): Base(make_Coord(n, h, w, c)) { }
+
+  /// Helper to construct from N, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(LongIndex n, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(h), Index(w), Index(c))) { }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator+(Base const& b) const {
+    return Tensor4DCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator-(Base const& b) const {
+    return Tensor4DCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator*(Base const& b) const {
+    return Tensor4DCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator/(Base const& b) const {
+    return Tensor4DCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a canonical 5D coordinate used by tensor operations.
+struct Tensor5DCoord : public Coord<5> {
+
+  /// Base class
+  using Base = Coord<5>;
+
+  /// Index type
+  using Index = typename Base::Index;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+  /// Batch dimension
+  static int const kN = 0;
+
+  /// Depth dimension
+  static int const kD = 1;
+
+  /// Height dimension
+  static int const kH = 2;
+
+  /// Width dimension
+  static int const kW = 3;
+
+  /// Channels dimension
+  static int const kC = 4;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord() { }
+
+  /// Constructs from Coord<5>
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Coord<5> const &coord): Base(coord) { }
+
+  /// Helper to construct from N, D, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Index n, Index d, Index h, Index w, Index c): Base(make_Coord(n, d, h, w, c)) { }
+
+  /// Helper to construct from N, D, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(LongIndex n, LongIndex d, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(d), Index(h), Index(w), Index(c))) { }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & d() const { return this->at(kD); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & d() { return this->at(kD); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator+(Base const& b) const {
+    return Tensor5DCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator-(Base const& b) const {
+    return Tensor5DCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator*(Base const& b) const {
+    return Tensor5DCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator/(Base const& b) const {
+    return Tensor5DCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h b/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
new file mode 100755
index 000000000..1191f651e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/subbyte_reference.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default layout function from coordinates in a tensor's index space into the n-D array held
+/// in memory.
+///
+/// All layout functions must define at least the members shown in IdentityTensorLayout<>.
+template <int Rank>
+class IdentityTensorLayout {
+public:
+  /// Logical rank of tensor
+  static int const kRank = Rank;
+
+  /// Rank of stride vector
+  static int const kStrideRank = Rank;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  IdentityTensorLayout(Stride const &stride = Stride()): stride_(stride) { }
+
+  /// Returns the offset of a coordinate in linear memory
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(Coord<Rank> const &coord) const {
+    return coord.dot(stride_);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &size) const {
+    int idx = stride_.max_dim_index();
+    return stride_[idx] * size[idx];
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+
+  Examples:
+
+  (These examples use helpers for matrix layouts defined in cutlass/layout/matrix.h)
+
+  1. Column-major matrix may be represented as a rank=2 tensor:
+
+    TensorRef<float, layout::ColumnMajor> A(ptr_A, ldm);
+
+  2. Row-major matrix may be represented as a rank=2 tensor:
+
+    TensorRef<float, layout::RowMajor> B(ptr_A, ldm);
+
+  3. An interleaved matrix may be represented as a rank=2 tensor:
+
+    TensorRef<int8_t, layout::ColumnMajorInterleaved<32> > C;
+
+  4. A helper exists to define a TensorRef for a contiguous matrix whose layout
+     is not known at compile time.
+
+    int ldm;                     // leading dimension
+    layout::Matrix kind;         // Could be layout::Matrix::kRowMajor or layout::Matrix::kColumnMajor
+    
+
+    TensorRef<int, layout::ContiguousMatrix> E(ptr_E, {ldm, kind});
+
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRef {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  /// Reference type to an element
+  using Reference = typename platform::conditional<
+    sizeof_bits<Element>::value >= 8,
+    Element &,
+    SubbyteReference<Element>
+    >::type;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// Pointer
+  Element* ptr_;
+
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(): ptr_(nullptr) {
+  
+  }
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    Element *ptr,                   ///< pointer to start of tensor
+    Layout const &layout            ///< layout object containing stride and mapping function
+  ):
+    ptr_(ptr), layout_(layout) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  template<typename _Magic = int>
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    NonConstTensorRef const &ref,              ///< TensorRef to non-const data
+    ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
+    _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
+  ):
+    ptr_(ref.data()), layout_(ref.layout()) { }
+
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(const_cast<typename platform::remove_const<Element>::type *>(ptr_), layout_);
+  }
+
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr) {
+    ptr_ = ptr;
+  }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout) {
+    ptr_ = ptr;
+    layout_ = layout;
+  }
+
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return ReferenceFactory<typename platform::remove_const<Element>::type,
+                            (sizeof_bits<Element>::value < 8)>::get(ptr_, idx);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_pointer_offset(LongIndex offset_) {
+    ptr_ = ReferenceFactory<typename platform::remove_const<Element>::type,
+           (sizeof_bits<Element>::value < 8)>::add_pointer_offset(ptr_, offset_);
+    return *this;
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator+(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator-(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+};
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRef<Element, Layout> make_TensorRef(Element *ptr, Layout const &layout) {
+  return TensorRef<Element, Layout>(ptr, layout);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations to handle degenerate and sub-byte cases.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+bool TensorRef_aligned(TensorRef<Element, Layout> const &ref, int alignment) {
+
+  int const kStrideRank = Layout::kStrideRank;
+
+  if (reinterpret_cast<uintptr_t>(ref.data()) % alignment) {
+    return false;
+  }
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < kStrideRank; ++i) {
+    if (ref.stride(i) % alignment) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
new file mode 100755
index 000000000..ab354bbaf
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
@@ -0,0 +1,374 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+
+#include <cstdint>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element_>
+struct PlanarComplexReference {
+
+  //
+  // Type definitions
+  //
+
+  using Element = Element_;
+  using ComplexElement = complex<Element>;
+
+  //
+  // Data members
+  //
+
+  Element *real;
+  Element *imag;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference(
+    Element *real_ = nullptr, 
+    Element *imag_ = nullptr
+  ):
+    real(real_), imag(imag_) { }
+
+  /// Loads the complex element
+  CUTLASS_HOST_DEVICE
+  operator complex<Element>() const {
+    return complex<Element>{*real, *imag};
+  }
+
+  /// Stores a complex element to the location pointed to by the reference 
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference &operator=(complex<Element> const &rhs) {
+    *real = rhs.real();
+    *imag = rhs.imag();
+    return *this;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRefPlanarComplex {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Complex element type
+  using ComplexElement = complex<Element>;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "Planar complex not suitable for subbyte elements at this time");
+
+  /// Reference type to an element
+  using Reference = PlanarComplexReference<Element>;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// Pointer
+  Element* ptr_;
+
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+
+  /// Offset to imaginary part
+  LongIndex imaginary_stride_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    Element *ptr = nullptr,                   ///< pointer to start of tensor
+    Layout const &layout = Layout(),          ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride = 0
+  ):
+    ptr_(ptr), layout_(layout), imaginary_stride_(imaginary_stride) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    NonConstTensorRef const &ref              ///< TensorRef to non-const data
+  ):
+    ptr_(ref.data()), layout_(ref.layout()), imaginary_stride_(ref.imaginary_stride_) { }
+
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_, imaginary_stride_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(
+      const_cast<typename platform::remove_const<Element>::type *>(ptr_), 
+      layout_, 
+      imaginary_stride_);
+  }
+
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr, LongIndex imaginary_stride = 0) {
+    ptr_ = ptr;
+    imaginary_stride_ = imaginary_stride;
+  }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride) {
+    ptr_ = ptr;
+    layout_ = layout;
+    imaginary_stride_ = imaginary_stride;
+  }
+
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * imaginary_data() const { return ptr_ + imaginary_stride_; }
+
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return Reference(ptr_ + idx, ptr_ + idx + imaginary_stride_);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Gets the stride to an imaginary element
+  LongIndex imaginary_stride() const {
+    return imaginary_stride_;
+  }
+
+  /// Gets the stride to an imaginary element
+  LongIndex &imaginary_stride() {
+    return imaginary_stride_;
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_pointer_offset(LongIndex offset_) {
+    ptr_ += offset_;
+    return *this;
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator+(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator-(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_real() const {
+    return cutlass::TensorRef<Element, Layout>(data(), layout());
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_imag() const {
+    return cutlass::TensorRef<Element, Layout>(imaginary_data(), layout());
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRefPlanarComplex<Element, Layout> make_TensorRefPlanarComplex(
+  Element *ptr, 
+  Layout const &layout, 
+  int64_t imaginary_stride) {
+
+  return TensorRefPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_view.h b/lightllm-kernel/cutlass/include/cutlass/tensor_view.h
new file mode 100755
index 000000000..7defcc24f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tensor_view.h
@@ -0,0 +1,297 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorView : public TensorRef<Element_, Layout_> {
+ public:
+
+  /// Base tensor reference
+  using Base = cutlass::TensorRef<Element_, Layout_>;
+
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// View extent
+  TensorCoord extent_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView() { }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout), extent_(extent) {
+  
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
+    Base::reset(ptr, layout);
+    this->resize(extent);
+  }
+
+  /// Updates the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr) {
+    Base::reset(ptr);
+  }
+
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord const &extent) {
+    this->extent_ = extent;
+  }
+
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+
+  /// Returns the number of logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex size() const {
+    return extent_.product();
+  }
+
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() const {
+    return TensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorView subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+
+    TensorView result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result;
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorView result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
+  Element *ptr, 
+  Layout const &layout,
+  typename Layout::TensorCoord const &extent) {
+
+  return TensorView<Element, Layout>(ptr, layout, extent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
new file mode 100755
index 000000000..c98de563f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorViewPlanarComplex : public TensorRefPlanarComplex<Element_, Layout_> {
+ public:
+
+  /// Base tensor reference
+  using Base = cutlass::TensorRefPlanarComplex<Element_, Layout_>;
+
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// View extent
+  TensorCoord extent_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(TensorCoord const &extent = TensorCoord()): extent_(extent) {
+
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride,           ///< stride between real and imaginary part
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout, imaginary_stride), extent_(extent) {
+  
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride, TensorCoord size) {
+    Base::reset(ptr, layout, imaginary_stride);
+    this->resize(extent_);
+  }
+
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord extent) {
+    this->extent_ = extent;
+  }
+
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  Base ref() const {
+    return Base(this->data(), this->layout(), this->imaginary_stride());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+
+    TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result; 
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorViewPlanarComplex result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_real() const {
+    return cutlass::TensorView<Element, Layout>(this->data(), this->layout(), extent_);
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_imag() const {
+    return cutlass::TensorView<Element, Layout>(this->imaginary_data(), this->layout(), extent_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorViewPlanarComplex<Element, Layout> make_TensorViewPlanarComplex(
+  Element *ptr, 
+  Layout const &layout,
+  typename Layout::LongIndex imaginary_stride,
+  typename Layout::TensorCoord const &extent) {
+
+  return TensorViewPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride, extent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tfloat32.h b/lightllm-kernel/cutlass/include/cutlass/tfloat32.h
new file mode 100755
index 000000000..8e7ab884c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/tfloat32.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing Tensor Float 32 data type.
+*/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#endif
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor Float 32 data type
+struct alignas(4) tfloat32_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint32_t storage;
+
+  //
+  // Methods
+  //
+  private:
+    CUTLASS_HOST_DEVICE
+    static uint32_t float_to_storage(float s) {
+  #if defined(__CUDA_ARCH__)
+      uint32_t result = reinterpret_cast<uint32_t const &>(s);
+  #else
+      uint32_t result;
+      std::memcpy(&result, &s, sizeof(float));
+  #endif
+      return result;
+    }
+
+  public:
+  /// Constructs from an unsigned int
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t bitcast(uint32_t x) {
+    tfloat32_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// Emulated rounding is fast in device code
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t round_half_ulp_truncate(float const &s) {
+    uint32_t x = float_to_storage(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #endif
+
+    return tfloat32_t::bitcast(x);
+  }
+
+  tfloat32_t() = default;
+
+  /// Floating-point conversion - round toward nearest even
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).raw()) { }
+
+  // Conversion from double (this rounds twice)
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(double x): tfloat32_t(float(x)) { }
+
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(int x) {
+    float flt = static_cast<float>(x);
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint32_t const &>(flt);
+    #else
+    std::memcpy(&storage, &flt, sizeof(storage));
+    #endif
+  }
+
+  // Conversion to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+
+    // Conversions to IEEE single-precision requires clearing dont-care bits
+    // of the mantissa.
+    unsigned bits = (storage & ~0x1fffu);
+
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+
+  /// Converts to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint32_t raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x80000000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 23) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7fffff);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::tfloat32_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) {
+  return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t nan_tf32(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::tfloat32_t::bitcast(0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::tfloat32_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::tfloat32_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::tfloat32_t(sqrtf(float(h)));
+#else
+  return cutlass::tfloat32_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
+
+  uint32_t a_mag = (a.raw() & 0x7fffffff);
+  uint32_t b_sign = (b.raw() & 0x80000000);
+  uint32_t result = (a_mag | b_sign);
+
+  return tfloat32_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace std {
+
+#if !defined(__CUDACC_RTC__)
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::tfloat32_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 19;
+
+  /// Least positive value
+  static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); }
+
+  /// Maximum finite value
+  static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace std
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) + float(rhs));
+}
+
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs) {
+  return tfloat32_t::bitcast(0x80000000 ^ lhs.raw());
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) + float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) - float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) * float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) / float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator++(tfloat32_t & lhs) {
+  float tmp(lhs);
+  ++tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator--(tfloat32_t & lhs) {
+  float tmp(lhs);
+  --tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator++(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp++;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator--(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp--;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(long double x) {
+  return cutlass::tfloat32_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) {
+  return cutlass::tfloat32_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h b/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
new file mode 100755
index 000000000..f6b4b2b79
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a matrix object intended for storing data in registers and operations within
+      a CUDA thread.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/matrix_coord.h"
+
+namespace cutlass {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Per-thread matrix object storing a packed matrix
+template <
+  typename Element,
+  int Rows,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+class Matrix : public Array<Element, Rows * Columns> {
+public:
+  
+  // Verify layout refers to a rank=2 matrix.
+  static_assert(
+    Layout::kRank == 2,
+    "Layout type must refer to a rank=2 matrix");
+
+  /// Base type
+  using Base = Array<Element, Rows * Columns>;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Number of rows
+  static int const kRows = Rows;
+
+  /// Number of columns
+  static int const kColumns = Columns;
+
+  /// Layout within the array
+  using Layout = Layout_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = 2;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Stride type
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to matrix object
+  using TensorRef = TensorRef<Element, kRank, Layout>;
+
+  /// TensorRef to constant matrix object
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  /// TensorRef to matrix object
+  using TensorView = TensorView<Element, kRank, Layout>;
+
+  /// TensorRef to constant matrix object
+  using ConstTensorView = typename TensorView::ConstTensorView;
+
+  /// Diagonal vector
+  using Diagonal = Vector<Element, __NV_STD_MIN(kRows, kColumns)>;
+
+private:
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Returns the size of the object
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord extent() {
+    return make_Coord(kRows, kColumns);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  static Layout layout() {
+    return Layout::packed(extent());
+  }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix() { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix(Diagonal const &diag) {
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() {
+    return TensorRef(this->data(), layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorView view() {
+    return TensorView(ref(), extent());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent());
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(MatrixCoord const& coord) const {
+    typename Base::size_type offset_(layout().offset(coord));
+    return Base::at(offset_);
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity() const {
+    return LongIndex(Base::size());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Column vector defined as a matrix with exactly one column
+template <
+  typename Element,
+  int Rows,
+  typename Layout = layout::ColumnMajor
+>
+using ColumnVector = Matrix<Element, Rows, 1, Layout>;
+
+/// Row vector defined as a matrix with exactly one row
+template <
+  typename Element,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+using RowVector = Matrix<Element, 1, Columns, Layout>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/trace.h b/lightllm-kernel/cutlass/include/cutlass/trace.h
new file mode 100755
index 000000000..1b0c51126
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/trace.h
@@ -0,0 +1,59 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for optionally tracing through code when debugging.
+
+    This file is to be included after all other headers.
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Tracing options
+#ifndef CUTLASS_DEBUG_TRACE_LEVEL
+#define CUTLASS_DEBUG_TRACE_LEVEL 0
+#endif
+
+#if CUTLASS_DEBUG_TRACE_LEVEL
+#include <iostream>
+#include "cutlass/core_io.h"
+#if defined(__CUDA_ARCH__)
+#define CUTLASS_TRACE_HOST(x)
+#else
+#define CUTLASS_TRACE_HOST(x) { std::cout << __FILE__ << ":" << __LINE__ << "  " << x << std::endl; }
+#endif
+#else
+#define CUTLASS_TRACE_HOST(x)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
new file mode 100755
index 000000000..430545e6d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+*/
+
+#pragma once
+
+#include "cute/arch/mma_sm90_gmma.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+using namespace cute;
+
+template <bool Transpose, class SmemLayoutAtom, class ElementType>
+constexpr auto
+gmma_smem_transpose_or_passthrough() {
+  if constexpr (Transpose) {
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW32_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<SmemLayoutAtom>, "Unsupported Layout_SW_Atom for B SMEM transposition");
+    }
+  }
+  else {
+    return SmemLayoutAtom{};
+  }
+}
+
+template <class SmemCopyAtom, class ElementType>
+constexpr auto
+use_universal_transposition() {
+  if constexpr (sizeof(ElementType) == 1) {
+    return !cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemCopyAtom>;
+  }
+  else if constexpr (sizeof(ElementType) == 4){
+    // Only universal transposition can handle SW64 and Non swizzle SMEM layout
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemCopyAtom> ||
+                  cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemCopyAtom>) {
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "Unsupported ElementType for B SMEM transposition");
+  }
+}
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class NoTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  constexpr CUTLASS_HOST_DEVICE
+  NoTranspositionOperandB(
+      int,
+      int,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int, int) { }
+
+  CUTLASS_DEVICE void synchronize(int) { }
+
+  CUTLASS_DEVICE void synchronize() { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int) { }
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class UniversalTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  constexpr CUTLASS_HOST_DEVICE 
+  UniversalTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage, int current_step) {
+      if (current_step > 0) {
+        return;
+      }
+
+      constexpr int NumMathWarpGroup = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+      static_assert(NumMathWarpGroup == 1 ||
+                    (!detail::use_universal_transposition<SmemLayoutAtomB, ElementB>() && NumMathWarpGroup == 2),
+                    "Wrong math warp group number for TransposeB");
+      constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+
+      constexpr int BytesPerSmemSwizzleUnit = 16;
+      constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// Universal transposition, need warp_group sync between load and store.
+      /// The number of reg used depends on the input elementB.
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /*
+          In one copy step, a warp group would load WarpgroupTileSize * WarpgroupTileSize tile then store to transposed location.
+          In warp_group_tile, each warp holds Four WarpTileSize x WarpTileSize elements:
+                    K
+              ------------
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step 0
+            | W0 W1 W2 W3  ---
+                  ....
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step n
+            | W0 W1 W2 W3  ---
+      */
+      static_assert((NumThreadsPerWarpGroup % WarpThreadShapeN == 0), "Unsupported warp thread layout.");
+      constexpr auto WarpgroupThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<NumThreadsPerWarpGroup / WarpThreadShapeN>{}));
+
+      // Get copy tile and partition to each thread
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpgroupThreadLayout,                           // thr_layout
+        Layout<_1>{}                                     // val_layout
+      );
+      static_assert(size(sB_tiled_copy) == size(TiledMma{}), "Wrong thread number in TiledCopy.");
+
+      auto sB_thr_copy        = sB_tiled_copy.get_thread_slice(warp_group_thread_idx);
+      Tensor tCsB             = sB_thr_copy.partition_S(     sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+      Tensor tCsB_transposed  = sB_thr_copy.partition_D(gmma_sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+
+      // Divide partitioned tile to limit register usage
+      constexpr int  CopySteps      = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+      constexpr auto CopyTileShape  = make_shape(size<0>(tCsB), Int< size<1>(tCsB) / CopySteps >{}, size<2>(tCsB));
+      static_assert(size<1>(tCsB) % CopySteps == 0, "CopySteps must evenly divide rank 1 size of partitioned SMEM.");
+
+      Tensor tCsB_copy_tile            = zipped_divide(tCsB, CopyTileShape);
+      Tensor tCsB_copy_tile_transposed = zipped_divide(tCsB_transposed, CopyTileShape);
+      auto   transpose_fragment        = make_fragment_like(tCsB_copy_tile(_,_0{}));
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step = 0; step < CopySteps; ++step) {
+        copy(sB_tiled_copy, tCsB_copy_tile(_,step), transpose_fragment);
+
+        // Make sure all elements are read before being overwritten
+        __syncthreads();
+
+        copy(sB_tiled_copy, transpose_fragment, tCsB_copy_tile_transposed(_,step));
+      }
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    // SMEM fence to make sure B is transposed before math
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB {
+public:
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  static constexpr int Steps             = 2;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  static_assert(size<2>(typename TiledMma::AtomShape_MNK{}) <= WarpThreadShapeK,
+      "Need to be able to transpose first k-block in the first step");
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+      if (current_step >= StepsPerWarpGroup) {
+        return;
+      }
+
+      static constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// A warp group uses 2 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+      /// In each step, one warp would hold two warp_tiles.
+      ///  Step 0:                Step 1:
+      ///  W0 W1 W2 W3            -- -- -- --
+      ///  W1 W0 -- --            -- -- W3 W2
+      ///  W2 -- -- --            -- W3 W0 W1
+      ///  W3 -- -- --            -- W2 W1 W0
+      ///
+      /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///
+      /// Fully static coord LUT to avoid extra register use.
+      /// [warp_id][step][warp_tile][n / k]
+      /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+      /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+      /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+      /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+      /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+      ///
+      /// Encoding the coord of warp tile0 into two int64_t values.
+      /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+      /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+      /// The 2-step transposition and the 8-step transposition share the same encoding.
+      ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      // Divide entire SMEM to multiple warp_tiles
+      constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+      Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+      Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+      // Get copy tile
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpThreadLayout,     // thr_layout
+        Layout<_1>{}          // val_layout
+      );
+
+      static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+      auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+      // Construct fragments for transposition
+      Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+      decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+        make_fragment_like(tmp_tCsB),
+        make_fragment_like(tmp_tCsB)
+      };
+
+      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
+      if constexpr (NumMathWarpGroup == 2) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        step += warp_idx / (NumWarpsPerWarpGroup * 2);
+      }
+
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT >> (NumBitsPerStep * current_step);
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT >> (NumBitsPerStep * current_step);
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      // decoding the warp tile coord.
+      int warp_tile0_n, warp_tile0_k;
+      if constexpr (StepsPerWarpGroup <= NumStepsEncoded) {
+        warp_tile0_n = tmp_warp_tile_n_coord_LUT & MaskPerStep;
+        warp_tile0_k = tmp_warp_tile_k_coord_LUT & MaskPerStep;
+      } else {
+        warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+      }
+
+      int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+      int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+
+      } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step < StepsPerWarpGroup) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < StepsPerWarpGroup; ++i) {
+      this->operator()(sB, gmma_sB, read_stage, i);
+    }
+    synchronize();
+
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB_1BElementB {
+public:
+
+  static_assert(sizeof(ElementB_) == 1);
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  static constexpr int Steps             = 8;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB_1BElementB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+    if (current_step > 0) {
+      return;
+    }
+
+    constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
+    ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
+    ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W3 W0 W1 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W2 W1 W0 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W1 W0 -- --   -- -- -- -- -- -- W3 W2
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W2 -- -- --   -- -- -- -- -- W3 W0 W1
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W3 -- -- --   -- -- -- -- -- W2 W1 W0
+    ///
+    ///  Step 4:                   Step 5:                   Step 6:                   Step 7:
+    ///  -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3
+    ///  W0 -- -- -- -- -- -- --   -- W0 -- -- -- -- -- --   -- -- W0 -- -- -- -- --   -- -- -- W0 -- -- -- --
+    ///  W1 -- -- -- -- -- -- --   -- W1 -- -- -- -- -- --   -- -- W1 -- -- -- -- --   -- -- -- W1 -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W2 -- -- -- -- -- --   -- -- W2 -- -- -- -- --   -- -- -- W2 -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W3 -- -- -- -- -- --   -- -- W3 -- -- -- -- --   -- -- -- W3 -- -- -- --
+    ///
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///
+    /// Fully static coord LUT to avoid extra register use.
+    /// [warp_id][step][warp_tile][n / k]
+    /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+    /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+    /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+    /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+    /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+    ///
+    /// Encoding the coord of warp tile0 into two int64_t values.
+    /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+    /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+    /// The 2-step transposition and the 8-step transposition share the same encoding.
+    ///
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Divide entire SMEM to multiple warp_tiles
+    constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+    Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+    Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+    // Get copy tile
+    auto sB_tiled_copy = make_tiled_copy(
+      Copy_Atom<DefaultCopy, ElementB>{},
+      WarpThreadLayout,     // thr_layout
+      Layout<_1>{}          // val_layout
+    );
+    static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+    auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+    // Construct fragments for transposition
+    Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+    decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+      make_fragment_like(tmp_tCsB),
+      make_fragment_like(tmp_tCsB)
+    };
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT;
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT;
+      constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step_per_warp_group = 0; step_per_warp_group < StepsPerWarpGroup; ++step_per_warp_group) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        int step = step_per_warp_group * NumMathWarpGroup + warp_idx / (NumWarpsPerWarpGroup * 2);
+        // decoding the warp tile coord.
+        int warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        int warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+        int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+        int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep;
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep;
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+      } // lock step
+    } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+
+template<
+  class TiledMma,
+  class SmemLayoutB,
+  class SmemLayoutAtomB,
+  class ElementB,
+  bool TransposeB
+>
+constexpr CUTLASS_HOST_DEVICE
+auto
+make_transpose_operand_b(
+    int warp_idx,
+    int warp_group_thread_idx,
+    TiledMma,
+    SmemLayoutB,
+    SmemLayoutAtomB,
+    ElementB,
+    cute::bool_constant<TransposeB>)
+{
+  if constexpr (!TransposeB) {
+    return NoTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (use_universal_transposition<SmemLayoutAtomB, ElementB>()) {
+    return UniversalTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (sizeof(ElementB) == 1) {
+    return AsyncTranspositionOperandB_1BElementB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else {
+    return AsyncTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+}
+
+}; // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
new file mode 100755
index 000000000..c7ab0ceb0
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Transform Kernel Universal adapter
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::transform::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TransformKernel_>
+class TransformUniversalAdapter
+{
+public:
+  using TransformKernel = TransformKernel_;
+  using Arguments = typename TransformKernel::Arguments;
+  using Params = typename TransformKernel::Params;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    return TransformKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += TransformKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = TransformKernel::to_underlying_arguments(args, workspace);
+    return TransformKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return TransformKernel::get_grid_shape(params);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null")
+      << ", EnableCudaHostAdapter: " << (kEnableCudaHostAdapter ? "True" : "false"));
+
+    // Initialize the workspace
+    Status status = TransformKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = TransformKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = TransformKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<TransformKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      int32_t kernel_index = 0,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::run()");
+    dim3 const block = TransformKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = TransformKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (TransformKernel::ArchTag::kMinComputeCapability >= 90) {
+      // Currently only support 1x1x1 for transform kernel.
+      dim3 const cluster = {1,1,1};
+      void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "TransformUniversalAdapter::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               kernel_index);
+          CUTLASS_TRACE_HOST("Kernel Launch Result" << cutlassGetStatusString(launch_result));
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<TransformKernel>;
+        if constexpr (TransformKernel::ArchTag::kMinComputeCapability == 90) {
+          launch_result = ClusterLauncher::launch(
+            grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        cutlass::kernel_launch<TransformKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else if (cudaSuccess != result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cudaGetErrorString(result));
+    }
+    else if (Status::kSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cutlassGetStatusString(launch_result));
+    }
+    return Status::kErrorInternal;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::transform::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
new file mode 100755
index 000000000..9f54c93f1
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/* \file
+   \brief Convolution filter format transformation kernel.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <random>
+
+#include "cutlass/coord.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/int_tuple.hpp"
+#include "cute/tensor.hpp"
+#include "cute/config.hpp"
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+enum class FilterFormat {
+  CKTRS,
+  CTRSK,
+  KTRSC
+};
+
+template <
+  FilterFormat SrcFormat,
+  FilterFormat DstFormat,
+  int NumDimensions,
+  class Element_,
+  int AlignmentBytes = 16
+>
+struct ConvFilterFormatTransformer {
+  
+  using Element = Element_;
+  static_assert(SrcFormat == FilterFormat::CKTRS, "Currently only source format of CKTRS is supported");
+  static_assert(DstFormat == FilterFormat::CTRSK || DstFormat == FilterFormat::KTRSC, "Currently only destination format of CTRSK/KTRSC is supported");
+  static_assert(AlignmentBytes > 0 && AlignmentBytes % static_cast<int>(sizeof(Element)) == 0, "Invalid alignment setting");
+
+  // In ktrsc order.
+  using FilterExtent = array<int, NumDimensions>;
+
+  // Default cta tile shape: 32x32
+  static constexpr auto CTATileShape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+  // Default thread layout: (4, 32)
+  static constexpr auto ThreadLayout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+
+  static constexpr uint32_t MaxThreadsPerBlock = 128;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = arch::Sm90;
+
+  // Default ctor
+  CUTLASS_HOST_DEVICE
+  ConvFilterFormatTransformer() {}
+
+  struct Arguments {
+    const void *src_ptr;
+    void *dst_ptr;
+    FilterExtent filter_extent;
+  };
+
+  struct Params {
+    using TensorSrc = decltype(make_tensor(make_gmem_ptr(recast_ptr<const Element>(nullptr)), make_layout(take<0,NumDimensions>(FilterExtent{}))));
+    using TensorDst = decltype(make_tensor(make_gmem_ptr(recast_ptr<Element>(nullptr)), make_layout(make_shape(int32_t(0), int32_t(0)))));
+
+    TensorSrc src;
+    TensorDst dst; 
+  };
+
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  static Status
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    // alignment rule
+    {
+      int contiguous_dim = DstFormat == FilterFormat::CTRSK ? args.filter_extent[0] : args.filter_extent[NumDimensions - 1];
+      int align_element = AlignmentBytes / static_cast<int>(sizeof(Element));
+
+      implementable &= (contiguous_dim % align_element == 0);
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Alignment setting is invalid.\n");
+        return Status::kInvalid;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(size(shape(ThreadLayout)), 1, 1);
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto dim_m = ceil_div(size<0>(shape(params.dst)), get<0>(CTATileShape));
+    auto dim_n = ceil_div(size<1>(shape(params.dst)), get<1>(CTATileShape));
+
+    return dim3(dim_m, dim_n, 1);
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    auto k = args.filter_extent[0];
+    auto c = args.filter_extent[NumDimensions - 1];
+    auto srt = reverse(take<1,NumDimensions - 1>(args.filter_extent));
+
+    // source shape (s,r,t,k,c)
+    auto shape_src = flatten(make_shape(srt, k, c));
+    auto shape_dst = DstFormat == FilterFormat::CTRSK ? make_shape(k, c * product(srt)) : make_shape(c, k * product(srt));
+
+    auto src = make_tensor(make_gmem_ptr(recast_ptr<const Element>(args.src_ptr)), make_layout(shape_src));
+    auto dst = make_tensor(make_gmem_ptr(recast_ptr<Element>(args.dst_ptr)), make_layout(shape_dst));
+
+    return Params{src, dst};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char *smem_buf) {
+    // Tile the input tensor into blocks
+    auto block_coord = make_coord(blockIdx.x, blockIdx.y);
+    auto block_shape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+    // Default thread layout: (4, 32)
+    auto thread_layout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+    auto vec_layout = make_layout(make_shape(Int<AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<1>{}));
+
+    Tensor tile_D = local_tile(params.dst, block_shape, block_coord);
+
+    // Construct tiled copy
+    using AccessType = cutlass::AlignedArray<Element, size(vec_layout)>;
+    using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
+
+    auto tiled_copy = make_tiled_copy(Atom{}, thread_layout, vec_layout);
+    auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+    Tensor thr_tile_D = thr_copy.partition_D(tile_D);
+
+    // shape (s, r, t)
+    auto shape_trs = take<0, NumDimensions - 2>(shape(params.src));
+    // strided_c = c for format CTRSK, strided_c = k for format KTRSC
+    auto strided_c = DstFormat == FilterFormat::CTRSK ? get<NumDimensions - 1>(shape(params.src)) : get<NumDimensions - 2>(shape(params.src));
+    // shape (s, r, t, c) for format CTRSK and shape (s, r, t, k) for format KTRSC 
+    auto shape_ctrs = append<NumDimensions - 1>(shape_trs, strided_c);
+    auto srtc_coord = idx2crd(int(blockIdx.y * get<1>(block_shape) + threadIdx.x / size<0>(thread_layout)), shape_ctrs);
+    // index of k for format CTRSK and index of c for format KTRSC
+    auto n_layout = make_layout(make_shape(gridDim.x, size<0>(thread_layout)), make_stride(size<0>(block_shape), size<0>(vec_layout)));
+    int n_idx = n_layout(make_coord(blockIdx.x, threadIdx.x % size<0>(thread_layout)));
+
+    // Fragment to load from S and store to D
+    auto frag = make_fragment_like(thr_tile_D);
+    // Predicate tensor.
+    Tensor thr_tile_P = make_tensor<bool>(shape(thr_tile_D));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(frag); ++i) {
+      auto srt_coord = take<0, NumDimensions - 2>(srtc_coord);
+      auto kc_coord = DstFormat == FilterFormat::CTRSK ?
+          make_coord(n_idx+i, get<NumDimensions - 2>(srtc_coord)) :
+          make_coord(get<NumDimensions - 2>(srtc_coord), n_idx+i);
+      auto coord = flatten(make_coord(srt_coord, kc_coord)); 
+      thr_tile_P(i) = elem_less(coord, shape(params.src));
+      if (thr_tile_P(i)) {
+        frag(i) = params.src(coord);
+      }
+    }
+
+    // Copy from RMEM to GMEM
+    copy_if(tiled_copy, thr_tile_P, frag, thr_tile_D);
+  }
+};
+
+} // namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
new file mode 100755
index 000000000..0ae7bab06
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
@@ -0,0 +1,578 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+
+#pragma once
+
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v, cute::uint_bit_t
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cute/algorithm/cooperative_copy.hpp" // cute::cooperative_copy
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressor {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementAMmaRawUnit = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using ElementAUintCompressed = cute::sparse_elem<ElementASparsity{}, ElementAUint>;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+  // Data Type for storing one chunk's metadata
+  static constexpr int ElementEBitsPerChunk = typename SparseConfig::ElementEBitsPerChunk{};
+  CUTE_STATIC_ASSERT(ElementEBitsPerChunk == 4, "ElementEBitsPerChunk is 4 for SM90");
+  using ElementEChunk = cute::uint_bit_t<ElementEBitsPerChunk>;
+  CUTE_STATIC_ASSERT(cute::is_same_v<ElementEChunk, cute::uint4_t>, "ElementEChunk is uint4_t for SM90");
+  using ElementESparsityPerChunk = Int<ElementEMmaSparsity{} / (cute::sizeof_bits_v<ElementEMmaRaw> / ElementEBitsPerChunk)>;
+
+  // AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = TensorEAtomM{};
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+
+  struct SharedStorage {
+    ElementEMma cEsE[cute::size(TensorEAtom{})];
+    ElementAUintCompressed cACsAC[cute::size(TensorEAtom{})];
+    ElementAUint cAsA[cute::size(TensorEAtom{})];
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct TransformArguments {
+    void const* ptr_A{nullptr};
+    StrideA dA{};
+    void* ptr_ACompress{nullptr};
+    void* ptr_E{nullptr};
+  };
+
+  using TransformParams = TransformArguments;
+
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+
+public:
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::to_underlying_arguments()");
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size");
+      return Status::kErrorInvalidProblem;
+    }
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::can_implement() (True)");
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    CUTLASS_UNUSED(args);
+    // Backward compatible with host compressor
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_workspace_size() (" << SharedStorageSize << ")");
+    return SharedStorageSize;
+  }
+
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    CUTLASS_UNUSED(args);
+    CUTLASS_UNUSED(workspace);
+    CUTLASS_UNUSED(stream);
+    CUTLASS_UNUSED(cuda_adapter);
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::initialize_workspace()");
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    constexpr int MaxAlignmentM = cutlass::const_max(TensorEAlignmentM, TensorAAlignmentM);
+    constexpr int MaxAlignmentK = cutlass::const_max(TensorEAlignmentK, TensorAAlignmentK);
+    const auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+
+    const int GemmMAlignedMax = cutlass::round_up(GemmM, MaxAlignmentM);
+    const int GemmKAlignedMax = cutlass::round_up(GemmK, MaxAlignmentK);
+
+    const int gridDim_X = cutlass::ceil_div(GemmMAlignedMax, TensorEAtomM{});
+    const int gridDim_Y = cutlass::ceil_div(GemmKAlignedMax, TensorEAtomK{});
+    const int gridDim_Z = GemmL;
+
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_grid_shape() ("
+      << gridDim_X << ", "
+      << gridDim_Y << ", "
+      << gridDim_Z << ")");
+    return dim3(gridDim_X, gridDim_Y, gridDim_Z);
+  }
+
+  static dim3
+  get_block_shape() {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_block_shape() ("
+      << MaxThreadsPerBlock << ", "
+      << 1 << ", "
+      << 1 << ")");
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTE_DEVICE
+  void
+  operator()(Params params, void* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+
+  CUTE_DEVICE
+  static void
+  run(Params params, void* smem_buf = nullptr) {
+    structure_sparse_compress(params, smem_buf);
+  }
+
+private:
+
+  struct MetadataOneChunk1to2 {
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 2);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b0100;
+          case 1:
+            return 0b1110;
+          default:
+            CUTE_GCC_UNREACHABLE;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (4 * elt_phy_idx));
+    }
+
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  struct MetadataOneChunk2to4{
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 4);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b00;
+          case 1:
+            return 0b01;
+          case 2:
+            return 0b10;
+          case 3:
+            return 0b11;
+          default:
+            CUTE_GCC_UNREACHABLE;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (2 * elt_phy_idx));
+    }
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  using MetadataOneChunk = cute::conditional_t<SparseConfig::IsTfmma,
+                                               MetadataOneChunk1to2,
+                                               MetadataOneChunk2to4>;
+
+private:
+
+  CUTE_DEVICE
+  static void
+  structure_sparse_compress(Params params, void* smem_buf) {
+    // * Input Params
+    auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    [[maybe_unused]] const int gridDim_X = gridDim.x;
+    [[maybe_unused]] const int gridDim_Y = gridDim.y;
+    [[maybe_unused]] const int gridDim_Z = gridDim.z;
+    [[maybe_unused]] const int blockDim_X = blockDim.x;
+
+    // * Global Tensor Layout
+    const cute::Layout layout_gA = make_layout(make_shape(GemmM, GemmK, GemmL), dA);
+    const cute::Layout layout_gAC = SparseConfig::fill_layoutA(params.problem_shape);
+    const cute::Layout layout_gE = SparseConfig::fill_layoutE(params.problem_shape);
+
+    // * Construct Global Tensor
+    const cute::Tensor gA   = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUint>(ptr_A)), layout_gA);
+    cute::Tensor gAC_sparse = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUintCompressed>(ptr_ACompress)), layout_gAC );
+    cute::Tensor gAC        = cute::recast<ElementAUint>(gAC_sparse);
+    cute::Tensor gE_sparse  = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementEMma>(ptr_E)), layout_gE);
+    cute::Tensor gE         = cute::recast<ElementEMmaRaw>(gE_sparse);
+
+    // * CTA Tensor Layout
+    using cAsA_layout_row = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutRight{}));
+    using cAsA_layout_col = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutLeft{}));
+    using cAsA_layout     = cute::conditional_t<cute::is_same_v<LayoutATag, layout::RowMajor>, cAsA_layout_row, cAsA_layout_col>;
+    using cACsAC_layout   = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementASparsity{}), LayoutRight{}));
+    using cEsE_layout     = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementEMmaSparsity{}), LayoutRight{}));
+
+    CUTE_STATIC_ASSERT(cute::is_static_v<TensorEAtom>, "TensorEAtom needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cAsA_layout>, "cAsA_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cACsAC_layout>, "cACsAC_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cEsE_layout>, "cEsE_layout needs to be static");
+
+    const int blockIdx_X = blockIdx.x;
+    const int blockIdx_Y = blockIdx.y;
+    const int blockIdx_Z = blockIdx.z;
+    const int threadIdx_X = threadIdx.x;
+
+    // * Construct CTA Tensor
+    const auto cta_coord = make_coord(blockIdx_X, blockIdx_Y, blockIdx_Z);
+    cute::Tensor cAgA   = cute::recast<ElementAMmaRawUnit>(local_tile(gA, shape(cAsA_layout{}), cta_coord));
+    cute::Tensor cACgAC = cute::recast<ElementAMmaRawUnit>(local_tile(gAC, shape(cACsAC_layout{}), cta_coord));
+    cute::Tensor cEgE   = local_tile(gE, shape(cEsE_layout{}), cta_coord);
+
+    cute::Tensor cAsA   = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cAsA)), cAsA_layout{}));
+    cute::Tensor cACsAC = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cACsAC)), cACsAC_layout{}));
+    cute::Tensor cEsE   = make_tensor(make_smem_ptr(cute::recast_ptr<ElementEMmaRaw>(shared_storage.cEsE)), cEsE_layout{});
+    cute::Tensor cEsE_chunk = cute::recast<ElementEChunk>(cEsE);
+
+    // * Handle in unit of Chunk when compress
+    using OneChunkSizeA  = Int<LogicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeAC = Int<PhysicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeE  = Int<LogicalElemsAPerChunk / ElementESparsityPerChunk{}>;
+    using NumOneChunkK   = Int<cutlass::ceil_div(TensorEAtomK{}, LogicalElemsAPerChunk)>;
+
+    cute::Tensor cAsA_log_chunk   = logical_divide(cAsA, make_shape(_, OneChunkSizeA{}));
+    cute::Tensor cACsAC_log_chunk = logical_divide(cACsAC, make_shape(_, OneChunkSizeAC{}));
+    cute::Tensor cEsE_log_chunk   = logical_divide(cEsE_chunk, make_shape(_, OneChunkSizeE{}));
+
+    // * Corner Case Handle
+    const auto GemmM_within_Cta = (GemmM - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmM - blockIdx_X * TensorEAtomM{};
+    const auto GemmK_within_Cta = ( (GemmK - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmK - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+    const auto GemmK_NumOneChunk_within_Cta = GemmK_within_Cta / LogicalElemsAMmaRawPerChunk;
+
+    const auto GemmMAlignedAC = cutlass::round_up(GemmM, TensorAAlignmentM);
+    const auto GemmKAlignedAC = cutlass::round_up(GemmK, TensorAAlignmentK);
+    const auto GemmMAlignedAC_within_Cta = (GemmMAlignedAC - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmMAlignedAC - blockIdx_X * TensorEAtomM{};
+    const auto GemmKAlignedAC_within_Cta = ( (GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+
+    // * Clear CTA Smem Tensor
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cACsAC);
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cEsE);
+
+    // * Input CTA Tensor G to S
+    if (GemmM_within_Cta == TensorEAtomM{} && GemmK_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+
+    // * Compress
+    // cACsAC is always row major order
+    // TensorEAtomM threads perform the compression, each thread compress one row
+    const int row_i = threadIdx_X;
+    if (row_i < GemmM_within_Cta) {
+
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < NumOneChunkK{}; ++col_chunk_i) {
+        if (col_chunk_i < GemmK_NumOneChunk_within_Cta) {
+          // Compress is handled in unit of ElementAMmaRawUnit
+          cute::Tensor tAsA   = cAsA_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tACsAC = cACsAC_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tEsE   = cEsE_log_chunk(row_i, make_coord(_, col_chunk_i));
+
+          int non_zero_cnt = 0;
+          // None zero element indx
+          // e.g.
+          //  2:4 sparsity [x 0 0 x]
+          //  non_zero_elt_log_idx = [0, 3]
+          int non_zero_elt_log_idx[OneChunkSizeAC{}] = { 0 };
+
+          // * Find None Zero Element Idx within Chunk
+          CUTE_UNROLL
+          for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
+            ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
+            if ( elem_A != ElementAMmaRawUnit{0} ) {
+              non_zero_elt_log_idx[non_zero_cnt] = elt_log_idx;
+              tACsAC[non_zero_cnt] = elem_A;
+              non_zero_cnt++;
+            }
+          }
+
+          // * Corner Case for 2:4 sparsity
+          if constexpr (cute::sizeof_bits_v<ElementAMmaRawUnit> < 32) {
+            // i.e. [0 0 0 x] -> [(0) 0 0 x]
+            if (non_zero_cnt == 1 && non_zero_elt_log_idx[0] == 3) {
+              tACsAC[1] = tACsAC[0];
+              tACsAC[0] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[0] = 0;
+              non_zero_elt_log_idx[1] = 3;
+            }
+            // i.e. [0 0 x 0] -> [0 0 x (0)]
+            // i.e. [0 x 0 0] -> [0 x 0 (0)]
+            // i.e. [x 0 0 0] -> [x 0 0 (0)]
+            else if (non_zero_cnt == 1) {
+              tACsAC[1] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[1] = 3;
+            }
+          }
+
+          // * Set Metadata Bits
+          MetadataOneChunk metadata_one_chunk;
+          CUTE_UNROLL
+          for (int elt_phy_idx = 0; elt_phy_idx < OneChunkSizeAC{}; elt_phy_idx++) {
+            metadata_one_chunk.set_metadata_bits(non_zero_elt_log_idx[elt_phy_idx], elt_phy_idx);
+          }
+          tEsE[0] = metadata_one_chunk.storage();
+
+        }
+        else {
+          break;
+        }
+      }
+    }
+
+    // * Sync after Compress
+    __syncthreads();
+
+    // * Output Cta Tensor S to G
+    if (GemmM_within_Cta > 0 && GemmK_within_Cta > 0) {
+      constexpr int MaxVecBits = 128; // STG.128
+      cute::cooperative_copy<MaxThreadsPerBlock, MaxVecBits>(threadIdx_X, cEsE, cEgE);
+    }
+
+    if (GemmMAlignedAC_within_Cta == TensorEAtomM{} && GemmKAlignedAC_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+
+  } // end of structure_sparse_compress()
+
+  template<uint32_t NumThreads,
+           typename TensorSrc>
+  CUTE_DEVICE
+  static void
+  cooperative_clear(
+    uint32_t const& tid,
+    TensorSrc dSrc) {
+    
+    auto dSrctSrc = local_partition(dSrc, make_layout(make_shape(NumThreads, _1{})), tid);
+    cute::clear(dSrctSrc);
+
+    // Sync all thread data access
+    __syncthreads();
+  }
+
+  template <bool pred,
+            typename LayoutTag,
+            typename TensorSrc,
+            typename TensorDst>
+  CUTE_DEVICE
+  static void
+  copy_vec_pred(
+      TensorSrc dSrc,
+      TensorDst dDst,
+      int threadIdx_X,
+      int valid_rows,
+      int valid_cols) {
+
+    constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
+    using Element = typename TensorSrc::element_type;
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
+    CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
+      "dSrc and dDst need to have same element bit width");
+    CUTE_STATIC_ASSERT(cute::size(dSrc) == cute::size(dDst), "dSrc and dDst need to have same size");
+
+    // ValueShape
+    using ValueShape = 
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
+                          Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
+      ;
+
+    constexpr int ValueShapeRows = shape<0>(ValueShape{});
+    constexpr int ValueShapeCols = shape<1>(ValueShape{});
+
+    // ThreadShape
+    using ThreadShape = 
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
+                          Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
+      ;
+
+    constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
+    constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
+
+    const int threadIdx_X_row = threadIdx_X / ThreadShapeCols;
+    const int threadIdx_X_col = threadIdx_X % ThreadShapeCols;
+
+    // Row Major
+    if constexpr (IsRowMajor) {
+      CUTE_UNROLL
+      for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+        CUTE_UNROLL
+        for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+          CUTE_UNROLL
+          for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+            CUTE_UNROLL
+            for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) 
+              ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Col Major
+    else {
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+        CUTE_UNROLL
+        for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+          CUTE_UNROLL
+          for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+            CUTE_UNROLL
+            for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred)
+              ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  
+    // Sync all thread data access
+    __syncthreads();
+  } // end of copy_vec_pred()
+  
+};
+
+}  // namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
new file mode 100755
index 000000000..51f42e9fd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
@@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils for structured sparse kernels
+*/
+
+#pragma once
+
+#include <algorithm>                       // std::fill
+#include <array>                           // std::array
+#include <random>                          // std::mt19937
+
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cutlass/arch/arch.h"             // cutlass::arch::SmXY
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+
+#include "cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp"
+
+namespace cutlass::transform::kernel {
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class StructuredSparseCompressorUtility {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  //* EltA
+  using ElementA = ElementA_;
+  using LayoutATag = LayoutATag_;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+
+  //* EltE
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+
+  //* AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  //* Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  StructuredSparseCompressorUtility() = default;
+
+  StructuredSparseCompressorUtility(ProblemShape problem, StrideA dA) {
+    set_problem_size(problem, dA);
+  }
+
+  void set_problem_size(ProblemShape problem, StrideA dA_) {
+    M = cute::size<0>(problem);
+    K = cute::size<2>(problem);
+    L = cute::size<3>(problem);
+
+    // The following three vars are logical elem count!
+    K_alignedA  = round_up(K, TensorAAlignmentK);
+    M_alignedA  = round_up(M, TensorAAlignmentM);
+    K_alignedE = round_up(K, TensorEAlignmentK);
+    M_alignedE = round_up(M, TensorEAlignmentM);
+
+    dA = dA_;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along K after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along K-dim
+   */
+  int get_metadata_m_physical() const {
+    return M_alignedE;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along M after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along M-dim
+   */
+  int get_metadata_k_physical() const {
+    return K_alignedE / ElementEMmaSparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along K after alignment requirement
+   * 
+   * @return int : number of ElementA along K-dim
+   */
+  int get_tensorA_k_physical() const {
+    return K_alignedA / ElementASparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along M after alignment requirement
+   * 
+   * @return int : number of ElementA along M-dim
+   */
+  int get_tensorA_m_physical() const {
+    return M_alignedA;
+  }
+
+  /**
+   * @brief Get the TensorACompressed Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_compressed_tensor_A_bytes() const {
+    const auto tensor_a_comp_num_elt_a = get_tensorA_m_physical() * get_tensorA_k_physical() * L;
+    const auto tensor_a_comp_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_comp_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_comp_bytes;
+  }
+
+  /**
+   * @brief Get the TensorA Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_raw_tensor_A_bytes() const {
+    const auto tensor_a_num_elt_a = uint64_t(M) * uint64_t(K) * uint64_t(L);
+    const auto tensor_a_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_bytes;
+  }
+
+  /**
+   * @brief Get the TensorE Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_tensor_E_bytes() const {
+    const auto tensor_e_num_elt_a = uint64_t(get_metadata_m_physical()) * uint64_t(get_metadata_k_physical()) * uint64_t(L);
+    const auto tensor_e_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_e_num_elt_a * cute::sizeof_bits_v<ElementEMmaRaw>);
+    return tensor_e_bytes;
+  }
+
+  constexpr auto fill_layoutA_from_compressor() const {
+    return SparseConfig::fill_layoutA(cute::make_tuple(M,_1{},K,L));
+  }
+
+  constexpr auto fill_layoutE_from_compressor() const {
+    return SparseConfig::fill_layoutE(cute::make_tuple(M,_1{},K,L));
+  }
+
+  void structure_sparse_zero_mask_fill(void* host_a_ptr, uint64_t seed) {
+    
+    constexpr int ChunkSize = LogicalElemsAMmaRawPerChunk;
+    using ChunkElement = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+
+    cute::Tensor gA_eltA = cute::make_tensor(
+        cute::recast_ptr<ElementA>(host_a_ptr),
+        cute::make_layout(make_shape(M, K, L), dA));
+
+    // Input TensorA is handled in unit of ElementAMmaRaw instead of ElementA
+    cute::Tensor gA = cute::recast<ChunkElement>(gA_eltA);
+
+    // Extract out the Chunk from K-mode
+    Tensor gA_chunk = cute::zipped_divide(gA, cute::Shape<_1,cute::Int<ChunkSize>>{}); // (Chunk, Rest)
+
+    // Half of the data is zero to indicate sparsityA = 2
+    std::array<int, ChunkSize> nnzb_indicator{};
+    for (size_t i = 1; i < nnzb_indicator.size(); i += 2) {
+      nnzb_indicator.at(i) = 1;
+    }
+
+    std::mt19937 rng(seed);
+    auto rest_shape = cute::shape<1>(gA_chunk);
+    for (auto iter = cute::make_coord_iterator(rest_shape); iter != cute::ForwardCoordIteratorSentinel{}; ++iter) {
+      std::shuffle(nnzb_indicator.begin(), nnzb_indicator.end(), rng);
+      for (int c = 0; c < size<0>(gA_chunk); ++c) {                        // for each elem within chunk
+        if (nnzb_indicator[c] == 0) {
+          gA_chunk(c, *iter) = ChunkElement{0};
+        }
+      }  // end of within chunk
+    }    // end of chunk_idx
+  }
+
+  int M{-1};
+  int K{-1};
+  int L{-1};
+  StrideA dA{};
+
+private:
+  int K_alignedA{-1};
+  int M_alignedA{-1};
+  int K_alignedE{-1};
+  int M_alignedE{-1};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+struct StructuredSparseCompressorSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a structured sparse compressor for given parameters.");
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm90> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+using StructuredSparseCompressor = typename StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    ArchTag
+>::Compressor;
+
+} // End namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h b/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
new file mode 100755
index 000000000..0fcb48e56
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
@@ -0,0 +1,926 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Strip-mines a pitch-linear tile among a given number of threads, first along
+/// the contiguous dimension then along the strided dimension.
+///
+/// The tile must be divisible by the thread count such that all threads may
+/// execute the same number of iterations with the same delta to exhaustively
+/// cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+///
+/// This ThreadMap is used by SIMT kernels and operand E of the sparse tensor
+/// kernels.
+template <
+  typename Shape_,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStripminedThreadMap {
+  
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal implementation details
+  struct Detail {
+
+    static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
+
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    static_assert((Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+                      (!(kThreads % ShapeVec::kContiguous)),
+                  "Shape must be divisible by number of iterations of each thread.");
+  };
+
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? (Detail::ShapeVec::kStrided + (kThreads / Detail::ShapeVec::kContiguous - 1)) /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+  
+
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      1,
+      kThreads / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * kElementsPerAccess,
+      1
+    >
+  >::type;
+
+  /// Shape of the tile in units of vectors
+  using StorageShape = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<Shape::kContiguous,
+                               Iterations::kStrided*(kThreads / Detail::ShapeVec::kContiguous)>,
+      layout::PitchLinearShape<Shape::kContiguous, Shape::kStrided>>::type;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess, 
+      thread_id / Detail::ShapeVec::kContiguous);
+  }
+};
+
+/// This ThreadMap is used by GEMV
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadContiguous
+{
+ static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
+              "Contiguous shape must divide number of threads");
+
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / (kThreads * kElementsPerAccess),
+                      Shape::kStrided>;
+
+  using Delta = layout::PitchLinearShape<1, 1>;
+
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+    return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
+  }
+};
+
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadStrided
+{
+  static_assert((Shape::kStrided % Threads == 0),
+                "Strided shape must divide number of threads");
+
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / kElementsPerAccess,
+                      Shape::kStrided / kThreads>;
+
+  using Delta = layout::PitchLinearShape<1, 1>;
+
+  using ShapeVec = Shape;
+
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+
+    return TensorCoord(0, thread_id * Iterations::kStrided);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements.
+///
+/// This ThreadMap is used by tensor core kernels.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearWarpRakedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    static_assert(
+      !(ShapeInAccesses::kContiguous % WarpThreadArrangement::kContiguous),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+
+    static_assert(
+      !(ShapeInAccesses::kStrided % WarpThreadArrangement::kStrided),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+        (WarpAccessIterations::kStrided >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kStrided);
+
+    static int const kWarpsContiguous =
+        (kWarpCount > WarpAccessIterations::kStrided
+             ? kWarpCount / kWarpsStrided
+             : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided
+  >;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements. Warps are arranged based on a stride.
+///
+/// This ThreadMap is used by tensor core kernels for NCxHWx layout.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStridedWarpRakedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  using WarpThreadArrangement = WarpThreadArrangement_;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Base ThreadMap
+  using BaseThreadMap = PitchLinearWarpRakedThreadMap<
+    Shape,
+    kThreads,
+    WarpThreadArrangement,
+    kElementsPerAccess
+  >;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape;
+
+
+  struct Detail {
+
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations;
+
+    static int const kWarpSize = BaseThreadMap::Detail::kWarpSize;
+
+    static int const kWarpCount = BaseThreadMap::Detail::kWarpCount;
+
+    using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses;
+
+    // Divide it into the number of warps, first partitioning the contiguous dimension then the
+    // stride.
+    static int const kWarpsContiguous =
+        (WarpAccessIterations::kContiguous >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kContiguous);
+
+    static int const kWarpsStrided =
+        (kWarpCount > WarpAccessIterations::kContiguous
+             ? kWarpCount / kWarpsContiguous
+             : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+  using Delta = typename BaseThreadMap::Delta;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Transpose the existing ThreadMap.  For example, interleaved layout is like
+/// congruous in the global memory and crosswise in the shared memory.  We need
+/// to transpose the coordinates between two.
+
+template <typename ThreadMap_, typename WarpThreadArrangement_>
+struct TransposePitchLinearThreadMap {
+  /// Underlying ThreadMap
+  using ThreadMap = ThreadMap_;
+
+  /// Tensor coordinate
+  using TensorCoord = typename ThreadMap::TensorCoord;
+
+  /// Tile shape
+  using Shape = typename ThreadMap::Shape;
+
+  /// Number of threads total
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(!(Shape::kContiguous % kElementsPerAccess),
+                  "Shape must be divisible by vector length.");
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement =
+        layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
+                                 ThreadMap::Detail::kWarpsContiguous>;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations =
+      layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+                               ThreadMap::Iterations::kContiguous>;
+
+  static_assert(Iterations::kContiguous == 1,
+    "Contiguous iteration has to be one to reuse the same shared store function with those that don't need transpose");
+
+  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+  using Delta =
+      layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
+                                   kElementsPerAccess,
+                               Detail::WarpThreadArrangement::kStrided>;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical
+  /// coordinate space Note this is slightly different from the one of
+  /// PitchLinearWarpRakedThreadMap.
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access
+    // (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+        Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+        Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
+
+    // This is the offset of a specific warp (in units of vectors)
+    // Note the order of / and %. Also the 2nd operand is kStrided.
+    layout::PitchLinearCoord warp_offset{
+        (warp_id / Detail::WarpArrangement::kStrided),
+        (warp_id % Detail::WarpArrangement::kStrided)};
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+        lane_id % Detail::WarpThreadArrangement::kContiguous,
+        lane_id / Detail::WarpThreadArrangement::kContiguous};
+
+    // This is the offset of a thread within a threadblock tile (units of
+    // vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of
+    // elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+        thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+        thread_offset_in_threadblock_tile_vec.strided()};
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMapSimt {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+    static_assert(Iterations::kStrided == 1,
+      "Strided iteration has to be one to reuse the same shared store function with those that don't need transpose");
+
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+
+    ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+
+
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Policy defining a warp-striped arrangement.  This partitions a tile into vectorized memory
+/// accesses performed by each warp then distributes warps across them. Warps are striped in the
+/// strided dimension and raked across the contiguous dimension.
+template <
+  typename Shape_,                          /// Overall shape to partition in units of elements
+  int Threads,                              /// Number of partiticipation threads
+  typename WarpThreadArrangement_,          /// Describes the shape of one memory access per warp
+  int ElementsPerAccess = 1                 /// Number of elements accessed by each thread per memory operation (i.e. vector size)
+>
+struct PitchLinearWarpStripedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+      (WarpAccessIterations::kStrided >= kWarpCount
+        ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
+
+    static int const kWarpsContiguous =
+      (kWarpCount > WarpAccessIterations::kStrided ?
+        WarpAccessIterations::kContiguous / kWarpsStrided : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
+  >;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Strip-mines a pitch-linear tile among a given number of threads, first along the contiguous
+/// dimension then along the strided dimension, while each thread access a 2D thread-tile.
+///
+/// The tile must be divisible by the thread count such that all threads may execute the same
+/// number of iterations with the same delta to exhaustively cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+template <
+  typename Shape_,
+  int Threads,
+        typename ThreadTileShape
+>
+struct PitchLinear2DThreadTileStripminedThreadMap;
+
+
+template <
+  typename Shape_,
+  int Threads
+>
+struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Access Shape of each thread
+  using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
+  //using ThreadAccessShape = ThreadTileShape;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract length of each access from Layout
+  static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
+
+  static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
+
+  /// Internal implementation details
+  struct Detail {
+
+    static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
+
+    static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
+
+    static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
+      "Shape must be divisible thread count * accesses per thread.");
+
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / ThreadAccessShape::kContiguous,
+      Shape::kStrided / ThreadAccessShape::kStrided
+    >;
+
+    static_assert(
+      (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+      (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
+      "Shape must be divisible by number of iterations of each thread."
+    );
+  };
+
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? Detail::ShapeVec::kStrided /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      Shape::kContiguous,
+      kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * ThreadAccessShape::kContiguous,
+      1
+    >
+  >::type;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
+      (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
+  }
+};
+
+/// Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMap2DThreadTile {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+
+    static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+
+    ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+
+
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h b/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
new file mode 100755
index 000000000..4d0b39073
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Basic copy routines for tensor views
+*/
+
+#pragma once
+
+namespace cutlass {
+namespace transform {
+namespace thread {
+
+/// Transforms a fragment by doing a transpose
+template <
+  int ElementCount, 
+  typename TransposeShape, 
+  typename Element
+> struct Transpose;
+
+/// Specialization for int8_t 4x4 transpose
+template <int ElementCount_>
+struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
+
+    static const int kElementCount = ElementCount_;
+    using TransposeShape = layout::PitchLinearShape<4,4>;
+    using Element = int8_t;
+    using Fragment = cutlass::Array<Element, kElementCount>;
+
+    static_assert(!(kElementCount % TransposeShape::kCount), "Shape needs to be multiple of 16 elements to do a 4x4 transpose");
+
+    CUTLASS_DEVICE 
+    void transform(Fragment& dst, Fragment& src) {
+
+    // Expose src/dst as int arrays.
+    int* src_int = reinterpret_cast<int*>(&src);
+    int* dst_int = reinterpret_cast<int*>(&dst);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementCount / TransposeShape::kCount; i++){
+  
+      int const i0 = 4 * i + 0;
+      int const i1 = 4 * i + 1;
+      int const i2 = 4 * i + 2;
+      int const i3 = 4 * i + 3;
+
+      int a0 = src_int[i0];
+      int a1 = src_int[i1];
+      int a2 = src_int[i2];
+      int a3 = src_int[i3];
+
+      int b0, b1, b2, b3, c0;
+      b0 = __byte_perm(a0, a1, 0x0040);
+      c0 = __byte_perm(a2, a3, 0x0040);
+      b0 = __byte_perm(b0, c0, 0x5410);
+
+      b1 = __byte_perm(a0, a1, 0x0051);
+      c0 = __byte_perm(a2, a3, 0x0051);
+      b1 = __byte_perm(b1, c0, 0x5410);
+
+      b2 = __byte_perm(a0, a1, 0x0062);
+      c0 = __byte_perm(a2, a3, 0x0062);
+      b2 = __byte_perm(b2, c0, 0x5410);
+
+      b3 = __byte_perm(a0, a1, 0x0073);
+      c0 = __byte_perm(a2, a3, 0x0073);
+      b3 = __byte_perm(b3, c0, 0x5410);
+
+      dst_int[i0] = b0;
+      dst_int[i1] = b1;
+      dst_int[i2] = b2;
+      dst_int[i3] = b3;
+    }
+  }
+};
+
+}  // namespace thread
+}  // namespace layout
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
new file mode 100755
index 000000000..ce7cbbe8f
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+namespace transform {
+namespace thread {
+
+namespace UnaryTransform {
+    struct Identity;    ///< None (i.e., identity)
+    struct Conjugate;   ///< Complex conjugate
+}
+
+/// Element-wise unary operator that transforms one element of a fragment at a time
+template<
+    typename FragmentIn, ///< Input Fragment
+    typename FragmentOut,///< Output Fragment
+    typename Transform>  ///< Unary transform operator
+class UnaryOp
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentOut execute(FragmentIn &in)
+        {
+            static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match.");
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            FragmentOut out;
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = static_cast<typename FragmentOut::Element>(in[i]);
+                }
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
+                }
+            }
+            return out;
+        }
+};
+
+template<typename FragmentIn, typename Transform>
+class UnaryOp<FragmentIn, FragmentIn, Transform>
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentIn execute(FragmentIn &in)
+        {
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                return in;
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   in[i] = conj(in[i]);
+                }
+            }
+            return in;
+        }
+      };
+    }
+  }
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
new file mode 100755
index 000000000..026e4ced4
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for matrix of indices (ellColInd matrix) 
+*/
+
+#pragma once
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+namespace ell{
+
+constexpr unsigned int SmemPow = 8;
+constexpr unsigned int SmemStages = 2;
+constexpr unsigned int SmemSize = 1 << SmemPow;
+constexpr unsigned int SmemMask = (SmemSize*SmemStages-1);
+
+class SharedStorage{
+  public:
+    Array<int, SmemSize*SmemStages> array;
+};
+
+class Iterator{
+  public:
+  using Layout = layout::PitchLinear;
+  using LongIndex = typename Layout::LongIndex;
+
+  private:
+    const int *gmem_col_idx_;
+    int *smem_col_idx_;
+    const int  block_size_;
+    const int  base_idx_;
+    const int  k_shape_;
+    const int  ell_increment_;
+    const int  array_length_;
+    int  col_idx_base_;
+    int  residue_;
+    int  counter_;
+
+    int  pow2_;
+    int  residue_shape_;
+
+    int  smem_offset_;
+    int  smem_stage_;
+    int  gmem_offset_;
+
+    int  lane_;
+
+    bool is_pow2_;
+    bool is_residue_tile_;
+
+  public:
+    CUTLASS_DEVICE
+    void load_ell_indices(){
+      for(int i=threadIdx.x; i<SmemSize; i+=blockDim.x){
+        int idx = (gmem_offset_+i < array_length_) ? gmem_offset_+i : array_length_-1;
+        int gmem_col_idx = gmem_col_idx_[idx] - base_idx_;
+        smem_col_idx_[i + smem_stage_ * SmemSize] = 
+          (gmem_col_idx >= 0) ? gmem_col_idx : -1;
+      }
+      gmem_offset_ += SmemSize;
+      smem_stage_ ^= 1;
+    }
+
+    CUTLASS_DEVICE
+    Iterator(
+        SharedStorage& shared_storage_base,
+        const int* col_idx,
+        const int& block_size,
+        const int& base_idx,
+        const int  k_shape,
+        const int& problem_size_k,
+        const int& ell_stride,
+        const int& thread_idx)
+        : residue_(0),
+          counter_(0),
+          smem_offset_(0),
+          smem_stage_(0),
+          gmem_offset_(0),
+          block_size_(block_size),
+          base_idx_(base_idx),
+          k_shape_(k_shape),
+          ell_increment_(ell_stride * block_size),
+          array_length_((problem_size_k + block_size_ - 1) / block_size_), 
+          residue_shape_(problem_size_k % k_shape_),
+          is_residue_tile_(residue_shape_ != 0),
+          smem_col_idx_(reinterpret_cast<int*>(&shared_storage_base.array)),
+          gmem_col_idx_(const_cast<int*>(col_idx)),
+          lane_(thread_idx % 32) {
+
+      load_ell_indices();
+      __syncthreads();
+          
+      is_pow2_ = ((block_size_ & (block_size_ - 1)) == 0);
+      if( is_pow2_ && k_shape <= block_size_ ) lane_ = 0;
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_;
+
+      pow2_ = 0;
+      while(block_size_ >> (pow2_ + 1)) ++pow2_;
+    }
+
+    CUTLASS_DEVICE
+    int get_blocksize(){
+      return block_size_;
+    }
+
+    CUTLASS_DEVICE
+    Iterator &operator++(){
+      if(is_residue_tile_){
+        residue_ += residue_shape_;
+        is_residue_tile_ = false;
+      } else {
+        residue_ += k_shape_;
+      }
+
+      if(residue_ < block_size_){
+        return *this;
+      }
+
+      if((array_length_ > SmemSize) && (((smem_offset_ >> SmemPow) & 1) != smem_stage_)) 
+        load_ell_indices();
+
+      if(residue_ == block_size_){
+        ++smem_offset_;
+        counter_ += ell_increment_;
+        residue_ = 0;
+        col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+        return *this;
+      }
+      
+      if(is_pow2_){
+        smem_offset_ += residue_ >> pow2_; 
+        counter_ += (residue_ >> pow2_) * ell_increment_;
+        residue_ = residue_ & ((1 << pow2_) - 1);
+      }
+      else {
+        smem_offset_ += residue_ / block_size_; 
+        counter_ += (residue_ / block_size_) * ell_increment_;
+        residue_ %= block_size_;
+      }
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+      
+      return *this;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset(const int& idx) {
+      int num_jump_tiles;
+      if(is_pow2_)
+        num_jump_tiles = (idx + residue_) >> pow2_;
+      else 
+        num_jump_tiles = (idx + residue_) / block_size_;
+
+      int tmp = __shfl_sync(0xffffffff, col_idx_base_, num_jump_tiles); 
+      return tmp - num_jump_tiles * ell_increment_;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset_fast() {
+      return col_idx_base_;
+    }
+};
+
+}
+}
+}
+}
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
new file mode 100755
index 000000000..2e9e3716a
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
@@ -0,0 +1,1350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaMultistage
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class EllPredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    LongIndex stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_(layout.stride(0)) {
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Initial offset to define ELL block
+  TensorCoord ell_offset_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+          
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(), 
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+      
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    ell_offset_ = ThreadMap::initial_offset(thread_id);
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(residue_extent, false);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      thread_offset_ += residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(residue_offset_));
+
+      compute_predicates_(extent_, true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+  
+  /// Returns a k_location
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    if(kAdvanceRank){ //strided
+      return ell_offset_.strided() + iteration_strided_ * ThreadMap::Delta::kStrided;
+    }else{
+      return ell_offset_.contiguous() + iteration_contiguous_ * ThreadMap::Delta::kContiguous + iteration_vector_ * AccessType::kElements;
+    }
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    if(kAdvanceRank)
+      return params_.stride_;
+    else
+      return 1;
+  }
+  
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+
+    Mask mask;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = ell_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (kAdvanceRank == 0) {
+        guard = (coord.strided() < blocksize);
+      } else {
+        guard = (coord.contiguous() < blocksize);
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      mask[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] &= predicates_[i];
+    }
+    set_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for row-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
new file mode 100755
index 000000000..7c1b27b3d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
@@ -0,0 +1,1315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaPipelined
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+#include "cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h"
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::EllPredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class EllPredicatedTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class EllPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      EllPredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return address_iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { address_iterator_.ell_add_mask(blocksize); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator &ell_iter) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          LongIndex ell_offset = 0;
+
+          int k_offset = address_iterator_.get_k();
+          ell_offset = ell_iter.get_offset(k_offset) * sizeof(Element);
+          
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator &ell_iter) {
+
+    LongIndex ell_offset = ell_iter.get_offset_fast() * sizeof(Element);
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
new file mode 100755
index 000000000..366897c65
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    It can be used to load the gamma and beta vectors of layernorm which is loop variant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  TensorCoord thread_offset_;
+
+  int problem_size_k_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  bool guard_;
+
+  TensorCoord::Index residue_size_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    problem_size_k_ = problem_size_k;
+
+    is_residue_tile_ = true;
+
+    residue_size_ = (problem_size_k_ - threadblock_offset.contiguous()) % ThreadblockShape::kContiguous;
+
+    if (residue_size_ == 0) {
+      residue_size_ = ThreadblockShape::kContiguous;
+    }
+
+    guard_ = ((thread_id - thread_base) * kElementsPerAccess) < residue_size_;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    guard_ = threadIdx.x < kThreads * 2;
+
+    TensorCoord offset = is_residue_tile_ ?
+      TensorCoord(residue_size_ + ThreadblockShape::kContiguous * (tile_offset.contiguous() - 1), 0)
+      : TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+
+    thread_offset_ =
+        thread_offset_ +
+        offset;
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    guard_ &= (!enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return guard_;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Extent of tensor
+      int problem_size_k,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size_k, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      int problem_size_k,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
new file mode 100755
index 000000000..54b0ecf5e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    This can be used to load var and mean vectors in layernorm which is loop invariant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 1;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kIterations = WarpShape::kContiguous / 8;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+
+  /// Size of tensor
+  int problem_size_;
+
+  int32_t thread_offset_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Extent of tensor
+      int problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      int problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
new file mode 100755
index 000000000..a99dae952
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -0,0 +1,2118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorPredicates
+///
+template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIteratorPredicates {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+// private:
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
+
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(), 
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+      
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    compute_predicates_(residue_extent, false);
+
+    set_iteration_index(0);
+  }
+
+  /// Default constructor
+  PredicatedTileAccessIteratorPredicates() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates(
+      /// Extent of tensor
+      TensorCoord extent)
+      : extent_(extent) {
+	}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates &operator++() {
+
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
+class PredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    
+    using Base = PredicatedTileAccessIteratorParams;
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : 
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+
+  /// Gather indices
+  int const *indices_;
+
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are neeeded as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+	  the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : params_(params),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
+        is_residue_tile_(true),
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+          
+    if (Gather) {
+      assert(indices_);
+    }
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather && !Permute) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+
+      Layout layout(params_.stride_);
+
+      if (!Gather && !Permute) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
+
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
+        }
+      } else {
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
+        } else {
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
+        }
+      }
+    } else {
+      if (!Gather && !Permute) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
+      } else {
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
+      }
+    }
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    if (Gather || Permute)
+    {
+      if (!valid()) {
+        return nullptr;
+      }
+
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
+
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
+    }
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather && !Permute) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather && !Permute) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+  
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column()),
+                  indices) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, 
+      Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row()),
+                  indices) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+          the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+	is_residue_tile_(true) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(the_predicates.residue_offset_));
+
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { the_predicates.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank-2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.  
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major interleaved data.  
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
new file mode 100755
index 000000000..4379bb0a6
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -0,0 +1,834 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIterator2dThreadTile
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class PredicatedTileAccessIterator2dThreadTile;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = (ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kStrided + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+
+   public:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : 
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Index of residue tile
+  int residue_tile_idx_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+  /// Tracks iterations within the thread loop
+  int iteration_thread_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_HOST_DEVICE
+  void compute_predicates_(
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++) {
+
+          TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous,
+                                      ts + s * ThreadMap::Delta::kStrided);
+
+          TensorCoord coord = thread_offset_ + iteration_coord;
+
+          bool guard;
+
+          if (is_steady_state) {
+            if (kAdvanceRank == 0) {
+              guard = (coord.strided() < extent_.strided());
+            } else {
+              guard = (coord.contiguous() < extent_.contiguous());
+            }
+          } else {
+            guard = (coord.strided() < extent_.strided() &&
+                     coord.contiguous() < extent_.contiguous());
+          }
+
+          int pred_idx = ts + c *  ThreadMap::ThreadAccessShape::kStrided + s * ThreadMap::Iterations::kContiguous *  ThreadMap::ThreadAccessShape::kStrided;
+          int word_idx = pred_idx / kPredicatesPerWord;
+          int residual = pred_idx % kPredicatesPerWord;
+          int byte_idx = residual / kPredicatesPerByte;
+          int bit_idx = residual % kPredicatesPerByte;
+          
+          predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+        }
+      }
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+          
+
+    TensorCoord residue_offset;
+    if (kAdvanceRank) {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kStrided;
+      residue_offset = make_Coord(0, residue_tile_idx_ * Shape::kStrided);
+    } else {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kContiguous;
+      residue_offset = make_Coord(residue_tile_idx_ * Shape::kContiguous, 0);
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + residue_offset +
+                     ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(false);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    int residual = index % (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    iteration_strided_ = index / (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    
+    iteration_contiguous_ = residual / ThreadMap::ThreadAccessShape::kStrided;
+    iteration_thread_ = residual % ThreadMap::ThreadAccessShape::kStrided;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += int(sizeof(Element)) * pointer_offset;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      TensorCoord residue_offset;
+      if (kAdvanceRank) {
+        residue_offset = TensorCoord(0, residue_tile_idx_ * Shape::kStrided);
+      } else {
+        residue_offset = TensorCoord(residue_tile_idx_ * Shape::kContiguous, 0);
+      }
+
+      thread_offset_ -= residue_offset;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(-layout(residue_offset));
+
+      compute_predicates_(true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * (tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * (tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * tile_offset.strided();
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * tile_offset.contiguous();
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *ret_val = reinterpret_cast<AccessType *>(
+                pointer_ + (iteration_thread_ * params_.stride_  + iteration_contiguous_ * ThreadMap::Delta::kContiguous) * int(sizeof(Element)));
+
+    return ret_val;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+
+    iteration_thread_++;
+
+    if (iteration_thread_ < ThreadMap::ThreadAccessShape::kStrided)
+      return *this;
+
+    iteration_thread_ = 0;
+
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    int pred_idx = 
+      iteration_thread_ + 
+      iteration_contiguous_ * ThreadMap::ThreadAccessShape::kStrided + 
+      iteration_strided_ * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    
+    return pred;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
new file mode 100755
index 000000000..c67af387e
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -0,0 +1,290 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Predicated tile access iterator descriptor object containing template dependent state
+struct PredicatedTileAccessIteratorDesc {
+
+  int element_size_bits = -1;
+  int advance_rank = -1;
+  layout::PitchLinearCoord threadblock_shape;
+  layout::PitchLinearCoord threadmap_iterations;
+  layout::PitchLinearCoord threadmap_delta;
+
+  //
+  // Methods
+  //
+
+  PredicatedTileAccessIteratorDesc() = default;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc(
+    int element_size_bits_,
+    int advance_rank_,
+    layout::PitchLinearCoord threadblock_shape_,
+    layout::PitchLinearCoord threadmap_iterations_,
+    layout::PitchLinearCoord threadmap_delta_
+  ):
+    element_size_bits(element_size_bits_),
+    advance_rank(advance_rank_),
+    threadblock_shape(threadblock_shape_),
+    threadmap_iterations(threadmap_iterations_),
+    threadmap_delta(threadmap_delta_)
+  {
+    #if 0
+    printf("PredicatedTileAccessIteratorDesc(%d, %d, {%d, %d}, {%d, %d}, {%d, %d}})\n",
+      element_size_bits,
+      advance_rank,
+      threadblock_shape.contiguous(), threadblock_shape.strided(),
+      threadmap_iterations.contiguous(), threadmap_iterations.strided(),
+      threadmap_delta.contiguous(), threadmap_delta.strided());
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an PredicatedTileAccessIteratorDesc from a template 
+// dependent state
+template <
+  typename Shape, typename Element, typename Layout,
+  int AdvanceRank, typename ThreadMap>
+  struct MakePredicatedTileAccessIteratorDesc;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return PredicatedTileAccessIteratorDesc(
+      sizeof_bits<Element>::value,
+      AdvanceRank,
+      {Shape::kContiguous, Shape::kStrided},
+      {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+      {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+    );
+}
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajor, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajor, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for roww-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct
+//
+
+struct PredicatedTileAccessIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+  /// stride of pitch-linear layout (units of Element)
+  LongIndex stride_ = 0;
+  /// amount (in byte) to increment pointer to move to next access along
+  /// strided dimension
+  LongIndex inc_strided_ = 0;
+  /// amount (in byte) to increment pointer from last access to first access
+  /// of next tile
+  LongIndex inc_next_ = 0;
+  /// amount (in byte) to increment pointer from first access of current tile
+  /// to first access of next tile
+  LongIndex inc_advance_ = 0;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    CUTLASS_ASSERT(desc.element_size_bits > 0);
+    CUTLASS_ASSERT(desc.advance_rank == 0 || desc.advance_rank == 1);
+
+    stride_ = stride;
+
+    inc_strided_ = (LongIndex(stride_) * desc.threadmap_delta.strided()) *
+                     desc.element_size_bits / 8;
+
+    if (desc.advance_rank) {
+      // advance along strided dimension
+      inc_advance_ =
+          desc.threadblock_shape.strided() * LongIndex(stride_) * desc.element_size_bits / 8;
+    } else {
+      // advance along contiguous dimension
+      inc_advance_ = desc.threadblock_shape.contiguous() * desc.element_size_bits / 8;
+    }
+
+    inc_next_ = inc_advance_ - LongIndex(desc.threadmap_iterations.strided() - 1) *
+                                   desc.threadmap_delta.strided() * LongIndex(stride_) *
+                                   desc.element_size_bits / 8;    
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    return initialize(LongIndex(stride), desc);
+  }
+
+  PredicatedTileAccessIteratorParams() = default;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
new file mode 100755
index 000000000..24498843d
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
@@ -0,0 +1,892 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorTriangularMatrix
+///
+template <typename Shape, typename Element, typename Layout, 
+          int AdvanceRank, typename ThreadMap, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType>
+class PredicatedTileAccessIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  using CompareOp = typename TrMatrixCompareOp<kFillMode, kDiagType>::Type;
+
+  static_assert( kFillMode == FillMode::kFull || 
+                 ((kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) && AccessType::kElements == 1), 
+                 "BLAS3 iterator for the triangular/symmetric matrix must use AccessType::kElements as 1");
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = 
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    StrideIndex stride_;
+    /// (true)  pitch-linear layout is mapped to row-major matrix 
+    /// (false) pitch-linear layout is mapped to column-major matrix
+    bool is_row_major_;
+    /// for vectorized access across the diagonal boundary guard condition is
+    /// checked for the element on the boundary
+    int access_diagonal_boundary_;    
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0), is_row_major_(false), access_diagonal_boundary_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, bool is_row_major, int access_diagonal_boundary) : 
+      stride_(layout.stride(0)), is_row_major_(is_row_major), access_diagonal_boundary_(access_diagonal_boundary) {
+
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+
+    };
+
+
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Track global memory addresses on the diagonal 
+  /// To ignore imag part for diagonal elements of hermitian matrices
+  uint32_t predicates_onDiag_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+      predicates_onDiag_[i] = 0u;
+    }
+
+    CompareOp compare_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+      bool onDiag = false;
+
+      guard = ((coord.strided() < extent.strided()) && 
+                (coord.contiguous() < extent.contiguous()));
+    
+
+      // guard access on the wrong side of the triagular matrix diagonal
+      if (kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) {
+        coord += TensorCoord{params_.access_diagonal_boundary_, 0};
+
+        bool triagular_guard_row_major = compare_op(coord.strided(), coord.contiguous()) | !params_.is_row_major_;
+        bool triagular_guard_col_major = compare_op(coord.contiguous(), coord.strided()) | params_.is_row_major_;
+        
+        guard = guard && triagular_guard_row_major && triagular_guard_col_major;
+
+        if (kDiagType == DiagType::kUnit) {
+          onDiag = (guard && coord.strided() == coord.contiguous()) ? true : false;
+        }
+      }
+
+      int pred_idx_onDiag = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx_onDiag = pred_idx_onDiag / kPredicatesPerWord;
+      int residual_onDiag = pred_idx_onDiag % kPredicatesPerWord;
+      int byte_idx_onDiag = residual_onDiag / kPredicatesPerByte;
+      int bit_idx_onDiag = residual_onDiag % kPredicatesPerByte;
+      
+      predicates_onDiag_[word_idx_onDiag] |= (unsigned(onDiag) << (byte_idx_onDiag * 8 + bit_idx_onDiag));
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+        extent_(extent) {
+
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(extent_);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+      pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      thread_offset_ += TensorCoord{0, Shape::kStrided * tile_offset.strided()};
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+      pointer_ += Shape::kStrided * tile_offset.strided();
+      thread_offset_ += TensorCoord{Shape::kContiguous * tile_offset.contiguous(), 0};
+    }
+
+    compute_predicates_(extent_);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_onDiag_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+
+    //return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+            SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+            typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, 
+                                   AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kLower) ? (AccessType::kElements - 1) : 0;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), false, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                                  kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kUpper) ? (AccessType::kElements - 1) : 0;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), true, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
new file mode 100755
index 000000000..bdfb33fe5
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -0,0 +1,1887 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess,
+  bool Gather = false,
+  typename PermuteLayout = layout::NoPermute
+>
+class PredicatedTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, bool Gather, typename PermuteLayout>
+class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType, Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset, indices) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0)))
+    {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+      indices)
+    { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,        ///< Initial offset of threadblock
+    int const *indices = nullptr                        ///< Gather indices
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+      indices
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRank,
+                             ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+
+    friend PredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+    {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank 2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
new file mode 100755
index 000000000..422ac45c3
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -0,0 +1,787 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h"
+#include "cutlass/transform/thread/transpose.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIterator2dThreadTile
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iteraor will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator2dThreadTile;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  bool Transpose = false
+>
+class PredicatedTileIterator2dThreadTile;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, bool Transpose_>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Transpose_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  /// extra set of parenthesis is needed for VS compiler
+  struct alignas((ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value /
+                  8)) AccessType {
+
+    Array<Element, ThreadMap::kElementsPerAccess> storage;
+
+    static int const kElements = ThreadMap::kElementsPerAccess;
+  };
+
+  /// Optinally this fragment can be 4x4 transposed
+  using Transform = thread::Transpose< ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount , layout::PitchLinearShape<4,4>, Element>;
+  static bool const transpose = Transpose_;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator2dThreadTile<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIterator2dThreadTile;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+
+            frag_ptr[access_idx] =
+                *(address_iterator_.get() + pointer_offset);
+          }
+
+          ++address_iterator_;
+        }
+      }
+    }
+
+    if (transpose) {
+      Transform t;
+      t.transform(frag, frag);
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+            *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    Transpose
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    Transpose
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
new file mode 100755
index 000000000..8fea9ae02
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
@@ -0,0 +1,818 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorTriangularMatrix
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iteraor will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIteratorTriangularMatrix;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class PredicatedTileIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          int AccessSize>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, 
+                                             kSideMode, kFillMode, kDiagType,
+                                             AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIteratorTriangularMatrix<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorTriangularMatrix;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, 
+                                              kSideMode, kFillMode, kDiagType,
+                                              AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                            kSideMode, kFillMode, kDiagType,
+                                            AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
new file mode 100755
index 000000000..391f94b97
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing computing the addresses of loading small
+    vectors from the global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedVectorAccessIterator
+///
+template <
+    /// Shape of the vector accessed by the entire threadblock
+    typename Shape,
+    /// Shape of the vector accessed by the warp
+    typename WarpShape,
+    /// Type of Element
+    typename Element,
+    /// Layout of the vector
+    typename Layout,
+    /// Number of elements for each access
+    int ElementsPerAccess,
+    /// Support residual tile
+    bool EnableResidualAccess = false
+>
+class PredicatedVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Vector access iterator specialized for vectors, e.g. scale and bias
+/// Thread arrangements are for TensorOps
+///
+template <
+  typename Shape_, 
+  typename WarpShape_, 
+  typename Element_, 
+  int ElementsPerAccess, 
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator <
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::PitchLinear,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+  public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+//  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kThreads = 32;
+  static int const kRowsPerIteration = 8;
+  static int const kThreadsPerRow = kThreads / kRowsPerIteration;
+  static int const kThreadsPerRowMask = 0x3;
+  static int const kIterations = WarpShape::kContiguous / (kThreadsPerRow * kElementsPerAccess); 
+  static int const kWarpCountStrided = Shape::kStrided / WarpShape::kStrided;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Extent of tensor
+  TensorCoord extent_;
+
+  /// pointer offset of each thread
+  TensorCoord thread_offset_;
+
+  /// iteration index
+  LongIndex iteration_;
+
+  /// residual access
+  bool is_residual_;
+
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
+ public:
+  /// Constructs a vector access iterator
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to the start of the vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    /// ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id,
+    /// Initial offset of threadblock
+    TensorCoord const &threadblock_offset)
+    : pointer_(reinterpret_cast<BytePointer>(
+                       const_cast<NonConstPointer>(pointer))),
+      extent_(extent),
+      is_residual_(false) {
+
+
+    int warp_offset = (warp_id / kWarpCountStrided) * WarpShape::kContiguous;
+
+    // Per-thread offset in logical coordinates of tensor
+
+    thread_offset_ = threadblock_offset + TensorCoord(warp_offset, 0) +
+        TensorCoord((thread_id & kThreadsPerRowMask) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+
+    if(EnableResidualAccess) {
+      // compute residual offset
+      typename TensorCoord::Index residual_size = extent_.contiguous() % WarpShape::kContiguous;
+      if (residual_size) {
+        is_residual_ = true;
+        residual_offset_ = make_Coord(residual_size, 0);
+      }
+    }
+  }
+
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to start of vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    ///< ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id)
+    : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id,
+                                     make_Coord(0, 0)) {}
+
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_ = index;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ =
+        thread_offset_ +
+        TensorCoord(WarpShape::kContiguous * tile_offset.contiguous(), 0);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        ((thread_offset_.contiguous() + iteration_ * kThreadsPerRow * kElementsPerAccess) 
+        * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iteration_;
+    if(iteration_ >= kIterations)
+      iteration_ = 0; 
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    if(EnableResidualAccess && is_residual_) {
+      is_residual_ = false;
+      thread_offset_ += residual_offset_; 
+    }
+    else
+      add_tile_offset(TensorCoord(1, 0));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return ((thread_offset_.contiguous() + 
+              iteration_ * kThreadsPerRow * kElementsPerAccess) < extent_.contiguous());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedVectorAccessIterator for row-major data.
+///
+template <
+  typename Shape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator<
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::RowMajor,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+ public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, 
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>, 
+      Element,
+      layout::PitchLinear,
+      ElementsPerAccess,
+      EnableResidualAccess>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  static int const kRowsPerIteration = UnderlyingIterator::kRowsPerIteration;
+  static int const kThreads = UnderlyingIterator::kThreads;
+  static int const kIterations = UnderlyingIterator::kIterations;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ///< Pointer to the start of the vector
+      ConstPointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< ID of each participating warp
+      int warp_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(pointer, layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id, warp_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ConstPointer pointer,   ///< Pointer to the start of the vector
+      TensorCoord extent,     ///< Extent of tensor
+      int thread_id,          ///< ID of each participating thread
+      int warp_id             ///< ID of each participating warp
+      )
+      : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id, 
+                                        make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    iterator_.advance();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
new file mode 100755
index 000000000..f5906d828
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing computing the addresses of storing of small
+   scale and bias vectors in the shared memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// RegularScaleBiasVectorAccessIterator
+///
+template <typename Shape, typename Element, typename Layout>
+class RegularScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<Shape_, Element_, layout::PitchLinear> {
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Element type per access
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = Shape::kContiguous / kElementsPerAccess;
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer 
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : byte_offset_(0) {
+    // Per-thread offset in logical coordinates of tensor
+    int thread_offset = thread_id * kElementsPerAccess;
+
+    // initialize pointer
+    pointer_ =
+        reinterpret_cast<AccessType *>(scale_bias_ref.data() + thread_offset);
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(pointer_);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() { return *this; }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    // Multiply by 2 because we store scale and bias belong to the same stage
+    // next to each other.
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * 2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor> {
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : iterator_({scale_bias_ref.data(), scale_bias_ref.stride()}, thread_id) {
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
new file mode 100755
index 000000000..d0992d441
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing the address computation of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8>
+class RegularTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
new file mode 100755
index 000000000..fa02b008b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -0,0 +1,408 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1), 
+      ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0), 
+      ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
new file mode 100755
index 000000000..a7b57bbe7
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
@@ -0,0 +1,587 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+           bool Dynamic_iterations = false,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8
+          >
+class RegularTileAccessIteratorDirectConv;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations OFF
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, false, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    //Do nothing
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * ThreadMap::Iterations::kStrided *
+                           ThreadMap::Delta::kStrided * stride_ * ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations ON
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_,true, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+  /// Total iterattions in the strided dimension: Dynamic value
+  int total_iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    total_iteration_strided_ = num;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < total_iteration_strided_) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * total_iteration_strided_ * ThreadMap::Delta::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment >
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations , Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
new file mode 100755
index 000000000..96e3ee84b
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -0,0 +1,821 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                          Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(
+          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * Layout::kFactor +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                  Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::TensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(!(ThreadMap::Delta::kContiguous % kCrosswise),
+                "kCrosswise is the smallest unit in the contiguous dimension "
+                "for shared memory swizzling.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+
+    /// Number of pointers
+    ///
+    /// Note:TN kblock32 layouts only needs 1 pointer, but strangely
+    /// reducing pointer count hurts perfomrnace
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Total number of sections.  The memory is divided into stages.  One stage
+  /// can store one tile.  Stage is divided into sections.  Interleaved layout
+  /// can have multiple sections in a stage.  The rest layout only has one section
+  /// in a stage.
+  int sections_;
+
+  /// Sections that a stage has
+  int sections_per_stage_;
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : sections_(ref.stride(0) / kCrosswise),
+        sections_per_stage_(Shape::kContiguous / kCrosswise),
+        // stride_ = kCrosswise x sections_ x kFactor
+        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
+                    ref.offset(thread_offset_in_threadblock_tile) /
+                        Layout::kElementsPerAccess;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset =
+        stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+        // kCrosswise elements in the contiguous dimension would span to a
+        // shared memory cache line.
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous / kCrosswise) *
+            Layout::TileShape::kContiguous;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next section.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
+                           ThreadMap::kElementsPerAccess / sections_ +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
new file mode 100755
index 000000000..b424af445
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -0,0 +1,1532 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous64b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers - two pointers are needed if making more than 4 iterations along
+    ///< strided dimension
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_[Detail::kPointerCount];
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data());
+
+    byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element);
+    
+    if (Detail::kPointerCount == 2) {
+      byte_offset_[1] = byte_offset_[0] ^ 8;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset / ThreadMap::kElementsPerAccess;
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    // Map the logical contiguous and strided access to the internal swizzled structure.
+    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16 + stride_ * ThreadMap::Delta::kContiguous * iteration_contiguous_;
+
+    char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
+
+    int byte_offset;
+
+    // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations)
+    // in the strided dimension
+    if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) {
+      byte_offset = byte_offset_[1];
+    }
+    else {
+      byte_offset = byte_offset_[0];
+    }
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicand64bCrosswise,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous128b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+
+  static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension");
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2);
+    int offset_s = (iteration_strided_ / 2) * 8;
+
+    int access_offset = offset_c * stride_ + offset_s;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous * stride_ + 
+      coord.strided() * Shape::kStrided * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
new file mode 100755
index 000000000..d09c23892
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
+>
+class RegularTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
new file mode 100755
index 000000000..1e04c4262
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -0,0 +1,552 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for pitch-linear.  This one is used by 2-stage SIMT kernels
+/// and sparse tensor core meta data.
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the contiguous or strided dimensions.");
+
+private:
+
+  //
+  // Types
+  //
+
+  //
+  // Data members
+  //
+
+  /// Pointer to memory
+  uint8_t *pointer_;
+
+  /// Stride quantity
+  StrideIndex stride_;
+
+  /// Amount to increment pointer along strided dimension
+  Index increment_strided_;
+
+  /// Amount to advance pointer between tiles
+  Index increment_advance_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ): 
+    pointer_(reinterpret_cast<uint8_t *>(ref.data()) + (ref.offset(ThreadMap::initial_offset(thread_idx)) * sizeof_bits<Element>::value / 8)) {
+    
+    stride_ = ref.stride()[0];
+    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value) * ThreadMap::Delta::kStrided / 8;
+    
+    increment_advance_ = 
+      (kAdvanceRank == 0 ? 
+        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
+        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous /
+                                   ThreadMap::kElementsPerAccess];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    load_with_pointer_offset(
+      frag, 
+      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
+        tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int idx = c + s * ThreadMap::Iterations::kContiguous;
+        access_ptr[c * ThreadMap::Delta::kContiguous /
+                   ThreadMap::kElementsPerAccess] = frag_ptr[idx];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    store_with_pointer_offset(
+      frag,
+      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    pointer_ += increment_advance_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    pointer_ -= increment_advance_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    int offset = sizeof_bits<Element>::value *
+        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
+    add_pointer_offset(offset);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+    /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+#if 0
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+#endif
+    return reinterpret_cast<AccessType *>(pointer_);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for row major 
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  using Underlying = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kAlignment
+  >;
+
+  using AccessType = typename Underlying::AccessType;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return iterator_.get();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for pitch-linear
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  using Underlying = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap
+  >;
+
+  using AccessType = typename Underlying::AccessType;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return iterator_.get();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
new file mode 100755
index 000000000..7fd495984
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
+>
+class RegularTileIterator2dThreadTile;
+
+
+/// Regular tile iterator specialized for pitch-linear + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the contiguous or strided dimensions.");
+
+private:
+
+  //
+  // Types
+  //
+  
+  using AccessType = AlignedArray<Element, ThreadMap::ThreadAccessShape::kCount, kAlignment>;
+
+  //
+  // Data members
+  //
+
+  /// Pointer to memory
+  uint8_t *pointer_;
+
+  /// Stride quantity
+  StrideIndex stride_;
+
+  /// Amount to increment pointer along strided dimension
+  LongIndex increment_strided_;
+
+  /// Amount to advance pointer between tiles
+  LongIndex increment_advance_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx,
+    int interleave
+  ){ 
+    
+    TensorCoord t = ThreadMap::initial_offset(thread_idx);
+    long int offset = t[0] * interleave + t[1] * ref.stride()[0]/interleave;
+    pointer_ = reinterpret_cast<uint8_t *>(ref.data() + offset);
+
+    stride_ = ref.stride()[0] / interleave;
+    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value / 8) * ThreadMap::Delta::kStrided / interleave;
+
+    increment_advance_ = 
+      (kAdvanceRank == 0 ? 
+        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
+        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8) / interleave);
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+          int idx = c + s * ThreadMap::Iterations::kContiguous;
+           frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided];
+        }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    load_with_pointer_offset(
+      frag, 
+      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
+        tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+          int idx = c + s * ThreadMap::Iterations::kContiguous;
+          access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided] = frag_ptr[idx];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    store_with_pointer_offset(
+      frag,
+      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    pointer_ += increment_advance_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    pointer_ -= increment_advance_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    int offset = sizeof_bits<Element>::value *
+        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
+    add_pointer_offset(offset);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::RowMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorInterleaved<4>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  using Underlying = RegularTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kAlignment
+  >;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorInterleaved<4>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+  using PitchLinearThreadMap = PitchLinearStripminedThreadMap< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, 
+                                  ThreadMap::kThreads, ThreadMap::ThreadAccessShape::kCount >;
+                        
+
+  using Underlying = RegularTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap
+  >;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
new file mode 100755
index 000000000..1308f45eb
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -0,0 +1,1107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                          Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+  };
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : address_iterator_(ref, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_tile_offset({0, 1});
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_tile_offset(coord);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+        AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+        frag_ptr[access_idx] = *access_ptr;
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+        *access_ptr = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag, 
+    Index pointer_offset) {
+    
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                  Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag, 
+    Index pointer_offset) {
+    
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::TensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : address_iterator_(ref, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_tile_offset(coord);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+        *access_ptr = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::ColumnMajorTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::RowMajorTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                    InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                      InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : address_iterator_(ref, thread_id) {}
+ 
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_pointer_offset(Shape::kCount);
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                             InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                                         InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
+    (kAdvanceRank == 1 ? 0 : 1),
+    ThreadMap
+  >;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
new file mode 100755
index 000000000..81b774cf2
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -0,0 +1,1460 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType * pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+
+      // This is the offset of a thread within a threadblock tile for a specific pointer
+      // (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
+    );
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+            vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
+
+        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+          vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType * pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+
+      // This is the offset of a thread within a threadblock tile for a specific pointer
+      // (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
+    );
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+            vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
+
+        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+          vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps.
+///
+/// Volta TN SMEM layout is a little diffrent:
+/// Crosseised elements will be stored in a line, while contiguous elements
+/// sre stored in line-by-line.
+/// Padding is used to reduce SMEM bank conflicts.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                               Shape_::kContiguous>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 Shape::kContiguous>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+
+    /// Iterations for the kElementsPerAccess of ThreadMap
+    static int const kIterarionsPerAccess =
+        ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
+
+    /// Contiguous elements per line
+    static int const kContiguousElementsPerLine = 4;
+  };
+
+ private:
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// The crosswised elements will be stored in a line.
+  /// line_size is size of crosswised dimension plus padding.
+  /// in units of AccessType
+  Index line_size;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(
+          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    // (Shape::kContiguous/Layout::kElementsPerAccess)*
+    //   line_size * Layout::kElementsPerAccess
+    add_pointer_offset(Shape::kContiguous * line_size);
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
+                       line_size + coord.strided() * Shape::kStrided) *
+                       Layout::kElementsPerAccess);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
+
+      access_ptr += 16 * (s / 2);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
+
+          int access_offset = 
+            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
+            vec_pointer_offset + i * line_size;
+
+          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
+            Detail::kIterarionsPerAccess + i;
+
+          char const *access_byte_ptr = reinterpret_cast<char const*>(access_ptr + access_offset);
+
+          frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              access_byte_ptr + byte_offset_);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
+
+      access_ptr += 16 * (s / 2) + vec_pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
+
+          int access_offset = 
+            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size + i * line_size;
+
+          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
+            Detail::kIterarionsPerAccess + i;
+
+          char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+          *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) =
+              frag_ptr[access_idx];
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_,
+                          layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Shape_::kRow>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Shape::kRow>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Shape::kRow>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,  
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_,
+                          layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Shape_::kColumn>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Shape::kColumn>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 Shape::kColumn>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
new file mode 100755
index 000000000..f78e5e862
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template wraps the vector access iterator concept to load whole vector from tensors in
+      memory. This is typically used for per-channel scale and bias in convolution kernels.
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename VectorAccessIterator_>
+class VectorIterator {
+public:
+  using VectorAccessIterator = VectorAccessIterator_;
+
+  using Shape = typename VectorAccessIterator::Shape;
+  using Element = typename VectorAccessIterator::Element;
+  using Layout = typename VectorAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using AccessType = typename VectorAccessIterator::AccessType;
+  using TensorRef = typename VectorAccessIterator::TensorRef;
+  using Index = typename VectorAccessIterator::Index;
+  using LongIndex = typename VectorAccessIterator::LongIndex;
+
+  static int const kElementsPerAccess = VectorAccessIterator::kElementsPerAccess;
+  static int const kRowsPerIteration = VectorAccessIterator::kRowsPerIteration;
+  static int const kThreads = VectorAccessIterator::kThreads;
+  static int const kIterations = VectorAccessIterator::kIterations;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, kElementsPerAccess * kIterations>;
+
+private:
+
+  /// Internal state
+  VectorAccessIterator vector_access_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  VectorIterator(
+    Element const *ptr,
+    TensorCoord extent,
+    int thread_idx,
+    int warp_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    vector_access_iterator_(ptr, extent, thread_idx, warp_idx, threadblock_offset) { }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  VectorIterator &operator++() {
+    vector_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  VectorIterator operator++(int) {
+    VectorIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < kIterations; ++c) {
+
+        cutlass::arch::global_load<
+          AccessType,
+          sizeof(AccessType)
+        >(
+          frag_ptr[c],
+          vector_access_iterator_.get() + pointer_offset,
+          vector_access_iterator_.valid()
+        );
+
+        ++vector_access_iterator_;
+      }
+//    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    vector_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    vector_access_iterator_.advance();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
new file mode 100755
index 000000000..b8bfa57fd
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of a warp vector
+      that participate in one warp-level mma operation.
+
+      Typically, this is used to access the scale/bias fragement of a warp-level mma operation.
+      The scale/bias vector is then partitioned into smaller fragments that can be fed into 
+      next warp-level mma operation. 
+
+      This iterator is necessary to accomplish warp-level mma fusion where the scale/bias vector is 
+      applied to the multiplicand for the next mma.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace transform {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the input fragment tile shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator;
+
+
+// Partial specialization for PitchLinear layout tile
+
+template <
+    /// Size of the input fragment vector shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator<Shape_, Element_,
+                                         cutlass::layout::PitchLinear,
+                                         InstructionShape_, ElementsPerAccess> {
+ public:
+    
+  /// Size of the input threadblock tile shape (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::PitchLinear;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kRowsPerIteration = 8;
+  static int const kColumnsPerAccess = 8;
+  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kK / kThreads;
+  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
+  
+  /// Number of iterations
+  using Iterations = MatrixShape<InstructionShape::kM / kRowsPerIteration, Shape::kContiguous / kElementsPerIteration>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+  // All fragments have kElementsPerAccess scale followed by bias
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one iteration of the iterator.
+  using Fragment = Array<Element, kElementsPerIteration * Iterations::kRow>;
+
+  /// Input threadblock fragment tile
+  using ThreadblockFragment = Array<Element, Shape::kContiguous >;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Input threadblock fragment tile
+  AccessType const *iterator_;
+
+  /// Internal index
+  int index_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
+      : iterator_(reinterpret_cast<AccessType const *>(&threadblock_frag)),
+        index_(0) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+
+    if(index_ >= Iterations::kColumn)
+        index_ = 0;
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    index_ = idx;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int r = 0; r < Iterations::kRow; r++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kAccessPerIteration; i++) {
+    
+          frag_ptr[i * Iterations::kRow + r].clear();
+          frag_ptr[i * Iterations::kRow + r] = iterator_[index_ * kAccessPerIteration + i];
+        }
+    }
+  }
+
+};
+
+// Partial specialization for Row-Major layout tile
+
+template <
+    /// Size of the input fragment tile shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator<Shape_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, ElementsPerAccess> {
+ public:
+    
+  /// Size of the input threadblock tile shape (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Underlying iterator
+  using Base = VectorFragmentIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+    layout::PitchLinear, InstructionShape, ElementsPerAccess>;
+
+
+ public:
+
+  //
+  // Derived quantities
+  //
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one iteration of the iterator.
+  using Fragment = typename Base::Fragment;
+
+  /// Input threadblock fragment tile
+  using ThreadblockFragment = typename Base::ThreadblockFragment;
+
+ private:
+  /// Underlying iterator
+  Base iterator_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
+      : iterator_(threadblock_frag) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    iterator_.add_offset(index_offset);
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    iterator_.set_index(idx);
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    iterator_.load(frag);
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/uint128.h b/lightllm-kernel/cutlass/include/cutlass/uint128.h
new file mode 100755
index 000000000..6de3ba141
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/uint128.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
+*/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#include <cstdlib>
+#include <cmath>
+#include <type_traits>
+#include <stdexcept>
+#endif
+
+#include "cutlass/cutlass.h"
+
+/// Optionally enable GCC's built-in type
+#if (defined(__x86_64) || defined (__aarch64__)) && !(defined(__CUDA_ARCH__) && ((__CUDACC_VER_MAJOR__ <= 10) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ <= 4)))) && defined(__GNUC__)
+#define CUTLASS_UINT128_NATIVE
+#elif !defined(__CUDA_ARCH__)
+// No custom support for 128b arithmetic on device
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define CUTLASS_INT128_ARITHMETIC
+#include <intrin.h>
+#if _MSC_VER >= 1920 && !defined(__CUDA_ARCH__)
+#define CUTLASS_INT128_ARITHMETIC_DIV
+#include <immintrin.h>
+#endif
+#endif
+#endif
+
+namespace cutlass {
+
+///! Unsigned 128b integer type
+struct alignas(16) uint128_t
+{
+  /// Size of one part of the uint's storage in bits
+  static constexpr int storage_bits_ = 64;
+
+  struct hilo
+  {
+    uint64_t lo;
+    uint64_t hi;
+  };
+
+  // Use a union to store either low and high parts or, if present, a built-in 128b integer type.
+  union {
+    struct hilo hilo_;
+
+#if defined(CUTLASS_UINT128_NATIVE)
+    unsigned __int128 native;
+#endif // defined(CUTLASS_UINT128_NATIVE)
+  };
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  uint128_t() : hilo_{0, 0} {}
+
+  /// Constructor from uint64
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_) : hilo_{lo_, 0} {}
+
+  /// Constructor from two 64b unsigned integers
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_, uint64_t hi_) : hilo_{lo_, hi_} {}
+
+  /// Optional constructor from native value
+#if defined(CUTLASS_UINT128_NATIVE)
+  uint128_t(unsigned __int128 value) : native(value) { }
+#endif
+
+  /// Lossily cast to uint64
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const
+  {
+    return hilo_.lo;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void exception()
+  {
+#if defined(__CUDA_ARCH__)
+  asm volatile ("  brkpt;\n");
+#else
+  // throw std::runtime_error("Not yet implemented.");
+  abort();
+#endif
+  }
+
+  /// Add
+  CUTLASS_HOST_DEVICE
+  uint128_t operator+(uint128_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native + rhs.native;
+#else
+    y.hilo_.lo = hilo_.lo + rhs.hilo_.lo;
+    y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (y.hilo_.lo < hilo_.lo);
+#endif
+    return y;
+  }
+
+  /// Subtract
+  CUTLASS_HOST_DEVICE
+  uint128_t operator-(uint128_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native - rhs.native;
+#else
+    y.hilo_.lo = hilo_.lo - rhs.hilo_.lo;
+    y.hilo_.hi = hilo_.hi - rhs.hilo_.hi - (rhs.hilo_.lo && y.hilo_.lo > hilo_.lo);
+#endif
+    return y;
+  }
+
+  /// Multiply by unsigned 64b integer yielding 128b integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator*(uint64_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native * rhs;
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // Multiply by the low part
+    y.hilo_.lo = _umul128(hilo_.lo, rhs, &y.hilo_.hi);
+
+    // Add the high part and ignore the overflow
+    uint64_t overflow{0};
+    y.hilo_.hi += _umul128(hilo_.hi, rhs, &overflow);
+#else
+    CUTLASS_UNUSED(rhs);
+    exception();
+#endif
+    return y;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator/(uint64_t const& divisor) const
+  {
+    uint64_t quotient{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    uint64_t remainder{0};
+    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator%(uint64_t const& divisor) const
+  {
+    uint64_t remainder{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return remainder;
+  }
+
+  /// Computes the quotient and remainder in a single method.
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const
+  {
+    uint64_t quotient{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(remainder);
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Left-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator<<(int sh) const
+  {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= storage_bits_) {
+      return uint128_t(0, hilo_.lo << (sh - storage_bits_));
+    }
+    else {
+      return uint128_t(
+        (hilo_.lo << sh),
+        (hilo_.hi << sh) | uint64_t(hilo_.lo >> (storage_bits_ - sh))
+      );
+    }
+  }
+
+  /// Right-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator>>(int sh) const
+  {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= storage_bits_) {
+      return uint128_t((hilo_.hi >> (sh - storage_bits_)), 0);
+    }
+    else {
+      return uint128_t(
+        (hilo_.lo >> sh) | (hilo_.hi << (storage_bits_ - sh)),
+        (hilo_.hi >> sh)
+      );
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/version.h b/lightllm-kernel/cutlass/include/cutlass/version.h
new file mode 100755
index 000000000..ff9aa1157
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/version.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#define CUTLASS_MAJOR 3
+#define CUTLASS_MINOR 6
+#define CUTLASS_PATCH 0
+
+#ifdef CUTLASS_VERSIONS_GENERATED
+#include "cutlass/version_extended.h"
+#else
+#define CUTLASS_BUILD 0
+#define CUTLASS_REVISION ""
+#endif
+
+#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
+
+namespace cutlass {
+
+  inline constexpr uint32_t getVersion() {
+    return CUTLASS_VERSION;
+  }
+  inline constexpr uint32_t getVersionMajor() {
+    return CUTLASS_MAJOR;
+  }
+  inline constexpr uint32_t getVersionMinor() {
+    return CUTLASS_MINOR;
+  }
+  inline constexpr uint32_t getVersionPatch() {
+    return CUTLASS_PATCH;
+  }
+  inline constexpr uint32_t getVersionBuild() {
+    return CUTLASS_BUILD + 0;
+  }
+
+  inline std::string getVersionString() {
+    std::string version = "@CUTLASS_VERSION@";
+    if (getVersionBuild()) {
+      version += "." + std::to_string(getVersionBuild());
+    }
+    return version;
+  }
+  
+  inline std::string getGitRevision() {
+    return "@CUTLASS_REVISION@";
+  }
+
+} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/wmma_array.h b/lightllm-kernel/cutlass/include/cutlass/wmma_array.h
new file mode 100755
index 000000000..0f9b2b514
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/wmma_array.h
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Whether the element type of T is half_t or __half
+  bool IsHalfType = (platform::is_same<typename T::element_type, cutlass::half_t>::value ||
+                     platform::is_same<typename T::element_type, __half>::value)
+>
+class WmmaFragmentArray: public Array<T, N, true> {
+public:
+
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], (typename T::element_type)0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
+
+    return *this;
+  }
+};
+
+/// Partial specialization for the case in which T::element_type is
+/// half_t or __half. This is needed because the cast (typename T::element_type)0
+/// in the primary template flags as an error when __CUDA_NO_HALF_CONVERSIONS__
+/// is set.
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N
+>
+class WmmaFragmentArray<T, N, true>: public Array<T, N, true> {
+public:
+
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], __float2half(0.f));
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
+
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/lightllm-kernel/cutlass/include/cutlass/workspace.h b/lightllm-kernel/cutlass/include/cutlass/workspace.h
new file mode 100755
index 000000000..6f1c3254c
--- /dev/null
+++ b/lightllm-kernel/cutlass/include/cutlass/workspace.h
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for initializing workspaces
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include "cutlass/trace.h"
+#endif
+
+#include "cutlass.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+static constexpr int MinWorkspaceAlignment = 16;
+
+#if !defined(__CUDACC_RTC__)
+static Status
+zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+  if (workspace_size > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  clearing workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_size, stream);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+Status
+fill_workspace(void* workspace, T fill_value, size_t fill_count, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+  static_assert(sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1, "Unsupported fill type");
+  if (fill_count > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  filling workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, fill_value, fill_count, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    CUdeviceptr d_workspace = reinterpret_cast<CUdeviceptr>(workspace);
+    CUresult result = CUDA_SUCCESS;
+    if (sizeof(T) == 4) {
+      result = cuMemsetD32Async(d_workspace, reinterpret_cast<uint32_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 2) {
+      result = cuMemsetD16Async(d_workspace, reinterpret_cast<uint16_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 1) {
+      result = cuMemsetD8Async(d_workspace, reinterpret_cast<uint8_t&>(fill_value), fill_count, stream);
+    }
+
+    if (CUDA_SUCCESS != result) {
+      const char** error_string_ptr = nullptr;
+      (void) cuGetErrorString(result, error_string_ptr);
+      if (error_string_ptr != nullptr) {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned error " << *error_string_ptr);
+      }
+      else {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned unrecognized error");
+      }
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/lightllm-kernel/include/cutlass_extensions/common.hpp b/lightllm-kernel/include/cutlass_extensions/common.hpp
new file mode 100755
index 000000000..f8a19f974
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/common.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+inline int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
diff --git a/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp b/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
new file mode 100755
index 000000000..58b1e8ff1
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
new file mode 100755
index 000000000..00b9c6f4a
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -0,0 +1,286 @@
+#pragma once
+
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace lightllm::c3x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but multiplies a Ls.
+ * The Ls tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueLs
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Ls = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+
+  using Compute2 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+    
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute2, Ls, EVTCompute1>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& ls) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto ls_args = SUPER::template args_from_tensor<Ls, ElementD>(ls);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    typename EVTCompute1::Arguments evt1_args{a_args, evt0_args};
+    return ArgumentType{ls_args, evt1_args};
+  }
+};
+
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias and multiplies a Ls.
+ * The bias tensor must be per-output channel.
+ * The Ls tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasLs
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+  using Ls = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+    
+  using EVTCompute1 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using Compute2 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute2, Ls, EVTCompute1>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias,
+                                   torch::Tensor const& ls) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto ls_args = SUPER::template args_from_tensor<Ls, ElementD>(ls);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    typename EVTCompute1::Arguments evt1_args{a_args, evt0_args, bias_args};
+    return ArgumentType{ls_args, evt1_args};
+  }
+};
+
+
+} // namespace lightllm::c3x
\ No newline at end of file
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index 3c80fef44..7ad632af5 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -1,4 +1,68 @@
 #pragma once
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <vector>
\ No newline at end of file
+#include <vector>
+#include <tuple>
+
+#include "utils.h"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+Tensor pre_tp_norm_bf16(Tensor &input);
+
+Tensor post_tp_norm_bf16(
+    Tensor &input, const Tensor& weight,
+    const Tensor& tp_variance, const int embed_dim,
+    const fp32_t eps
+);
+
+Tensor rmsnorm_align16_bf16(
+    const Tensor &X, const Tensor &W,
+    const fp32_t eps
+);
+
+void per_token_quant_bf16_fp8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
+std::tuple<Tensor, Tensor> add_norm_quant_bf16_fp8(
+    Tensor& X, const Tensor &R, const Tensor &W,
+    const fp32_t eps
+);
+
+void gelu_per_token_quant_bf16_fp8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
+void cutlass_scaled_mm(
+    Tensor& c, Tensor const& a,
+    Tensor const& b, Tensor const& a_scales,
+    Tensor const& b_scales,
+    c10::optional<Tensor> const& bias,
+    c10::optional<Tensor> const& ls
+);
+
+Tensor grouped_topk(
+        Tensor topk_weights,
+        Tensor correction_bias,
+        Tensor topk_indices,
+        Tensor group_indices,
+        Tensor gating_output,
+        int64_t  num_expert_group,
+        int64_t  topk_group,
+        int64_t  topk,
+        bool     renormalize,
+        std::string scoring_func,
+        Tensor group_scores
+);
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/include/reduce/sm70.cuh b/lightllm-kernel/include/reduce/sm70.cuh
new file mode 100755
index 000000000..b1d78f344
--- /dev/null
+++ b/lightllm-kernel/include/reduce/sm70.cuh
@@ -0,0 +1,191 @@
+#pragma once
+#include "utils.h"
+
+namespace lightllm {
+namespace reduce {
+namespace sm70 {
+/**
+ * @brief Performs a block-wide reduction to sum up floating-point
+ * values across all threads in a block.
+ *
+ * This function computes the sum of all `input` values
+ * provided by threads in a block using
+ * a combination of warp shuffle and shared memory.
+ * The result is stored in the first thread of the block.
+ *
+ * @tparam TPB Threads per block, must be a multiple of the warp size (32).
+ * @param input The input value for the calling thread.
+ * @return The block-wide sum of the input values. Only thread 0 of the block holds the valid result.
+ *
+ * @note This function assumes that `TPB` is divisible by 32 (warp size).
+ */
+template<int32_t TPB>
+__device__ inline
+fp32_t sync_block_reduce_sum_f32(const fp32_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % 32;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32_t local_sum = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        local_sum += __shfl_down_sync(0xFFFFFFFF, local_sum, stride);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32_t shared_sum[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_sum[warp_id] = local_sum;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_sum = shared_sum[warp_lane];
+        } else {
+            local_sum = 0.0f;
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            local_sum += __shfl_down_sync(0xFFFFFFFF, local_sum, stride);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_sum[0] = local_sum;
+    }
+    __syncthreads();
+
+    return shared_sum[0];
+}
+
+
+
+template<int32_t TPB>
+__device__ inline
+fp32_t sync_block_reduce_max_f32(const fp32_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % 32;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32_t local_max = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        local_max = fmaxf(__shfl_down_sync(0xFFFFFFFF, local_max, stride), local_max);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32_t shared_max[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_max[warp_id] = local_max;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_max = shared_max[warp_lane];
+        } else {
+            local_max = -FLT_MAX;
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            local_max = fmaxf(__shfl_down_sync(0xFFFFFFFF, local_max, stride), local_max);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_max[0] = local_max;
+    }
+    __syncthreads();
+
+    return shared_max[0];
+}
+
+/**
+ * @brief Performs a block-wide reduction to compute both sum and max
+ * of floating-point values across all threads in a block.
+ *
+ * This function computes both the sum and maximum of all `input` values
+ * provided by threads in a block using a combination of warp shuffle
+ * and shared memory. The result is stored in the first thread of the block.
+ *
+ * @tparam TPB Threads per block, must be a multiple of the warp size (32).
+ * @param input The input value for the calling thread (contains .x for sum, .y for max).
+ * @return The block-wide reduction result (sum in .x, max in .y). Only thread 0 of the block holds the valid result.
+ *
+ * @note This function assumes that `TPB` is divisible by 32 (warp size).
+ */
+template<int32_t TPB>
+__device__ inline
+fp32x2_t sync_block_reduce_sum_max_f32(const fp32x2_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % warpSize;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32x2_t local_result = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        // Sum reduction for .x component
+        float sum_val = __shfl_down_sync(0xFFFFFFFF, local_result.x, stride);
+        local_result.x += sum_val;
+        
+        // Max reduction for .y component
+        float max_val = __shfl_down_sync(0xFFFFFFFF, local_result.y, stride);
+        local_result.y = max(local_result.y, max_val);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32x2_t shared_result[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_result[warp_id] = local_result;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_result = shared_result[warp_lane];
+        } else {
+            local_result.x = 0.0f;  // Identity for sum
+            local_result.y = -INFINITY;  // Identity for max
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            // Sum reduction for .x component
+            float sum_val = __shfl_down_sync(0xFFFFFFFF, local_result.x, stride);
+            local_result.x += sum_val;
+            
+            // Max reduction for .y component
+            float max_val = __shfl_down_sync(0xFFFFFFFF, local_result.y, stride);
+            local_result.y = max(local_result.y, max_val);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_result[0] = local_result;
+    }
+    __syncthreads();
+
+    return shared_result[0];
+}
+
+} // namespace sm70
+} // namespace reduce
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/include/utils.h b/lightllm-kernel/include/utils.h
new file mode 100644
index 000000000..105dc89fc
--- /dev/null
+++ b/lightllm-kernel/include/utils.h
@@ -0,0 +1,267 @@
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <device_launch_parameters.h>
+#include <cuda_runtime_api.h>
+
+// mycuda, some wrappers and utils
+namespace lightllm {
+// type definitions
+using fp16_t = __half;
+using fp16x2_t = __half2;
+using bf16_t = __nv_bfloat16;
+using bf16x2_t = __nv_bfloat162;
+
+using fp8_e4m3_t = __nv_fp8_e4m3;
+using fp8x2_e4m3_t = __nv_fp8x2_e4m3;
+using fp8x4_e4m3_t = __nv_fp8x4_e4m3;
+
+using fp32_t = float;
+using fp32x2_t = float2;
+using fp32x4_t = float4;
+
+using int32x4_t = int4;
+using int32x2_t = int2;
+
+using int8x2_t = short;
+using int8x4_t = int32_t;
+using int8x8_t = int64_t;
+
+using vec_type = int4;
+
+// convert fp16_t to fp32_t
+__device__ inline fp32_t cvt_f16_f32(const fp16_t x) { return __half2float(x); }
+
+__device__ inline fp16_t cvt_f32_f16(const fp32_t x) { return __float2half(x); }
+
+// Convert bf16_t to fp32_t
+__device__ inline fp32_t cvt_bf16_f32(const bf16_t x) {
+    return __bfloat162float(x);
+}
+
+// Convert fp32_t to bf16_t
+__device__ inline bf16_t cvt_f32_bf16(const fp32_t x) {
+    return __float2bfloat16(x);
+}
+
+// bf16x2 to fp32x2 conversion
+__device__ inline fp32x2_t bf16x2_to_fp32x2(bf16x2_t bf16x2_val) {
+    // Extract the two bfloat16 values from bf16x2
+    bf16_t low = __low2bfloat16(bf16x2_val);
+    bf16_t high = __high2bfloat16(bf16x2_val);
+
+    // Convert bfloat16 to float
+    float low_f = __bfloat162float(low);
+    float high_f = __bfloat162float(high);
+
+    // Pack the two floats into a float2
+    return make_float2(low_f, high_f);
+}
+
+__device__ inline bf16x2_t _float22bf162_rn(fp32x2_t val) {
+    bf16_t low = __float2bfloat16(val.x);
+    bf16_t high = __float2bfloat16(val.y);
+    return bf16x2_t(low, high);
+}
+
+template <typename T>
+__host__ __device__ T Cdiv(T numerator, T denominator) {
+    return (numerator + denominator - 1) / denominator;
+}
+
+template <typename T>
+__host__ __device__ T Adiv(T value, T alignment) {
+    return (value + alignment - 1) & ~(alignment - 1);
+}
+
+__device__ inline fp32x2_t operator+(const fp32x2_t& a, const fp32x2_t& b) {
+    return {a.x + b.x, a.y + b.y};
+}
+
+__device__ inline fp16_t abs(const fp16_t& x) { return __habs(x); }
+
+__device__ inline bool operator>(const fp16_t& a, const fp16_t& b) {
+    return __hgt(a, b);
+}
+
+__device__ inline fp16_t operator+(const fp16_t& a, const fp16_t& b) {
+    return __hadd(a, b);
+}
+
+__device__ inline fp16_t operator-(const fp16_t& a, const fp16_t& b) {
+    return __hsub(a, b);
+}
+
+__device__ inline fp16_t operator*(const fp16_t& a, const fp16_t& b) {
+    return __hmul(a, b);
+}
+
+__device__ inline fp16_t operator/(const fp16_t& a, const fp16_t& b) {
+    return __hdiv(a, b);
+}
+
+__device__ inline fp16_t& operator+=(fp16_t& a, const fp16_t& b) {
+    a = __hadd(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator-=(fp16_t& a, const fp16_t& b) {
+    a = __hsub(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator*=(fp16_t& a, const fp16_t& b) {
+    a = __hmul(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator/=(fp16_t& a, const fp16_t& b) {
+    a = __hdiv(a, b);
+    return a;
+}
+
+__device__ inline fp16x2_t operator+(const fp16x2_t& a, const fp16x2_t& b) {
+    return __hadd2(a, b);
+}
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void vec_copy(const void* src, void* dest)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(src);
+    T* out = static_cast<T*>(dest);
+    *out = *in;
+}
+
+template<int32_t divisor>
+__device__ inline int32x2_t divmod(const int32_t x);
+
+template<>
+__device__ inline int32x2_t divmod<128>(const int32_t x) {
+    return {x >> 7, x & 0x7F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<64>(const int32_t x) {
+    return {x >> 6, x & 0x3F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<32>(const int32_t x) {
+    return {x >> 5, x & 0x1F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<16>(const int32_t x) {
+    return {x >> 4, x & 0x0F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<8>(const int32_t x) {
+    return {x >> 3, x & 0x07};
+}
+
+template<>
+__device__ inline int32x2_t divmod<4>(const int32_t x) {
+    return {x >> 2, x & 0x03};
+}
+
+template<>
+__device__ inline int32x2_t divmod<2>(const int32_t x) {
+    return {x >> 1, x & 0x01};
+}
+
+}  // namespace lightllm
+
+// mytorch, some wrappers and utils
+namespace lightllm {
+using Tensor = torch::Tensor;
+
+template <typename T>
+__host__ inline T *PTR(at::Tensor t) {
+    return reinterpret_cast<T *>(t.data_ptr());
+}
+
+template <>
+__host__ inline fp16_t *PTR(at::Tensor t) {
+    return reinterpret_cast<fp16_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline fp16x2_t *PTR(at::Tensor t) {
+    return reinterpret_cast<fp16x2_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8x4_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8x4_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8x2_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8x2_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline uint16_t *PTR(at::Tensor t) {
+    return reinterpret_cast<uint16_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline uint32_t *PTR(at::Tensor t) {
+    return reinterpret_cast<uint32_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline void *PTR(at::Tensor t) {
+    return reinterpret_cast<void *>(t.data_ptr());
+}
+
+__device__ inline
+void block_debug_print_matrix(fp16_t *ptr, int32_t M, int32_t N, int32_t stride) {
+    if(threadIdx.x == 0) {
+        printf("Debug Matrix [%d, %d, %d]: \n", blockIdx.x, blockIdx.y, blockIdx.z);
+        for(int32_t i = 0; i < M; i++) {
+            for(int32_t j = 0; j < N; j++) {
+                printf("%.2f ", __half2float(ptr[i * stride + j]));
+            }
+            printf("\n");
+        }
+    }
+}
+
+}  // namespace lightllm
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index c3f54642b..b8930e5dc 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -15,18 +15,39 @@
             "directory (csrc/) found; please ensure you have run "
             "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
         )
+    
+    PROGRAM_NAME = "lightllm_kernel._C"
+    EXTENSION_BUILD_DIR = "build"
+    INCLUDE_DIR = "include"
+    CUTLASS_DIR = "cutlass/include"
 
-    sources = (
-        [str(p) for p in (csrc_dir / "moe").glob("*.cpp")]
-        + [str(p) for p in (csrc_dir / "moe").glob("*.cu")]
-        + [str(csrc_dir / "ops_bindings.cpp")]
-    )
+    sources = []
+    file_names = []  # Store file names for printing
+    for subdir, _, files in os.walk(csrc_dir):
+        for file in files:
+            if file.endswith((".cpp", ".cu")):
+                sources.append(os.path.join(subdir, file))
+                file_names.append(file)
+
+    # Print all detected source file names
+    print(f"{PROGRAM_NAME}: Detected source files:")
+    for file_name in file_names:
+        print(f"  - {file_name}")
 
     _C = load(
-        name="lightllm_kernel._C",
+        name=PROGRAM_NAME,
         sources=sources,
         verbose=True,
+        extra_include_paths=[
+            os.path.join(repo_root, INCLUDE_DIR),
+            os.path.join(repo_root, CUTLASS_DIR),
+        ],
+        build_directory=os.path.join(repo_root, EXTENSION_BUILD_DIR),
+        with_cuda=True,
         extra_cuda_cflags=[
+            "-DNDEBUG",
+            "-O3",
+            "-use_fast_math",
             # A100
             "-gencode=arch=compute_80,code=sm_80",
             "-gencode=arch=compute_80,code=compute_80",
@@ -36,9 +57,23 @@
             # Hopper / H100 / H200
             "-gencode=arch=compute_90,code=sm_90",
             "-gencode=arch=compute_90,code=compute_90",
+            "-gencode=arch=compute_90a,code=sm_90a",
         ],
+        extra_cflags=["-O3"],
     )
 
 # 向外暴露 Python 端接口
-grouped_topk = _C.grouped_topk
-__all__ = ["grouped_topk"]
+from .fusion import pre_tp_norm_bf16, post_tp_norm_bf16, add_norm_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
+from .norm import rmsnorm_bf16
+from .quant import per_token_quant_bf16_fp8
+from .gemm import cutlass_scaled_mm_bias_ls
+
+__all__ = [
+    "rmsnorm_bf16",
+    "per_token_quant_bf16_fp8",
+    "pre_tp_norm_bf16",
+    "post_tp_norm_bf16",
+    "add_norm_quant_bf16_fp8",
+    "gelu_per_token_quant_bf16_fp8",
+    "cutlass_scaled_mm_bias_ls",
+]
diff --git a/lightllm-kernel/lightllm_kernel/ops/fusion.py b/lightllm-kernel/lightllm_kernel/ops/fusion.py
new file mode 100644
index 000000000..6f3c8243b
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/fusion.py
@@ -0,0 +1,22 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+def pre_tp_norm_bf16(input: torch.Tensor) -> torch.Tensor:
+    """ Calculate powersum along embedding dimension of the input """
+    return _C.pre_tp_norm_bf16(input)
+
+def post_tp_norm_bf16(input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float) -> torch.Tensor:
+    """ Apply rmsnorm on given input, with weight and pre calculated powersum """
+    return _C.post_tp_norm_bf16(input, weight, tp_variance, embed_dim, eps)
+
+def add_norm_quant_bf16_fp8(input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Apply add_norm_quant on given input, with residual and weight """
+    return _C.add_norm_quant_bf16_fp8(input, residual, weight, eps)
+
+def gelu_per_token_quant_bf16_fp8(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method """
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.gelu_per_token_quant_bf16_fp8(output, input, scales)
+    return output, scales
diff --git a/lightllm-kernel/lightllm_kernel/ops/gemm.py b/lightllm-kernel/lightllm_kernel/ops/gemm.py
new file mode 100644
index 000000000..0fb569cfc
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/gemm.py
@@ -0,0 +1,8 @@
+import torch
+from typing import Optional
+from . import _C
+
+def cutlass_scaled_mm_bias_ls(c: torch.Tensor, a: torch.Tensor, b: torch.Tensor,
+                      a_scales: torch.Tensor, b_scales: torch.Tensor, bias: Optional[torch.Tensor], ls: Optional[torch.Tensor]) -> None :
+    """ Apply scaled mm on the given input, with optional bias and ls weight """
+    return _C.cutlass_scaled_mm(c, a, b, a_scales, b_scales, bias, ls)
diff --git a/lightllm-kernel/lightllm_kernel/ops/norm.py b/lightllm-kernel/lightllm_kernel/ops/norm.py
new file mode 100644
index 000000000..d60013f42
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/norm.py
@@ -0,0 +1,7 @@
+import torch
+from typing import Optional
+from . import _C
+
+def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float=1e-12) -> torch.Tensor:
+    """ Apply rmsnorm on given X, with weight W and eps """
+    return _C.rmsnorm_align16_bf16(X, W, eps)
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
index e69de29bb..8889f11b1 100644
--- a/lightllm-kernel/lightllm_kernel/ops/quant.py
+++ b/lightllm-kernel/lightllm_kernel/ops/quant.py
@@ -0,0 +1,10 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+def per_token_quant_bf16_fp8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Quantize the given input using per token quant method """
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.per_token_quant_bf16_fp8(output, input, scales)
+    return output, scales
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
index 34f992b73..70ae132dd 100644
--- a/lightllm-kernel/setup.py
+++ b/lightllm-kernel/setup.py
@@ -1,32 +1,53 @@
+import os
 from pathlib import Path
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
-this_dir = Path(__file__).parent
+repo_root = Path(__file__).resolve().parents[0]
+csrc_dir = repo_root / "csrc"
+if not csrc_dir.exists():
+    raise ImportError(
+        "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
+        "directory (csrc/) found; please ensure you have run "
+        "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
+    )
+
+PROGRAM_NAME = "lightllm_kernel._C"
+INCLUDE_DIR = "include"
+CUTLASS_DIR = "cutlass/include"
+
+sources = []
+file_names = []  # Store file names for printing
+for subdir, _, files in os.walk(csrc_dir):
+    for file in files:
+        if file.endswith((".cpp", ".cu")):
+            sources.append(os.path.join(subdir, file))
+            file_names.append(file)
+
+# Print all detected source file names
+print(f"{PROGRAM_NAME}: Detected source files:")
+for file_name in file_names:
+    print(f"  - {file_name}")
 
-sources = [
-    str(this_dir / "csrc" / "moe" / "grouped_topk_interface.cpp"),
-    str(this_dir / "csrc" / "moe" / "grouped_topk.cu"),
-    str(this_dir / "csrc" / "ops_bindings.cpp"),
-]
-print("---- sources for CUDAExtension ----")
-for s in sources:
-    print(s)
-print("-----------------------------------")
 ext_modules = [
     CUDAExtension(
-        name="lightllm_kernel._C",
+        name=PROGRAM_NAME,
         sources=sources,
         extra_compile_args={
             "cxx": ["-O3"],
             "nvcc": [
+                "-DNDEBUG", 
                 "-O3",
                 "--use_fast_math",
                 "-gencode=arch=compute_90,code=sm_90",
                 "-gencode=arch=compute_90,code=compute_90",
+                '-gencode=arch=compute_90a, code=sm_90a',
             ],
         },
-        include_dirs=[str(this_dir / "include")],
+        include_dirs=[
+            os.path.join(repo_root, INCLUDE_DIR),
+            os.path.join(repo_root, CUTLASS_DIR),
+        ],
     )
 ]
 
diff --git a/lightllm-kernel/test/__init__.py b/lightllm-kernel/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm-kernel/test/fusion/add_norm_quant_test.py b/lightllm-kernel/test/fusion/add_norm_quant_test.py
new file mode 100755
index 000000000..a04d2b9f3
--- /dev/null
+++ b/lightllm-kernel/test/fusion/add_norm_quant_test.py
@@ -0,0 +1,70 @@
+import unittest
+import torch
+from lightllm_kernel.ops import add_norm_quant_bf16_fp8
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from test.utils import benchmark, error
+
+
+def torch_add_norm_quant_bf16_fp8(X, R, W, eps=1e-6):
+    N = X.size(1)
+    # 1. Add residual
+    X = X.add_(R)
+    # 2. rmsnorm
+    normalized = torch.nn.functional.rms_norm(X, (N, ), W, eps=eps)
+    # 3. per token quant
+    quantized, scales = ops.scaled_fp8_quant(normalized, scale=None, use_per_token_if_dynamic=True)
+
+    return quantized, scales
+
+class TestFusedAddNormQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [13]
+        self.seqLens = [1025]
+        self.embed_dims = [16, 32, 64, 512, 1024, 3200, 4096, 12800, 24, 511, 513, 1023, 1025, 1032, 4097]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+        self.eps = 1e-6
+
+    def test_accuracy(self):
+        """Test the accuracy of FusedAddNormQuant against torch."""
+        for batch in self.batchs:
+            for seqLen in self.seqLens:
+                for embed_dim in self.embed_dims:
+                        with self.subTest(shape=[batch, seqLen, embed_dim]):
+                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            X2 = X1.clone()
+                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            R2 = R1.clone()
+                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            output_real, scales_real = torch_add_norm_quant_bf16_fp8(X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
+                            output_pred, scales_pred = add_norm_quant_bf16_fp8(X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
+
+                            self.assertTrue(
+                                error(output_real, output_pred) < 0.01,
+                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. output_real={output_real}, output_pred={output_pred}"
+                            )
+                            self.assertTrue(
+                                error(scales_real, scales_pred) < 0.01,
+                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                            )
+
+    def test_performance(self):
+        """Test the performance of FusedAddNormQuant using benchmark."""
+        for batch in self.batchs:
+            for seqLen in self.seqLens:
+                for embed_dim in self.embed_dims:
+                        with self.subTest(shape=[batch, seqLen, embed_dim]):
+                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                            R2 = R1.clone()
+                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+
+                            shape = [[batch, seqLen, embed_dim]]
+                            tflops = 0.0
+                            benchmark(torch_add_norm_quant_bf16_fp8, shape, tflops, 100, X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
+                            benchmark(add_norm_quant_bf16_fp8, shape, tflops, 100, X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
new file mode 100644
index 000000000..90fc00025
--- /dev/null
+++ b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
@@ -0,0 +1,50 @@
+import unittest
+import torch
+from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
+from lightllm_kernel.ops import per_token_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
+from test.utils import benchmark, error
+
+def gelu_quant(x):
+    y = gelu_fwd(x)
+    return per_token_quant_bf16_fp8(y)
+
+class TestGeluQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [13325]
+        self.hiddenDims = [3200, 4800, 12800, 511, 1032, 1023, 1025]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of gelu_per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.normal(mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype)
+
+                    y_real, scales_real = gelu_quant(input)
+                    y_pred, scales_pred = gelu_per_token_quant_bf16_fp8(input)
+                    
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}"
+                    )
+                    
+    def test_performance(self):
+        """Test the performance of gelu_per_token_quant using benchmark."""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = 0.0
+                    benchmark(gelu_per_token_quant_bf16_fp8, shape, tflops, 100, input)
+                    benchmark(gelu_quant, shape, tflops, 100, input)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/fusion/post_tp_norm_test.py b/lightllm-kernel/test/fusion/post_tp_norm_test.py
new file mode 100755
index 000000000..4830112c3
--- /dev/null
+++ b/lightllm-kernel/test/fusion/post_tp_norm_test.py
@@ -0,0 +1,54 @@
+import unittest
+import torch
+from lightllm_kernel.ops import post_tp_norm_bf16
+from test.utils import benchmark, error
+
+
+def post_tp_norm(input, weight, tp_variance, embed_dim, eps):
+    input = input.to(torch.float32)
+    variance = tp_variance / embed_dim
+    variance = variance.unsqueeze(-1)
+    input = input * torch.rsqrt(variance + eps)
+    out = weight * input.to(torch.bfloat16)
+    return out
+
+class TestPostTpNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+        self.embed_dim = 3200
+        self.eps = 1e-6
+
+    def test_accuracy(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    V = torch.rand(size=[batch], device=self.device, dtype=torch.float32)
+
+                    y_real = post_tp_norm(X, W, V, self.embed_dim, self.eps)
+                    y_pred = post_tp_norm_bf16(X, W, V, self.embed_dim, self.eps)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                    )
+
+    def test_performance(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    V = torch.rand(size=[batch], device=self.device, dtype=torch.float32)
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(post_tp_norm_bf16, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
+                    benchmark(post_tp_norm, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/fusion/pre_tp_norm_test.py b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
new file mode 100755
index 000000000..baf0e52ea
--- /dev/null
+++ b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
@@ -0,0 +1,46 @@
+import unittest
+import torch
+from lightllm_kernel.ops import pre_tp_norm_bf16
+from test.utils import benchmark, error
+
+
+def pre_tp_norm(input):
+    input = input.to(torch.float32)
+    tp_variance = input.pow(2).sum(-1, keepdim=False)
+    return tp_variance
+
+class TestPreTpNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+
+                    y_real = pre_tp_norm(X)
+                    y_pred = pre_tp_norm_bf16(X)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                    )
+
+    def test_performance(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(pre_tp_norm_bf16, shape, tflops, 100, X)
+                    benchmark(pre_tp_norm, shape, tflops, 100, X)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
new file mode 100644
index 000000000..a9d0d014d
--- /dev/null
+++ b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
@@ -0,0 +1,80 @@
+import unittest
+import torch
+from lightllm_kernel.ops import cutlass_scaled_mm_bias_ls
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from test.utils import benchmark, error
+
+
+def torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=None, ls=None):
+    y_pred_tmp = ops.cutlass_scaled_mm(x_q, w_q_t, x_scale, w_scale, out_dtype=out_dtype, bias=bias)
+    y_pred = y_pred_tmp * ls
+    return y_pred
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [128, 1024, 13325]
+        self.hiddenDims = [256, 512, 1024, 3200]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+
+    def test_accuracy(self):
+        """Test the accuracy of cutlass_scaled_mm_bias_ls"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    M, N, K = token, 3 * hiddenDim, hiddenDim
+
+                    input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype)
+                    x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    
+                    # 生成权重张量w_q（N×K），转置后为K×N（列优先）
+                    weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype)
+                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    # 转置，w_q_t为列优先
+                    w_q_t = w_q.t()
+                    assert w_q_t.stride(0) == 1, "权重转置后步幅需列优先"
+
+                    y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
+                    bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+
+                    cutlass_scaled_mm_bias_ls(y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
+                    y_real = torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls)
+
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}"
+                    )
+
+    def test_performance(self):
+        """Test the performance of cutlass_scaled_mm_bias_ls"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    M, N, K = token, 3 * hiddenDim, hiddenDim
+
+                    input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype) - 0.5
+                    x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    # 生成权重张量w_q（N×K），转置后为K×N（列优先）
+                    weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype) - 0.5
+                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    # 转置，w_q_t为列优先
+                    w_q_t = w_q.t()
+                    assert w_q_t.stride(0) == 1, "权重转置后步幅需列优先"
+
+                    y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
+                    shape = [[token, hiddenDim]]
+                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024**4
+                    benchmark(cutlass_scaled_mm_bias_ls, shape, tflops, 100, y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
+                    benchmark(torch_cutlass_scale_gemm_with_ls, shape, tflops, 100, x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls) # 无bias 495GB/s, 有bias 482GB/s
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/norm/rmsnorm_test.py b/lightllm-kernel/test/norm/rmsnorm_test.py
new file mode 100755
index 000000000..aaccc1c92
--- /dev/null
+++ b/lightllm-kernel/test/norm/rmsnorm_test.py
@@ -0,0 +1,45 @@
+import unittest
+import torch
+from lightllm_kernel.ops import rmsnorm_bf16
+from test.utils import benchmark, error
+
+
+class TestRmsNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of rmsnorm against torch.rmsnorm."""
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    y_real = torch.nn.functional.rms_norm(X, (size, ), W)
+                    y_pred = rmsnorm_bf16(X, W)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                    )
+                    print(f"{error(y_pred, y_real) = }")
+
+    def test_performance(self):
+        """Test the performance of rmsnorm using benchmark."""
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(rmsnorm_bf16, shape, tflops, 100, X, W)
+                    benchmark(torch.nn.functional.rms_norm, shape, tflops, 100, X, (size, ), W)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/quant/quant_test.py b/lightllm-kernel/test/quant/quant_test.py
new file mode 100755
index 000000000..a71d2f249
--- /dev/null
+++ b/lightllm-kernel/test/quant/quant_test.py
@@ -0,0 +1,47 @@
+import unittest
+import torch
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm_kernel.ops import per_token_quant_bf16_fp8
+from test.utils import benchmark, error
+
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [1024, 13325]
+        self.hiddenDims = [256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
+                    y_real, scales_real = ops.scaled_fp8_quant(
+                        input.contiguous().cuda(self.device), scale=None, use_per_token_if_dynamic=True
+                    )
+                    y_pred, scales_pred = per_token_quant_bf16_fp8(input)
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}"
+                    )
+
+    def test_performance(self):
+        """Test the performance of per_token_quant"""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = token * size / 1024**4
+                    benchmark(per_token_quant_bf16_fp8, shape, tflops, 100, input)
+                    benchmark(ops.scaled_fp8_quant, shape, tflops, 100, input, None, True)
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/lightllm-kernel/test/utils.py b/lightllm-kernel/test/utils.py
new file mode 100644
index 000000000..c87373178
--- /dev/null
+++ b/lightllm-kernel/test/utils.py
@@ -0,0 +1,125 @@
+import torch
+from typing import Callable
+from typing import List
+
+
+def error(y_pred: torch.Tensor, y_real: torch.Tensor) -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+    
+    SNR can be calcualted as following equation:
+    
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+    
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+    
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        torch.Tensor: _description_
+    """
+    y_pred = torch.flatten(y_pred).float()
+    y_real = torch.flatten(y_real).float()
+
+    if y_pred.shape != y_real.shape:
+        raise ValueError(f'Can not compute snr loss for tensors with different shape. '
+            f'({y_pred.shape} and {y_real.shape})')
+
+    noise_power  = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+    return snr.item()
+
+
+def benchmark(func: Callable, shape: List[int], tflops: float, steps: int, *args, **kwargs):
+    """
+    A decorator function to assist in performance testing of CUDA operations.
+    
+    This function will:
+    1. Automatically determine whether any parameters in the argument list, 
+       or the output of the `func`, are of type `torch.Tensor`.
+    2. If so, calculate the memory usage of the input and output tensors 
+       on the GPU (based on their data type and `torch.numel()`).
+    3. Establish a CUDA graph and attempt to execute `func` repeatedly for `steps` iterations.
+    4. Record the execution time during these iterations.
+    5. Use the information above to compute the compute performance (TFLOPS) and memory throughput.
+
+    Args:
+        func (function): The function to benchmark.
+        shape (list of int): The problem shape.
+        tflops (float): The computational workload (in TFLOPS) per call of `func`.
+        steps (int): The number of times the function is executed during benchmarking.
+        *args: Positional arguments to be passed to the `func`.
+        **kwargs: Keyword arguments to be passed to the `func`.
+
+    Returns:
+        function result
+    """
+    
+    # Ensure CUDA is available
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for benchmarking.")
+
+    # Check for torch.Tensor in inputs and outputs
+    input_tensors = [arg for arg in args if isinstance(arg, torch.Tensor)]
+    input_tensors += [value for value in kwargs.values() if isinstance(value, torch.Tensor)]
+
+    def calculate_memory(tensor: torch.Tensor):
+        """Calculate memory usage in bytes for a tensor."""
+        return tensor.numel() * tensor.element_size()
+
+    input_memory = sum(calculate_memory(t) for t in input_tensors)
+
+    # Execute the function to inspect outputs
+    with torch.no_grad():
+        output = func(*args, **kwargs)
+
+    output_memory = 0
+    if isinstance(output, torch.Tensor):
+        output_memory = calculate_memory(output)
+    elif isinstance(output, (list, tuple)):
+        output_memory = sum(calculate_memory(o) for o in output if isinstance(o, torch.Tensor))
+
+    total_memory = input_memory + output_memory
+
+    # Warm-up and CUDA graph creation
+    for _ in range(10):  # Warm-up
+        func(*args, **kwargs)
+
+    torch.cuda.synchronize()  # Ensure no pending operations
+
+    # Benchmark the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for _ in range(steps):
+        func(*args, **kwargs)
+    end_event.record()
+
+    torch.cuda.synchronize()  # Ensure all operations are finished
+    elapsed_time_ms = start_event.elapsed_time(end_event)  # Time in milliseconds
+
+    # Calculate performance metrics
+    elapsed_time_s = elapsed_time_ms / 1000  # Convert to seconds
+    avg_time_per_step = elapsed_time_s / steps
+    compute_performance = tflops / avg_time_per_step  # TFLOPS
+    memory_throughput = (total_memory * steps / (1024**3)) / elapsed_time_s  # GB/s
+
+    # Print performance metrics
+    print(f"Function: {func.__name__}{shape}")
+    # print(f"Function: {func.__ne__}{shape}")
+    print(f"Elapsed Time (total): {elapsed_time_s:.4f} seconds")
+    print(f"Average Time Per Step: {avg_time_per_step * 1000 :.3f} ms")
+    print(f"Compute Performance: {compute_performance:.2f} TFLOPS")
+    print(f"Memory Throughput: {memory_throughput:.2f} GB/s")
+    print("") # print a blank line.
\ No newline at end of file

From 08851459f74a4da2bfe968351e24d4099ee66010 Mon Sep 17 00:00:00 2001
From: Xtra <571889291@qq.com>
Date: Tue, 13 May 2025 17:02:20 +0800
Subject: [PATCH 03/14] perf(vit): add optimized rmsnorm and quant operator
 support (#888)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# 添加vit调用优化算子逻辑

1. rmsnorm，添加单卡或者多卡优化算子逻辑
2. per_token_quant，添加激活值量化优化算子逻辑
---
 lightllm/common/quantization/vllm_quant.py    | 11 ++++-
 .../layer_infer/transformer_layer_infer.py    | 42 ++++++++++++++++++-
 .../vit/triton_kernel/flashattention_nopad.py |  2 +
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
index 30048b44d..565d86da0 100644
--- a/lightllm/common/quantization/vllm_quant.py
+++ b/lightllm/common/quantization/vllm_quant.py
@@ -74,6 +74,8 @@ def __init__(self):
         self.is_moe = False
         # PINGPONG_FP8_GEMM is per tensor quant way.
         self.use_pingpong_fp8_gemm = os.getenv("ENABLE_PINGPONG_FP8_GEMM", "0").upper() in ["ON", "TRUE", "1"]
+        # per token quant with better performance.
+        self.use_lightllm_kernels = os.getenv("ENABLE_LIGHTLLM_KERNELS", "0").upper() in ["ON", "TRUE", "1"]
 
         if self.use_pingpong_fp8_gemm:
             self.quantize = self.quantize_pingpong_fp8
@@ -121,7 +123,14 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
     def apply_scaled_mm_fp8(
         self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
     ):
-        x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+        if self.use_lightllm_kernels:
+
+            from lightllm_kernel.ops import per_token_quant_bf16_fp8
+
+            x_q, x_scale = per_token_quant_bf16_fp8(input_tensor)
+        else:
+            x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+        
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
         if out is None:
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
index 14ba9cfed..5dc5d3d42 100644
--- a/lightllm/models/vit/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -1,3 +1,4 @@
+import os
 import torch
 import torch.functional as F
 import torch.distributed as dist
@@ -17,6 +18,25 @@
 
 class ViTTransformerLayerInfer:
     """ """
+    # 类变量缓存导入的算子
+    _lightllm_kernels = None
+    
+    @classmethod
+    def _init_kernels(cls):
+        if cls._lightllm_kernels is None:
+            cls._lightllm_kernels = {}
+            try:
+                from lightllm_kernel.ops import (
+                    rmsnorm_bf16,
+                    pre_tp_norm_bf16, post_tp_norm_bf16
+                )
+                cls._lightllm_kernels.update({
+                    'rmsnorm_bf16': rmsnorm_bf16,
+                    'pre_tp_norm_bf16': pre_tp_norm_bf16,
+                    'post_tp_norm_bf16': post_tp_norm_bf16,
+                })
+            except ImportError as e:
+                print(f"Warning: Failed to load lightllm_kernel.ops: {e}")
 
     def __init__(self, layer_num, network_config, mode=[]):
         self.tp_rank_ = get_current_rank_in_dp()
@@ -32,6 +52,9 @@ def __init__(self, layer_num, network_config, mode=[]):
         self.network_config_ = network_config
         self.mode = mode
         self.layer_num_ = layer_num
+        self.use_lightllm_kernels = os.getenv("ENABLE_LIGHTLLM_KERNELS", "0").upper() in ["ON", "TRUE", "1"]
+        if self.use_lightllm_kernels:
+             self.__class__._init_kernels()  # 确保算子已初始化
         return
 
     def norm(self, input, weight):
@@ -45,6 +68,17 @@ def norm(self, input, weight):
         out = out.reshape(input_shape)
         return out
 
+    def tp_norm_optim(self, input, weight):
+        if self.tp_world_size_ == 1:
+            out =  self._lightllm_kernels['rmsnorm_bf16'](input, weight, self.eps_)
+        else:
+            tp_variance = self._lightllm_kernels['pre_tp_norm_bf16'](input)
+            dist.all_reduce(tp_variance, op=dist.ReduceOp.SUM, async_op=False)
+            out =  self._lightllm_kernels['post_tp_norm_bf16'](
+                input, weight, tp_variance, self.embed_dim_, self.eps_
+            )
+        return out
+
     def tp_norm(self, input, weight):
         input_shape = input.shape
         input = input.view(-1, self.tp_padding_head_num * self.head_dim_)
@@ -89,8 +123,12 @@ def _ffn_norm(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Ten
             )
 
     def _qk_norm(self, q, k, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
-        q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
-        k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
+        if self.use_lightllm_kernels:
+            q_norm = self.tp_norm_optim(q, layer_weight.q_norm_weight_.weight)
+            k_norm = self.tp_norm_optim(k, layer_weight.k_norm_weight_.weight)
+        else:
+            q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
+            k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
         return q_norm, k_norm
 
     def _get_qkv(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
index 705ed98ec..dcaee55f9 100644
--- a/lightllm/models/vit/triton_kernel/flashattention_nopad.py
+++ b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -175,6 +175,7 @@ def flash_attention_v3_fwd(
             v,
             None,
             None,  # k_new, v_new
+            None,
             o,  # out
             None,
             None,
@@ -191,6 +192,7 @@ def flash_attention_v3_fwd(
             None,
             None,
             None,
+            None,
             softmax_scale,
             False,  # causal
             window_size=(-1, -1),

From d8989d8e7865a0496b463392df4fcf3ec29ec3e3 Mon Sep 17 00:00:00 2001
From: Alice <sangchengmeng@mail.ustc.edu.cn>
Date: Fri, 23 May 2025 13:46:14 +0800
Subject: [PATCH 04/14] add-all_gather

---
 lightllm-kernel/CMakeLists.txt                |   5 +-
 lightllm-kernel/csrc/moe/all_gather.cu        | 161 ++++++
 lightllm-kernel/csrc/moe/all_gather.cuh       | 287 ++++++++++
 lightllm-kernel/csrc/moe/all_reduce.cuh       | 516 ++++++++++++++++++
 lightllm-kernel/csrc/ops_bindings.cpp         |  16 +-
 lightllm-kernel/include/ops_common.h          |   9 +
 lightllm-kernel/lightllm_kernel/__init__.py   |   1 +
 .../lightllm_kernel/ops/__init__.py           |   7 +-
 lightllm-kernel/lightllm_kernel/ops/moe.py    |  39 ++
 lightllm-kernel/setup.py                      |   7 +-
 lightllm/common/quantization/w8a8_quant.py    |  20 +-
 11 files changed, 1033 insertions(+), 35 deletions(-)
 create mode 100644 lightllm-kernel/csrc/moe/all_gather.cu
 create mode 100644 lightllm-kernel/csrc/moe/all_gather.cuh
 create mode 100644 lightllm-kernel/csrc/moe/all_reduce.cuh

diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt
index c61ed9dd8..5de3c6420 100644
--- a/lightllm-kernel/CMakeLists.txt
+++ b/lightllm-kernel/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 # 找 PyTorch & Python
 find_package(Torch REQUIRED)
 find_package(Python REQUIRED COMPONENTS Development)
+find_package(CUDAToolkit REQUIRED)
 
 # 收集 csrc 下的 .cpp/.cu
 file(GLOB_RECURSE SRC_CPP   CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cpp")
@@ -23,7 +24,9 @@ target_include_directories(_C PRIVATE ${TORCH_INCLUDE_DIRS})
 target_link_libraries(_C
     PRIVATE
       ${TORCH_LIBRARIES}
-      Python::Python)
+      Python::Python
+      CUDA::cudart
+      CUDA::cuda_driver)
 
       
 # 输出文件名 _C.so，无前缀
diff --git a/lightllm-kernel/csrc/moe/all_gather.cu b/lightllm-kernel/csrc/moe/all_gather.cu
new file mode 100644
index 000000000..fb6fd5ac9
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/all_gather.cu
@@ -0,0 +1,161 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#include "ops_common.h"
+#include "all_gather.cuh"
+
+namespace lightllm {
+namespace ops {
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t init_custom_gather_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool full_nvlink) {
+  int world_size = fake_ipc_ptrs.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  vllm::Signal* ipc_ptrs[8];
+  for (int i = 0; i < world_size; i++) {
+    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
+  }
+  return (fptr_t) new vllm::CustomAllgather(ipc_ptrs, rank_data.data_ptr(),
+                                            rank_data.numel(), rank, world_size,
+                                            full_nvlink);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous_gather(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+          t.numel() * t.element_size());
+}
+
+/**
+ * Performs an out-of-place allgather and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_gather(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+  
+                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK(_is_weak_contiguous_gather(out));
+  TORCH_CHECK(_is_weak_contiguous_gather(inp));
+  auto input_size = inp.numel() * inp.element_size();
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
+                                  cudaMemcpyDeviceToDevice, stream));
+  } else {
+    reg_buffer = inp.data_ptr();
+  }
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allgather<float>(stream, reinterpret_cast<float*>(reg_buffer),
+                           reinterpret_cast<float*>(out.data_ptr()),
+                           inp.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allgather<half>(stream, reinterpret_cast<half*>(reg_buffer),
+                          reinterpret_cast<half*>(out.data_ptr()), inp.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allgather<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
+          reinterpret_cast<nv_bfloat16*>(out.data_ptr()), inp.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allgather only supports float32, float16 and bfloat16");
+  }
+}
+
+void allgather_dispose(fptr_t _fa) {
+  delete reinterpret_cast<vllm::CustomAllgather*>(_fa);
+}
+
+int64_t meta_size() { return sizeof(vllm::Signal); }
+
+void allgather_register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+allgather_get_graph_buffer_ipc_meta(fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void allgather_register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
+}
+
+
+// torch::Tensor all_gather(
+//     int64_t _fa,
+//     torch::Tensor inp,
+//     torch::Tensor out,
+//     int64_t _reg_buffer,
+//     int64_t reg_buffer_sz_bytes) {
+
+//   all_gather_cuda(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes);
+//   return out;
+// }
+  } // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/all_gather.cuh b/lightllm-kernel/csrc/moe/all_gather.cuh
new file mode 100644
index 000000000..99cb579be
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/all_gather.cuh
@@ -0,0 +1,287 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include "all_reduce.cuh"
+
+// #define CUDACHECK(cmd)                                              \
+//   do {                                                              \
+//     cudaError_t e = cmd;                                            \
+//     if (e != cudaSuccess) {                                         \
+//       printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+//              cudaGetErrorString(e));                                \
+//       exit(EXIT_FAILURE);                                           \
+//     }                                                               \
+//   } while (0)
+
+namespace vllm {
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct gather_packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+};
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    custom_all_gather_kernel(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename gather_packed_t<T>::P;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  for (int idx = tid; idx < size; idx += stride) {
+    #pragma unroll
+      for (int step = 0; step < ngpus; step ++) {
+          int src_rank = (rank - step + ngpus) % ngpus;  // 当前步骤中数据来源的进程
+          P* ptr = (P*)_dp->ptrs[src_rank];
+          int dst_offset = src_rank * size;         // 数据在 recv_buf 中的存储位置
+          // 从 src_rank 的 handle 中读取数据，并存储到 recv_buf
+          int dst_idx = dst_offset + idx;
+          ((P*)result)[dst_idx] = ptr[idx];
+      }
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllgather {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allgather synchronization, and the second section
+   * is for storing the intermediate results required by some allgather algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllgather(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allgather, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allgather, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allgather(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = gather_packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allgather currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+    size /= d;
+    // auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    KL(ngpus, custom_all_gather_kernel);        \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allgather only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllgather() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void vllm::CustomAllgather::allgather<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace vllm
diff --git a/lightllm-kernel/csrc/moe/all_reduce.cuh b/lightllm-kernel/csrc/moe/all_reduce.cuh
new file mode 100644
index 000000000..6be4d4f2b
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/all_reduce.cuh
@@ -0,0 +1,516 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+namespace vllm {
+
+constexpr int kMaxBlocks = 36;
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
+struct Signal {
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+};
+
+struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+
+struct __align__(16) RankSignals { Signal* signals[8]; };
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) { return a += b; }
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#endif
+}
+
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  return flag;
+}
+
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+  if (threadIdx.x < ngpus) {
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
+    } else {
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
+    }
+  }
+  if constexpr (is_start || need_fence) __syncthreads();
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+}
+
+template <typename P>
+DINLINE P* get_tmp_buf(Signal* sg) {
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    if (world_size_ == 2) {                           \
+      KL(ngpus, cross_device_reduce_1stage);          \
+    } else if (full_nvlink_) {                        \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);        \
+      } else {                                        \
+        KL(ngpus, cross_device_reduce_2stage);        \
+      }                                               \
+    }                                                 \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace vllm
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index b5ad19e74..96672d2e8 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -6,19 +6,7 @@ namespace lightllm {
 namespace ops {
 
 PYBIND11_MODULE(_C, m) {
-    m.def("grouped_topk", &grouped_topk,
-          "Grouped Top-K routing (CUDA)",
-          py::arg("topk_weights"),
-          py::arg("correction_bias"),
-          py::arg("topk_indices"),
-          py::arg("group_indices"),
-          py::arg("gating_output"),
-          py::arg("num_expert_group"),
-          py::arg("topk_group"),
-          py::arg("topk"),
-          py::arg("renormalize"),
-          py::arg("scoring_func"),
-          py::arg("group_scores") = torch::Tensor());
+    m.def("grouped_topk", &grouped_topk,"Grouped Top-K routing (CUDA)");
     m.def("rmsnorm_align16_bf16", &rmsnorm_align16_bf16, "RMSNORM (CUDA)");
     m.def("pre_tp_norm_bf16", &pre_tp_norm_bf16, "PRE TP NORM (CUDA)");
     m.def("post_tp_norm_bf16", &post_tp_norm_bf16, "POST TP NORM (CUDA)");
@@ -26,6 +14,8 @@ PYBIND11_MODULE(_C, m) {
     m.def("add_norm_quant_bf16_fp8", &add_norm_quant_bf16_fp8, "ADD NORM QUANT FUSED (CUDA)");
     m.def("gelu_per_token_quant_bf16_fp8", &gelu_per_token_quant_bf16_fp8, "GELU QUANT FUSED (CUDA)");
     m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
+    m.def("all_gather", &all_gather, "ALL GATHER (CUDA)");
+    m.def("meta_size", &lightllm::ops::meta_size, "Size (in bytes) of vllm::Signal metadata");
 }
 
 } // namespace ops
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index 7ad632af5..7087f086c 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -12,6 +12,7 @@ namespace ops {
 
 using namespace lightllm;
 
+int64_t meta_size();
 Tensor pre_tp_norm_bf16(Tensor &input);
 
 Tensor post_tp_norm_bf16(
@@ -64,5 +65,13 @@ Tensor grouped_topk(
         Tensor group_scores
 );
 
+void all_gather(
+    int64_t _fa,
+    Tensor& inp,
+    Tensor& out,
+    int64_t _reg_buffer,
+    int64_t reg_buffer_sz_bytes
+);
+
 } // namespace ops
 } // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/lightllm_kernel/__init__.py b/lightllm-kernel/lightllm_kernel/__init__.py
index 23c3bd2b0..9373b4b13 100644
--- a/lightllm-kernel/lightllm_kernel/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/__init__.py
@@ -1,3 +1,4 @@
 from . import ops  # noqa: F401
 
+meta_size = ops.meta_size
 __all__ = ["ops"]
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index b8930e5dc..68bf434ca 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -7,6 +7,7 @@
 try:
     _C = importlib.import_module(f"{PKG}._C")
 except ImportError:
+    raise ImportError("Cannot import compiled extension 'lightllm_kernel.ops'")
     repo_root = Path(__file__).resolve().parents[2]
     csrc_dir = repo_root / "csrc"
     if not csrc_dir.exists():
@@ -15,7 +16,7 @@
             "directory (csrc/) found; please ensure you have run "
             "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
         )
-    
+
     PROGRAM_NAME = "lightllm_kernel._C"
     EXTENSION_BUILD_DIR = "build"
     INCLUDE_DIR = "include"
@@ -62,11 +63,13 @@
         extra_cflags=["-O3"],
     )
 
+meta_size = _C.meta_size
 # 向外暴露 Python 端接口
 from .fusion import pre_tp_norm_bf16, post_tp_norm_bf16, add_norm_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
 from .norm import rmsnorm_bf16
 from .quant import per_token_quant_bf16_fp8
 from .gemm import cutlass_scaled_mm_bias_ls
+from .moe import all_gather, grouped_topk
 
 __all__ = [
     "rmsnorm_bf16",
@@ -76,4 +79,6 @@
     "add_norm_quant_bf16_fp8",
     "gelu_per_token_quant_bf16_fp8",
     "cutlass_scaled_mm_bias_ls",
+    "grouped_topk",
+    "meta_size",
 ]
diff --git a/lightllm-kernel/lightllm_kernel/ops/moe.py b/lightllm-kernel/lightllm_kernel/ops/moe.py
index e69de29bb..1f4eeeb84 100644
--- a/lightllm-kernel/lightllm_kernel/ops/moe.py
+++ b/lightllm-kernel/lightllm_kernel/ops/moe.py
@@ -0,0 +1,39 @@
+import torch
+from typing import Optional
+from . import _C
+
+
+def all_gather(
+    _fa: int, inp: torch.Tensor, out: torch.Tensor, _reg_buffer: int, reg_buffer_sz_bytes: int
+) -> torch.Tensor:
+    """Apply rmsnorm on given X, with weight W and eps"""
+    return _C.all_gather(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes)
+
+
+def grouped_topk(
+    topk_weights: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_indices: torch.Tensor,
+    group_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    topk: int,
+    renormalize: bool,
+    scoring_func: str,
+    group_scores: torch.Tensor,
+) -> torch.Tensor:
+    """Apply rmsnorm on given X, with weight W and eps"""
+    return _C.grouped_topk(
+        topk_weights,
+        correction_bias,
+        topk_indices,
+        group_indices,
+        gating_output,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        scoring_func,
+        group_scores,
+    )
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
index 70ae132dd..06d51d152 100644
--- a/lightllm-kernel/setup.py
+++ b/lightllm-kernel/setup.py
@@ -33,15 +33,18 @@
     CUDAExtension(
         name=PROGRAM_NAME,
         sources=sources,
+        libraries=["cuda"],
+        library_dirs=["/lib/x86_64-linux-gnu"],
+        extra_link_args=["-lcuda"],  # <-- 备选/补充方法
         extra_compile_args={
             "cxx": ["-O3"],
             "nvcc": [
-                "-DNDEBUG", 
+                "-DNDEBUG",
                 "-O3",
                 "--use_fast_math",
                 "-gencode=arch=compute_90,code=sm_90",
                 "-gencode=arch=compute_90,code=compute_90",
-                '-gencode=arch=compute_90a, code=sm_90a',
+                "-gencode=arch=compute_90a, code=sm_90a",
             ],
         },
         include_dirs=[
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
index 68dda0bd7..6f1989325 100644
--- a/lightllm/common/quantization/w8a8_quant.py
+++ b/lightllm/common/quantization/w8a8_quant.py
@@ -6,6 +6,7 @@
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL
 
 
 class BaseQuantizationMethod(QuantizationMethod):
@@ -67,17 +68,6 @@ class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.is_moe = False
-        # PINGPONG_FP8_GEMM is per tensor quant way.
-        self.use_pingpong_fp8_gemm = os.getenv("ENABLE_PINGPONG_FP8_GEMM", "0").upper() in ["ON", "TRUE", "1"]
-        # per token quant with better performance.
-        self.use_lightllm_kernels = os.getenv("ENABLE_LIGHTLLM_KERNELS", "0").upper() in ["ON", "TRUE", "1"]
-
-        if self.use_pingpong_fp8_gemm:
-            self.quantize = self.quantize_pingpong_fp8
-            self.apply = self.apply_pingpong_fp8
-        else:
-            self.quantize = self.quantize_scaled_mm_fp8
-            self.apply = self.apply_scaled_mm_fp8
 
     def quantize(self, weight: torch.Tensor):
         if self.is_moe:
@@ -102,13 +92,7 @@ def quantize_moe(self, weight):
         return qweights, weight_scale
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
-        raise Exception("This function needs to be bound.")
-
-    def apply_scaled_mm_fp8(
-        self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
-    ):
-        if self.use_lightllm_kernels:
-
+        if HAS_LIGHTLLM_KERNEL:
             from lightllm_kernel.ops import per_token_quant_bf16_fp8
 
             x_q, x_scale = per_token_quant_bf16_fp8(input_tensor)

From ce1cd438bf13723decaeb13f17bc4ae87c8032c3 Mon Sep 17 00:00:00 2001
From: Xtra <571889291@qq.com>
Date: Mon, 26 May 2025 10:37:09 +0800
Subject: [PATCH 05/14] feat: remove cutlass/include add cutlass submodule
 (#911)

---
 .gitmodules                                   |     3 +
 .../cutlass/include/cute/algorithm/axpby.hpp  |    95 -
 .../cutlass/include/cute/algorithm/clear.hpp  |    64 -
 .../cute/algorithm/cooperative_copy.hpp       |   333 -
 .../cute/algorithm/cooperative_gemm.hpp       |   512 -
 .../cutlass/include/cute/algorithm/copy.hpp   |   382 -
 .../cutlass/include/cute/algorithm/fill.hpp   |    87 -
 .../include/cute/algorithm/functional.hpp     |   290 -
 .../cutlass/include/cute/algorithm/gemm.hpp   |   500 -
 .../cutlass/include/cute/algorithm/prefer.hpp |    46 -
 .../include/cute/algorithm/prefetch.hpp       |   145 -
 .../cute/algorithm/tensor_algorithms.hpp      |   166 -
 .../cute/algorithm/tuple_algorithms.hpp       |  1073 -
 .../include/cute/arch/cluster_sm90.hpp        |   245 -
 .../cutlass/include/cute/arch/config.hpp      |    50 -
 .../cutlass/include/cute/arch/copy.hpp        |   107 -
 .../cutlass/include/cute/arch/copy_sm50.hpp   |    98 -
 .../cutlass/include/cute/arch/copy_sm75.hpp   |   236 -
 .../cutlass/include/cute/arch/copy_sm80.hpp   |   198 -
 .../cutlass/include/cute/arch/copy_sm90.hpp   |   219 -
 .../include/cute/arch/copy_sm90_desc.hpp      |   440 -
 .../include/cute/arch/copy_sm90_tma.hpp       |  1395 -
 .../cutlass/include/cute/arch/mma.hpp         |    64 -
 .../cutlass/include/cute/arch/mma_sm61.hpp    |    87 -
 .../cutlass/include/cute/arch/mma_sm70.hpp    |   329 -
 .../cutlass/include/cute/arch/mma_sm75.hpp    |   120 -
 .../cutlass/include/cute/arch/mma_sm80.hpp    |  2243 -
 .../cutlass/include/cute/arch/mma_sm90.hpp    |  9331 ---
 .../include/cute/arch/mma_sm90_desc.hpp       |   156 -
 .../include/cute/arch/mma_sm90_gmma.hpp       | 20974 ------
 .../include/cute/arch/mma_sm90_gmma_ext.hpp   | 56445 ---------------
 .../cute/arch/mma_sm90_gmma_sparse.hpp        | 22743 ------
 .../cute/arch/mma_sm90_gmma_sparse_ext.hpp    | 60445 ----------------
 .../cutlass/include/cute/arch/util.hpp        |   320 -
 .../cutlass/include/cute/atom/copy_atom.hpp   |   764 -
 .../cutlass/include/cute/atom/copy_traits.hpp |   159 -
 .../include/cute/atom/copy_traits_sm50.hpp    |    75 -
 .../include/cute/atom/copy_traits_sm75.hpp    |   143 -
 .../include/cute/atom/copy_traits_sm80.hpp    |   194 -
 .../include/cute/atom/copy_traits_sm90.hpp    |   132 -
 .../cute/atom/copy_traits_sm90_im2col.hpp     |   940 -
 .../cute/atom/copy_traits_sm90_tma.hpp        |  1525 -
 .../atom/copy_traits_sm90_tma_swizzle.hpp     |    93 -
 .../cutlass/include/cute/atom/mma_atom.hpp    |  1117 -
 .../cutlass/include/cute/atom/mma_traits.hpp  |   189 -
 .../include/cute/atom/mma_traits_sm61.hpp     |    73 -
 .../include/cute/atom/mma_traits_sm70.hpp     |   198 -
 .../include/cute/atom/mma_traits_sm75.hpp     |    81 -
 .../include/cute/atom/mma_traits_sm80.hpp     |   489 -
 .../include/cute/atom/mma_traits_sm90.hpp     |   144 -
 .../cute/atom/mma_traits_sm90_gmma.hpp        |  8999 ---
 .../cute/atom/mma_traits_sm90_gmma_ext.hpp    | 20116 -----
 .../cute/atom/mma_traits_sm90_gmma_sparse.hpp |  7738 --
 .../atom/mma_traits_sm90_gmma_sparse_ext.hpp  | 17335 -----
 .../cutlass/include/cute/config.hpp           |   149 -
 .../include/cute/container/alignment.hpp      |    70 -
 .../cutlass/include/cute/container/array.hpp  |   492 -
 .../include/cute/container/array_aligned.hpp  |    42 -
 .../include/cute/container/array_subbyte.hpp  |   643 -
 .../include/cute/container/bit_field.hpp      |   133 -
 .../include/cute/container/cuda_types.hpp     |   183 -
 .../include/cute/container/packed_tuple.hpp   |   254 -
 .../cutlass/include/cute/container/tuple.hpp  |   744 -
 .../include/cute/container/type_list.hpp      |   124 -
 .../cutlass/include/cute/int_tuple.hpp        |   864 -
 .../cutlass/include/cute/layout.hpp           |  2058 -
 .../cutlass/include/cute/layout_composed.hpp  |   652 -
 .../include/cute/numeric/arithmetic_tuple.hpp |   556 -
 .../cutlass/include/cute/numeric/complex.hpp  |    76 -
 .../cutlass/include/cute/numeric/int.hpp      |   106 -
 .../include/cute/numeric/integer_sequence.hpp |   151 -
 .../cute/numeric/integral_constant.hpp        |   517 -
 .../include/cute/numeric/integral_ratio.hpp   |   264 -
 .../cutlass/include/cute/numeric/math.hpp     |   356 -
 .../include/cute/numeric/numeric_types.hpp    |   135 -
 .../cutlass/include/cute/numeric/real.hpp     |    74 -
 .../cutlass/include/cute/pointer.hpp          |   322 -
 .../cutlass/include/cute/pointer_base.hpp     |   246 -
 .../cutlass/include/cute/pointer_flagged.hpp  |   199 -
 .../cutlass/include/cute/pointer_sparse.hpp   |   172 -
 .../cutlass/include/cute/pointer_swizzle.hpp  |   168 -
 .../cutlass/include/cute/stride.hpp           |   598 -
 .../cutlass/include/cute/swizzle.hpp          |   498 -
 .../cutlass/include/cute/swizzle_layout.hpp   |   584 -
 .../cutlass/include/cute/tensor.hpp           |    58 -
 .../cutlass/include/cute/tensor_impl.hpp      |  1193 -
 .../cutlass/include/cute/tensor_predicate.hpp |    78 -
 .../cutlass/include/cute/tensor_zip.hpp       |   243 -
 .../cutlass/include/cute/underscore.hpp       |   194 -
 .../cutlass/include/cute/util/debug.hpp       |   164 -
 .../cutlass/include/cute/util/print.hpp       |   261 -
 .../cutlass/include/cute/util/type_traits.hpp |   292 -
 .../cutlass/include/cutlass/aligned_buffer.h  |   129 -
 .../cutlass/include/cutlass/arch/arch.h       |   109 -
 .../cutlass/include/cutlass/arch/barrier.h    |   630 -
 .../include/cutlass/arch/cache_operation.h    |    66 -
 .../cutlass/include/cutlass/arch/config.h     |    81 -
 .../cutlass/arch/grid_dependency_control.h    |    84 -
 .../cutlass/include/cutlass/arch/memory.h     |   602 -
 .../include/cutlass/arch/memory_sm75.h        |   269 -
 .../include/cutlass/arch/memory_sm80.h        |   472 -
 .../cutlass/include/cutlass/arch/mma.h        |   269 -
 .../cutlass/include/cutlass/arch/mma_sm50.h   |   432 -
 .../cutlass/include/cutlass/arch/mma_sm60.h   |   252 -
 .../cutlass/include/cutlass/arch/mma_sm61.h   |   142 -
 .../cutlass/include/cutlass/arch/mma_sm70.h   |   665 -
 .../cutlass/include/cutlass/arch/mma_sm75.h   |   793 -
 .../cutlass/include/cutlass/arch/mma_sm80.h   |  1505 -
 .../cutlass/include/cutlass/arch/mma_sm89.h   |   367 -
 .../cutlass/include/cutlass/arch/mma_sm90.h   |   245 -
 .../include/cutlass/arch/mma_sparse_sm80.h    |  1238 -
 .../include/cutlass/arch/mma_sparse_sm89.h    |   409 -
 .../include/cutlass/arch/reg_reconfig.h       |    67 -
 .../cutlass/include/cutlass/arch/simd.h       |   125 -
 .../cutlass/include/cutlass/arch/simd_sm60.h  |   104 -
 .../cutlass/include/cutlass/arch/simd_sm61.h  |   147 -
 .../cutlass/include/cutlass/arch/synclog.hpp  |  1324 -
 .../cutlass/include/cutlass/arch/wmma.h       |   223 -
 .../cutlass/include/cutlass/arch/wmma_sm70.h  |   136 -
 .../cutlass/include/cutlass/arch/wmma_sm72.h  |   210 -
 .../cutlass/include/cutlass/arch/wmma_sm75.h  |   207 -
 .../cutlass/include/cutlass/array.h           |  2614 -
 .../include/cutlass/array_planar_complex.h    |    89 -
 .../cutlass/include/cutlass/array_subbyte.h   |   559 -
 .../cutlass/include/cutlass/barrier.h         |   377 -
 .../cutlass/include/cutlass/bfloat16.h        |   679 -
 .../cutlass/include/cutlass/blas3.h           |   143 -
 .../cutlass/include/cutlass/blas3_types.h     |    78 -
 .../cutlass/include/cutlass/block_striped.h   |   267 -
 .../include/cutlass/cluster_launch.hpp        |   275 -
 .../cutlass/include/cutlass/complex.h         |   823 -
 .../cutlass/include/cutlass/constants.h       |  1239 -
 .../conv/collective/builders/sm90_common.inl  |    96 -
 .../collective/builders/sm90_gmma_builder.inl |   257 -
 .../conv/collective/collective_builder.hpp    |    93 -
 .../conv/collective/collective_conv.hpp       |    62 -
 .../cutlass/conv/collective/detail.hpp        |   254 -
 ..._implicit_gemm_gmma_ss_warpspecialized.hpp |   663 -
 .../cutlass/conv/conv2d_problem_size.h        |   654 -
 .../cutlass/conv/conv3d_problem_size.h        |   513 -
 .../cutlass/conv/convnd_problem_shape.hpp     |   561 -
 .../include/cutlass/conv/convolution.h        |   194 -
 .../cutlass/include/cutlass/conv/detail.hpp   |   137 -
 .../conv/device/conv_universal_adapter.hpp    |   421 -
 .../cutlass/conv/device/direct_convolution.h  |   270 -
 .../conv/device/implicit_gemm_convolution.h   |   361 -
 .../device/implicit_gemm_convolution_fusion.h |   269 -
 .../include/cutlass/conv/dispatch_policy.hpp  |    90 -
 .../cutlass/conv/kernel/conv_universal.hpp    |    65 -
 .../cutlass/conv/kernel/default_conv2d.h      |   322 -
 .../conv/kernel/default_conv2d_dgrad.h        |  1927 -
 .../conv/kernel/default_conv2d_fprop.h        |  2007 -
 .../conv/kernel/default_conv2d_fprop_fusion.h |   357 -
 .../kernel/default_conv2d_fprop_with_absmax.h |   127 -
 .../default_conv2d_fprop_with_broadcast.h     |   221 -
 .../default_conv2d_fprop_with_reduction.h     |   130 -
 .../conv/kernel/default_conv2d_group_fprop.h  |   622 -
 .../conv/kernel/default_conv2d_wgrad.h        |  1011 -
 .../conv/kernel/default_conv2d_wgrad_fusion.h |   325 -
 .../conv/kernel/default_conv3d_dgrad.h        |   736 -
 .../conv/kernel/default_conv3d_fprop.h        |   981 -
 .../conv/kernel/default_conv3d_fprop_fusion.h |   360 -
 .../default_conv3d_fprop_with_broadcast.h     |   222 -
 .../conv/kernel/default_conv3d_wgrad.h        |   936 -
 .../cutlass/conv/kernel/default_deconv2d.h    |   999 -
 .../kernel/default_deconv2d_with_broadcast.h  |   305 -
 .../cutlass/conv/kernel/default_deconv3d.h    |   541 -
 .../kernel/default_deconv3d_with_broadcast.h  |   309 -
 .../conv/kernel/default_depthwise_fprop.h     |   588 -
 .../cutlass/conv/kernel/direct_convolution.h  |   505 -
 .../conv/kernel/implicit_gemm_convolution.h   |   455 -
 .../kernel/implicit_gemm_convolution_fusion.h |   461 -
 .../implicit_gemm_convolution_strided_dgrad.h |   492 -
 .../implicit_gemm_convolution_with_absmax.h   |   494 -
 ...cit_gemm_convolution_with_fused_epilogue.h |   499 -
 ...sm90_implicit_gemm_tma_warpspecialized.hpp |    76 -
 .../cutlass/conv/thread/depthwise_mma.h       |   325 -
 ...rad_filter_tile_access_iterator_analytic.h |   485 -
 ...ad_filter_tile_access_iterator_optimized.h |   619 -
 ...t_gradient_tile_access_iterator_analytic.h |   606 -
 ..._gradient_tile_access_iterator_optimized.h |   821 -
 ...activation_tile_access_iterator_analytic.h |   332 -
 ...vation_tile_access_iterator_few_channels.h |   360 -
 ...tion_tile_access_iterator_fixed_channels.h |   353 -
 ...ctivation_tile_access_iterator_optimized.h |   422 -
 ...rop_filter_tile_access_iterator_analytic.h |   330 -
 ...filter_tile_access_iterator_few_channels.h |   289 -
 ...lter_tile_access_iterator_fixed_channels.h |   275 -
 ...op_filter_tile_access_iterator_optimized.h |   322 -
 .../cutlass/conv/threadblock/conv2d_params.h  |   893 -
 .../conv/threadblock/conv2d_tile_iterator.h   |   337 -
 ...activation_tile_access_iterator_analytic.h |   285 -
 ...ctivation_tile_access_iterator_optimized.h |   321 -
 ...t_gradient_tile_access_iterator_analytic.h |   260 -
 ..._gradient_tile_access_iterator_optimized.h |   310 -
 ...rad_filter_tile_access_iterator_analytic.h |   268 -
 ...ad_filter_tile_access_iterator_optimized.h |   289 -
 ...t_gradient_tile_access_iterator_analytic.h |   343 -
 ..._gradient_tile_access_iterator_optimized.h |   489 -
 ...activation_tile_access_iterator_analytic.h |   291 -
 ...ctivation_tile_access_iterator_optimized.h |   478 -
 ...rop_filter_tile_access_iterator_analytic.h |   259 -
 ...op_filter_tile_access_iterator_optimized.h |   279 -
 .../cutlass/conv/threadblock/conv3d_params.h  |   508 -
 ...activation_tile_access_iterator_analytic.h |   289 -
 ...ctivation_tile_access_iterator_optimized.h |   319 -
 ...t_gradient_tile_access_iterator_analytic.h |   267 -
 ..._gradient_tile_access_iterator_optimized.h |   310 -
 .../depthwise_direct_conv_params.h            |   230 -
 ...erator_direct_conv_fixed_stride_dilation.h |   314 -
 ...le_access_iterator_direct_conv_optimized.h |   291 -
 .../depthwise_fprop_direct_conv_multistage.h  |   551 -
 ...le_access_iterator_direct_conv_optimized.h |   261 -
 .../threadblock/depthwise_fprop_pipelined.h   |   336 -
 .../conv/threadblock/depthwise_mma_base.h     |   229 -
 ...depthwise_mma_core_with_lane_access_size.h |   952 -
 .../implicit_gemm_fprop_fusion_multistage.h   |   802 -
 .../threadblock/implicit_gemm_multistage.h    |   539 -
 .../threadblock/implicit_gemm_pipelined.h     |   320 -
 .../implicit_gemm_wgrad_fusion_multistage.h   |   729 -
 ...icated_scale_bias_vector_access_iterator.h |   470 -
 .../predicated_scale_bias_vector_iterator.h   |   371 -
 .../conv/threadblock/threadblock_swizzle.h    |   193 -
 .../cutlass/conv/warp/mma_depthwise_simt.h    |   380 -
 .../warp/mma_depthwise_simt_tile_iterator.h   |   862 -
 .../conv/warp/scale_bias_relu_transform.h     |   221 -
 .../cutlass/include/cutlass/coord.h           |   480 -
 .../cutlass/include/cutlass/core_io.h         |   286 -
 .../include/cutlass/cuda_host_adapter.hpp     |   407 -
 .../cutlass/include/cutlass/cutlass.h         |   160 -
 .../include/cutlass/detail/collective.hpp     |    63 -
 .../cutlass/detail/dependent_false.hpp        |    86 -
 .../include/cutlass/detail/helper_macros.hpp  |   205 -
 .../cutlass/include/cutlass/detail/layout.hpp |   406 -
 .../cutlass/include/cutlass/detail/mma.hpp    |    71 -
 .../cutlass/include/cutlass/device_kernel.h   |   125 -
 .../collective/builders/sm90_builder.inl      |   812 -
 .../collective/builders/sm90_common.inl       |    80 -
 .../collective/collective_builder.hpp         |   120 -
 .../collective/collective_epilogue.hpp        |    71 -
 .../epilogue/collective/default_epilogue.hpp  |   242 -
 .../collective/default_epilogue_array.hpp     |   273 -
 .../cutlass/epilogue/collective/detail.hpp    |   491 -
 .../collective/epilogue_tensor_broadcast.hpp  |   271 -
 .../collective/sm70_epilogue_vectorized.hpp   |   549 -
 .../sm70_epilogue_vectorized_array.hpp        |   412 -
 ...m90_epilogue_array_tma_warpspecialized.hpp |  1191 -
 .../sm90_epilogue_tma_warpspecialized.hpp     |   904 -
 ...e_tma_warpspecialized_bias_elementwise.hpp |   164 -
 .../cutlass/epilogue/dispatch_policy.hpp      |   195 -
 .../cutlass/epilogue/fusion/callbacks.hpp     |    89 -
 .../cutlass/epilogue/fusion/operations.hpp    |   351 -
 .../sm90_callbacks_tma_warpspecialized.hpp    |  1787 -
 ...90_visitor_compute_tma_warpspecialized.hpp |   839 -
 .../sm90_visitor_load_tma_warpspecialized.hpp |  1415 -
 ...sm90_visitor_store_tma_warpspecialized.hpp |  1736 -
 .../sm90_visitor_tma_warpspecialized.hpp      |  1139 -
 .../fusion/sm90_visitor_topk_softmax.hpp      |   759 -
 .../cutlass/epilogue/thread/activation.h      |   758 -
 .../cutlass/epilogue/thread/conversion_op.h   |   132 -
 .../cutlass/epilogue/thread/detail.hpp        |    52 -
 .../epilogue/thread/linear_combination.h      |   523 -
 .../linear_combination_bias_elementwise.h     |   524 -
 .../thread/linear_combination_bias_relu.h     |   610 -
 .../thread/linear_combination_clamp.h         |   685 -
 .../thread/linear_combination_dgelu.h         |   250 -
 .../thread/linear_combination_drelu.h         |   452 -
 .../epilogue/thread/linear_combination_gelu.h |    70 -
 .../thread/linear_combination_generic.h       |   265 -
 .../linear_combination_generic_with_scaling.h |   325 -
 .../thread/linear_combination_hardswish.h     |    69 -
 .../thread/linear_combination_leaky_relu.h    |   231 -
 .../thread/linear_combination_params.h        |    75 -
 .../linear_combination_planar_complex.h       |   236 -
 .../epilogue/thread/linear_combination_relu.h |   572 -
 .../thread/linear_combination_relu0.h         |   543 -
 .../linear_combination_residual_block.h       |   301 -
 .../thread/linear_combination_sigmoid.h       |    70 -
 .../epilogue/thread/linear_combination_silu.h |    69 -
 .../linear_combination_tensor_broadcast.hpp   |   253 -
 .../linear_combination_with_elementwise.h     |   234 -
 .../cutlass/epilogue/thread/reduction_op.h    |    97 -
 .../cutlass/epilogue/thread/scale_type.h      |    66 -
 .../default_epilogue_complex_tensor_op.h      |   255 -
 ...default_epilogue_complex_tensor_op_blas3.h |   264 -
 .../default_epilogue_direct_store.h           |    74 -
 .../default_epilogue_planar_complex.h         |   241 -
 .../threadblock/default_epilogue_simt.h       |   443 -
 .../threadblock/default_epilogue_tensor_op.h  |   904 -
 .../default_epilogue_tensor_op_blas3.h        |   175 -
 .../default_epilogue_volta_tensor_op.h        |   337 -
 .../default_epilogue_with_absmax.h            |   126 -
 .../default_epilogue_with_broadcast.h         |   376 -
 .../default_epilogue_with_reduction.h         |   177 -
 .../default_epilogue_wmma_tensor_op.h         |   165 -
 .../threadblock/default_thread_map_simt.h     |   127 -
 .../default_thread_map_tensor_op.h            |   208 -
 .../default_thread_map_volta_tensor_op.h      |   228 -
 .../default_thread_map_wmma_tensor_op.h       |   113 -
 .../direct_store_epilogue_iterator.h          |   142 -
 .../cutlass/epilogue/threadblock/epilogue.h   |   543 -
 .../epilogue/threadblock/epilogue_base.h      |   240 -
 .../threadblock/epilogue_base_streamk.h       |   197 -
 .../epilogue/threadblock/epilogue_depthwise.h |   335 -
 .../threadblock/epilogue_direct_store.h       |   347 -
 .../threadblock/epilogue_gemm_k_reduction.h   |   212 -
 .../threadblock/epilogue_planar_complex.h     |   401 -
 .../threadblock/epilogue_smem_accumulator.h   |   230 -
 .../epilogue_streamk_with_broadcast.h         |   443 -
 .../epilogue_visitor_with_softmax.h           |   513 -
 .../threadblock/epilogue_with_absmax.h        |   923 -
 .../threadblock/epilogue_with_broadcast.h     |  1718 -
 .../threadblock/epilogue_with_reduction.h     |   823 -
 .../threadblock/epilogue_with_visitor.h       |   409 -
 .../epilogue_with_visitor_callbacks.h         |   504 -
 .../epilogue/threadblock/epilogue_workspace.h |   197 -
 .../threadblock/fusion/visitor_2x.hpp         |   433 -
 .../threadblock/fusion/visitor_compute.hpp    |   109 -
 .../threadblock/fusion/visitor_load.hpp       |   583 -
 .../threadblock/fusion/visitor_store.hpp      |   805 -
 .../epilogue/threadblock/fusion/visitors.hpp  |    38 -
 .../threadblock/interleaved_epilogue.h        |   407 -
 .../threadblock/output_iterator_parameter.h   |   223 -
 .../threadblock/output_tile_thread_map.h      |   628 -
 .../threadblock/predicated_tile_iterator.h    |  1387 -
 .../predicated_tile_iterator_affine.h         |   615 -
 ...cated_tile_iterator_affine_layout_params.h |   156 -
 .../predicated_tile_iterator_blas3.h          |   633 -
 .../predicated_tile_iterator_conv.h           |   562 -
 .../predicated_tile_iterator_direct_conv.h    |   445 -
 .../predicated_tile_iterator_params.h         |   483 -
 .../predicated_tile_iterator_predicates.h     |   309 -
 .../predicated_tile_iterator_strided_dgrad.h  |   479 -
 .../threadblock/shared_load_iterator.h        |   223 -
 .../threadblock/shared_load_iterator_mixed.h  |   594 -
 .../shared_load_iterator_pitch_linear.h       |   194 -
 .../fragment_iterator_complex_tensor_op.h     |   187 -
 ...ment_iterator_gaussian_complex_tensor_op.h |   194 -
 .../epilogue/warp/fragment_iterator_simt.h    |   164 -
 .../warp/fragment_iterator_tensor_op.h        |   378 -
 .../warp/fragment_iterator_volta_tensor_op.h  |   269 -
 .../warp/fragment_iterator_wmma_tensor_op.h   |   164 -
 .../cutlass/epilogue/warp/simt_policy.h       |   107 -
 .../cutlass/epilogue/warp/tensor_op_policy.h  |   189 -
 .../epilogue/warp/tile_iterator_simt.h        |   785 -
 .../epilogue/warp/tile_iterator_tensor_op.h   |   671 -
 .../warp/tile_iterator_tensor_op_mixed.h      |  1081 -
 .../warp/tile_iterator_volta_tensor_op.h      |   440 -
 .../warp/tile_iterator_wmma_tensor_op.h       |   227 -
 .../epilogue/warp/volta_tensor_op_policy.h    |   195 -
 .../epilogue/warp/wmma_tensor_op_policy.h     |   101 -
 .../cutlass/include/cutlass/fast_math.h       |  1067 -
 .../cutlass/include/cutlass/float8.h          |  1284 -
 .../include/cutlass/floating_point_nvrtc.h    |    98 -
 .../cutlass/include/cutlass/functional.h      |   930 -
 .../gemm/collective/builders/sm90_common.inl  |   419 -
 .../collective/builders/sm90_gmma_builder.inl |  1048 -
 .../builders/sm90_sparse_config.inl           |   268 -
 .../builders/sm90_sparse_gmma_builder.inl     |   388 -
 .../gemm/collective/collective_builder.hpp    |    42 -
 .../collective/collective_builder_decl.hpp    |    88 -
 .../gemm/collective/collective_mma.hpp        |    49 -
 .../gemm/collective/collective_mma_decl.hpp   |    64 -
 .../gemm/collective/fp8_accumulation.hpp      |   121 -
 .../gemm/collective/sm70_mma_twostage.hpp     |   597 -
 .../gemm/collective/sm80_mma_multistage.hpp   |   707 -
 ..._mma_array_tma_gmma_ss_warpspecialized.hpp |   759 -
 ...mma_multistage_gmma_rs_warpspecialized.hpp |   677 -
 ...mma_multistage_gmma_ss_warpspecialized.hpp |   509 -
 .../sm90_mma_tma_gmma_rs_warpspecialized.hpp  |   752 -
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp |  1560 -
 .../gemm/collective/sm90_mma_tma_gmma_ss.hpp  |   539 -
 .../sm90_mma_tma_gmma_ss_warpspecialized.hpp  |   582 -
 ...90_mma_tma_gmma_ss_warpspecialized_fp8.hpp |   584 -
 ...sparse_mma_tma_gmma_ss_warpspecialized.hpp |   724 -
 .../cutlass/gemm/device/base_grouped.h        |   478 -
 .../gemm/device/default_gemm_configuration.h  |   955 -
 .../include/cutlass/gemm/device/ell_gemm.h    |   849 -
 .../include/cutlass/gemm/device/gemm.h        |   772 -
 .../include/cutlass/gemm/device/gemm_array.h  |   738 -
 .../cutlass/gemm/device/gemm_batched.h        |   704 -
 .../cutlass/gemm/device/gemm_complex.h        |   718 -
 .../cutlass/gemm/device/gemm_grouped.h        |    61 -
 .../device/gemm_layernorm_mainloop_fusion.h   |   385 -
 .../include/cutlass/gemm/device/gemm_sparse.h |   515 -
 .../gemm/device/gemm_sparse_universal.h       |   211 -
 .../gemm_sparse_universal_with_absmax.h       |   202 -
 .../gemm/device/gemm_sparse_with_absmax.h     |   360 -
 .../gemm/device/gemm_sparse_with_visitor.h    |   342 -
 .../gemm/device/gemm_splitk_parallel.h        |   636 -
 .../cutlass/gemm/device/gemm_universal.h      |   442 -
 .../gemm/device/gemm_universal_adapter.h      |   693 -
 .../cutlass/gemm/device/gemm_universal_base.h |   522 -
 .../gemm_universal_streamk_with_broadcast.h   |   386 -
 .../gemm/device/gemm_universal_with_absmax.h  |   404 -
 .../device/gemm_universal_with_broadcast.h    |   386 -
 .../gemm/device/gemm_with_k_reduction.h       |   415 -
 .../include/cutlass/gemm/device/gemv.h        |   182 -
 .../include/cutlass/gemm/device/rank_2k.h     |   548 -
 .../cutlass/gemm/device/rank_2k_grouped.h     |    63 -
 .../include/cutlass/gemm/device/rank_k.h      |   510 -
 .../include/cutlass/gemm/device/symm.h        |   603 -
 .../include/cutlass/gemm/device/trmm.h        |   759 -
 .../include/cutlass/gemm/dispatch_policy.hpp  |   324 -
 .../cutlass/include/cutlass/gemm/gemm.h       |   133 -
 .../cutlass/gemm/gemm_enumerated_types.h      |    80 -
 .../gemm/group_array_problem_shape.hpp        |   123 -
 .../cutlass/gemm/kernel/default_ell_gemm.h    |   837 -
 .../cutlass/gemm/kernel/default_gemm.h        |  1189 -
 .../gemm/kernel/default_gemm_complex.h        |   404 -
 .../gemm/kernel/default_gemm_grouped.h        |   384 -
 ...ult_gemm_grouped_softmax_mainloop_fusion.h |   164 -
 .../default_gemm_layernorm_mainloop_fusion.h  |   137 -
 .../default_gemm_planar_complex_universal.h   |   352 -
 .../cutlass/gemm/kernel/default_gemm_sparse.h |   252 -
 .../kernel/default_gemm_sparse_universal.h    |   141 -
 ...efault_gemm_sparse_universal_with_absmax.h |   144 -
 .../kernel/default_gemm_sparse_with_absmax.h  |   157 -
 .../kernel/default_gemm_sparse_with_visitor.h |   197 -
 .../kernel/default_gemm_splitk_parallel.h     |   136 -
 .../default_gemm_streamk_with_broadcast.h     |   146 -
 .../gemm/kernel/default_gemm_universal.h      |   396 -
 .../default_gemm_universal_with_visitor.h     |   157 -
 .../gemm/kernel/default_gemm_with_absmax.h    |   143 -
 .../gemm/kernel/default_gemm_with_broadcast.h |   243 -
 .../kernel/default_gemm_with_k_reduction.h    |   150 -
 .../gemm/kernel/default_gemm_with_reduction.h |   246 -
 .../cutlass/gemm/kernel/default_gemv.h        |   132 -
 .../cutlass/gemm/kernel/default_rank_2k.h     |   285 -
 .../gemm/kernel/default_rank_2k_complex.h     |   498 -
 .../gemm/kernel/default_rank_2k_grouped.h     |   355 -
 .../gemm/kernel/default_rank_2k_universal.h   |   346 -
 .../cutlass/gemm/kernel/default_rank_k.h      |   247 -
 .../gemm/kernel/default_rank_k_complex.h      |   429 -
 .../gemm/kernel/default_rank_k_universal.h    |   305 -
 .../cutlass/gemm/kernel/default_symm.h        |   321 -
 .../gemm/kernel/default_symm_complex.h        |   508 -
 .../gemm/kernel/default_symm_universal.h      |   342 -
 .../cutlass/gemm/kernel/default_trmm.h        |   269 -
 .../gemm/kernel/default_trmm_complex.h        |   265 -
 .../gemm/kernel/default_trmm_universal.h      |   359 -
 .../include/cutlass/gemm/kernel/ell_gemm.h    |   824 -
 .../include/cutlass/gemm/kernel/gemm.h        |   380 -
 .../include/cutlass/gemm/kernel/gemm_array.h  |   264 -
 .../cutlass/gemm/kernel/gemm_batched.h        |   273 -
 .../cutlass/gemm/kernel/gemm_grouped.h        |   457 -
 .../kernel/gemm_grouped_problem_visitor.h     |   121 -
 .../gemm_grouped_softmax_mainloop_fusion.h    |   481 -
 .../kernel/gemm_layernorm_mainloop_fusion.h   |   782 -
 .../include/cutlass/gemm/kernel/gemm_params.h |   189 -
 .../cutlass/gemm/kernel/gemm_pipelined.h      |   158 -
 .../cutlass/gemm/kernel/gemm_planar_complex.h |   715 -
 .../gemm/kernel/gemm_planar_complex_array.h   |   609 -
 .../gemm/kernel/gemm_sparse_universal.h       |   804 -
 .../gemm_sparse_universal_with_absmax.h       |   609 -
 .../gemm/kernel/gemm_splitk_parallel.h        |   253 -
 .../kernel/gemm_streamk_with_fused_epilogue.h |  2396 -
 .../gemm/kernel/gemm_transpose_operands.h     |   124 -
 .../cutlass/gemm/kernel/gemm_universal.h      |   702 -
 .../cutlass/gemm/kernel/gemm_universal.hpp    |    66 -
 .../cutlass/gemm/kernel/gemm_universal_decl.h |    61 -
 .../gemm/kernel/gemm_universal_streamk.h      |  1168 -
 .../gemm/kernel/gemm_universal_with_visitor.h |   321 -
 .../gemm_universal_with_visitor_streamk.h     |   895 -
 .../cutlass/gemm/kernel/gemm_with_absmax.h    |   759 -
 .../gemm/kernel/gemm_with_fused_epilogue.h    |  1512 -
 .../gemm/kernel/gemm_with_k_reduction.h       |   704 -
 .../include/cutlass/gemm/kernel/gemv.h        |   638 -
 .../gemm/kernel/gemv_batched_strided.h        |   244 -
 .../gemm/kernel/grouped_problem_visitor.h     |   463 -
 .../cutlass/gemm/kernel/params_sparse_base.h  |   115 -
 .../gemm/kernel/params_universal_base.h       |   264 -
 .../cutlass/gemm/kernel/rank_2k_grouped.h     |   688 -
 .../kernel/rank_2k_grouped_problem_visitor.h  |   376 -
 .../gemm/kernel/rank_2k_transpose_operands.h  |   129 -
 .../cutlass/gemm/kernel/rank_2k_universal.h   |   769 -
 .../cutlass/gemm/kernel/rank_k_universal.h    |   556 -
 .../include/cutlass/gemm/kernel/sm70_gemm.hpp |   270 -
 ..._array_tma_warpspecialized_cooperative.hpp |   881 -
 ...emm_array_tma_warpspecialized_pingpong.hpp |   946 -
 .../cutlass/gemm/kernel/sm90_gemm_tma.hpp     |   306 -
 .../kernel/sm90_gemm_tma_warpspecialized.hpp  |   522 -
 ...0_gemm_tma_warpspecialized_cooperative.hpp |   671 -
 ...sm90_gemm_tma_warpspecialized_pingpong.hpp |   664 -
 .../gemm/kernel/sm90_gemm_warpspecialized.hpp |   417 -
 .../sm90_gemm_warpspecialized_cooperative.hpp |   504 -
 .../sm90_gemm_warpspecialized_pingpong.hpp    |   516 -
 .../gemm/kernel/sm90_tile_scheduler.hpp       |   139 -
 .../gemm/kernel/sm90_tile_scheduler_group.hpp |   510 -
 .../kernel/sm90_tile_scheduler_stream_k.hpp   |   960 -
 .../include/cutlass/gemm/kernel/sparse_gemm.h |   394 -
 .../gemm/kernel/sparse_gemm_with_absmax.h     |   509 -
 .../gemm/kernel/sparse_gemm_with_visitor.h    |   238 -
 .../gemm/kernel/static_tile_scheduler.hpp     |   502 -
 .../cutlass/gemm/kernel/symm_universal.h      |   675 -
 .../cutlass/gemm/kernel/tile_scheduler.hpp    |   149 -
 .../gemm/kernel/tile_scheduler_params.h       |  1535 -
 .../cutlass/gemm/kernel/trmm_universal.h      |   580 -
 .../cutlass/include/cutlass/gemm/thread/mma.h |    90 -
 .../include/cutlass/gemm/thread/mma_sm50.h    |   538 -
 .../include/cutlass/gemm/thread/mma_sm60.h    |  1161 -
 .../include/cutlass/gemm/thread/mma_sm61.h    |   284 -
 .../gemm/threadblock/default_ell_mma.h        |   734 -
 .../gemm/threadblock/default_gemv_core.h      |   151 -
 .../cutlass/gemm/threadblock/default_mma.h    |   823 -
 .../gemm/threadblock/default_mma_core.h       |   116 -
 .../gemm/threadblock/default_mma_core_simt.h  |  1723 -
 .../gemm/threadblock/default_mma_core_sm70.h  |   682 -
 .../gemm/threadblock/default_mma_core_sm75.h  |  1315 -
 .../gemm/threadblock/default_mma_core_sm80.h  |  2951 -
 .../default_mma_core_sparse_sm80.h            |   876 -
 .../default_mma_core_with_access_size.h       |   328 -
 .../default_mma_core_with_reduction.h         |   167 -
 .../gemm/threadblock/default_mma_core_wmma.h  |   712 -
 .../default_mma_layernorm_mainloop_fusion.h   |   178 -
 .../default_mma_planar_complex_multistage.h   |   136 -
 .../default_mma_planar_complex_pipelined.h    |   130 -
 .../default_mma_softmax_mainloop_fusion.h     |   160 -
 .../threadblock/default_mma_with_reduction.h  |   141 -
 .../default_multistage_mma_complex.h          |   159 -
 .../default_multistage_mma_complex_core.h     |   119 -
 ...default_multistage_mma_complex_core_sm80.h |  1808 -
 .../default_multistage_trmm_complex.h         |   556 -
 .../gemm/threadblock/default_sparse_mma.h     |   196 -
 .../cutlass/gemm/threadblock/default_trmm.h   |   445 -
 .../gemm/threadblock/ell_mma_multistage.h     |   648 -
 .../gemm/threadblock/ell_mma_pipelined.h      |   376 -
 .../include/cutlass/gemm/threadblock/gemv.h   |   147 -
 .../cutlass/gemm/threadblock/index_remat.h    |   107 -
 .../cutlass/gemm/threadblock/mma_base.h       |   236 -
 .../gemm/threadblock/mma_blas3_multistage.h   |   707 -
 ...mma_layernorm_mainloop_fusion_multistage.h |   863 -
 .../cutlass/gemm/threadblock/mma_multistage.h |   741 -
 .../cutlass/gemm/threadblock/mma_pipelined.h  |   439 -
 .../threadblock/mma_planar_complex_base.h     |   208 -
 .../mma_planar_complex_multistage.h           |   646 -
 .../mma_planar_complex_pipelined.h            |   424 -
 .../gemm/threadblock/mma_singlestage.h        |   265 -
 .../mma_softmax_mainloop_fusion_multistage.h  |   756 -
 .../gemm/threadblock/mma_sparse_base.h        |   273 -
 .../gemm/threadblock/mma_sparse_multistage.h  |   668 -
 .../mma_with_reduction_multistage.h           |   545 -
 .../gemm/threadblock/threadblock_swizzle.h    |   459 -
 .../threadblock/threadblock_swizzle_streamk.h |   801 -
 .../gemm/warp/default_mma_complex_tensor_op.h |   612 -
 .../gemm/warp/default_mma_sparse_tensor_op.h  |   165 -
 .../cutlass/gemm/warp/default_mma_tensor_op.h |   123 -
 .../gemm/warp/default_mma_tensor_op_sm80.h    |   375 -
 .../default_mma_with_reduction_tensor_op.h    |    92 -
 .../gemm/warp/default_mma_wmma_tensor_op.h    |   130 -
 .../warp/layernorm_scale_bias_transform.h     |   139 -
 .../cutlass/include/cutlass/gemm/warp/mma.h   |    60 -
 .../cutlass/gemm/warp/mma_complex_tensor_op.h |  1168 -
 .../warp/mma_complex_tensor_op_fast_f32.h     |   663 -
 ...mma_complex_tensor_op_tile_iterator_sm80.h |  2485 -
 .../warp/mma_gaussian_complex_tensor_op.h     |   642 -
 ...ian_complex_tensor_op_tile_iterator_sm80.h |   390 -
 .../gemm/warp/mma_mixed_input_tensor_op.h     |   566 -
 .../cutlass/gemm/warp/mma_planar_complex.h    |   182 -
 .../include/cutlass/gemm/warp/mma_simt.h      |   263 -
 .../cutlass/gemm/warp/mma_simt_policy.h       |    69 -
 .../gemm/warp/mma_simt_tile_iterator.h        |  1890 -
 .../cutlass/gemm/warp/mma_sparse_tensor_op.h  |   382 -
 .../include/cutlass/gemm/warp/mma_tensor_op.h |   415 -
 .../gemm/warp/mma_tensor_op_fast_f32.h        |   471 -
 .../warp/mma_tensor_op_fragment_iterator.h    |   559 -
 .../cutlass/gemm/warp/mma_tensor_op_policy.h  |    65 -
 .../cutlass/gemm/warp/mma_tensor_op_sm70.h    |   280 -
 .../warp/mma_tensor_op_tile_access_iterator.h |   362 -
 .../gemm/warp/mma_tensor_op_tile_iterator.h   |  4803 --
 .../warp/mma_tensor_op_tile_iterator_sm70.h   |  3098 -
 .../warp/mma_tensor_op_tile_iterator_sm80.h   |  2441 -
 .../warp/mma_tensor_op_tile_iterator_sparse.h |   380 -
 .../warp/mma_tensor_op_tile_iterator_wmma.h   |   805 -
 .../cutlass/gemm/warp/mma_tensor_op_wmma.h    |   223 -
 .../gemm/warp/mma_with_reduction_tensor_op.h  |   449 -
 .../gemm/warp/scale_bias_tile_iterator.h      |   572 -
 .../gemm/warp/softmax_scale_bias_transform.h  |   117 -
 .../gemm/warp/tile_iterator_planar_complex.h  |   250 -
 .../cutlass/include/cutlass/gemm_coord.h      |   394 -
 .../cutlass/include/cutlass/gemm_coord.hpp    |    66 -
 .../cutlass/include/cutlass/half.h            |   930 -
 .../cutlass/include/cutlass/integer_subbyte.h |   280 -
 .../include/cutlass/kernel_hardware_info.h    |    76 -
 .../include/cutlass/kernel_hardware_info.hpp  |    35 -
 .../cutlass/include/cutlass/kernel_launch.h   |   141 -
 .../cutlass/include/cutlass/layout/layout.h   |    64 -
 .../cutlass/include/cutlass/layout/matrix.h   |  1349 -
 .../cutlass/include/cutlass/layout/permute.h  |   828 -
 .../include/cutlass/layout/pitch_linear.h     |   149 -
 .../cutlass/include/cutlass/layout/tensor.h   |   648 -
 .../layout/tensor_op_multiplicand_sm70.h      |  1044 -
 .../layout/tensor_op_multiplicand_sm75.h      |  1169 -
 .../layout/tensor_op_multiplicand_sm80.h      |  1139 -
 .../cutlass/include/cutlass/layout/vector.h   |   105 -
 .../cutlass/include/cutlass/matrix.h          | 14129 ----
 .../cutlass/include/cutlass/matrix_coord.h    |   164 -
 .../cutlass/include/cutlass/matrix_shape.h    |    65 -
 .../include/cutlass/numeric_conversion.h      |  4547 --
 .../cutlass/include/cutlass/numeric_size.h    |    83 -
 .../cutlass/include/cutlass/numeric_types.h   |    88 -
 .../include/cutlass/pipeline/pipeline.hpp     |    36 -
 .../cutlass/pipeline/sm90_pipeline.hpp        |  1173 -
 .../include/cutlass/pitch_linear_coord.h      |   181 -
 .../include/cutlass/platform/platform.h       |   913 -
 .../include/cutlass/predicate_vector.h        |   547 -
 .../cutlass/include/cutlass/quaternion.h      |   752 -
 .../cutlass/include/cutlass/real.h            |    61 -
 .../cutlass/reduction/device/reduce_split_k.h |   232 -
 .../cutlass/reduction/device/tensor_reduce.h  |   264 -
 .../device/tensor_reduce_affine_contiguous.h  |   374 -
 .../device/tensor_reduce_affine_strided.h     |   362 -
 .../reduction/kernel/reduce_softmax_final.h   |   267 -
 .../cutlass/reduction/kernel/reduce_split_k.h |   248 -
 .../kernel/tensor_reduce_affine_contiguous.h  |   606 -
 .../kernel/tensor_reduce_affine_strided.h     |   641 -
 .../include/cutlass/reduction/thread/reduce.h |   234 -
 .../reduction/thread/reduction_operators.h    |   235 -
 .../cutlass/reduction/threadblock_swizzle.h   |    67 -
 .../include/cutlass/relatively_equal.h        |   275 -
 .../cutlass/include/cutlass/semaphore.h       |   118 -
 .../include/cutlass/subbyte_reference.h       |  1388 -
 .../cutlass/include/cutlass/tensor_coord.h    |   326 -
 .../cutlass/include/cutlass/tensor_ref.h      |   419 -
 .../cutlass/tensor_ref_planar_complex.h       |   374 -
 .../cutlass/include/cutlass/tensor_view.h     |   297 -
 .../cutlass/tensor_view_planar_complex.h      |   301 -
 .../cutlass/include/cutlass/tfloat32.h        |   478 -
 .../cutlass/include/cutlass/thread/matrix.h   |   198 -
 .../cutlass/include/cutlass/trace.h           |    59 -
 .../collective/sm90_wgmma_transpose.hpp       |   754 -
 .../device/transform_universal_adapter.hpp    |   303 -
 .../kernel/filter_format_transformer.hpp      |   223 -
 .../kernel/sm90_sparse_gemm_compressor.hpp    |   578 -
 .../kernel/sparse_gemm_compressor.hpp         |   284 -
 .../transform/pitch_linear_thread_map.h       |   926 -
 .../cutlass/transform/thread/transpose.h      |   107 -
 .../cutlass/transform/thread/unary_op.h       |   105 -
 .../transform/threadblock/ell_iterator.h      |   199 -
 .../ell_predicated_tile_access_iterator.h     |  1350 -
 .../ell_predicated_tile_iterator.h            |  1315 -
 ...icated_scale_bias_vector_access_iterator.h |   375 -
 .../predicated_scale_bias_vector_iterator.h   |   328 -
 .../predicated_tile_access_iterator.h         |  2118 -
 ...icated_tile_access_iterator_2dthreadtile.h |   834 -
 .../predicated_tile_access_iterator_params.h  |   290 -
 ...d_tile_access_iterator_triangular_matrix.h |   892 -
 .../threadblock/predicated_tile_iterator.h    |  1887 -
 .../predicated_tile_iterator_2dthreadtile.h   |   787 -
 ...edicated_tile_iterator_triangular_matrix.h |   818 -
 .../predicated_vector_access_iterator.h       |   417 -
 ...egular_scale_bias_vector_access_iterator.h |   253 -
 .../regular_tile_access_iterator.h            |    58 -
 ...egular_tile_access_iterator_pitch_linear.h |   408 -
 ...access_iterator_pitch_linear_direct_conv.h |   587 -
 .../regular_tile_access_iterator_tensor_op.h  |   821 -
 ...ular_tile_access_iterator_tensor_op_sm80.h |  1532 -
 .../threadblock/regular_tile_iterator.h       |    62 -
 .../regular_tile_iterator_pitch_linear.h      |   552 -
 ..._tile_iterator_pitch_linear_2dthreadtile.h |   509 -
 .../regular_tile_iterator_tensor_op.h         |  1107 -
 .../regular_tile_iterator_tensor_op_sm70.h    |  1460 -
 .../transform/threadblock/vector_iterator.h   |   149 -
 .../transform/warp/vector_fragment_iterator.h |   283 -
 .../cutlass/include/cutlass/uint128.h         |   270 -
 .../cutlass/include/cutlass/version.h         |    80 -
 .../cutlass/include/cutlass/wmma_array.h      |   133 -
 .../cutlass/include/cutlass/workspace.h       |   150 -
 .../lightllm_kernel/ops/__init__.py           |    14 +-
 lightllm-kernel/setup.py                      |     9 +-
 third-party/cutlass                           |     1 +
 671 files changed, 17 insertions(+), 549297 deletions(-)
 create mode 100644 .gitmodules
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/config.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/arch/util.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/config.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/alignment.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/array.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/tuple.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/container/type_list.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/int_tuple.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/layout.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/layout_composed.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/int.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/math.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/numeric/real.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/pointer.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/pointer_base.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/stride.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/swizzle.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/tensor.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/underscore.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/util/debug.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/util/print.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/arch.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/config.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/array.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/barrier.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/bfloat16.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3_types.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/block_striped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/constants.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/coord.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/core_io.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/cutlass.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/device_kernel.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/fast_math.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/float8.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/functional.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/half.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/layout.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/permute.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/vector.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_size.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_types.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/platform/platform.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/quaternion.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/real.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/semaphore.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/tfloat32.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/trace.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/uint128.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/version.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/wmma_array.h
 delete mode 100755 lightllm-kernel/cutlass/include/cutlass/workspace.h
 create mode 160000 third-party/cutlass

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..d16e9335b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third-party/cutlass"]
+	path = third-party/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
deleted file mode 100755
index 339743f49..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/tensor_impl.hpp>
-#include <cute/tensor_predicate.hpp>
-
-namespace cute
-{
-
-//
-// Accept mutable temporaries
-//
-template <class Alpha,
-          class XEngine, class XLayout,
-          class Beta,
-          class YEngine, class YLayout,
-          class PrdTensor = TrivialPredTensor>
-CUTE_HOST_DEVICE
-void
-axpby(Alpha                    const& alpha,
-      Tensor<XEngine, XLayout> const& x,
-      Beta                     const& beta,
-      Tensor<YEngine, YLayout>     && y,
-      PrdTensor                const& p = {})
-{
-  return axpby(alpha, x, beta, y, p);
-}
-
-//
-// AXPBY
-//
-template <class Alpha,
-          class XEngine, class XLayout,
-          class Beta,
-          class YEngine, class YLayout,
-          class PrdTensor = TrivialPredTensor>
-CUTE_HOST_DEVICE
-void
-axpby(Alpha                    const& alpha,
-      Tensor<XEngine, XLayout> const& x,
-      Beta                     const& beta,
-      Tensor<YEngine, YLayout>      & y,
-      PrdTensor                const& p = {})
-{
-  auto isBetaZero = [&] () {
-    if constexpr (is_complex<Beta>::value) {
-      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
-    }
-    else {
-      return beta == Int<0>{};
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  } ();
-
-  CUTE_UNROLL
-  for (int i = 0; i < size(x); ++i) {
-    if (p(i)) {
-      y(i) = (isBetaZero ? alpha * x(i) : alpha * x(i) + beta * y(i));
-    }
-  }
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
deleted file mode 100755
index 0b3a8eaa1..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>          // CUTE_HOST_DEVICE
-#include <cute/tensor_impl.hpp>     // cute::Tensor
-#include <cute/algorithm/fill.hpp>  // cute::fill
-
-namespace cute
-{
-
-//
-// Accept mutable temporaries
-//
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE
-void
-clear(Tensor<Engine, Layout>&& tensor)
-{
-  return clear(tensor);
-}
-
-//
-// Set elements to zero
-//
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE
-void
-clear(Tensor<Engine, Layout>& tensor)
-{
-  using T = typename Tensor<Engine,Layout>::value_type;
-
-  fill(tensor, T{});
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
deleted file mode 100755
index 9d080116d..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp
+++ /dev/null
@@ -1,333 +0,0 @@
-/***************************************************************************************************
-* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: BSD-3-Clause
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* 1. Redistributions of source code must retain the above copyright notice, this
-* list of conditions and the following disclaimer.
-*
-* 2. Redistributions in binary form must reproduce the above copyright notice,
-* this list of conditions and the following disclaimer in the documentation
-* and/or other materials provided with the distribution.
-*
-* 3. Neither the name of the copyright holder nor the names of its
-* contributors may be used to endorse or promote products derived from
-* this software without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-**************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/layout.hpp>
-#include <cute/layout_composed.hpp> // cute::logical_divide
-#include <cute/swizzle.hpp>         // cute::Swizzle
-#include <cute/swizzle_layout.hpp>  // cute::get_nonswizzle_portion
-#include <cute/tensor_impl.hpp>     // cute::Tensor
-#include <cute/tensor_predicate.hpp>
-#include <cute/algorithm/copy.hpp>
-#include <cute/atom/copy_atom.hpp>
-
-namespace cute
-{
-
-template <uint32_t NumThreads,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE void
-naive_cooperative_copy(uint32_t                     const& tid,
-                       Tensor<SrcEngine, SrcLayout> const& src,
-                       Tensor<DstEngine, DstLayout>      & dst)
-{
-  auto N = size(src);
-  if (tid < N) {
-    uint32_t upper_bound = (N / NumThreads) * NumThreads;
-    CUTE_UNROLL
-    for (uint32_t i = 0; i < upper_bound; i += NumThreads) {   // All in-bounds
-      dst[tid + i] = src[tid + i];
-    }
-    if (N % NumThreads != 0) {                                 // Likely static condition
-      uint32_t final_idx = tid + upper_bound;
-      if (final_idx < N) {                                     // Final in-bounds
-        dst[final_idx] = src[final_idx];
-      }
-    }
-  }
-}
-
-// Accept mutable temporaries
-template <uint32_t NumThreads,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE void
-naive_cooperative_copy(uint32_t                     const& tid,
-                       Tensor<SrcEngine, SrcLayout> const& src,
-                       Tensor<DstEngine, DstLayout>     && dst)
-{
-  return naive_cooperative_copy(tid, src, dst);
-}
-
-// A heuristic to determine a "good" permutation of two tensors for later vectorization and thr-assignment
-template <class AEngine, class ALayout,
-          class BEngine, class BLayout>
-CUTE_HOST_DEVICE constexpr
-auto
-heuristic_permutation(Tensor<AEngine, ALayout> const& a,
-                      Tensor<BEngine, BLayout> const& b)
-{
-  constexpr bool swizzleA = get_swizzle_t<AEngine>::num_bits != 0 or
-                            get_swizzle_t<ALayout>::num_bits != 0;
-  constexpr bool swizzleB = get_swizzle_t<BEngine>::num_bits != 0 or
-                            get_swizzle_t<BLayout>::num_bits != 0;
-  auto a_inv = right_inverse(get_nonswizzle_portion(a.layout()));
-  auto b_inv = right_inverse(get_nonswizzle_portion(b.layout()));
-
-  constexpr uint8_t scoreA = (uint8_t(swizzleA)                  << 2) |
-                             (uint8_t(is_smem<AEngine>::value)   << 1) |
-                             (uint8_t(size(a_inv) > size(b_inv)) << 0);
-
-  constexpr uint8_t scoreB = (uint8_t(swizzleB)                  << 2) |
-                             (uint8_t(is_smem<BEngine>::value)   << 1) |
-                             (uint8_t(size(b_inv) > size(a_inv)) << 0);
-
-  if constexpr (scoreA >= scoreB) {
-    return a_inv;
-  } else {
-    return b_inv;
-  }
-}
-
-// cooperative_copy<NumThreads, MaxVecBits>(thr_idx, src, dst)
-// Use NumThreads to copy Tensor src to Tensor dst with element-wise vectorization up to MaxVecBits.
-// @pre 0 <= @a tid < NumThreads
-// @pre Tensors @a src and @a dst are aligned up to MaxVecBits.
-//      That is, pointers and dynamic strides are assumed to be aligned up to MaxVecBits.
-//
-template <uint32_t NumThreads, uint32_t MaxVecBits,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-cooperative_copy(uint32_t                     const& tid,
-                 Tensor<SrcEngine, SrcLayout> const& src,
-                 Tensor<DstEngine, DstLayout>      & dst)
-{
-  // Assumes the shapes are static, can generalize/fallback
-  CUTE_STATIC_ASSERT_V(is_static<decltype(shape(src))>{} && is_static<decltype(shape(dst))>{});
-  CUTE_STATIC_ASSERT_V(size(src) == size(dst));
-  // Assumes the types are the same, can generalize/fallback
-  static_assert(cute::is_same<typename SrcEngine::value_type, typename DstEngine::value_type>::value);
-  static_assert(MaxVecBits == sizeof_bits_v<typename SrcEngine::value_type> ||
-                MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
-                "Expected MaxVecBits to be value size or 8 or 16 or 32 or 64 or 128 for alignment and performance.");
-  // Check that the tensors are likely shared across threads: either gmem or smem
-  static_assert((is_gmem<SrcEngine>::value || is_smem<SrcEngine>::value),
-                "cooperative_copy expects shared gmem or smem source tensor.");
-  static_assert((is_gmem<DstEngine>::value || is_smem<DstEngine>::value),
-                "cooperative_copy expects shared gmem or smem destination tensor.");
-  // Precondition on tid in DEBUG
-  assert(tid < NumThreads);
-  // Precondition on pointer alignment in DEBUG
-  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(src.data())));
-  assert(is_byte_aligned<ceil_div(MaxVecBits,8u)>(raw_pointer_cast(dst.data())));
-
-#if 0
-      if (thread0()) {
-        print("   "); print("cooperative_copy\n");
-        print("   "); print("NumThreads: "); print(NumThreads); print("\n");
-        print("   "); print("MaxVecBits: "); print(MaxVecBits); print("\n");
-        print("   "); print("src: "); print(src); print("\n");
-        print("   "); print("dst: "); print(dst); print("\n");
-      }
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-#endif
-
-  // The common layout of the two tensors that can be vectorized over elements and threads
-  // vidx -> coord
-  auto common_layout = heuristic_permutation(src, dst);
-
-  // Apply
-  // (V, rest)
-  Tensor src_a = coalesce(logical_divide(src, common_layout), Shape<_1,_1>{});
-  Tensor dst_a = coalesce(logical_divide(dst, common_layout), Shape<_1,_1>{});
-
-  //
-  // Determine vectorization of elems and thrs based on src/dst size and number of threads
-  // NOTE: This heuristic promotes parallelization over vectorization
-  //
-
-  // The number of elements and number of bits
-  constexpr int  elem_bits = sizeof_bits_v<typename SrcEngine::value_type>;
-  constexpr int total_elem = size(SrcLayout{});
-
-  // The number of elements that can be vectorized in values
-  constexpr int common_elem = decltype(max_common_vector(src_a, dst_a))::value;
-
-#if 0
-      if (thread0()) {
-        print("   "); print("common_layout: "); print(common_layout); print("\n");
-        print("   "); print("src_a: "); print(src_a); print("\n");
-        print("   "); print("dst_a: "); print(dst_a); print("\n");
-      }
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-#endif
-
-  //
-  if constexpr (total_elem % NumThreads != 0) {
-    // Not attempting to find a partitioning pattern, fallback to dynamically indexed slowpath
-
-    if constexpr (common_elem > 1 && MaxVecBits > elem_bits) {
-      // If the vectorization is non-trivial and divides the maximum vectorizations, then vectorize
-      constexpr auto max_align_src = elem_bits * decltype(max_alignment(src_a.layout()))::value;
-      constexpr auto max_align_dst = elem_bits * decltype(max_alignment(dst_a.layout()))::value;
-      constexpr auto vec_bits = gcd(max_align_src, max_align_dst, MaxVecBits);
-      using VecType = uint_bit_t<vec_bits>;
-
-      static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
-      static_assert((vec_bits >= 8), "No support for subbyte copying");
-
-      Tensor src_v = recast<VecType const>(src_a);
-      Tensor dst_v = recast<VecType      >(dst_a);
-
-#if 0
-      if (thread0()) {
-        print("   "); print("cooperative_copy -- naive\n");
-        print("   "); print("src_v: "); print(src_v); print("\n");
-        print("   "); print("dst_v: "); print(dst_v); print("\n");
-      }
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-#endif
-
-      naive_cooperative_copy<NumThreads>(tid, src_v, dst_v);
-    } else {
-      naive_cooperative_copy<NumThreads>(tid, src_a, dst_a);
-    }
-  } else {
-    // If the tensors can be equally partitioned by the threads,
-    // compute vectorization widths in elements and threads.
-
-    // If there are too many threads to allow a full vectorized copy, trunc the vectorization
-    constexpr int total_bits = total_elem * elem_bits;
-    constexpr int max_bits_per_thr = total_bits / NumThreads;
-    // At least elem_bits, at most common_bits
-    constexpr int common_bits = common_elem * elem_bits;
-    constexpr int vec_bits = cute::max(elem_bits, cute::gcd(common_bits, int(MaxVecBits), max_bits_per_thr));
-
-    // Should account for vec_bits < 8 and/or vec_elem <= 1
-    // And also account for subbyte types, which could cause race conditions
-    // Want to ENFORCE sufficient vectorization in those cases
-    static_assert(vec_bits % elem_bits == 0, "Expected divisibility");
-    static_assert(vec_bits >= 8, "No support for subbyte copying");
-
-    using VecType = uint_bit_t<vec_bits>;
-    constexpr int vec_elem = vec_bits / elem_bits;
-
-    constexpr int vec_thrs = cute::min(int(NumThreads), total_elem / vec_elem);
-
-    //
-    // Determine the partitioning patterns for the vec_elems and vec_thrs
-    //
-
-    // Distribute the rest of the V*T to some consistent portion outside of the common_layout, if needed
-    auto common_domain_src = domain_distribute(shape(src_a), Int<vec_elem*vec_thrs>{});
-    auto common_domain_dst = domain_distribute(shape(dst_a), Int<vec_elem*vec_thrs>{});
-
-    // Make sure for now, could fall back here instead
-    CUTE_STATIC_ASSERT_V(size(common_domain_src) == Int<vec_elem*vec_thrs>{});
-    CUTE_STATIC_ASSERT_V(compatible(common_domain_src, common_domain_dst) ||
-                         compatible(common_domain_dst, common_domain_src));
-    // Use the "more specific" domain for the extra elements of V*T
-    auto common_domain = conditional_return(compatible(common_domain_src, common_domain_dst),
-                                            common_domain_dst, common_domain_src);
-
-    // Construct the tiler
-    auto tiler_vt = common_domain.with_shape(Int<vec_elem>{}, Int<vec_thrs>{});
-
-    // Apply and slice
-    Tensor src_v = logical_divide(src_a, tiler_vt)(make_coord(_,tid),_);
-    Tensor dst_v = logical_divide(dst_a, tiler_vt)(make_coord(_,tid),_);
-
-#if 0
-      if (thread0()) {
-        print("   "); print("cooperative_copy -- vec\n");
-        print("   "); print("Used vector: ");  print(vec_elem); print("\n");
-        print("   "); print("Used threads: ");  print(vec_thrs); print("\n");
-        print("   "); print("tiler_vt: "); print(tiler_vt); print("\n");
-        print("   "); print("src_v: "); print(src_v); print("\n");
-        print("   "); print("dst_v: "); print(dst_v); print("\n");
-        print("   "); print("recast<VecType const>(src_v): "); print(recast<VecType const>(src_v)); print("\n");
-        print("   "); print("recast<VecType      >(dst_v): "); print(recast<VecType      >(dst_v)); print("\n");
-      }
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-#endif
-
-    // If we're using all threads (static) or the tid is in-range (dynamic)
-    if (vec_thrs == NumThreads or tid < vec_thrs) {
-      return copy_if(TrivialPredTensor{}, recast<VecType const>(src_v), recast<VecType>(dst_v));
-    }
-  }
-}
-
-// Default max-vectorization size to value_type size
-template <uint32_t NumThreads,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-cooperative_copy(uint32_t                     const& tid,
-                 Tensor<SrcEngine, SrcLayout> const& src,
-                 Tensor<DstEngine, DstLayout>      & dst)
-{
-  constexpr uint32_t MaxVecBits = sizeof_bits_v<typename SrcEngine::value_type>;
-  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
-}
-
-//
-// Accept mutable temporaries
-//
-
-template <uint32_t NumThreads,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-cooperative_copy(uint32_t                     const& tid,
-                 Tensor<SrcEngine, SrcLayout> const& src,
-                 Tensor<DstEngine, DstLayout>     && dst)
-{
-  return cooperative_copy<NumThreads>(tid, src, dst);
-}
-
-template <uint32_t NumThreads, uint32_t MaxVecBits,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-cooperative_copy(uint32_t                     const& tid,
-                 Tensor<SrcEngine, SrcLayout> const& src,
-                 Tensor<DstEngine, DstLayout>     && dst)
-{
-  return cooperative_copy<NumThreads, MaxVecBits>(tid, src, dst);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
deleted file mode 100755
index 2c91ce6f4..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp
+++ /dev/null
@@ -1,512 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/util/type_traits.hpp>
-
-#include <cute/atom/mma_atom.hpp>
-
-#include <cute/algorithm/axpby.hpp>
-#include <cute/algorithm/functional.hpp>
-#include <cute/algorithm/gemm.hpp>
-
-#include <cute/tensor_impl.hpp>
-
-namespace cute
-{
-
-//
-// Cooperative Shared-Memory GEMMs
-//
-
-namespace detail {
-
-// Predicated Cooperative GEMM
-template <class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp, class BLoadTransformOp,
-          class CLoadTransformOp, class CStoreTransformOp,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-cooperative_gemm_predication(ThrMMA<Args...> const& thr_mma,
-                             Alpha const& alpha,
-                             Tensor<TA, ALayout> sA,
-                             Tensor<TB, BLayout> sB,
-                             Beta  const& beta,
-                             Tensor<TC, CLayout> sC,
-                             ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
-                             BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
-                             CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
-                             CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
-{
-  using TypeA = typename TA::value_type;
-  using TypeB = typename TB::value_type;
-  using TypeC = typename TC::value_type;
-
-  //
-  // MMA Partitioning
-  //
-
-  // Partition the sA, sB, and sC tiles across the threads for the MMA
-  Tensor tCsA = thr_mma.partition_A(sA);                            // (MMA,MMA_M,MMA_K)
-  Tensor tCsB = thr_mma.partition_B(sB);                            // (MMA,MMA_N,MMA_K)
-  Tensor tCsC = thr_mma.partition_C(sC);                            // (MMA,MMA_M,MMA_N)
-
-  // Create register tensors for the MMA to operate on
-  Tensor tCrA = thr_mma.make_fragment_A(tCsA);                      // (MMA,MMA_M,MMA_K)
-  Tensor tCrB = thr_mma.make_fragment_B(tCsB);                      // (MMA,MMA_N,MMA_K)
-  Tensor tCrC = thr_mma.make_fragment_C(tCsC);                      // (MMA,MMA_M,MMA_N)
-
-#if 0
-  if (thread0()) {
-    print("  sA: "); print(  sA); print("\n");
-    print("  sB: "); print(  sB); print("\n");
-    print("  sC: "); print(  sC); print("\n");
-    print(thr_mma);
-    print("tCsA: "); print(tCsA); print("\n");
-    print("tCsB: "); print(tCsB); print("\n");
-    print("tCsC: "); print(tCsC); print("\n");
-    print("tCrA: "); print(tCrA); print("\n");
-    print("tCrB: "); print(tCrB); print("\n");
-    print("tCrC: "); print(tCrC); print("\n");
-  }
-#endif
-
-  //
-  // PREDICATION
-  //
-
-  // Create coordinate tensors for the problem
-  Tensor cA = make_identity_tensor(shape(sA));                      // (M,K) -> (m,k)
-  Tensor cB = make_identity_tensor(shape(sB));                      // (N,K) -> (n,k)
-
-  // Repeat partitioning with thr_mma
-  Tensor tCcA = thr_mma.partition_A(cA);                            // (MMA,MMA_M,MMA_K) -> (m,k)
-  Tensor tCcB = thr_mma.partition_B(cB);                            // (MMA,MMA_N,MMA_K) -> (n,k)
-
-  // Allocate the preds for MMA- and MMA_MN-modes
-  Tensor tCpA = make_tensor<bool>(make_shape(size<0>(tCsA), size<1>(tCsA)));
-  Tensor tCpB = make_tensor<bool>(make_shape(size<0>(tCsB), size<1>(tCsB)));
-
-  // Populate the predicates on M and N
-  CUTE_UNROLL
-  for (int i = 0; i < size(tCpA); ++i) {
-    tCpA(i) = elem_less(get<0>(tCcA(_,_,Int<0>{})(i)), shape<0>(sA));
-  }
-  CUTE_UNROLL
-  for (int i = 0; i < size(tCpB); ++i) {
-    tCpB(i) = elem_less(get<0>(tCcB(_,_,Int<0>{})(i)), shape<0>(sB));
-  }
-
-#if 0
-  if (thread0()) {
-    print("  cA: "); print(  cA); print("\n");
-    print("  cB: "); print(  cB); print("\n");
-    print("tCcA: "); print(tCcA); print("\n");
-    print("tCcB: "); print(tCcB); print("\n");
-    print_tensor(tCpA);
-    print_tensor(tCpB);
-  }
-#endif
-
-  //
-  // PREFETCH k_block = 0
-  //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
-  //   Assumes the MMA-tiling in K is trivial
-  //
-
-  constexpr int K_BLOCK_MAX = size<2>(tCrA);
-
-  CUTE_UNROLL
-  for (int m = 0; m < size<1>(tCrA); ++m) {     // Copy MMA_M
-    CUTE_UNROLL
-    for (int i = 0; i < size<0>(tCrA); ++i) {   // Copy MMA_I
-      tCrA(i,m,0) = (tCpA(i,m) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,0)), shape<1>(sA)))) ? sA_load_op(tCsA(i,m,0)) : TypeA{};
-    }
-  }
-  CUTE_UNROLL
-  for (int n = 0; n < size<1>(tCrB); ++n) {     // Copy MMA_N
-    CUTE_UNROLL
-    for (int i = 0; i < size<0>(tCrB); ++i) {   // Copy MMA_I
-      tCrB(i,n,0) = (tCpB(i,n) && (0 < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,0)), shape<1>(sB)))) ? sB_load_op(tCsB(i,n,0)) : TypeB{};
-    }
-  }
-  //
-  // MAINLOOP
-  //
-
-  // Clear accumulators
-  clear(tCrC);
-
-  CUTE_UNROLL
-  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
-  {
-    if (k_block < K_BLOCK_MAX-1)   // static-if not the last k_block
-    {
-      int k_next = k_block + 1;    // Load k_next block
-
-      //   Condition the k-predication on (static) k_block == K_BLOCK_MAX-1, the last k_block
-      //   Assumes the MMA-tiling in K is trivial
-
-      CUTE_UNROLL
-      for (int m = 0; m < size<1>(tCrA); ++m) {       // Copy MMA_M
-        CUTE_UNROLL
-        for (int i = 0; i < size<0>(tCrA); ++i) {     // Copy MMA_I
-          tCrA(i,m,k_next) = (tCpA(i,m) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcA(i,m,k_next)), shape<1>(sA)))) ? sA_load_op(tCsA(i,m,k_next)) : TypeA{};
-        }
-      }
-      CUTE_UNROLL
-      for (int n = 0; n < size<1>(tCrB); ++n) {       // Copy MMA_N
-        CUTE_UNROLL
-        for (int i = 0; i < size<0>(tCrB); ++i) {     // Copy MMA_I
-          tCrB(i,n,k_next) = (tCpB(i,n) && (k_next < K_BLOCK_MAX-1 || elem_less(get<1>(tCcB(i,n,k_next)), shape<1>(sB)))) ? sB_load_op(tCsB(i,n,k_next)) : TypeB{};
-        }
-      }
-    }
-    // GEMM on k_block in registers
-    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
-  }
-
-  //
-  // Epilogue
-  //
-
-  // Create coordinate tensors for the problem
-  Tensor cC   = make_identity_tensor(shape(sC));                     // (M,N) -> (m,n)
-  // Repeat partitioning with thr_mma
-  Tensor tCcC = thr_mma.partition_C(cC);                             // (MMA,MMA_M,MMA_N) -> (m,n)
-
-  const bool isBetaZero = (beta == Beta{});
-
-  // Custom axpby_if for now
-  CUTE_UNROLL
-  for (int i = 0; i < size(tCrC); ++i)
-  {
-    if (elem_less(tCcC(i), shape(sC)))
-    {
-      tCsC(i) = sC_store_op(isBetaZero ? alpha * static_cast<TypeC>(tCrC(i))
-                                       : alpha * static_cast<TypeC>(tCrC(i)) +
-                                          beta * static_cast<TypeC>(sC_load_op(tCsC(i))));
-    }
-  }
-}
-
-// Slow fallback path
-template <class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp, class BLoadTransformOp,
-          class CLoadTransformOp, class CStoreTransformOp,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-cooperative_gemm_predication(uint32_t thread_idx,
-                             TiledMMA<Args...> const& tiled_mma,
-                             Alpha const& alpha,
-                             Tensor<TA, ALayout> sA,
-                             Tensor<TB, BLayout> sB,
-                             Beta  const& beta,
-                             Tensor<TC, CLayout> sC,
-                             ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
-                             BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
-                             CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
-                             CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
-{
-  // ThrMMA
-  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-  cooperative_gemm_predication(thr_mma, alpha, sA, sB, beta, sC, sA_load_op, sB_load_op, sC_load_op, sC_store_op);
-}
-
-// Unpredicated Cooperative GEMM
-template <class SmemCopyOpA, class SmemCopyOpB, class SmemCopyOpC,
-          class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp, class BLoadTransformOp,
-          class CLoadTransformOp, class CStoreTransformOp,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-cooperative_gemm_no_predication(uint32_t thread_idx,
-                                TiledMMA<Args...> const& tiled_mma,
-                                Alpha const& alpha,
-                                Tensor<TA, ALayout> sA,
-                                Tensor<TB, BLayout> sB,
-                                Beta  const& beta,
-                                Tensor<TC, CLayout> sC,
-                                ALoadTransformOp  const& sA_load_op,  // transforms A values before use in GEMM
-                                BLoadTransformOp  const& sB_load_op,  // transforms B values before use in GEMM
-                                CLoadTransformOp  const& sC_load_op,  // transforms C values before use in GEMM
-                                CStoreTransformOp const& sC_store_op) // transforms results before they are stored to C
-{
-  using TypeA = typename TA::value_type;
-  using TypeB = typename TB::value_type;
-  using TypeC = typename TC::value_type;
-
-  // ThrMMA
-  auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-
-  //
-  // MMA Partitioning
-  //
-
-  Tensor tCsC = thr_mma.partition_C(sC);
-  // Create register tensors for the MMA to operate on
-  Tensor tCrA  = thr_mma.partition_fragment_A(sA);                    // (MMA,MMA_M,MMA_K)
-  Tensor tCrB  = thr_mma.partition_fragment_B(sB);                    // (MMA,MMA_N,MMA_K)
-  Tensor tCrC  = thr_mma.make_fragment_C(tCsC);                       // (MMA,MMA_M,MMA_N)
-
-  using CopyOpAType = SmemCopyOpA;
-  using CopyOpBType = SmemCopyOpB;
-
-  auto smem_tiled_copy_A = make_tiled_copy_A(Copy_Atom<CopyOpAType, TypeA>{}, thr_mma);
-  auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
-  Tensor tCsA            = smem_thr_copy_A.partition_S(sA);
-  Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);
-  CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));             // CPY_M
-  CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));             // CPY_K
-
-  auto smem_tiled_copy_B = make_tiled_copy_B(Copy_Atom<CopyOpBType, TypeB>{}, thr_mma);
-  auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
-  Tensor tCsB            = smem_thr_copy_B.partition_S(sB);
-  Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);
-  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
-  CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
-
-#if 0
-  if (thread0()) {
-    print("  sA: "); print(sA); print("\n");
-    print("  sB: "); print(sB); print("\n");
-    print("  sC: "); print(sC); print("\n");
-    print(thr_mma); print("\n");
-    print("tCsC: "); print(tCsC); print("\n");
-    print("tCrA: "); print(tCrA); print("\n");
-    print("tCrB: "); print(tCrB); print("\n");
-    print("tCrC: "); print(tCrC); print("\n");
-    print(smem_thr_copy_A); print("\n");
-    print("tCsA: "); print(tCsA); print("\n");
-    print("tCrA_copy_view: "); print(tCrA_copy_view); print("\n");
-    print(smem_thr_copy_B); print("\n");
-    print("tCsB: "); print(tCsB); print("\n");
-    print("tCrB_copy_view: "); print(tCrB_copy_view); print("\n");
-  }
-#endif
-
-  //
-  // PREFETCH
-  //
-
-  copy(smem_tiled_copy_A, tCsA(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
-  copy(smem_tiled_copy_B, tCsB(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
-  //
-  // MAINLOOP
-  //
-
-  // Clear accumulators
-  clear(tCrC);
-
-  constexpr int K_BLOCK_MAX = size<2>(tCrA);
-
-  CUTE_UNROLL
-  for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block)
-  {
-    // static-if load the next k_block. No k-predication required on these loads.
-    if (k_block < K_BLOCK_MAX-1)
-    {
-      // Load the next k_block
-      int k_next = k_block + 1;       // statically unrolled
-      copy(smem_tiled_copy_A, tCsA(_,_,k_next), tCrA_copy_view(_,_,k_next));
-      copy(smem_tiled_copy_B, tCsB(_,_,k_next), tCrB_copy_view(_,_,k_next));
-    }
-
-    // Transform A and B, relying on the compiler to remove in case of identity ops
-    cute::transform(tCrA(_,_,k_block), sA_load_op);
-    cute::transform(tCrB(_,_,k_block), sB_load_op);
-
-    // GEMM on k_block in registers
-    gemm(thr_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
-  }
-
-  //
-  // Epilogue
-  //
-
-  auto isBetaZero = [&] () {
-    if constexpr (is_complex<Beta>::value) {
-      return beta.real() == Int<0>{} && beta.imag() == Int<0>{};
-    }
-    else {
-      return beta == Int<0>{};
-    }
-    CUTE_GCC_UNREACHABLE;
-  } ();
-
-  using CopyOpCType = SmemCopyOpC;
-  Tensor tCrD = thr_mma.make_fragment_C(tCsC);
-  if(!isBetaZero) {
-    copy(CopyOpCType{}, tCsC, tCrD);
-    // Transform C on/after load
-    cute::transform(tCrD, sC_load_op);
-  }
-  // C = alpha * (A * B) + beta * C
-  axpby(alpha, tCrC, beta, tCrD);
-  // Transform C before/on store
-  cute::transform(tCrD, sC_store_op);
-  copy(CopyOpCType{}, tCrD, tCsC);
-}
-
-} // end namespace detail
-
-template <class SmemCopyOpA, class SmemCopyOpB, class SmemCopyOpC,
-          class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
-          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-cooperative_gemm(uint32_t thread_idx,
-                 TiledMMA<Args...> const& tiled_mma,
-                 Alpha const& alpha,
-                 Tensor<TA, ALayout> sA,
-                 Tensor<TB, BLayout> sB,
-                 Beta  const& beta,
-                 Tensor<TC, CLayout> sC,
-                 ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
-                 BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
-                 CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
-                 CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
-{
-  CUTE_STATIC_ASSERT_V(size<0>(sA) == size<0>(sC));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<0>(sB) == size<1>(sC));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));  // AK == BK
-
-  using TypeA = typename TA::value_type;
-  using TypeB = typename TB::value_type;
-  using TypeC = typename TC::value_type;
-
-  static_assert(is_convertible_v<decay_t<invoke_result_t<ALoadTransformOp, TypeA>>, TypeA>,
-    "ALoadTransformOp functor must accept value of type TA::value_type and return value convertible to type TA::value_type");
-  static_assert(is_convertible_v<decay_t<invoke_result_t<BLoadTransformOp, TypeB>>, TypeB>,
-    "BLoadTransformOp functor must accept value of type TB::value_type and return value convertible to type TB::value_type");
-  static_assert(is_convertible_v<decay_t<invoke_result_t<CLoadTransformOp, TypeC>>, TypeC>,
-    "CLoadTransformOp functor must accept value of type TC::value_type and return value convertible to type TC::value_type");
-  static_assert(is_convertible_v<decay_t<invoke_result_t<CStoreTransformOp, TypeC>>, TypeC>,
-    "CStoreTransformOp functor must accept value of type TC::value_type and return value convertible to type TC::value_type");
-
-  static constexpr bool compat = evenly_divides(make_shape(size<0>(sA), size<0>(sB), size<1>(sA)),
-                                                tile_shape(TiledMMA<Args...>{}));
-  if constexpr (compat) {
-    detail::cooperative_gemm_no_predication<SmemCopyOpA, SmemCopyOpB, SmemCopyOpC>(
-        thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
-        sA_load_op, sB_load_op, sC_load_op, sC_store_op
-    );
-  } else {
-    detail::cooperative_gemm_predication(
-      thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
-      sA_load_op, sB_load_op, sC_load_op, sC_store_op
-    );
-  }
-}
-
-template <class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
-          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-cooperative_gemm(uint32_t thread_idx,
-                 TiledMMA<Args...> const& tiled_mma,
-                 Alpha const& alpha,
-                 Tensor<TA, ALayout> sA,
-                 Tensor<TB, BLayout> sB,
-                 Beta  const& beta,
-                 Tensor<TC, CLayout> sC,
-                 ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
-                 BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
-                 CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
-                 CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
-{
-  using CopyOpA = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TA::value_type>>;
-  using CopyOpB = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TB::value_type>>;
-  using CopyOpC = AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<typename TC::value_type>>;
-  cooperative_gemm<CopyOpA, CopyOpB, CopyOpC>(
-      thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
-      sA_load_op, sB_load_op, sC_load_op, sC_store_op
-  );
-}
-
-// Legacy overload of cute::gemm for backwards-compatibility
-template <class... Args,
-          class Alpha, class TA, class ALayout, class TB, class BLayout,
-          class Beta,  class TC, class CLayout,
-          class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
-          class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
-          __CUTE_REQUIRES(ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_smem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(ThrMMA<Args...> const& thr_mma,
-     Alpha const& alpha,
-     Tensor<TA, ALayout> sA,
-     Tensor<TB, BLayout> sB,
-     Beta  const& beta,
-     Tensor<TC, CLayout> sC,
-     ALoadTransformOp  const& sA_load_op  = {}, // transforms A values before use in GEMM
-     BLoadTransformOp  const& sB_load_op  = {}, // transforms B values before use in GEMM
-     CLoadTransformOp  const& sC_load_op  = {}, // transforms C values before use in GEMM
-     CStoreTransformOp const& sC_store_op = {}) // transforms results before they are stored to C
-{
-  // Goes directly to the slow path to avoid getting thread_idx from thr_mma
-  detail::cooperative_gemm_predication(
-    thr_mma, alpha, sA, sB, beta, sC,
-    sA_load_op, sB_load_op, sC_load_op, sC_store_op
-  );
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
deleted file mode 100755
index c2decd15d..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>            // CUTE_HOST_DEVICE
-#include <cute/tensor_impl.hpp>       // cute::Tensor
-#include <cute/tensor_predicate.hpp>  // cute::TrivialPredTensor
-#include <cute/atom/copy_atom.hpp>    // cute::Copy_Atom
-
-namespace cute
-{
-
-//
-// Accept mutable temporaries
-//
-
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Tensor<SrcEngine, SrcLayout> const& src,
-     Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy(src, dst);
-}
-
-template <class VecType,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
-         Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy_vec<VecType>(src, dst);
-}
-
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
-             Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy_aligned(src, dst);
-}
-
-template <class PrdTensor,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_if(PrdTensor                    const& pred,
-        Tensor<SrcEngine, SrcLayout> const& src,
-        Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy_if(pred, src, dst);
-}
-
-template <class CopyPolicy,
-          class PrdTensor,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_if(CopyPolicy                   const& copy_policy,
-        PrdTensor                    const& pred,
-        Tensor<SrcEngine, SrcLayout> const& src,
-        Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy_if(copy_policy, pred, src, dst);
-}
-
-template <class CopyPolicy,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(CopyPolicy                   const& copy_policy,
-     Tensor<SrcEngine, SrcLayout> const& src,
-     Tensor<DstEngine, DstLayout>     && dst)
-{
-  return copy(copy_policy, src, dst);
-}
-
-//
-// copy_if -- Predicated Copy
-//
-
-template <class PrdTensor,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_if(PrdTensor                    const& pred,
-        Tensor<SrcEngine, SrcLayout> const& src,
-        Tensor<DstEngine, DstLayout>      & dst)
-{
-  auto copy_op = select_elementwise_copy(src, dst);
-
-  CUTE_UNROLL
-  for (int i = 0; i < size(src); ++i) {
-    if (pred(i)) {
-      copy_op.copy(src(i), dst(i));
-    }
-  }
-}
-
-//
-// copy_if -- Predicated CopyAtom
-//
-
-namespace detail {
-
-// Trait that detects if atom's traits has a member function with(bool)
-template <class, class Enable = void>
-constexpr bool has_with_bool = false;
-
-template <class T>
-constexpr bool has_with_bool<T, cute::void_t<decltype(declval<typename T::Traits>().with(declval<bool>()))>> = true;
-
-} // end namespace detail
-
-template <class... CopyArgs,
-          class PredTensor,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_if(Copy_Atom<CopyArgs...>       const& copy_atom,
-        PredTensor                   const& pred,      // (Rest...)
-        Tensor<SrcEngine, SrcLayout> const& src,       // (V,Rest...)
-        Tensor<DstEngine, DstLayout>      & dst)       // (V,Rest...)
-{
-  static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
-  if constexpr (SrcLayout::rank == 1) {   // Dispatch the copy
-    copy_atom.call(src, dst);
-  } else {                                // Loop over all but the first mode
-    constexpr int R = SrcLayout::rank;
-    Tensor src_v = group_modes<1,R>(src);
-    Tensor dst_v = group_modes<1,R>(dst);
-    CUTE_UNROLL
-    for (int i = 0; i < size<1>(src_v); ++i) {
-      // If copy traits can be transformed with a predicate value, do it, otherwise branch here
-      if constexpr (detail::has_with_bool<Copy_Atom<CopyArgs...>>) {
-        copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i));
-      } else {
-        if (pred(i)) {
-          copy_atom.call(src_v(_,i), dst_v(_,i));
-        }
-      }
-    }
-  }
-}
-
-//
-// copy_vec -- attempt vectorized copy with VecType
-//
-
-template <class VecType,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
-         Tensor<DstEngine, DstLayout>      & dst)
-{
-  static_assert(sizeof_bits_v<VecType> >= 8 && sizeof_bits_v<VecType> % 8 == 0,
-                "Expected a vectorization type of at least a byte.");
-  using SrcType = typename SrcEngine::value_type;
-  using DstType = typename DstEngine::value_type;
-  if constexpr (cute::is_same<SrcType, DstType>::value &&
-                sizeof_bits_v<VecType>  > sizeof_bits_v<DstType>)
-  {
-    // Preserve volatility of Src/Dst types.
-    using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
-    using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType       volatile, VecType      >;
-    Tensor src_v = recast<SrcVecType>(src);
-    Tensor dst_v = recast<DstVecType>(dst);
-
-#if 0
-    if (thread0()) {
-      print("copy_vec<%db> -- vectorizing copy:\n", int(sizeof_bits_v<VecType>));
-      print("   "); print(src); print(" => "); print(src_v); print("\n");
-      print("   "); print(dst); print(" => "); print(dst_v); print("\n");
-    }
-#endif
-
-    return copy_if(TrivialPredTensor{}, src_v, dst_v);
-  } else {
-#if 0
-  if (thread0()) {
-    print("copy_vec<%db> -- NOT vectorizing copy:\n", int(sizeof_bits_v<VecType>));
-    print("   "); print(src); print("\n");
-    print("   "); print(dst); print("\n");
-  }
-#endif
-
-    return copy_if(TrivialPredTensor{}, src, dst);
-  }
-}
-
-//
-// copy -- CopyAtom
-//
-
-template <class... CopyArgs,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Copy_Atom<CopyArgs...>       const& copy_atom,
-     Tensor<SrcEngine, SrcLayout> const& src,
-     Tensor<DstEngine, DstLayout>      & dst)
-{
-  return copy_if(copy_atom, TrivialPredTensor{}, src, dst);
-}
-
-//////////////////////////////////////////
-// Special Auto-Vectorizing Overloads
-//////////////////////////////////////////
-
-// Specialization for AutoVectorizingCopyAssumedAlignment<MaxVecBits>
-template <int MaxVecBits, class... Args,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits> const&,
-     Tensor<SrcEngine, SrcLayout>                        const& src,
-     Tensor<DstEngine, DstLayout>                             & dst)
-{
-  constexpr int vec_elem = decltype(max_common_vector(src, dst))::value;
-
-  constexpr int max_align_src = decltype(max_alignment(src.layout()))::value;
-  constexpr int max_align_dst = decltype(max_alignment(dst.layout()))::value;
-  constexpr int max_align     = gcd(vec_elem, max_align_src, max_align_dst);
-
-  constexpr int src_bits = sizeof_bits<typename SrcEngine::value_type>::value;
-  constexpr int vec_bits = gcd(src_bits * max_align, MaxVecBits);
-
-  if constexpr (vec_elem > 1 && vec_bits >= 8) {
-    // If more than one element vectorizes to 8bits or more, then copy_vec
-#if 0
-    if (thread0()) {
-      print("copy -- found max_common_vector of %d elems and vectorization to %d bits\n", vec_elem, vec_bits);
-      print("   "); print(src); print("\n");
-      print("   "); print(dst); print("\n");
-    }
-#endif
-    return copy_vec<uint_bit_t<vec_bits>>(src, dst);
-  } else {
-    return copy_if(TrivialPredTensor{}, src, dst);
-  }
-}
-
-// Auto-vectorizing copy for static layouts
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Tensor<SrcEngine, SrcLayout> const& src,
-     Tensor<DstEngine, DstLayout>      & dst)
-{
-  if constexpr (is_static<SrcLayout>::value && is_static<DstLayout>::value) {
-    // Assume Tensors with static layouts (e.g. registers) have pointers that are 128b aligned
-    return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst);
-  } else {
-    // Do not assume that dynamic layouts are aligned.
-    return copy(AutoVectorizingCopyWithAssumedAlignment<8>{}, src, dst);
-  }
-}
-
-// Auto-vectorizing copy with assumed alignment up to 128bit.
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy_aligned(Tensor<SrcEngine, SrcLayout> const& src,
-             Tensor<DstEngine, DstLayout>      & dst)
-{
-  return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst);
-}
-
-// Specializaton for Atom AutoVectorizingCopyAssumedAlignment
-template <int MaxVecBits, class... Args,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>, Args...> const&,
-     Tensor<SrcEngine, SrcLayout>                                            const& src,
-     Tensor<DstEngine, DstLayout>                                                 & dst)
-{
-  return copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>{}, src, dst);
-}
-
-#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
-template <class... CT_Args,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,  // Copy_Traits may or may not have the memory barrier in it already
-     Tensor<SrcEngine, SrcLayout>                 const& src,
-     Tensor<DstEngine, DstLayout>                      & dst)
-{
-  using SrcType = typename SrcEngine::value_type;
-  using DstType = typename DstEngine::value_type;
-  static_assert(cute::is_same<SrcType, DstType>::value);
-  static_assert((is_gmem<SrcEngine>::value && is_smem<DstEngine>::value) ||
-                (is_smem<SrcEngine>::value && is_gmem<DstEngine>::value),
-                "Bulk Copy only supports gmem -> smem or smem -> gmem movement.");
-  // G2S or S2G dispatch
-  using BULK_COPY_OP = conditional_t<is_gmem<SrcEngine>::value,
-                                     SM90_BULK_COPY_G2S,
-                                     SM90_BULK_COPY_S2G>;
-
-  // Find the common subtensor of src and dst
-  auto tiler = max_common_layout(src, dst);
-  constexpr int vec_elem = decltype(size(tiler))::value;
-  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
-  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
-
-  // Construct a new concrete Atom of the vector size
-  using BulkAtom = Copy_Atom<Copy_Traits<BULK_COPY_OP, Int<vec_bits>, CT_Args...>, SrcType>;
-  auto bulk_atom = apply(atom.opargs_, [](auto const&... args) { return BulkAtom{args...}; });
-
-#if 0
-  if (thread0()) {
-    print("copy blkcp -- found a max_common_layout of "); print(tiler); print("\n");
-    print("   "); print(src); print("\n");
-    print("   "); print(dst); print("\n");
-  }
-#endif
-
-  return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler));
-}
-
-// Backwards-compat. Throw out any extra Copy_Atom args.
-template <class... CT_Args, class... CA_Args,
-          class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE
-void
-copy(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
-     Tensor<SrcEngine, SrcLayout>                const& src,
-     Tensor<DstEngine, DstLayout>                     & dst)
-{
-  return copy(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src, dst);
-}
-#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
deleted file mode 100755
index 3f33a42ad..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/tensor_impl.hpp>
-#include <cute/algorithm/prefer.hpp>
-
-namespace cute
-{
-
-//
-// Accept mutable temporaries
-//
-template <class Engine, class Layout, class T>
-CUTE_HOST_DEVICE
-void
-fill(Tensor<Engine, Layout>&& tensor, T const& value)
-{
-  return fill(tensor, value);
-}
-
-namespace detail
-{
-
-// Prefer fill(tensor.data(), value), if possible
-template <class Engine, class Layout, class T>
-CUTE_HOST_DEVICE
-auto
-fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<1>)
-    -> decltype(fill(tensor.data(), value))
-{
-  fill(tensor.data(), value);
-}
-
-// Default implementation
-template <class Engine, class Layout, class T>
-CUTE_HOST_DEVICE
-void
-fill(Tensor<Engine, Layout>& tensor, T const& value, prefer<0>)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor); ++i) {
-    tensor(i) = value;
-  }
-}
-
-} // end namespace detail
-
-template <class Engine, class Layout, class T>
-CUTE_HOST_DEVICE
-void
-fill(Tensor<Engine, Layout>& tensor, T const& value)
-{
-  return detail::fill(tensor, value, prefer<1>{});
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
deleted file mode 100755
index ef80d018d..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp
+++ /dev/null
@@ -1,290 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>          // CUTE_HOST_DEVICE
-#include <cute/numeric/math.hpp>    // cute::max, cute::min
-#include <cute/numeric/complex.hpp> // cute::conj
-
-/** C++14 <functional> extensions */
-
-namespace cute {
-
-/**************/
-/** Identity **/
-/**************/
-
-struct identity {
-  template <class T>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) operator()(T&& arg) const {
-    return static_cast<T&&>(arg);
-  }
-};
-
-template <class R>
-struct constant_fn {
-  template <class... T>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) operator()(T&&...) const {
-    return r_;
-  }
-  R r_;
-};
-
-/***********/
-/** Unary **/
-/***********/
-
-#define CUTE_LEFT_UNARY_OP(NAME,OP)                                  \
-  struct NAME {                                                      \
-    template <class T>                                               \
-    CUTE_HOST_DEVICE constexpr                                       \
-    decltype(auto) operator()(T&& arg) const {                       \
-      return OP static_cast<T&&>(arg);                                \
-    }                                                                \
-  }
-#define CUTE_RIGHT_UNARY_OP(NAME,OP)                                 \
-  struct NAME {                                                      \
-    template <class T>                                               \
-    CUTE_HOST_DEVICE constexpr                                       \
-    decltype(auto) operator()(T&& arg) const {                       \
-      return static_cast<T&&>(arg) OP ;                               \
-    }                                                                \
-  }
-#define CUTE_NAMED_UNARY_OP(NAME,OP)                                 \
-  struct NAME {                                                      \
-    template <class T>                                               \
-    CUTE_HOST_DEVICE constexpr                                       \
-    decltype(auto) operator()(T&& arg) const {                       \
-      return OP (static_cast<T&&>(arg));                              \
-    }                                                                \
-  }
-
-CUTE_LEFT_UNARY_OP(unary_plus,       +);
-CUTE_LEFT_UNARY_OP(negate,           -);
-CUTE_LEFT_UNARY_OP(bit_not,          ~);
-CUTE_LEFT_UNARY_OP(logical_not,      !);
-CUTE_LEFT_UNARY_OP(dereference,      *);
-CUTE_LEFT_UNARY_OP(address_of,       &);
-CUTE_LEFT_UNARY_OP(pre_increment,   ++);
-CUTE_LEFT_UNARY_OP(pre_decrement,   --);
-
-CUTE_RIGHT_UNARY_OP(post_increment, ++);
-CUTE_RIGHT_UNARY_OP(post_decrement, --);
-
-CUTE_NAMED_UNARY_OP(abs_fn,           abs);
-CUTE_NAMED_UNARY_OP(conjugate, cute::conj);
-
-#undef CUTE_LEFT_UNARY_OP
-#undef CUTE_RIGHT_UNARY_OP
-#undef CUTE_NAMED_UNARY_OP
-
-template <int Shift_>
-struct shift_right_const {
-  static constexpr int Shift = Shift_;
-
-  template <class T>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) operator()(T&& arg) const {
-    return static_cast<T&&>(arg) >> Shift;
-  }
-};
-
-template <int Shift_>
-struct shift_left_const {
-  static constexpr int Shift = Shift_;
-
-  template <class T>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) operator()(T&& arg) const {
-    return static_cast<T&&>(arg) << Shift;
-  }
-};
-
-/************/
-/** Binary **/
-/************/
-
-#define CUTE_BINARY_OP(NAME,OP)                                      \
-  struct NAME {                                                      \
-    template <class T, class U>                                      \
-    CUTE_HOST_DEVICE constexpr                                       \
-    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
-      return static_cast<T&&>(lhs) OP static_cast<U&&>(rhs);           \
-    }                                                                \
-  }
-#define CUTE_NAMED_BINARY_OP(NAME,OP)                                \
-  struct NAME {                                                      \
-    template <class T, class U>                                      \
-    CUTE_HOST_DEVICE constexpr                                       \
-    decltype(auto) operator()(T&& lhs, U&& rhs) const {              \
-      return OP (static_cast<T&&>(lhs), static_cast<U&&>(rhs));        \
-    }                                                                \
-  }
-
-
-CUTE_BINARY_OP(plus,                 +);
-CUTE_BINARY_OP(minus,                -);
-CUTE_BINARY_OP(multiplies,           *);
-CUTE_BINARY_OP(divides,              /);
-CUTE_BINARY_OP(modulus,              %);
-
-CUTE_BINARY_OP(plus_assign,         +=);
-CUTE_BINARY_OP(minus_assign,        -=);
-CUTE_BINARY_OP(multiplies_assign,   *=);
-CUTE_BINARY_OP(divides_assign,      /=);
-CUTE_BINARY_OP(modulus_assign,      %=);
-
-CUTE_BINARY_OP(bit_and,              &);
-CUTE_BINARY_OP(bit_or,               |);
-CUTE_BINARY_OP(bit_xor,              ^);
-CUTE_BINARY_OP(left_shift,          <<);
-CUTE_BINARY_OP(right_shift,         >>);
-
-CUTE_BINARY_OP(bit_and_assign,      &=);
-CUTE_BINARY_OP(bit_or_assign,       |=);
-CUTE_BINARY_OP(bit_xor_assign,      ^=);
-CUTE_BINARY_OP(left_shift_assign,  <<=);
-CUTE_BINARY_OP(right_shift_assign, >>=);
-
-CUTE_BINARY_OP(logical_and,         &&);
-CUTE_BINARY_OP(logical_or,          ||);
-
-CUTE_BINARY_OP(equal_to,            ==);
-CUTE_BINARY_OP(not_equal_to,        !=);
-CUTE_BINARY_OP(greater,              >);
-CUTE_BINARY_OP(less,                 <);
-CUTE_BINARY_OP(greater_equal,       >=);
-CUTE_BINARY_OP(less_equal,          <=);
-
-CUTE_NAMED_BINARY_OP(max_fn, cute::max);
-CUTE_NAMED_BINARY_OP(min_fn, cute::min);
-
-#undef CUTE_BINARY_OP
-#undef CUTE_NAMED_BINARY_OP
-
-/**********/
-/** Fold **/
-/**********/
-
-#define CUTE_FOLD_OP(NAME,OP)                                        \
-  struct NAME##_unary_rfold {                                        \
-    template <class... T>                                            \
-    CUTE_HOST_DEVICE constexpr                                       \
-    auto operator()(T&&... t) const {                                \
-      return (t OP ...);                                             \
-    }                                                                \
-  };                                                                 \
-  struct NAME##_unary_lfold {                                        \
-    template <class... T>                                            \
-    CUTE_HOST_DEVICE constexpr                                       \
-    auto operator()(T&&... t) const {                                \
-      return (... OP t);                                             \
-    }                                                                \
-  };                                                                 \
-  struct NAME##_binary_rfold {                                       \
-    template <class U, class... T>                                   \
-    CUTE_HOST_DEVICE constexpr                                       \
-    auto operator()(U&& u, T&&... t) const {                         \
-      return (t OP ... OP u);                                        \
-    }                                                                \
-  };                                                                 \
-  struct NAME##_binary_lfold {                                       \
-    template <class U, class... T>                                   \
-    CUTE_HOST_DEVICE constexpr                                       \
-    auto operator()(U&& u, T&&... t) const {                         \
-      return (u OP ... OP t);                                        \
-    }                                                                \
-  }
-
-CUTE_FOLD_OP(plus,                 +);
-CUTE_FOLD_OP(minus,                -);
-CUTE_FOLD_OP(multiplies,           *);
-CUTE_FOLD_OP(divides,              /);
-CUTE_FOLD_OP(modulus,              %);
-
-CUTE_FOLD_OP(plus_assign,         +=);
-CUTE_FOLD_OP(minus_assign,        -=);
-CUTE_FOLD_OP(multiplies_assign,   *=);
-CUTE_FOLD_OP(divides_assign,      /=);
-CUTE_FOLD_OP(modulus_assign,      %=);
-
-CUTE_FOLD_OP(bit_and,              &);
-CUTE_FOLD_OP(bit_or,               |);
-CUTE_FOLD_OP(bit_xor,              ^);
-CUTE_FOLD_OP(left_shift,          <<);
-CUTE_FOLD_OP(right_shift,         >>);
-
-CUTE_FOLD_OP(bit_and_assign,      &=);
-CUTE_FOLD_OP(bit_or_assign,       |=);
-CUTE_FOLD_OP(bit_xor_assign,      ^=);
-CUTE_FOLD_OP(left_shift_assign,  <<=);
-CUTE_FOLD_OP(right_shift_assign, >>=);
-
-CUTE_FOLD_OP(logical_and,         &&);
-CUTE_FOLD_OP(logical_or,          ||);
-
-CUTE_FOLD_OP(equal_to,            ==);
-CUTE_FOLD_OP(not_equal_to,        !=);
-CUTE_FOLD_OP(greater,              >);
-CUTE_FOLD_OP(less,                 <);
-CUTE_FOLD_OP(greater_equal,       >=);
-CUTE_FOLD_OP(less_equal,          <=);
-
-#undef CUTE_FOLD_OP
-
-/**********/
-/** Meta **/
-/**********/
-
-template <class Fn, class Arg>
-struct bound_fn {
-
-  template <class T>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(T&& arg) {
-    return fn_(arg_, static_cast<T&&>(arg));
-  }
-
-  Fn fn_;
-  Arg arg_;
-};
-
-template <class Fn, class Arg>
-CUTE_HOST_DEVICE constexpr
-auto
-bind(Fn const& fn, Arg const& arg) {
-  return bound_fn<Fn,Arg>{fn, arg};
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
deleted file mode 100755
index c4713838b..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp
+++ /dev/null
@@ -1,500 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/util/type_traits.hpp>
-#include <cute/algorithm/functional.hpp>
-
-#include <cute/tensor_impl.hpp>
-
-#include <cute/atom/mma_atom.hpp>
-
-/** The gemm algorithm takes four (or three) tensors and computes
- *   D = A * B + C
- * It dispatches based on the number of modes each tensor has:
- *
- * 1. `(V) x (V) => (V)`.
- *      The element-wise product of vectors. Dispatches to FMA or MMA.
- * 2. `(M) x (N) => (M,N)`.
- *      The outer product of vectors. Dispatches to [3] with new mode K=(1).
- * 3. `(M,K) x (N,K) => (M,N)`.
- *      The product of matrices. Dispatches to [5] with MMA vector-mode V.
- * 4. `(V,M) x (V,N) => (V,M,N)`.
- *      The batched outer product of vectors. Accounts for register reuse and dispatches to [1] for each (m,n).
- * 5. `(V,M,K) x (V,N,K) => (V,M,N)`.
- *      The batched product of matrices. Dispatches to [4] for each (k).
- */
-
-namespace cute
-{
-
-//
-// Three arguments to four
-//
-
-template <class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout>      & C)
-{
-  return gemm(C, A, B, C);
-}
-
-template <class MMA,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout>      & C)
-{
-  return gemm(mma, C, A, B, C);
-}
-
-//
-// Accept mutable temporaries
-//
-
-template <class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout>     && C)
-{
-  return gemm(C, A, B, C);
-}
-
-template <class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(Tensor<TD, DLayout>     && D,
-     Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout> const& C)
-{
-  return gemm(D, A, B, C);
-}
-
-template <class MMA,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout>     && C)
-{
-  return gemm(mma, C, A, B, C);
-}
-
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>     && D,
-     Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout> const& C)
-{
-  return gemm(mma, D, A, B, C);
-}
-
-//
-// Default MMA is UniversalFMA
-//
-
-template <class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE
-void
-gemm(Tensor<TD, DLayout>      & D,
-     Tensor<TA, ALayout> const& A,
-     Tensor<TB, BLayout> const& B,
-     Tensor<TC, CLayout> const& C)
-{
-  using MMA = MMA_Atom<UniversalFMA<typename Tensor<TD,DLayout>::value_type,
-                                    typename Tensor<TA,ALayout>::value_type,
-                                    typename Tensor<TB,BLayout>::value_type,
-                                    typename Tensor<TC,CLayout>::value_type>>;
-
-  return gemm(MMA{}, D, A, B, C);
-}
-
-//
-// Thread-Local Register-Memory GEMMs
-//
-
-// Dispatch [1]: (V) x (V) => (V)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 1 && is_rmem<TD>::value &&
-                          ALayout::rank == 1 && is_rmem<TA>::value &&
-                          BLayout::rank == 1 && is_rmem<TB>::value &&
-                          CLayout::rank == 1 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (V) Logical data
-     Tensor<TA, ALayout> const& A,  // (V) Logical data
-     Tensor<TB, BLayout> const& B,  // (V) Logical data
-     Tensor<TC, CLayout> const& C)  // (V) Logical data
-{
-  // No static assertions on (V), MMA checks compatibility
-  mma.call(D, A, B, C);
-}
-
-// Dispatch [2]: (M) x (N) => (M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
-                          ALayout::rank == 1 && is_rmem<TA>::value &&
-                          BLayout::rank == 1 && is_rmem<TB>::value &&
-                          CLayout::rank == 2 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (M)   Logical data
-     Tensor<TB, BLayout> const& B,  // (N)   Logical data
-     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
-  gemm(mma,
-       D,                                                       // (M,N)
-       make_tensor(A.data(), append<2>(A.layout())),            // (M,1)
-       make_tensor(B.data(), append<2>(B.layout())),            // (N,1)
-       C);                                                      // (M,N)
-}
-
-// Dispatch [3]: (M,K) x (N,K) => (M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
-                          ALayout::rank == 2 && is_rmem<TA>::value &&
-                          BLayout::rank == 2 && is_rmem<TB>::value &&
-                          CLayout::rank == 2 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
-     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
-     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
-
-  // Assert this is a 1-value MMA
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
-
-  gemm(mma,
-       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
-       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
-       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
-       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
-}
-
-// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
-                          ALayout::rank == 2 && is_rmem<TA>::value &&
-                          BLayout::rank == 2 && is_rmem<TB>::value &&
-                          CLayout::rank == 3 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (V,M)   Logical data
-     Tensor<TB, BLayout> const& B,  // (V,N)   Logical data
-     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
-  auto M = size<1>(A);
-  auto N = size<1>(B);
-  // REGISTER .reuse OPTIMIZATIONS
-  // 64-bit traversal specialization -- serpentine path
-  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
-                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8)
-  {
-#if 1 // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
-    // Row-major serpentine iteration
-    CUTE_UNROLL
-    for (int m = 0; m < M; ++m) {
-      CUTE_UNROLL
-      for (int n = 0; n < N; ++n) {
-        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
-        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
-      }
-    }
-#else
-    // Col-major serpentine iteration
-    CUTE_UNROLL
-    for (int n = 0; n < N; ++n) {
-      CUTE_UNROLL
-      for (int m = 0; m < M; ++m) {
-        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
-        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
-      }
-    }
-#endif
-  } else
-  // 32-bit traversal specialization -- kinked serpentine path
-  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
-                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4)
-  {
-#if 1  // NOTE: Row- vs Col- major could depend on the C-matrix order... (which we can test)
-    // Row-major kinked serpentine iteration
-    CUTE_UNROLL
-    for (int m = 0; m < M; m += 2) {
-      CUTE_UNROLL
-      for (int n = 0; n < N; ++n) {
-        int ns = (m & 2) ? N-1-n : n;
-        gemm(mma, D(_,m+0,ns), A(_,m+0), B(_,ns), C(_,m+0,ns));
-
-        if (m+1 < M) {
-          gemm(mma, D(_,m+1,ns), A(_,m+1), B(_,ns), C(_,m+1,ns));
-        }
-      }
-    }
-#else
-    // Col-major kinked serpentine iteration
-    CUTE_UNROLL
-    for (int n = 0; n < N; n += 2) {
-      CUTE_UNROLL
-      for (int m = 0; m < M; ++m) {
-        // Kinked serpentine traversal for maximum register reuse
-        int ms = (n & 2) ? M-1-m : m;
-        gemm(mma, D(_,ms,n+0), A(_,ms), B(_,n+0), C(_,ms,n+0));
-
-        if (n+1 < N) {
-          gemm(mma, D(_,ms,n+1), A(_,ms), B(_,n+1), C(_,ms,n+1));
-        }
-      }
-    }
-#endif
-  } else
-  // 64-bit + 32-bit traversal order -- keep A (64-bit) in the outer loop and serpentine B
-  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 8 &&
-                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 4) {
-    // Row-major serpentine iteration
-    CUTE_UNROLL
-    for (int m = 0; m < M; ++m) {
-      CUTE_UNROLL
-      for (int n = 0; n < N; ++n) {
-        int ns = (m & 1) ? N-1-n : n;  // Serpentine coordinate
-        gemm(mma, D(_,m,ns), A(_,m), B(_,ns), C(_,m,ns));
-      }
-    }
-  } else
-  // 32-bit + 64-bit traversal order -- keep B (64-bit) in the outer loop and serpentine A
-  if constexpr (decltype(size<0>(A))::value * sizeof(typename TA::value_type) == 4 &&
-                decltype(size<0>(B))::value * sizeof(typename TB::value_type) == 8) {
-    // Col-major serpentine iteration
-    CUTE_UNROLL
-    for (int n = 0; n < N; ++n) {
-      CUTE_UNROLL
-      for (int m = 0; m < M; ++m) {
-        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
-        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
-      }
-    }
-  } else
-  // Fallback to serpentine loop
-  {
-    // Col-major serpentine iteration
-    CUTE_UNROLL
-    for (int n = 0; n < N; ++n) {
-      CUTE_UNROLL
-      for (int m = 0; m < M; ++m) {
-        int ms = (n & 1) ? M-1-m : m;  // Serpentine coordinate
-        gemm(mma, D(_,ms,n), A(_,ms), B(_,n), C(_,ms,n));
-      }
-    }
-  }
-}
-
-// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
-                          ALayout::rank == 3 && is_rmem<TA>::value &&
-                          BLayout::rank == 3 && is_rmem<TB>::value &&
-                          CLayout::rank == 3 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
-     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
-     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
-  auto K = size<2>(A);
-
-  CUTE_UNROLL
-  for (int k = 0; k < K; ++k) {
-    gemm(mma, D, A(_,_,k), B(_,_,k), C);
-  }
-}
-
-//
-// Thread-Local Shared-Memory GEMMs
-//
-
-// Dispatch [1]: (V) x (V) => (V)
-// Dispatch [2]: (M) x (N) => (M,N)
-// Dispatch [3]: (M,K) x (N,K) => (M,N)
-// Dispatch [4]: (V,M) x (V,N) => (V,M,N)
-// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
-// Dispatch [3]: (M,K) x (N,K) => (M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 2 && is_rmem<TD>::value &&
-                          ALayout::rank == 2 && is_smem<TA>::value &&
-                          BLayout::rank == 2 && is_smem<TB>::value &&
-                          CLayout::rank == 2 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (M,K) Logical data
-     Tensor<TB, BLayout> const& B,  // (N,K) Logical data
-     Tensor<TC, CLayout> const& C)  // (M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<0>(A) == size<0>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<0>(B) == size<1>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(B));  // AK == BK
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D));
-
-  // Assert this is a 1-value MMA
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutC_TV{}) == Int<1>{});
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutA_TV{}) == Int<1>{});
-  CUTE_STATIC_ASSERT_V(size<1>(typename MMA_Atom<MMA>::LayoutB_TV{}) == Int<1>{});
-
-  gemm(mma,
-       make_tensor(D.data(), prepend<3>(D.layout())),      // (1,M,N)
-       make_tensor(A.data(), prepend<3>(A.layout())),      // (1,M,K)
-       make_tensor(B.data(), prepend<3>(B.layout())),      // (1,N,K)
-       make_tensor(C.data(), prepend<3>(C.layout())));     // (1,M,N)
-}
-
-// Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
-template <class MMA,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout,
-          __CUTE_REQUIRES(DLayout::rank == 3 && is_rmem<TD>::value &&
-                          ALayout::rank == 3 && is_smem<TA>::value &&
-                          BLayout::rank == 3 && is_smem<TB>::value &&
-                          CLayout::rank == 3 && is_rmem<TC>::value)>
-CUTE_HOST_DEVICE
-void
-gemm(MMA_Atom<MMA>       const& mma,
-     Tensor<TD, DLayout>      & D,  // (V,M,N) Logical data
-     Tensor<TA, ALayout> const& A,  // (V,M,K) Logical data
-     Tensor<TB, BLayout> const& B,  // (V,N,K) Logical data
-     Tensor<TC, CLayout> const& C)  // (V,M,N) Logical data
-{
-  CUTE_STATIC_ASSERT_V(size<1>(A) == size<1>(C));  // AM == CM
-  CUTE_STATIC_ASSERT_V(size<1>(B) == size<2>(C));  // BN == CN
-  CUTE_STATIC_ASSERT_V(size<2>(A) == size<2>(B));  // AK == BK
-  CUTE_STATIC_ASSERT_V(size<0>(C) == size<0>(D) && size<1>(C) == size<1>(D) && size<2>(C) == size<2>(D));
-
-  auto rA = MMA_Atom<MMA>::make_fragment_A(A);
-  auto rB = MMA_Atom<MMA>::make_fragment_B(B);
-
-  auto K = size<2>(A);
-
-  CUTE_UNROLL
-  for (int k = 0; k < K; ++k)
-  {
-    copy(A(_,_,k), rA(_,_,k));
-    copy(B(_,_,k), rB(_,_,k));
-    // Thread-level register gemm for k
-    gemm(mma, D, rA(_,_,k), rB(_,_,k), C);
-  }
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
deleted file mode 100755
index a69e50429..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-namespace cute
-{
-
-// Infinite types that inherit from each other
-template <size_t N>
-struct prefer : prefer<N-1> {};
-
-template <>
-struct prefer<0> {};
-
-// Can be used to preferencially overload implementations
-// Higher N in prefer<N> have higher priority.
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
deleted file mode 100755
index c39f63acd..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>          // CUTE_HOST_DEVICE
-#include <cute/tensor_impl.hpp>     // cute::Tensor
-#include <cute/atom/copy_atom.hpp>  // cute::Copy_Atom
-
-namespace cute
-{
-
-//
-// Prefetch global tensors into L2
-//
-
-template <uint32_t NumThreads, uint32_t FetchBytes = 64,
-          class GEngine, class GLayout>
-CUTE_HOST_DEVICE
-void
-cooperative_prefetch(uint32_t                 const& tid,
-                     Tensor<GEngine, GLayout> const& src)
-{
-  static_assert(is_gmem<GEngine>::value, "Expected global tensor for prefetch");
-
-  constexpr int V = decltype(max_common_vector(src, src))::value;
-
-  if constexpr (V > 1) {
-    // L2 sector is 32B, default fetch granularity is 64B
-    using VecType = conditional_t<(V * sizeof_bits_v<typename GEngine::value_type>) < (FetchBytes * 8),
-                                  ArrayEngine<typename GEngine::value_type, V>,
-                                  uint8_t[FetchBytes]                         >;
-
-    Tensor src_v = recast<VecType const>(src);
-    CUTE_UNROLL
-    for (int i = tid; i < size(src_v); i += NumThreads) {
-      prefetch(raw_pointer_cast(&src_v(i)));
-    }
-  } else {
-    CUTE_UNROLL
-    for (int i = tid; i < size(src); i += NumThreads) {
-      prefetch(raw_pointer_cast(&src(i)));
-    }
-  }
-}
-
-template <class GEngine, class GLayout>
-CUTE_HOST_DEVICE
-void
-prefetch(Tensor<GEngine, GLayout> const& src)
-{
-  return cooperative_prefetch<1>(0, src);
-}
-
-// Prefetch with copy atom
-namespace detail {
-
-template <class CopyOp, class = void>
-constexpr bool has_prefetch = false;
-
-template <class CopyOp>
-constexpr bool has_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = true;
-
-} // end namespace detail
-
-template <class CopyOp, class... CT_Args, class... CA_Args,
-          class GEngine, class GLayout>
-CUTE_HOST_DEVICE
-void
-prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CA_Args...> const& atom,
-         Tensor<GEngine, GLayout>                               const& src)
-{
-  if constexpr (detail::has_prefetch<CopyOp>) {
-    using Prefetch_Traits = Copy_Traits<typename CopyOp::PREFETCH, CT_Args...>;
-    using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CA_Args...>;
-    Prefetch_Atom prefetch_atom{atom};
-    auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
-    return copy(prefetch_atom, src, dst);
-  } else {
-    return prefetch(src);
-  }
-}
-
-#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
-template <class... CT_Args,
-          class SrcEngine, class SrcLayout>
-CUTE_HOST_DEVICE
-void
-prefetch(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom,
-         Tensor<SrcEngine, SrcLayout>                 const& src)
-{
-  using SrcType = typename SrcEngine::value_type;
-  static_assert(is_gmem<SrcEngine>::value, "Expected global tensor for L2 prefetch");
-
-  auto tiler = max_common_layout(src, src);
-  constexpr int vec_elem = decltype(size(tiler))::value;
-  constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>;
-  static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP");
-
-  // Construct a new concrete Atom of the vector size
-  auto bulk_atom = Copy_Atom<Copy_Traits<SM90_BULK_COPY_G2S, Int<vec_bits>>, SrcType>{};
-
-  return prefetch(bulk_atom, logical_divide(src, tiler));
-}
-
-// Backwards-compat. Throw out any extra Copy_Atom args.
-template <class... CT_Args, class... CA_Args,
-          class SrcEngine, class SrcLayout>
-CUTE_HOST_DEVICE
-void
-prefetch(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
-         Tensor<SrcEngine, SrcLayout>                                        const& src)
-{
-  return prefetch(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src);
-}
-#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
deleted file mode 100755
index dbffc6133..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/** Common algorithms on (hierarchical) tensors */
-
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/tensor_impl.hpp>
-
-namespace cute
-{
-
-//
-// for_each
-//
-
-template <class Engine, class Layout, class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-for_each(Tensor<Engine,Layout> const& tensor, UnaryOp&& op)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor); ++i) {
-    op(tensor(i));
-  }
-}
-
-template <class Engine, class Layout, class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-for_each(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor); ++i) {
-    op(tensor(i));
-  }
-}
-
-// Accept mutable temporaries
-template <class Engine, class Layout, class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-for_each(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
-{
-  return for_each(tensor, op);
-}
-
-//
-// transform
-//
-
-// Similar to std::transform but does not return number of elements affected
-template <class Engine, class Layout, class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<Engine,Layout>& tensor, UnaryOp&& op)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor); ++i) {
-    tensor(i) = op(tensor(i));
-  }
-}
-
-// Accept mutable temporaries
-template <class Engine, class Layout, class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<Engine,Layout>&& tensor, UnaryOp&& op)
-{
-  return transform(tensor, op);
-}
-
-// Similar to std::transform transforms one tensors and assigns it to another
-template <class EngineIn, class LayoutIn,
-          class EngineOut, class LayoutOut,
-          class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
-          Tensor<EngineOut,LayoutOut>      & tensor_out,
-          UnaryOp&& op)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor_in); ++i) {
-    tensor_out(i) = op(tensor_in(i));
-  }
-}
-
-// Accept mutable temporaries
-template <class EngineIn, class LayoutIn,
-          class EngineOut, class LayoutOut,
-          class UnaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<EngineIn, LayoutIn > const& tensor_in,
-          Tensor<EngineOut,LayoutOut>     && tensor_out,
-          UnaryOp&& op)
-{
-  return transform(tensor_in, tensor_out, op);
-}
-
-// Similar to std::transform with a binary operation
-// Takes two tensors as input and one tensor as output.
-// Applies the binary_op to tensor_in1 and tensor_in2 and
-// assigns it to tensor_out
-template <class EngineIn1, class LayoutIn1,
-          class EngineIn2, class LayoutIn2,
-          class EngineOut, class LayoutOut,
-          class BinaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
-          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
-          Tensor<EngineOut,LayoutOut>      & tensor_out,
-          BinaryOp&& op)
-{
-  CUTE_UNROLL
-  for (int i = 0; i < size(tensor_in1); ++i) {
-    tensor_out(i) = op(tensor_in1(i), tensor_in2(i));
-  }
-}
-
-// Accept mutable temporaries
-template <class EngineIn1, class LayoutIn1,
-          class EngineIn2, class LayoutIn2,
-          class EngineOut, class LayoutOut,
-          class BinaryOp>
-CUTE_HOST_DEVICE constexpr
-void
-transform(Tensor<EngineIn1,LayoutIn1> const& tensor_in1,
-          Tensor<EngineIn2,LayoutIn2> const& tensor_in2,
-          Tensor<EngineOut,LayoutOut>     && tensor_out,
-          BinaryOp&& op)
-{
-  return transform(tensor_in1, tensor_in2, tensor_out, op);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp b/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
deleted file mode 100755
index 5a70f590b..000000000
--- a/lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp
+++ /dev/null
@@ -1,1073 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/util/type_traits.hpp>
-#include <cute/container/tuple.hpp>
-#include <cute/algorithm/functional.hpp>
-#include <cute/numeric/integer_sequence.hpp>
-#include <cute/numeric/integral_constant.hpp>
-
-/// @file tuple_algorithms.hpp
-/// @brief Common algorithms on (hierarchical) tuples
-///
-/// Code guidelines and style preferences:
-///
-/// For perfect forwarding, don't use std::forward, because it may not
-/// be defined in device code when compiling with NVRTC. Instead, use
-/// `static_cast<ParameterType&&>(parameter_name)`.
-///
-/// CuTe generally does not bother forwarding functions, as
-/// reference-qualified member functions are rare in this code base.
-///
-/// Throughout CUTLASS, cute::make_tuple always needs to be called
-/// namespace-qualified, EVEN If inside the cute namespace and/or in
-/// scope of a "using namespace cute" declaration. Otherwise, the
-/// compiler may select std::make_tuple instead of cute::make_tuple,
-/// due to argument-dependent lookup.
-
-namespace cute
-{
-
-//
-// Apply (Unpack)
-// (t, f) => f(t_0,t_1,...,t_n)
-//
-
-namespace detail {
-
-template <class T, class F, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-apply(T&& t, F&& f, seq<I...>)
-{
-  return f(get<I>(static_cast<T&&>(t))...);
-}
-
-} // end namespace detail
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-apply(T&& t, F&& f)
-{
-  return detail::apply(static_cast<T&&>(t), f, tuple_seq<T>{});
-}
-
-//
-// Transform Apply
-// (t, f, g) => g(f(t_0),f(t_1),...)
-//
-
-namespace detail {
-
-template <class T, class F, class G, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-tapply(T&& t, F&& f, G&& g, seq<I...>)
-{
-  return g(f(get<I>(static_cast<T&&>(t)))...);
-}
-
-template <class T0, class T1, class F, class G, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-tapply(T0&& t0, T1&& t1, F&& f, G&& g, seq<I...>)
-{
-  return g(f(get<I>(static_cast<T0&&>(t0)),
-             get<I>(static_cast<T1&&>(t1)))...);
-}
-
-template <class T0, class T1, class T2, class F, class G, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-tapply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g, seq<I...>)
-{
-  return g(f(get<I>(static_cast<T0&&>(t0)),
-             get<I>(static_cast<T1&&>(t1)),
-             get<I>(static_cast<T2&&>(t2)))...);
-}
-
-} // end namespace detail
-
-template <class T, class F, class G>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_apply(T&& t, F&& f, G&& g)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return detail::tapply(static_cast<T&&>(t), f, g, tuple_seq<T>{});
-  } else {
-    return g(f(static_cast<T&&>(t)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class F, class G>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_apply(T0&& t0, T1&& t1, F&& f, G&& g)
-{
-  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
-    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
-  } else {
-    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class T2, class F, class G>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_apply(T0&& t0, T1&& t1, T2&& t2, F&& f, G&& g)
-{
-  if constexpr (is_tuple<remove_cvref_t<T0>>::value) {
-    return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2), f, g, tuple_seq<T0>{});
-  } else {
-    return g(f(static_cast<T0&&>(t0), static_cast<T1&&>(t1), static_cast<T2&&>(t2)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// For Each
-// (t, f) => f(t_0),f(t_1),...,f(t_n)
-//
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-void
-for_each(T&& t, F&& f)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return detail::apply(t, [&](auto&&... a) { (f(static_cast<decltype(a)&&>(a)), ...); }, tuple_seq<T>{});
-  } else {
-    return f(static_cast<T&&>(t));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-for_each_leaf(T&& t, F&& f)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return detail::apply(static_cast<T&&>(t), [&](auto&&... a){ return (for_each_leaf(static_cast<decltype(a)&&>(a), f), ...); }, tuple_seq<T>{});
-  } else {
-    return f(static_cast<T&&>(t));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Transform
-// (t, f) => (f(t_0),f(t_1),...,f(t_n))
-//
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform(T const& t, F&& f)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::tapply(t, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T>{});
-  } else {
-    return f(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform(T0 const& t0, T1 const& t1, F&& f)
-{
-  if constexpr (is_tuple<T0>::value) {
-    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
-    return detail::tapply(t0, t1, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
-  } else {
-    return f(t0, t1);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class T2, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
-{
-  if constexpr (is_tuple<T0>::value) {
-    static_assert(tuple_size<T0>::value == tuple_size<T1>::value, "Mismatched tuple_size");
-    static_assert(tuple_size<T0>::value == tuple_size<T2>::value, "Mismatched tuple_size");
-    return detail::tapply(t0, t1, t2, f, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_seq<T0>{});
-  } else {
-    return f(t0, t1, t2);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_leaf(T const& t, F&& f)
-{
-  if constexpr (is_tuple<T>::value) {
-    return transform(t, [&](auto const& a) { return transform_leaf(a, f); });
-  } else {
-    return f(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_leaf(T0 const& t0, T1 const& t1, F&& f)
-{
-  if constexpr (is_tuple<T0>::value) {
-    return transform(t0, t1, [&](auto const& a, auto const& b) { return transform_leaf(a, b, f); });
-  } else {
-    return f(t0, t1);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// find and find_if
-//
-
-namespace detail {
-
-template <class T, class F, int I, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-find_if(T const& t, F&& f, seq<I,Is...>)
-{
-  if constexpr (decltype(f(get<I>(t)))::value) {
-    return cute::C<I>{};
-  } else
-  if constexpr (sizeof...(Is) == 0) {
-    return cute::C<I+1>{};
-  } else {
-    return find_if(t, f, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-find_if(T const& t, F&& f)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::find_if(t, f, tuple_seq<T>{});
-  } else {
-    return cute::C<decltype(f(t))::value ? 0 : 1>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-find(T const& t, X const& x)
-{
-  return find_if(t, [&](auto const& v) { return v == x; });  // This should always return a static true/false
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-any_of(T const& t, F&& f)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
-  } else {
-    return f(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-all_of(T const& t, F&& f)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
-  } else {
-    return f(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-none_of(T const& t, F&& f)
-{
-  return not any_of(t, f);
-}
-
-//
-// Filter
-// (t, f) => <f(t_0),f(t_1),...,f(t_n)>
-//
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_tuple(T const& t, F&& f)
-{
-  return transform_apply(t, f, [](auto const&... a) { return cute::tuple_cat(a...); });
-}
-
-template <class T0, class T1, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_tuple(T0 const& t0, T1 const& t1, F&& f)
-{
-  return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); });
-}
-
-template <class T0, class T1, class T2, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_tuple(T0 const& t0, T1 const& t1, T2 const& t2, F&& f)
-{
-  return transform_apply(t0, t1, t2, f, [](auto const&... a) { return cute::tuple_cat(a...); });
-}
-
-//
-// Fold (Reduce, Accumulate)
-// (t, v, f) => f(...f(f(v,t_0),t_1),...,t_n)
-//
-
-namespace detail {
-
-template <class Fn, class Val>
-struct FoldAdaptor {
-  template <class X>
-  CUTE_HOST_DEVICE constexpr auto operator|(X&& x) {
-    auto r = fn_(val_, static_cast<X&&>(x));
-    return FoldAdaptor<Fn, decltype(r)>{fn_, r};
-  }
-  Fn fn_;
-  Val val_;
-};
-
-template <class T, class V, class F, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-fold(T&& t, V const& v, F&& f, seq<Is...>)
-{
-  return (FoldAdaptor<F,V>{f,v} | ... | get<Is>(static_cast<T&&>(t))).val_;
-}
-
-} // end namespace detail
-
-template <class T, class V, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-fold(T&& t, V const& v, F&& f)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return detail::fold(static_cast<T&&>(t), v, f, tuple_seq<T>{});
-  } else {
-    return f(v, static_cast<T&&>(t));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-fold_first(T&& t, F&& f)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return detail::fold(static_cast<T&&>(t), get<0>(t), f, make_range<1,tuple_size<remove_cvref_t<T>>::value>{});
-  } else {
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// front, back, take, select, unwrap
-//
-
-// Get the first non-tuple element in a hierarchical tuple
-template <class T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-front(T&& t)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    return front(get<0>(static_cast<T&&>(t)));
-  } else {
-    return static_cast<T&&>(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Get the last non-tuple element in a hierarchical tuple
-template <class T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-back(T&& t)
-{
-  if constexpr (is_tuple<remove_cvref_t<T>>::value) {
-    constexpr int N = tuple_size<remove_cvref_t<T>>::value;
-
-    // MSVC needs a bit of extra help here deducing return types.
-    // We help it by peeling off the nonrecursive case a level "early."
-    if constexpr (! is_tuple<remove_cvref_t<decltype(get<N - 1>(static_cast<T&&>(t)))>>::value) {
-      return get<N - 1>(static_cast<T&&>(t));
-    } else {
-      return back(get<N - 1>(static_cast<T&&>(t)));
-    }
-  } else {
-    return static_cast<T&&>(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Takes the elements in the range [B,E)
-template <int B, int E, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-take(T const& t)
-{
-  if constexpr (E == -1) {
-    if constexpr (is_tuple<T>::value) {
-      return take<B,tuple_size<T>::value>(t);
-    } else {
-      return take<B,1>(t);
-    }
-  } else
-  if constexpr (B <= E) {
-    return detail::apply(t, [](auto const&... a) { return cute::make_tuple(a...); }, make_range<B,E>{});
-  } else {
-    static_assert(B <= E);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Select tuple elements with given indices.
-template <int... I, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-select(T const& t)
-{
-  return cute::make_tuple(get<I>(t)...);
-}
-
-// Wrap non-tuples into rank-1 tuples or forward
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-wrap(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    return t;
-  } else {
-    return cute::make_tuple(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Unwrap rank-1 tuples until we're left with a rank>1 tuple or a non-tuple
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-unwrap(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (tuple_size<T>::value == 1) {
-      return unwrap(get<0>(t));
-    } else {
-      return t;
-    }
-  } else {
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Flatten and Unflatten
-//
-
-template <class T>
-struct is_flat : true_type {};
-
-template <class... Ts>
-struct is_flat<tuple<Ts...>> : bool_constant<(true && ... && (not is_tuple<Ts>::value))> {};
-
-// Flatten a hierarchical tuple to a tuple of depth one
-//   and wrap non-tuples into a rank-1 tuple.
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten_to_tuple(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (is_flat<T>::value) {      // Shortcut for perf
-      return t;
-    } else {
-      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
-    }
-  } else {
-    return cute::make_tuple(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Flatten a hierarchical tuple to a tuple of depth one
-//   and leave non-tuple untouched.
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (is_flat<T>::value) {      // Shortcut for perf
-      return t;
-    } else {
-      return filter_tuple(t, [](auto const& a) { return flatten_to_tuple(a); });
-    }
-  } else {
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-namespace detail {
-
-template <class FlatTuple, class TargetProfile>
-CUTE_HOST_DEVICE constexpr
-auto
-unflatten_impl(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
-{
-  if constexpr (is_tuple<TargetProfile>::value) {
-    return fold(target_profile, cute::make_tuple(cute::make_tuple(), flat_tuple), [](auto const& v, auto const& t) {
-      auto [result, remaining_tuple] = v;
-      auto [sub_result, sub_tuple] = unflatten_impl(remaining_tuple, t);
-      return cute::make_tuple(append(result, sub_result), sub_tuple);
-    });
-  } else {
-    return cute::make_tuple(get<0>(flat_tuple), take<1, decltype(rank(flat_tuple))::value>(flat_tuple));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-}  // end namespace detail
-
-// Unflatten a flat tuple into a hierarchical tuple
-// @pre flatten(@a flat_tuple) == @a flat_tuple
-// @pre rank(flatten(@a target_profile)) == rank(@a flat_tuple)
-// @post congruent(@a result, @a target_profile)
-// @post flatten(@a result) == @a flat_tuple
-template <class FlatTuple, class TargetProfile>
-CUTE_HOST_DEVICE constexpr
-auto
-unflatten(FlatTuple const& flat_tuple, TargetProfile const& target_profile)
-{
-  auto [unflatten_tuple, flat_remainder] = detail::unflatten_impl(flat_tuple, target_profile);
-  CUTE_STATIC_ASSERT_V(rank(flat_remainder) == Int<0>{});
-  return unflatten_tuple;
-}
-
-//
-// insert and remove and replace
-//
-
-namespace detail {
-
-// Shortcut around cute::tuple_cat for common insert/remove/repeat cases
-template <class T, class X, int... I, int... J, int... K>
-CUTE_HOST_DEVICE constexpr
-auto
-construct(T const& t, X const& x, seq<I...>, seq<J...>, seq<K...>)
-{
-  return cute::make_tuple(get<I>(t)..., (void(J),x)..., get<K>(t)...);
-}
-
-} // end namespace detail
-
-// Insert x into the Nth position of the tuple
-template <int N, class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-insert(T const& t, X const& x)
-{
-  return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N,tuple_size<T>::value>{});
-}
-
-// Remove the Nth element of the tuple
-template <int N, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-remove(T const& t)
-{
-  return detail::construct(t, 0, make_seq<N>{}, seq<>{}, make_range<N+1,tuple_size<T>::value>{});
-}
-
-// Replace the Nth element of the tuple with x
-template <int N, class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-replace(T const& t, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::construct(t, x, make_seq<N>{}, seq<0>{}, make_range<N+1,tuple_size<T>::value>{});
-  } else {
-    static_assert(N == 0);
-    return x;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Replace the first element of the tuple with x
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-replace_front(T const& t, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::construct(t, x, seq<>{}, seq<0>{}, make_range<1,tuple_size<T>::value>{});
-  } else {
-    return x;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Replace the last element of the tuple with x
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-replace_back(T const& t, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::construct(t, x, make_seq<tuple_size<T>::value-1>{}, seq<0>{}, seq<>{});
-  } else {
-    return x;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Make a tuple of Xs of tuple_size N
-//
-
-template <int N, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_repeat(X const& x)
-{
-  return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
-}
-
-//
-// Make repeated Xs of rank N
-//
-
-template <int N, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-repeat(X const& x)
-{
-  if constexpr (N == 1) {
-    return x;
-  } else {
-    return detail::construct(0, x, seq<>{}, make_seq<N>{}, seq<>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Make a tuple of Xs the same profile as tuple T
-//
-
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-repeat_like(T const& t, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return transform(t, [&](auto const& a) { return repeat_like(a,x); });
-  } else {
-    return x;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Group the elements [B,E) of a T into a single element
-// e.g. group<2,4>(T<_1,_2,_3,_4,_5,_6>{})
-//              => T<_1,_2,T<_3,_4>,_5,_6>{}
-template <int B, int E, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-group(T const& t)
-{
-  if constexpr (not is_tuple<T>::value) {
-    if constexpr (E == -1) {
-      return group<B,1>(t);
-    } else {
-      return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,1>{});
-    }
-  } else
-  if constexpr (E == -1) {
-    return group<B,tuple_size<T>::value>(t);
-  } else
-  if constexpr (B <= E) {
-    return detail::construct(t, take<B,E>(t), make_seq<B>{}, make_seq<(B < E)>{}, make_range<E,tuple_size<T>::value>{});
-  } else {
-    static_assert(B <= E);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Extend a T to rank N by appending/prepending an element
-//
-
-template <int N, class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-append(T const& a, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (N == tuple_size<T>::value) {
-      return a;
-    } else {
-      static_assert(N > tuple_size<T>::value);
-      return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, make_seq<N-tuple_size<T>::value>{}, seq<>{});
-    }
-  } else {
-    if constexpr (N == 1) {
-      return a;
-    } else {
-      return detail::construct(cute::make_tuple(a), x, seq<0>{}, make_seq<N-1>{}, seq<>{});
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-append(T const& a, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::construct(a, x, make_seq<tuple_size<T>::value>{}, seq<0>{}, seq<>{});
-  } else {
-    return cute::make_tuple(a, x);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-prepend(T const& a, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (N == tuple_size<T>::value) {
-      return a;
-    } else {
-      static_assert(N > tuple_size<T>::value);
-      return detail::construct(a, x, seq<>{}, make_seq<N-tuple_size<T>::value>{}, make_seq<tuple_size<T>::value>{});
-    }
-  } else {
-    if constexpr (N == 1) {
-      return a;
-    } else {
-      static_assert(N > 1);
-      return detail::construct(cute::make_tuple(a), x, seq<>{}, make_seq<N-1>{}, seq<0>{});
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-prepend(T const& a, X const& x)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::construct(a, x, seq<>{}, seq<0>{}, make_seq<tuple_size<T>::value>{});
-  } else {
-    return cute::make_tuple(x, a);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Inclusive scan (prefix sum)
-//
-
-namespace detail {
-
-template <class T, class V, class F, int I, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-iscan(T const& t, V const& v, F&& f, seq<I,Is...>)
-{
-  // Apply the function to v and the element at I
-  auto v_next = f(v, get<I>(t));
-  // Replace I with v_next
-  auto t_next = replace<I>(t, v_next);
-
-#if 0
-  std::cout << "ISCAN i" << I << std::endl;
-  std::cout << "  t      " << t << std::endl;
-  std::cout << "  i      " << v << std::endl;
-  std::cout << "  f(i,t) " << v_next << std::endl;
-  std::cout << "  t_n    " << t_next << std::endl;
-#endif
-
-  if constexpr (sizeof...(Is) == 0) {
-    return t_next;
-  } else {
-    return iscan(t_next, v_next, f, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class T, class V, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-iscan(T const& t, V const& v, F&& f)
-{
-  return detail::iscan(t, v, f, tuple_seq<T>{});
-}
-
-//
-// Exclusive scan (prefix sum)
-//
-
-namespace detail {
-
-template <class T, class V, class F, int I, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-escan(T const& t, V const& v, F&& f, seq<I,Is...>)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    // Replace I with v
-    return replace<I>(t, v);
-  } else {
-    // Apply the function to v and the element at I
-    auto v_next = f(v, get<I>(t));
-    // Replace I with v
-    auto t_next = replace<I>(t, v);
-
-#if 0
-    std::cout << "ESCAN i" << I << std::endl;
-    std::cout << "  t      " << t << std::endl;
-    std::cout << "  i      " << v << std::endl;
-    std::cout << "  f(i,t) " << v_next << std::endl;
-    std::cout << "  t_n    " << t_next << std::endl;
-#endif
-
-    // Recurse
-    return escan(t_next, v_next, f, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class T, class V, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-escan(T const& t, V const& v, F&& f)
-{
-  return detail::escan(t, v, f, tuple_seq<T>{});
-}
-
-//
-// Zip (Transpose)
-//
-
-// Take       ((a,b,c,...),(x,y,z,...),...)        rank-R0 x rank-R1 input
-// to produce ((a,x,...),(b,y,...),(c,z,...),...)  rank-R1 x rank-R0 output
-
-namespace detail {
-
-template <int J, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-zip_(Ts const&... ts)
-{
-  return cute::make_tuple(get<J>(ts)...);
-}
-
-template <class T, int... Is, int... Js>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(T const& t, seq<Is...>, seq<Js...>)
-{
-  static_assert(conjunction<bool_constant<tuple_size<tuple_element_t<0,T>>::value == tuple_size<tuple_element_t<Is,T>>::value>...>::value, "Mismatched Ranks");
-  return cute::make_tuple(zip_<Js>(get<Is>(t)...)...);
-}
-
-} // end namespace detail
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    if constexpr (is_tuple<tuple_element_t<0,T>>::value) {
-      return detail::zip(t, tuple_seq<T>{}, tuple_seq<tuple_element_t<0,T>>{});
-    } else {
-      return cute::make_tuple(t);
-    }
-  } else {
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Convenient to pass them in separately
-template <class T0, class T1, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(T0 const& t0, T1 const& t1, Ts const&... ts)
-{
-  return zip(cute::make_tuple(t0, t1, ts...));
-}
-
-//
-// zip2_by -- A guided zip for rank-2 tuples
-//   Take a tuple like ((A,a),((B,b),(C,c)),d)
-//   and produce a tuple ((A,(B,C)),(a,(b,c),d))
-//   where the rank-2 modes are selected by the terminals of the guide (X,(X,X))
-//
-
-namespace detail {
-
-template <class T, class TG, int... Is, int... Js>
-CUTE_HOST_DEVICE constexpr
-auto
-zip2_by(T const& t, TG const& guide, seq<Is...>, seq<Js...>)
-{
-  // zip2_by produces the modes like ((A,a),(B,b),...)
-  auto split = cute::make_tuple(zip2_by(get<Is>(t), get<Is>(guide))...);
-
-  // Rearrange and append missing modes from t to make ((A,B,...),(a,b,...,x,y))
-  return cute::make_tuple(cute::make_tuple(get<0>(get<Is>(split))...),
-                          cute::make_tuple(get<1>(get<Is>(split))..., get<Js>(t)...));
-}
-
-} // end namespace detail
-
-template <class T, class TG>
-CUTE_HOST_DEVICE constexpr
-auto
-zip2_by(T const& t, TG const& guide)
-{
-  if constexpr (is_tuple<TG>::value) {
-    constexpr int TR = tuple_size<T>::value;
-    constexpr int GR = tuple_size<TG>::value;
-    static_assert(TR >= GR, "Mismatched ranks");
-    return detail::zip2_by(t, guide,
-                           make_range< 0, GR>{},
-                           make_range<GR, TR>{});
-  } else {
-    static_assert(tuple_size<T>::value == 2, "Mismatched ranks");
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/// @return A tuple of the elements of @c t in reverse order.
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-reverse(T const& t)
-{
-  if constexpr (is_tuple<T>::value) {
-    return detail::apply(t, [](auto const&... a){ return cute::make_tuple(a...); }, tuple_rseq<T>{});
-  } else {
-    return t;
-  }
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
deleted file mode 100755
index 8fff51be8..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
-  ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))))
-#  define CUTE_ARCH_CLUSTER_SM90_ENABLED
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12))
-#  define CUTE_ARCH_ELECT_ONE_SM90_ENABLED
-#endif
-
-namespace cute {
-
-CUTE_DEVICE void cluster_arrive_relaxed()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  asm volatile("barrier.cluster.arrive.relaxed.aligned;\n" : : );
-#else
-  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
-#endif
-}
-
-CUTE_DEVICE void cluster_arrive()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  asm volatile("barrier.cluster.arrive.aligned;\n" : : );
-#else
-  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
-#endif
-}
-
-CUTE_DEVICE void cluster_wait()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  asm volatile("barrier.cluster.wait.aligned;\n" : : );
-#else
-  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
-#endif
-}
-
-CUTE_DEVICE void cluster_sync()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  cluster_arrive();
-  cluster_wait();
-#else
-  CUTE_INVALID_CONTROL_PATH("CUTE_ARCH_CLUSTER_SM90_ENABLED is not defined");
-#endif
-}
-
-// Returns the dim3 grid size in terms of number of clusters.
-CUTE_DEVICE dim3 cluster_grid_dims()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t x, y, z;
-  asm volatile("mov.u32 %0, %%nclusterid.x;\n" : "=r"(x) : );
-  asm volatile("mov.u32 %0, %%nclusterid.y;\n" : "=r"(y) : );
-  asm volatile("mov.u32 %0, %%nclusterid.z;\n" : "=r"(z) : );
-  return {x, y, z};
-#elif defined(__CUDA_ARCH__)
-  // MSVC requires protecting use of gridDim with __CUDA_ARCH__.
-  return gridDim;
-#elif defined(_MSC_VER)
-  CUTE_INVALID_CONTROL_PATH("cluster_grid_dims() can only be called on device");
-  return {0, 0, 0};
-#else
-  return {0, 0, 0};
-#endif
-}
-
-// Returns the dim3 cluster rank in the grid.
-CUTE_DEVICE dim3 cluster_id_in_grid()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t x, y, z;
-  asm volatile("mov.u32 %0, %%clusterid.x;\n" : "=r"(x) : );
-  asm volatile("mov.u32 %0, %%clusterid.y;\n" : "=r"(y) : );
-  asm volatile("mov.u32 %0, %%clusterid.z;\n" : "=r"(z) : );
-  return {x, y, z};
-#elif defined(__CUDA_ARCH__)
-  // MSVC requires protecting use of blockIdx with __CUDA_ARCH__.
-  return blockIdx;
-#elif defined(_MSC_VER)
-  CUTE_INVALID_CONTROL_PATH("cluster_id_in_grid() can only be called on device");
-  return {0, 0, 0};
-#else
-  return {0, 0, 0};
-#endif
-}
-
-// Returns the relative dim3 block rank local to the cluster.
-CUTE_DEVICE dim3 block_id_in_cluster()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t x, y, z;
-  asm volatile("mov.u32 %0, %%cluster_ctaid.x;\n" : "=r"(x) : );
-  asm volatile("mov.u32 %0, %%cluster_ctaid.y;\n" : "=r"(y) : );
-  asm volatile("mov.u32 %0, %%cluster_ctaid.z;\n" : "=r"(z) : );
-  return {x, y, z};
-#else
-  return {0,0,0};
-#endif
-}
-
-// Returns the dim3 cluster shape.
-CUTE_DEVICE dim3 cluster_shape()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t x, y, z;
-  asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(x) : );
-  asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(y) : );
-  asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(z) : );
-  return {x, y, z};
-#else
-  return {1,1,1};
-#endif
-}
-
-// Get 1D ctaid in a cluster.
-CUTE_DEVICE uint32_t block_rank_in_cluster()
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t rank;
-  asm volatile("mov.u32 %0, %%cluster_ctarank;\n" : "=r"(rank) :);
-  return rank;
-#else
-  return 0;
-#endif
-}
-
-// Set the destination block-ID in cluster for a given SMEM Address
-CUTE_DEVICE uint32_t set_block_rank(uint32_t smemAddr, uint32_t rank)
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t result;
-  asm volatile("mapa.shared::cluster.u32  %0, %1, %2;\n"
-              : "=r"(result)
-              : "r"(smemAddr), "r"(rank));
-  return result;
-#else
-  return smemAddr;
-#endif
-}
-
-// Elect one thread in the warp. The elected thread gets its predicate set to true, all others obtain false.
-CUTE_HOST_DEVICE uint32_t elect_one_sync()
-{
-#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
-  uint32_t pred = 0;
-  uint32_t laneid = 0;
-  asm volatile(
-    "{\n"
-    ".reg .b32 %%rx;\n"
-    ".reg .pred %%px;\n"
-    "     elect.sync %%rx|%%px, %2;\n"
-    "@%%px mov.s32 %1, 1;\n"
-    "     mov.s32 %0, %%rx;\n"
-    "}\n"
-    : "+r"(laneid), "+r"(pred)
-    : "r"(0xFFFFFFFF));
-  return pred;
-#elif defined(__CUDA_ARCH__)
-  return (threadIdx.x % 32) == 0;
-#else
-  return true;
-#endif
-}
-
-struct ElectOneLaneIdReturnType {
-  uint32_t is_leader;
-  uint32_t leader_lane_id;
-};
-
-CUTE_HOST_DEVICE
-ElectOneLaneIdReturnType
-elect_one_leader_sync()
-{
-#if defined(CUTE_ARCH_ELECT_ONE_SM90_ENABLED)
-  uint32_t pred = 0;
-  uint32_t laneid = 0;
-  asm volatile(
-    "{\n"
-    ".reg .b32 %%rx;\n"
-    ".reg .pred %%px;\n"
-    "     elect.sync %%rx|%%px, %2;\n"
-    "@%%px mov.s32 %1, 1;\n"
-    "     mov.s32 %0, %%rx;\n"
-    "}\n"
-    : "+r"(laneid), "+r"(pred)
-    : "r"(0xFFFFFFFF));
-  return {pred, laneid};
-#elif defined(__CUDA_ARCH__)
-  return {(threadIdx.x % 32) == 0, 0};
-#else
-  return {true, 0};
-#endif
-}
-
-// Store value to remote shared memory in the cluster
-CUTE_DEVICE
-void
-store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
-{
-#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
-  uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
-  uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
-  asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
-               : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
-#endif
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/config.hpp b/lightllm-kernel/cutlass/include/cute/arch/config.hpp
deleted file mode 100755
index 84d7779a3..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/config.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cutlass/arch/config.h> // CUTLASS_ARCH_MMA_SMxx_ENABLED
-
-// TMA instructions
-#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
-#  define CUTE_ARCH_TMA_SM90_ENABLED
-#endif
-
-#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED)
-#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
-#endif
-
-// STSM
-#if defined(CUTLASS_ARCH_MMA_SM90_ENABLED)
-#  define CUTE_ARCH_STSM_SM90_ENABLED
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy.hpp
deleted file mode 100755
index 513928999..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/util.hpp>
-#include <cute/numeric/numeric_types.hpp>
-
-namespace cute
-{
-
-//
-// Direct Copy for any type
-//
-
-template <class S, class D = S>
-struct UniversalCopy
-{
-  using SRegisters = S[1];
-  using DRegisters = D[1];
-
-  template <class S_, class D_>
-  CUTE_HOST_DEVICE static constexpr void
-  copy(S_ const& src,
-       D_      & dst)
-  {
-    dst = static_cast<D>(static_cast<S>(src));
-  }
-
-  // Accept mutable temporaries
-  template <class S_, class D_>
-  CUTE_HOST_DEVICE static constexpr void
-  copy(S_ const& src,
-       D_     && dst)
-  {
-    UniversalCopy<S,D>::copy(src, dst);
-  }
-};
-
-//
-// Placeholder for the copy algorithm's stronger auto-vectorizing behavior
-//   that assumes alignment of pointers and dynamic layouts up to MaxVecBits
-//
-
-template <int MaxVecBits = 128>
-struct AutoVectorizingCopyWithAssumedAlignment
-     : UniversalCopy<uint_bit_t<MaxVecBits>>
-{
-  static_assert(MaxVecBits == 8 || MaxVecBits == 16 || MaxVecBits == 32 || MaxVecBits == 64 || MaxVecBits == 128,
-                "Expected MaxVecBits to be 8 or 16 or 32 or 64 or 128 for alignment and performance.");
-};
-
-//
-// AutoVectorizingCopy alias assumes maximal alignment of pointers and dynamic strides.
-//   If this is not the case then AutoVectorizingCopyWithAssumedAlignment should be used instead
-//
-
-using AutoVectorizingCopy = AutoVectorizingCopyWithAssumedAlignment<128>;
-
-//
-// DefaultCopy alias does not assume alignment of pointers or dynamic strides.
-//
-
-using DefaultCopy = AutoVectorizingCopyWithAssumedAlignment<8>;
-
-//
-// Global memory prefetch into L2
-//
-
-CUTE_HOST_DEVICE static void
-prefetch(void const* gmem_ptr)
-{
-#if defined(__CUDA_ARCH__)
-  asm volatile("prefetch.global.L2 [%0];\n" : : "l"(gmem_ptr) : "memory");
-#endif
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
deleted file mode 100755
index 925d9ebe3..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/copy.hpp>
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
-  #define CUTE_ARCH_WARP_SHUFFLE_ENABLED 1
-#endif
-
-namespace cute
-{
-// Shuffle data between thread pair (0, 1), (2, 3), etc.
-struct SM50_Shuffle_U32_2x2Trans_XOR1
-{
-  using SRegisters = uint32_t[2];
-  using DRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
-  {
-#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
-    uint32_t x0 = src0;
-    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 1);
-
-    uint32_t x1 = src1;
-    uint32_t y1 = __shfl_xor_sync(0xffffffff, x1, 1);
-
-    if (threadIdx.x % 2 == 0) {
-      dst1 = y0;
-    } 
-    else {
-      dst0 = y1;
-    }
-#else 
-    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
-#endif
-  }
-};
-
-// Shuffle data between thread pair (0, 4), (1, 5), etc.
-struct SM50_Shuffle_U32_2x2Trans_XOR4
-{
-  using SRegisters = uint32_t[2];
-  using DRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1, uint32_t& dst0, uint32_t& dst1)
-  {
-#if defined(CUTE_ARCH_WARP_SHUFFLE_ENABLED)
-    uint32_t x0 = threadIdx.x & 4  ? src0 : src1;
-    uint32_t y0 = __shfl_xor_sync(0xffffffff, x0, 4);
-
-    // Replace detination register with shuffle result.
-    if (threadIdx.x & 0x4) {
-      dst0 = y0;
-    } 
-    else {
-      dst1 = y0;
-    }
-#else 
-    CUTE_INVALID_CONTROL_PATH("Trying to use __shfl_xor_sync without CUTE_ARCH_WARP_SHUFFLE_ENABLED.");
-#endif
-  }
-};
-
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
deleted file mode 100755
index 3d3d37acb..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/copy.hpp>
-
-// Config
-#if defined(__clang__) && defined(__CUDA__)
-  // ldmatrix PTX instructions added in Clang 14: https://reviews.llvm.org/D107046
-  // ... but will not work until Clang 15:
-  //   * https://reviews.llvm.org/D121666
-  //   * https://reviews.llvm.org/D126846
-  #define CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75 (__clang_major__ >= 15)
-#endif
-
-#if defined(__NVCC__) || defined(__CUDACC_RTC__)
-  // ldmatrix PTX instruction added in CUDA 10.2+
-  #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 ((__CUDACC_VER_MAJOR__  == 10 && __CUDACC_VER_MINOR__ >= 2) || __CUDACC_VER_MAJOR__ >= 11)
-#endif
-
-#if ! defined(CUTE_ARCH_LDSM_SM75_SUPPORTED)
-  #define CUTE_ARCH_LDSM_SM75_SUPPORTED (CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 || CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75)
-#endif
-
-#if ! defined(CUTE_ARCH_LDSM_SM75_ENABLED)
-  #define CUTE_ARCH_LDSM_SM75_ENABLED (CUTE_ARCH_LDSM_SM75_SUPPORTED)
-#endif
-
-#if (CUTE_ARCH_LDSM_SM75_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
-  #define CUTE_ARCH_LDSM_SM75_ACTIVATED 1
-#endif
-
-namespace cute
-{
-
-struct SM75_U32x1_LDSM_N
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
-        : "=r"(dst)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-struct SM75_U32x2_LDSM_N
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst0, uint32_t& dst1)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
-        : "=r"(dst0), "=r"(dst1)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-struct SM75_U32x4_LDSM_N
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
-        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-struct SM75_U16x2_LDSM_T
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];\n"
-        : "=r"(dst)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-struct SM75_U16x4_LDSM_T
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst0, uint32_t& dst1)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];\n"
-        : "=r"(dst0), "=r"(dst1)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-struct SM75_U16x8_LDSM_T
-{
-  using SRegisters = uint128_t[1];
-  using DRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint128_t const& smem_src,
-       uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3)
-  {
-#if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_src);
-    asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
-        : "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3)
-        :  "r"(smem_int_ptr));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use ldmatrix without CUTE_ARCH_LDSM_SM75_ACTIVATED.");
-#endif
-  }
-};
-
-//
-// Legacy LDSM interfaces that aren't very useful
-//
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-copy_ldsm(uint128_t const* const smem_ptr,
-          T* rmem_ptr)
-{
-  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
-
-  // if constexpr
-  if (sizeof(T) == 4) {
-    SM75_U32x1_LDSM_N::copy(smem_ptr[0], reg_ptr[0]);
-  }
-  else if (sizeof(T) == 8) {
-    SM75_U32x2_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
-  }
-  else if (sizeof(T) == 16) {
-    SM75_U32x4_LDSM_N::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
-  }
-  else {
-    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
-  }
-}
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-copy_ldsm_trans(uint128_t const* const smem_ptr,
-                T* rmem_ptr)
-{
-  uint32_t* reg_ptr = reinterpret_cast<uint32_t*>(rmem_ptr);
-
-  // if constexpr
-  if (sizeof(T) == 4) {
-    SM75_U16x2_LDSM_T::copy(smem_ptr[0], reg_ptr[0]);
-  }
-  else if (sizeof(T) == 8) {
-    SM75_U16x4_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1]);
-  }
-  else if (sizeof(T) == 16) {
-    SM75_U16x8_LDSM_T::copy(smem_ptr[0], reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3]);
-  }
-  else {
-    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
-  }
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
deleted file mode 100755
index e04181bfe..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/copy.hpp>
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-#  define CUTE_ARCH_CP_ASYNC_SM80_ENABLED
-#endif
-
-namespace cute
-{
-
-/// Copy via cp.async with caching at all levels
-template <class TS, class TD = TS>
-struct SM80_CP_ASYNC_CACHEALWAYS
-{
-  using SRegisters = TS[1];
-  using DRegisters = TD[1];
-
-  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
-  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
-
-  CUTE_HOST_DEVICE static void
-  copy(TS const& gmem_src,
-       TD      & smem_dst)
-  {
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-    TS const* gmem_ptr    = &gmem_src;
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2;\n"
-        :: "r"(smem_int_ptr),
-           "l"(gmem_ptr),
-           "n"(sizeof(TS)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
-#endif
-  }
-};
-
-/// Copy via cp.async with caching at global level
-template <class TS, class TD = TS>
-struct SM80_CP_ASYNC_CACHEGLOBAL
-{
-  using SRegisters = TS[1];
-  using DRegisters = TD[1];
-
-  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
-  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
-
-  CUTE_HOST_DEVICE static void
-  copy(TS const& gmem_src,
-       TD      & smem_dst)
-  {
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-    TS const* gmem_ptr    = &gmem_src;
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2;\n"
-        :: "r"(smem_int_ptr),
-           "l"(gmem_ptr),
-           "n"(sizeof(TS)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
-#endif
-  }
-};
-
-/// Copy via cp.async with caching at all levels
-template <class TS, class TD = TS>
-struct SM80_CP_ASYNC_CACHEALWAYS_ZFILL
-{
-  using SRegisters = TS[1];
-  using DRegisters = TD[1];
-
-  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
-  static_assert(sizeof(TS) == 4 || sizeof(TS) == 8 || sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
-
-  CUTE_HOST_DEVICE static void
-  copy(TS const& gmem_src,
-       TD      & smem_dst,
-       bool      pred)
-  {
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-    TS const* gmem_ptr    = &gmem_src;
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    int src_size = pred ? sizeof(TS) : 0;
-    asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n"
-        :: "r"(smem_int_ptr),
-           "l"(gmem_ptr),
-           "n"(sizeof(TS)),
-           "r"(src_size));
-#else
-    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
-#endif
-  }
-};
-
-/// Copy via cp.async with caching at global level
-template <class TS, class TD = TS>
-struct SM80_CP_ASYNC_CACHEGLOBAL_ZFILL
-{
-  using SRegisters = TS[1];
-  using DRegisters = TD[1];
-
-  static_assert(sizeof(TS) == sizeof(TD), "cp.async requires sizeof(src_value_type) == sizeof(dst_value_type)");
-  static_assert(sizeof(TS) == 16, "cp.async sizeof(TS) is not supported");
-
-  CUTE_HOST_DEVICE static void
-  copy(TS const& gmem_src,
-       TD      & smem_dst,
-       bool      pred)
-  {
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-    TS const* gmem_ptr    = &gmem_src;
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    int src_size = pred ? sizeof(TS) : 0;
-    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n"
-        :: "r"(smem_int_ptr),
-           "l"(gmem_ptr),
-           "n"(sizeof(TS)),
-           "r"(src_size));
-#else
-    CUTE_INVALID_CONTROL_PATH("Support for cp.async instructions has not been enabled");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
-CUTE_HOST_DEVICE
-void
-cp_async_fence()
-{
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-  asm volatile("cp.async.commit_group;\n" ::);
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Blocks until all but N previous cp.async.commit_group operations have committed.
-template <int N>
-CUTE_HOST_DEVICE
-void
-cp_async_wait()
-{
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-  if constexpr (N == 0) {
-    asm volatile("cp.async.wait_all;\n" ::);
-  } else {
-    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
-  }
-#endif
-}
-
-template <int N>
-CUTE_HOST_DEVICE
-void
-cp_async_wait(Int<N>)
-{
-  return cp_async_wait<N>();
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
deleted file mode 100755
index bcb3b7d19..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>      // CUTE_HOST_DEVICE
-#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
-#include <cute/arch/copy.hpp>
-
-namespace cute
-{
-
-struct SM90_U32x1_STSM_N
-{
-  using SRegisters = uint32_t[1];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src,
-       uint128_t     & smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x1.m8n8.shared.b16 [%0], {%1};\n"
-        :: "r"(smem_int_ptr),
-           "r"(src));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_U32x2_STSM_N
-{
-  using SRegisters = uint32_t[2];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1,
-       uint128_t& smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n"
-        :: "r"(smem_int_ptr),
-           "r"(src0), "r"(src1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_U32x4_STSM_N
-{
-  using SRegisters = uint32_t[4];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
-       uint128_t& smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x4.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
-        :: "r"(smem_int_ptr),
-          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_U16x2_STSM_T
-{
-  using SRegisters = uint32_t[1];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src,
-       uint128_t& smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [%0], {%1};\n"
-        :: "r"(smem_int_ptr),
-           "r"(src));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_U16x4_STSM_T
-{
-  using SRegisters = uint32_t[2];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1,
-       uint128_t& smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [%0], {%1, %2};\n"
-        :: "r"(smem_int_ptr),
-           "r"(src0), "r"(src1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_U16x8_STSM_T
-{
-  using SRegisters = uint32_t[4];
-  using DRegisters = uint128_t[1];
-
-  CUTE_HOST_DEVICE static void
-  copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
-       uint128_t& smem_dst)
-  {
-#if defined(CUTE_ARCH_STSM_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
-    asm volatile ("stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n"
-        :: "r"(smem_int_ptr),
-          "r"(src0), "r"(src1), "r"(src2), "r"(src3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use stmatrix without CUTE_ARCH_STSM_SM90_ENABLED.");
-#endif
-  }
-};
-
-//
-// Legacy STSM interfaces that aren't very useful
-//
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-copy_stsm(T const* const rmem_ptr,
-          uint128_t* const smem_ptr)
-{
-  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
-
-  // if constexpr
-  if (sizeof(T) == 4) {
-    SM90_U32x1_STSM_N::copy(reg_ptr[0], smem_ptr[0]);
-  }
-  else if (sizeof(T) == 8) {
-    SM90_U32x2_STSM_N::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
-  }
-  else if (sizeof(T) == 16) {
-    SM90_U32x4_STSM_N::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
-  }
-  else {
-    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
-  }
-}
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-copy_stsm_trans(T const* const rmem_ptr,
-                uint128_t* const smem_ptr)
-{
-  uint32_t const* reg_ptr = reinterpret_cast<uint32_t const*>(rmem_ptr);
-
-  // if constexpr
-  if (sizeof(T) == 4) {
-    SM90_U16x2_STSM_T::copy(reg_ptr[0], smem_ptr[0]);
-  }
-  else if (sizeof(T) == 8) {
-    SM90_U16x4_STSM_T::copy(reg_ptr[0], reg_ptr[1], smem_ptr[0]);
-  }
-  else if (sizeof(T) == 16) {
-    SM90_U16x8_STSM_T::copy(reg_ptr[0], reg_ptr[1], reg_ptr[2], reg_ptr[3], smem_ptr[0]);
-  }
-  else {
-    static_assert(sizeof(T) == 4 || sizeof(T) == 8 || sizeof(T) == 16, "sizeof(T) is not supported");
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cute/arch/copy_sm90_desc.hpp>
-#include <cute/arch/copy_sm90_tma.hpp>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
deleted file mode 100755
index cc0bf4a39..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp
+++ /dev/null
@@ -1,440 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/numeric_types.h"
-
-#if !defined(__CUDACC_RTC__)
-#include <cuda.h>
-#include <cinttypes>
-#endif
-
-#include <cute/config.hpp>
-
-#include <cute/arch/util.hpp>   // cute::cast_smem_ptr_to_uint
-#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
-#include <cute/arch/copy.hpp>
-#include <cute/arch/copy_sm90.hpp>
-
-#include <cute/container/alignment.hpp>
-#include <cute/container/bit_field.hpp>
-#include <cute/container/array.hpp>
-#include <cute/numeric/numeric_types.hpp>
-
-namespace cute
-{
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Barriers are 64-bit of user-managed information used in broadly two types syncronization patterns
-/// 1) arrive/wait on threads (usage: cp.async and warp-specialized kernels)
-/// 2) transaction-based (usage: TMA transaction where a CTA issues one transaction)
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Initialize barrier present in shared memory
-CUTE_HOST_DEVICE
-void
-initialize_barrier(uint64_t& smem_barrier,                 // 64 bits user-manged barrier in smem
-                   int thread_count = 1)                   // Thread count expected to arrive/wait on this barrier
-{
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
-  asm volatile ("mbarrier.init.shared::cta.b64 [%0], %1;\n"
-    :: "r"(smem_int_ptr),
-       "r"(thread_count));
-#endif
-}
-
-// Set the number of bytes transfered per transaction and perform an arrive operation as well
-CUTE_HOST_DEVICE
-void
-set_barrier_transaction_bytes(uint64_t& smem_barrier,      // 64 bits user-manged barrier in smem
-                              uint32_t bytes)              // Number of bytes transfered by per TMA transaction
-{
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
-  asm volatile ("mbarrier.arrive.expect_tx.shared::cta.b64 _, [%0], %1;\n"
-    :: "r"(smem_int_ptr),
-       "r"(bytes));
-#endif
-}
-
-// Barrier wait
-CUTE_HOST_DEVICE
-void
-wait_barrier(uint64_t& smem_barrier,                       // 64 bits user-manged barrier in smem
-             int phase_bit)                                // Current phase bit the barrier waiting to flip
-{
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
-  asm volatile(
-    "{\n"
-    ".reg .pred                P1;\n"
-    "LAB_WAIT:\n"
-    "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
-    "@P1                       bra DONE;\n"
-    "bra                   LAB_WAIT;\n"
-    "DONE:\n"
-    "}\n"
-    :: "r"(smem_int_ptr),
-       "r"(phase_bit));
-
-#endif
-}
-
-// Barrier arrive
-CUTE_HOST_DEVICE
-void
-arrive_barrier(uint64_t& smem_barrier)                      // 64 bits user-manged barrier in smem
-{
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-  uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_barrier);
-  asm volatile(
-    "{\n"
-    ".reg .b64 state; \n"
-    "mbarrier.arrive.shared::cta.b64   state, [%0];\n"
-    "}\n"
-    :: "r"(smem_int_ptr));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// TMA Descriptor and utilities
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace TMA {
-
-enum class SmemSwizzleBits : uint8_t {
-  DISABLE = 0,
-  B32 = 1,
-  B64 = 2,
-  B128 = 3,
-};
-
-enum class SmemSwizzleBase : uint8_t {
-  SWIZZLE_BASE_16B         = 0,
-};
-
-enum class OOBFill : uint8_t {
-  ZERO = 0,
-  CONSTANT = 1,
-};
-
-CUTE_HOST_DEVICE char const* to_string(OOBFill const& t) {
-  switch (t) {
-    case OOBFill::ZERO:     return "ZERO";
-    case OOBFill::CONSTANT: return "CONSTANT";
-  }
-  return nullptr;
-}
-
-enum class L2Promotion : uint8_t {
-  DISABLE = 0,
-  B64 = 1,
-  B128 = 2,
-  B256 = 3,
-};
-
-CUTE_HOST_DEVICE char const* to_string(L2Promotion const& t) {
-  switch (t) {
-    case L2Promotion::DISABLE: return "DISABLE";
-    case L2Promotion::B64:     return "B64";
-    case L2Promotion::B128:    return "B128";
-    case L2Promotion::B256:    return "B256";
-  }
-  return nullptr;
-}
-
-// Aux parameters which are independent with the problem size
-struct DescriptorAuxParams {
-  OOBFill     oobfill_     = OOBFill::ZERO;
-  L2Promotion l2promo_     = L2Promotion::DISABLE;
-};
-
-enum class CacheHintSm90 : uint64_t {
-  EVICT_NORMAL = 0x1000000000000000,
-  EVICT_FIRST = 0x12F0000000000000,
-  EVICT_LAST = 0x14F0000000000000,
-};
-
-#if (__CUDACC_VER_MAJOR__ >= 12)
-
-#if !defined(__CUDACC_RTC__)
-/// @return The TMA descriptor datatype enum corresponding to T.
-template <class T>
-inline CUtensorMapDataType
-to_CUtensorMapDataType() {
-  if constexpr (is_same_v<T,       int8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
-  if constexpr (is_same_v<T,      uint8_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
-  if constexpr (is_same_v<T, float_e4m3_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
-  if constexpr (is_same_v<T, float_e5m2_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT8;    } else
-  if constexpr (is_same_v<T,     uint16_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT16;   } else
-  if constexpr (is_same_v<T,     uint32_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT32;   } else
-  if constexpr (is_same_v<T,     uint64_t>) { return CU_TENSOR_MAP_DATA_TYPE_UINT64;   } else
-  if constexpr (is_same_v<T,      int32_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT32;    } else
-  if constexpr (is_same_v<T,      int64_t>) { return CU_TENSOR_MAP_DATA_TYPE_INT64;    } else
-  if constexpr (is_same_v<T,       half_t>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;  } else
-  if constexpr (is_same_v<T,        float>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT32;  } else
-  if constexpr (is_same_v<T,       double>) { return CU_TENSOR_MAP_DATA_TYPE_FLOAT64;  } else
-  if constexpr (is_same_v<T,   bfloat16_t>) { return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; } else
-  if constexpr (is_same_v<T,   tfloat32_t>) { return CU_TENSOR_MAP_DATA_TYPE_TFLOAT32; } else
-  { static_assert(sizeof(T) < 0, "Unknown TMA Format!"); }
-}
-
-inline CUtensorMapSwizzle
-to_CUtensorMapSwizzle(SmemSwizzleBits const& t, SmemSwizzleBase const& b) {
-  switch (t) {
-    default: assert(false && "Unsupported pair of SmemSwizzleBits and SmemSwizzleBase!");
-    case SmemSwizzleBits::DISABLE: 
-      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 0B swizzle bits.");
-      return CU_TENSOR_MAP_SWIZZLE_NONE;
-    case SmemSwizzleBits::B32:
-      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 32B swizzle bits.");
-      return CU_TENSOR_MAP_SWIZZLE_32B;
-    case SmemSwizzleBits::B64:
-      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 64B swizzle bits.");
-      return CU_TENSOR_MAP_SWIZZLE_64B;
-    case SmemSwizzleBits::B128:
-      assert((b == SmemSwizzleBase::SWIZZLE_BASE_16B) && "Expected 16B swizzle base for 128B swizzle bits.");
-      return CU_TENSOR_MAP_SWIZZLE_128B;
-  }
-}
-
-inline CUtensorMapFloatOOBfill
-to_CUtensorMapFloatOOBfill(OOBFill const& t) {
-  switch(t) {
-    default:                assert(false && "Unknown OOBFill!");
-    case OOBFill::ZERO:     return CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
-    case OOBFill::CONSTANT: return CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
-  }
-}
-
-inline CUtensorMapL2promotion
-to_CUtensorMapL2promotion(L2Promotion const& t) {
-  switch(t) {
-    default: assert(false && "Unknown L2Promotion!");
-    case L2Promotion::DISABLE: return CU_TENSOR_MAP_L2_PROMOTION_NONE;
-    case L2Promotion::B64:     return CU_TENSOR_MAP_L2_PROMOTION_L2_64B;
-    case L2Promotion::B128:    return CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
-    case L2Promotion::B256:    return CU_TENSOR_MAP_L2_PROMOTION_L2_256B;
-  }
-}
-
-#endif // !defined(__CUDACC_RTC__)
-
-#endif // (__CUDACC_VER_MAJOR__ >= 12)
-
-} // end namespace TMA
-
-#if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
-  using TmaDescriptor = CUtensorMap;
-  using Im2ColTmaDescriptor = CUtensorMap;
-#else
-  using TmaDescriptor = struct alignas(64) { char bytes[128]; };
-  using Im2ColTmaDescriptor = struct alignas(64) { char bytes[128]; };
-#endif
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Initiates a TensorMap Prefetch
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTE_HOST_DEVICE
-void
-prefetch_tma_descriptor(TmaDescriptor const* desc_ptr)
-{
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-  // Prefetch TMA Descriptor using generic addressing (i.e. no specific state space: const or param)
-  asm volatile (
-    "prefetch.tensormap [%0];"
-    :
-    : "l"(gmem_int_desc)
-    : "memory");
-#else
-  CUTE_INVALID_CONTROL_PATH("Trying to use TMA Descriptor Prefetch without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Perform a TensorMap modification (by each field)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Replace tensor pointer directly in GMEM
-CUTE_HOST_DEVICE
-void
-tma_descriptor_replace_addr_in_global_mem(TmaDescriptor const* desc_ptr,
-                                          void const* const new_tensor_ptr)
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
-  asm volatile (
-    "tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;"
-    :: "l"(gmem_int_desc), "l"(new_desc_addr));
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-// Replace tensor pointer by bringing the tensormap from GMEM into the shared memory
-CUTE_HOST_DEVICE
-void
-tma_descriptor_replace_addr_in_shared_mem(TmaDescriptor& smem_desc,
-                                          void const* const new_tensor_ptr)
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
-  uint64_t const new_desc_addr = reinterpret_cast<uint64_t>(new_tensor_ptr);
-  asm volatile (
-    "tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;"
-    :: "r"(smem_int_desc), "l"(new_desc_addr));
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-// Replace tensor dims and strides for GEMMs by bringing the tensormap from GMEM into the shared memory
-CUTE_HOST_DEVICE
-void
-tma_descriptor_replace_dims_strides_in_shared_mem(TmaDescriptor                 & smem_desc,
-                                                  cute::array<uint32_t, 5> const& prob_shape,
-                                                  cute::array<uint64_t, 5> const& prob_stride)
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
-  uint64_t const smem_int64_desc = 0;
-  asm volatile (
-    "cvt.u64.u32 %0, %1;"
-    :: "l"(smem_int64_desc), "r"(smem_int_desc));
-  asm volatile (
-    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 0, %1;"
-    :: "l"(smem_int64_desc), "r"(prob_shape[0]));
-  asm volatile (
-    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 1, %1;"
-    :: "l"(smem_int64_desc), "r"(prob_shape[1]));
-  asm volatile (
-    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 2, %1;"
-    :: "l"(smem_int64_desc), "r"(prob_shape[2]));
-  asm volatile (
-    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 3, %1;"
-    :: "l"(smem_int64_desc), "r"(prob_shape[3]));
-  asm volatile (
-    "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], 4, %1;"
-    :: "l"(smem_int64_desc), "r"(prob_shape[4]));
-  // Strides must be a multiple of 16. Also, stride for the intermost dimension is implicitly 1
-  #if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 5)))
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[1]));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[2]));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[3]));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[4]));
-  #else
-  // 4 LSBs are not included
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 0, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[1] >> 4));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 1, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[2] >> 4));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 2, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[3] >> 4));
-  asm volatile (
-    "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], 3, %1;"
-    :: "l"(smem_int64_desc), "l"(prob_stride[4] >> 4));
-  #endif
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Perform a fused copy and fence operation (needed when modifying tensormap in shared memory)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTE_HOST_DEVICE
-void
-tma_descriptor_cp_fence_release(TmaDescriptor const* gmem_desc_ptr, TmaDescriptor& smem_desc)
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(gmem_desc_ptr);
-  uint32_t smem_int_desc = cast_smem_ptr_to_uint(&smem_desc);
-  asm volatile (
-    "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], 128;"
-    :: "l"(gmem_int_desc), "r"(smem_int_desc));
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Perform a release fence operation (needed when modifying tensormap directly in GMEM)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTE_HOST_DEVICE
-void
-tma_descriptor_fence_release()
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  asm volatile ("fence.proxy.tensormap::generic.release.gpu;");
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Perform a acquire fence operation
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTE_HOST_DEVICE
-void
-tma_descriptor_fence_acquire(TmaDescriptor const* desc_ptr)
-{
-#if defined(CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED)
-  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-  asm volatile (
-    "fence.proxy.tensormap::generic.acquire.gpu [%0], 128;"
-    :
-    : "l"(gmem_int_desc)
-    : "memory");
-#else
-  CUTE_INVALID_CONTROL_PATH("Using TMA Descriptor modification without CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED and CUDA 12.3");
-#endif
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp b/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
deleted file mode 100755
index fb33d63ca..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp
+++ /dev/null
@@ -1,1395 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/config.hpp> // CUTE_ARCH_TMA_SMxx_ENABLED
-#include <cute/arch/copy.hpp>
-#include <cute/arch/copy_sm90.hpp>
-#include "cutlass/arch/synclog.hpp"
-
-namespace cute
-{
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_LOAD : Initiates a TMA copy from global memory to shared memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_1D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
-      " [%0], [%1, {%3}], [%2], %4;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(crd0), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.1d.L2.global"
-        " [%0, {%1}];"
-        :
-        : "l"(gmem_int_desc),
-          "r"(crd0)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_2D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
-      " [%0], [%1, {%3, %4}], [%2], %5;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(crd0), "r"(crd1), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.2d.L2.global"
-        " [%0, {%1, %2}];"
-        :
-        : "l"(gmem_int_desc),
-          "r"(crd0), "r"(crd1)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
-      " [%0], [%1, {%3, %4, %5}], [%2], %6;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.3d.L2.global"
-        " [%0, {%1, %2, %3}];"
-        :
-        : "l"(gmem_int_desc),
-          "r"(crd0), "r"(crd1), "r"(crd2)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
-      " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.4d.L2.global"
-        " [%0, {%1, %2, %3, %4}];"
-        :
-        : "l"(gmem_int_desc),
-          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint"
-      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.5d.L2.global"
-        " [%0, {%1, %2, %3, %4, %5}];"
-        :
-        : "l"(gmem_int_desc),
-          "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0)
-  {
-    return SM90_TMA_LOAD_1D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-    return SM90_TMA_LOAD_2D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-    return SM90_TMA_LOAD_3D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-    return SM90_TMA_LOAD_4D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-    return SM90_TMA_LOAD_5D::copy(desc_ptr, mbar_ptr, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0)
-    {
-      return SM90_TMA_LOAD_1D::PREFETCH::copy(desc_ptr, crd0);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1)
-    {
-      return SM90_TMA_LOAD_2D::PREFETCH::copy(desc_ptr, crd0, crd1);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-    {
-      return SM90_TMA_LOAD_3D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-    {
-      return SM90_TMA_LOAD_4D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-    {
-      return SM90_TMA_LOAD_5D::PREFETCH::copy(desc_ptr, crd0, crd1, crd2, crd3, crd4);
-    }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_LOAD im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_IM2COL_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-       uint16_t const& offset_w)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
-      " [%0], [%1, {%3, %4, %5}], [%2], {%6};"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_n),
-        "h"(offset_w)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-         uint16_t const& offset_w)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.3d.L2.global.im2col"
-        " [%0, {%1, %2, %3}], {%4};"
-        :
-        : "l"(gmem_int_desc),
-          "r"(coord_c), "r"(coord_w), "r"(coord_n),
-          "h"(offset_w)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_IM2COL_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
-      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8};"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
-        "h"(offset_w), "h"(offset_h)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-         uint16_t const& offset_w, uint16_t const& offset_h)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.4d.L2.global.im2col"
-        " [%0, {%1, %2, %3, %4}], {%5, %6};"
-        :
-        : "l"(gmem_int_desc),
-          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
-          "h"(offset_w), "h"(offset_h)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_IM2COL_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
-      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10};"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
-        "h"(offset_w), "h"(offset_h), "h"(offset_d)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-      asm volatile (
-        "cp.async.bulk.prefetch.tensor.5d.L2.global.im2col"
-        " [%0, {%1, %2, %3, %4, %5}], {%6, %7, %8};"
-        :
-        : "l"(gmem_int_desc),
-          "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
-          "h"(offset_w), "h"(offset_h), "h"(offset_d)
-        : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_TMA_LOAD_IM2COL
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-       uint16_t const& offset_w)
-  {
-    return SM90_TMA_LOAD_IM2COL_3D::copy(desc_ptr, mbar_ptr, smem_ptr,
-                                         coord_c, coord_w, coord_n,
-                                         offset_w);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h)
-  {
-    return SM90_TMA_LOAD_IM2COL_4D::copy(desc_ptr, mbar_ptr, smem_ptr,
-                                         coord_c, coord_w, coord_h, coord_n,
-                                         offset_w, offset_h);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-  {
-    return SM90_TMA_LOAD_IM2COL_5D::copy(desc_ptr, mbar_ptr, smem_ptr,
-                                         coord_c, coord_w, coord_h, coord_d, coord_n,
-                                         offset_w, offset_h, offset_d);
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-         uint16_t const& offset_w)
-    {
-      return SM90_TMA_LOAD_IM2COL_3D::PREFETCH::copy(desc_ptr,
-                                                     coord_c, coord_w, coord_n,
-                                                     offset_w);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-         uint16_t const& offset_w, uint16_t const& offset_h)
-    {
-      return SM90_TMA_LOAD_IM2COL_4D::PREFETCH::copy(desc_ptr,
-                                                     coord_c, coord_w, coord_h, coord_n,
-                                                     offset_w, offset_h);
-    }
-    CUTE_HOST_DEVICE static void
-    copy(void const* desc_ptr,
-         int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-         uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-    {
-      return SM90_TMA_LOAD_IM2COL_5D::PREFETCH::copy(desc_ptr,
-                                                     coord_c, coord_w, coord_h, coord_d, coord_n,
-                                                     offset_w, offset_h, offset_d);
-    }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_LOAD_MULTICAST: Initiates a TMA copy from global memory to shared memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_MULTICAST_1D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
-      " [%0], [%1, {%4}], [%2], %3, %5;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "h"(multicast_mask),
-        "r"(crd0), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_MULTICAST_2D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
-      " [%0], [%1, {%4, %5}], [%2], %3, %6;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "h"(multicast_mask),
-        "r"(crd0), "r"(crd1), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_MULTICAST_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
-      " [%0], [%1, {%4, %5, %6}], [%2], %3, %7;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "h"(multicast_mask),
-        "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_MULTICAST_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
-      " [%0], [%1, {%4, %5, %6, %7}], [%2], %3, %8;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "h"(multicast_mask),
-        "r"(crd0), "r"(crd1), "r"(crd2),  "r"(crd3), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_MULTICAST_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint"
-      " [%0], [%1, {%4, %5, %6, %7, %8}], [%2], %3, %9;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "h"(multicast_mask),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_MULTICAST
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0)
-  {
-    return SM90_TMA_LOAD_MULTICAST_1D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-    return SM90_TMA_LOAD_MULTICAST_2D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-    return SM90_TMA_LOAD_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-    return SM90_TMA_LOAD_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask, uint64_t cache_hint,
-       void      * smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-    return SM90_TMA_LOAD_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask, cache_hint, smem_ptr, crd0, crd1, crd2, crd3, crd4);
-  }
-
-  using PREFETCH = typename SM90_TMA_LOAD::PREFETCH;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_LOAD_MULTICAST im2col: Initiates a TMA copy, in im2col mode, from global memory to shared memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_IM2COL_MULTICAST_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-       uint16_t const& offset_w)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
-      " [%0], [%1, {%3, %4, %5}], [%2], {%6}, %7;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_n),
-        "h"(offset_w),
-        "h"(multicast_mask)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_IM2COL_MULTICAST_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
-      " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
-        "h"(offset_w), "h"(offset_h),
-        "h"(multicast_mask)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_IM2COL_MULTICAST_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_load(__LINE__, gmem_int_desc, smem_int_mbar, smem_int_ptr);
-    // Copy from global to shared::cluster.
-    asm volatile (
-      "cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster"
-      " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10}, %11;"
-      :
-      : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n),
-        "h"(offset_w), "h"(offset_h), "h"(offset_d),
-        "h"(multicast_mask)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_LOAD_IM2COL_MULTICAST
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n,
-       uint16_t const& offset_w)
-  {
-    return SM90_TMA_LOAD_IM2COL_MULTICAST_3D::copy(desc_ptr, mbar_ptr, multicast_mask,
-                                                   smem_ptr,
-                                                   coord_c, coord_w, coord_n,
-                                                   offset_w);
-  }
-
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h)
-  {
-    return SM90_TMA_LOAD_IM2COL_MULTICAST_4D::copy(desc_ptr, mbar_ptr, multicast_mask,
-                                                   smem_ptr,
-                                                   coord_c, coord_w, coord_h, coord_n,
-                                                   offset_w, offset_h);
-  }
-
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr, uint64_t* mbar_ptr, uint16_t multicast_mask,
-       void      * smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n,
-       uint16_t const& offset_w, uint16_t const& offset_h, uint16_t const& offset_d)
-  {
-    return SM90_TMA_LOAD_IM2COL_MULTICAST_5D::copy(desc_ptr, mbar_ptr, multicast_mask,
-                                                   smem_ptr,
-                                                   coord_c, coord_w, coord_h, coord_d, coord_n,
-                                                   offset_w, offset_h, offset_d);
-  }
-
-  using PREFETCH = typename SM90_TMA_LOAD_IM2COL::PREFETCH;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_STORE : Initiates a TMA copy from shared memory to global memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_STORE_1D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_2D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0)
-  {
-    return SM90_TMA_STORE_1D::copy(desc_ptr, smem_ptr, crd0);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-    return SM90_TMA_STORE_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-    return SM90_TMA_STORE_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-    return SM90_TMA_STORE_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-    return SM90_TMA_STORE_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_STORE im2col: Initiates a TMA copy, in im2col mode, from shared memory to global memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_STORE_IM2COL_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group"
-      " [%0, {%2, %3, %4}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(coord_c), "r"(coord_w), "r"(coord_n)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_IM2COL_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group"
-      " [%0, {%2, %3, %4, %5}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_IM2COL_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group"
-      " [%0, {%2, %3, %4, %5, %6}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_d), "r"(coord_n)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_STORE_IM2COL
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_n)
-  {
-    return SM90_TMA_STORE_IM2COL_3D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_n);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_n)
-  {
-    return SM90_TMA_STORE_IM2COL_4D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_n);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* desc_ptr,
-       void const* smem_ptr,
-       int32_t const& coord_c, int32_t const& coord_w, int32_t const& coord_h, int32_t const& coord_d, int32_t const& coord_n)
-  {
-    return SM90_TMA_STORE_IM2COL_5D::copy(desc_ptr, smem_ptr, coord_c, coord_w, coord_h, coord_d, coord_n);
-  }
-};
-
-// Fence for smem stores for subsequent TMA_STORE
-CUTE_HOST_DEVICE static void
-tma_store_fence() {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
-    asm volatile ("fence.proxy.async.shared::cta;");
-#elif defined(__CUDA_ARCH__)
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-}
-
-// Indicate arrival of warp issuing TMA_STORE
-CUTE_HOST_DEVICE static void
-tma_store_arrive() {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    cutlass::arch::synclog_emit_tma_store_arrive(__LINE__);
-    asm volatile("cp.async.bulk.commit_group;");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-}
-
-// Wait until at most Count committed TMA_STOREs are pending and all prior commits are complete
-template <int Count>
-CUTE_HOST_DEVICE static void
-tma_store_wait() {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    asm volatile(
-      "cp.async.bulk.wait_group.read %0;"
-      :
-      : "n"(Count)
-      : "memory");
-    cutlass::arch::synclog_emit_tma_store_wait(__LINE__, Count);
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// TMA_REDUCE_ADD : Initiates a TMA reduce-add from shared memory to global memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_REDUCE_ADD_1D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.bulk_group [%0, {%2}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_REDUCE_ADD_2D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group [%0, {%2, %3}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_REDUCE_ADD_3D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_REDUCE_ADD_4D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_REDUCE_ADD_5D
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_tma_store(__LINE__, gmem_int_desc, smem_int_ptr);
-    asm volatile (
-      "cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr),
-        "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
-      : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_TMA_REDUCE_ADD
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0)
-  {
-    return SM90_TMA_REDUCE_ADD_1D::copy(desc_ptr, smem_ptr, crd0);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1)
-  {
-    return SM90_TMA_REDUCE_ADD_2D::copy(desc_ptr, smem_ptr, crd0, crd1);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2)
-  {
-    return SM90_TMA_REDUCE_ADD_3D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3)
-  {
-    return SM90_TMA_REDUCE_ADD_4D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3);
-  }
-  CUTE_HOST_DEVICE static void
-  copy(void const* const desc_ptr,
-       void const* const smem_ptr,
-       int32_t const& crd0, int32_t const& crd1, int32_t const& crd2, int32_t const& crd3, int32_t const& crd4)
-  {
-    return SM90_TMA_REDUCE_ADD_5D::copy(desc_ptr, smem_ptr, crd0, crd1, crd2, crd3, crd4);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// BULK_COPY : Copy a bulk of memory between shared memory and global memory
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_BULK_COPY_G2S
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* gmem_ptr, uint64_t* mbar_ptr,
-       void      * smem_ptr, int32_t load_bytes)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
-                     :
-                     : "r"(smem_int_ptr), "l"(gmem_ptr), "r"(load_bytes), "r"(smem_int_mbar)
-                     : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  struct PREFETCH
-  {
-    CUTE_HOST_DEVICE static void
-    copy(void const* gmem_ptr, int32_t load_bytes)
-    {
-  #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-      asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n"
-                      :
-                      : "l"(gmem_ptr), "r"(load_bytes)
-                      : "memory");
-  #else
-      CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
-  #endif
-    }
-  };
-};
-
-struct SM90_BULK_COPY_S2G
-{
-  CUTE_HOST_DEVICE static void
-  copy(void const* smem_ptr,
-       void      * gmem_ptr, int32_t store_bytes)
-  {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
-                     :
-                     : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
-                     : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Trying to use BULK_COPY without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-struct SM90_BULK_COPY_AUTO {};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma.hpp
deleted file mode 100755
index 6e06114a6..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>           // CUTE_HOST_DEVICE
-#include <cute/numeric/complex.hpp>  // cute::fma
-#include <cute/numeric/real.hpp>     // cute::fma
-
-namespace cute
-{
-
-//
-// Direct FMA for any type
-//
-
-template <class D, class A = D, class B = A, class C = D>
-struct UniversalFMA
-{
-  using DRegisters = D[1];
-  using ARegisters = A[1];
-  using BRegisters = B[1];
-  using CRegisters = C[1];
-
-  CUTE_HOST_DEVICE static constexpr void
-  fma(D      & d,
-      A const& a,
-      B const& b,
-      C const& c)
-  {
-    // Forward to an ADL/cute free function for these types
-    using cute::fma;
-    fma(d, a, b, c);
-  }
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
deleted file mode 100755
index f7bcb7d19..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/arch/mma.hpp>
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
-#  define CUTE_ARCH_MMA_SM61_ENABLED
-#endif
-
-namespace cute
-{
-
-struct SM61_DP4A
-{
-  using DRegisters = int32_t[1];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = int32_t[1];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
-  {
-#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
-    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(d)
-                 : "r"(a), "r"(b), "r"(c));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP4A without CUTE_ARCH_MMA_SM61_ENABLED");
-#endif
-  }
-};
-
-struct SM61_DP2A
-{
-  using DRegisters = int32_t[1];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = int32_t[1];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(int32_t& d, uint32_t const& a, uint32_t const& b, int32_t const& c)
-  {
-#if defined(CUTE_ARCH_MMA_SM61_ENABLED)
-    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(d)
-                 : "r"(a), "r"(b), "r"(c));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM61_DP2A without CUTE_ARCH_MMA_SM61_ENABLED");
-#endif
-  }
-};
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
deleted file mode 100755
index 63d96cf5d..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/mma.hpp>
-
-// Config
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
-#  define CUTE_ARCH_MMA_SM70_SUPPORTED
-#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
-#    define CUTE_ARCH_MMA_SM70_ENABLED
-#  endif
-#endif
-
-namespace cute
-{
-
-//
-// SM70 MMA 884 F16F16F16
-//
-
-struct SM70_8x8x4_F16F16F16F16_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16"
-                 "{%0, %1,  %2,  %3},"
-                 "{%4, %5},"
-                 "{%6, %7},"
-                 "{%8, %9, %10, %11};\n"
-        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TN without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F16F16F16F16_NT
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16"
-                 "{%0, %1,  %2,  %3},"
-                 "{%4, %5},"
-                 "{%6, %7},"
-                 "{%8, %9, %10, %11};\n"
-        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NT without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F16F16F16F16_NN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16"
-                 "{%0, %1,  %2,  %3},"
-                 "{%4, %5},"
-                 "{%6, %7},"
-                 "{%8, %9, %10, %11};\n"
-        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_NN without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F16F16F16F16_TT
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16"
-                 "{%0, %1,  %2,  %3},"
-                 "{%4, %5},"
-                 "{%6, %7},"
-                 "{%8, %9, %10, %11};\n"
-        : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F16F16F16F16_TT without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// SM70 MMA 884 F16F16F32
-//
-
-struct SM70_8x8x4_F32F16F16F32_TN
-{
-  using DRegisters = float[8];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[8];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float      & d2, float      & d3,
-      float         & d4, float         & d5, float      & d6, float      & d7,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      float    const& c0, float    const& c1, float const& c2, float const& c3,
-      float    const& c4, float    const& c5, float const& c6, float const& c7)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32"
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-                 "{%8,  %9},"
-                 "{%10, %11},"
-                 "{%12, %13, %14, %15, %16, %17, %18, %19};\n"
-        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
-          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
-           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TN without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F32F16F16F32_NT
-{
-  using DRegisters = float[8];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[8];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float      & d2, float      & d3,
-      float         & d4, float         & d5, float      & d6, float      & d7,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      float    const& c0, float    const& c1, float const& c2, float const& c3,
-      float    const& c4, float    const& c5, float const& c6, float const& c7)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32"
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-                 "{%8,  %9},"
-                 "{%10, %11},"
-                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
-        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
-          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
-           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NT without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F32F16F16F32_NN
-{
-  using DRegisters = float[8];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[8];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float      & d2, float      & d3,
-      float         & d4, float         & d5, float      & d6, float      & d7,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      float    const& c0, float    const& c1, float const& c2, float const& c3,
-      float    const& c4, float    const& c5, float const& c6, float const& c7)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32"
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-                 "{%8,  %9},"
-                 "{%10, %11},"
-                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
-        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
-          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
-           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_NN without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM70_8x8x4_F32F16F16F32_TT
-{
-  using DRegisters = float[8];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[8];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float      & d2, float      & d3,
-      float         & d4, float         & d5, float      & d6, float      & d7,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0, uint32_t const& b1,
-      float    const& c0, float    const& c1, float const& c2, float const& c3,
-      float    const& c4, float    const& c5, float const& c6, float const& c7)
-  {
-#if defined(CUTE_ARCH_MMA_SM70_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32"
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-                 "{%8,  %9},"
-                 "{%10, %11},"
-                 "{%12, %13, %14, %15, %16, %17, %18, %19};"
-        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3),
-          "=f"(d4), "=f"(d5), "=f"(d6), "=f"(d7)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),  "r"(b1),
-           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3),
-           "f"(c4),  "f"(c5),  "f"(c6),  "f"(c7));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM70_8x8x4_F32F16F16F32_TT without CUTE_ARCH_MMA_SM70_ENABLED");
-#endif
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
deleted file mode 100755
index c33f7b391..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/mma.hpp>
-
-// Config
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
-#  define CUTE_ARCH_MMA_SM75_SUPPORTED
-#  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
-#    define CUTE_ARCH_MMA_SM75_ENABLED
-#  endif
-#endif
-
-namespace cute
-{
-
-//
-// SM75 MMA 1688 F16F16F32
-//
-
-struct SM75_16x8x8_F32F16F16F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = float[4];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float      & d2, float      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      float    const& c0, float    const& c1, float const& c2, float const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                 "{%0, %1, %2, %3},"
-                 "{%4, %5},"
-                 "{%6},"
-                 "{%7, %8, %9, %10};\n"
-        : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-        :  "r"(a0),  "r"(a1),
-           "r"(b0),
-           "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// SM75 MMA 8816 S8S8S32
-//
-
-struct SM75_8x8x16_S32S8S8S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  // Register asm fma
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM75_ENABLED)
-    asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32"
-                 "{%0, %1},"
-                 "{%2},"
-                 "{%3},"
-                 "{%4, %5};\n"
-        : "=r"(d0), "=r"(d1)
-        :  "r"(a0),
-           "r"(b0),
-           "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM75_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM75_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
deleted file mode 100755
index 60777f220..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp
+++ /dev/null
@@ -1,2243 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/arch/mma.hpp>
-#include <cute/numeric/complex.hpp>
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-#  define CUTE_ARCH_MMA_SM80_ENABLED
-
-#if (__CUDA_ARCH__ <= 900)
-#define CUTE_ARCH_MMA_B1_AND_SM80_ENABLED
-#endif
-
-#if (__CUDA_ARCH__ <= 890)
-#define CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED
-#endif
-
-#endif
-
-
-
-namespace cute {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct SM80_16x8x8_F16F16F16F16_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
-      "{%0, %1},"
-      "{%2, %3},"
-      "{%4},"
-      "{%5, %6};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_F16F16F16F16_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      "{%6,  %7},"
-      "{%8,  %9};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F16F16F16F16_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct SM80_16x8x8_F32F16F16F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_F32F16F16F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32F16F16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct SM80_16x8x8_F32BF16BF16F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_F32BF16BF16F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_F32BF16BF16F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x4 TN
-struct SM80_16x8x4_F32TF32TF32F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x4_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct SM80_16x8x8_F32TF32TF32F32_TN
-{
-  using DRegisters = float[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      float const   & c0, float const   & c1, float const   & c2, float const   & c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "f"(c0),  "f"(c1),  "f"(c2),  "f"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x8_F32TF32TF32F32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x4 TN
-struct SM80_8x8x4_F64F64F64F64_TN
-{
-  using DRegisters = double[2];
-  using ARegisters = double[1];
-  using BRegisters = double[1];
-  using CRegisters = double[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(double      & d0, double      & d1,
-      double const& a0,
-      double const& b0,
-      double const& c0, double const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=d"(d0), "=d"(d1)
-      :  "d"(a0),
-         "d"(b0),
-         "d"(c0),  "d"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-// MMA 8x8x4 TN with Planar Complex multiplication
-struct SM80_8x8x4_C64C64C64C64_TN
-{
-  using DRegisters = complex<double>[2];
-  using ARegisters = complex<double>[1];
-  using BRegisters = complex<double>[1];
-  using CRegisters = complex<double>[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(complex<double>      & d0, complex<double>      & d1,
-      complex<double> const& a0,
-      complex<double> const& b0,
-      complex<double> const& c0, complex<double> const& c1)
-  {
-    // Because thrust::complex does not provide a mutable ref
-    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
-    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
-    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
-    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
-
-    // d.real() =  a.real() * b.real() + c.real();
-    SM80_8x8x4_F64F64F64F64_TN::fma(
-      rd0, rd1,
-      a0.real(),
-      b0.real(),
-      c0.real(), c1.real());
-
-    // d.imag() =  a.imag() * b.real() + c.imag();
-    SM80_8x8x4_F64F64F64F64_TN::fma(
-      id0, id1,
-      a0.imag(),
-      b0.real(),
-      c0.imag(), c1.imag());
-
-    // d.real() = -a.imag() * b.imag() + d.real();
-    SM80_8x8x4_F64F64F64F64_TN::fma(
-      rd0, rd1,
-      -a0.imag(),
-      b0.imag(),
-      d0.real(), d1.real());
-
-    // d.imag() =  a.real() * b.imag() + d.imag();
-    SM80_8x8x4_F64F64F64F64_TN::fma(
-      id0, id1,
-      a0.real(),
-      b0.imag(),
-      d0.imag(), d1.imag());
-  }
-};
-
-// MMA 8x8x4 TN with Gaussian Complex multiplication:
-//    (a + bi)*(c + di)
-//  yields
-//    t0 += a*c
-//    t1 += b*d
-//    t2 += (a+b)*(c+d)
-//  then
-//    re = t0 - t1
-//    im = t2 - t0 - t1
-struct SM80_8x8x4_GC64C64C64GC64_TN
-{
-  struct GaussComplex {
-    double t0, t1, t2;
-
-    CUTE_HOST_DEVICE //constexpr
-    operator complex<double>() const { return complex<double>(t0 - t1, t2 - t0 - t1); }
-
-    CUTE_HOST_DEVICE friend //constexpr
-    complex<double> operator*(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) * b; }
-    CUTE_HOST_DEVICE friend //constexpr
-    complex<double> operator*(complex<double> const& a, GaussComplex const& b) { return b * a; }
-
-    CUTE_HOST_DEVICE friend //constexpr
-    complex<double> operator+(GaussComplex const& a, complex<double> const& b) { return static_cast<complex<double>>(a) + b; }
-    CUTE_HOST_DEVICE friend //constexpr
-    complex<double> operator+(complex<double> const& a, GaussComplex const& b) { return b + a; }
-  };
-
-  using DRegisters = GaussComplex[2];
-  using ARegisters = complex<double>[1];
-  using BRegisters = complex<double>[1];
-  using CRegisters = GaussComplex[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(GaussComplex         & d0, GaussComplex         & d1,
-      complex<double> const& a0,
-      complex<double> const& b0,
-      GaussComplex    const& c0, GaussComplex    const& c1)
-  {
-    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t0, d1.t0,
-                                    a0.real(),
-                                    b0.real(),
-                                    c0.t0, c1.t0);
-    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t1, d1.t1,
-                                    a0.imag(),
-                                    b0.imag(),
-                                    c0.t1, c1.t1);
-    SM80_8x8x4_F64F64F64F64_TN::fma(d0.t2, d1.t2,
-                                    a0.real() + a0.imag(),
-                                    b0.real() + b0.imag(),
-                                    c0.t2, c1.t2);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32S8S8S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32S8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32S8S8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32S8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S8S8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32S8U8S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32S8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.s8.u8.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32S8U8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32S8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S8U8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32U8S8S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32U8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32U8S8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32U8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U8S8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U8S8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8S8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32U8U8S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x16 TN
-struct SM80_8x8x16_S32U8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32U8U8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct SM80_16x8x16_S32U8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x16_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U8U8S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U8U8S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U8U8S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32S4S4S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32S4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S4S4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s4.s4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32S4S4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32S4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32S4U4S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32S4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.s4.u4.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S4U4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32S4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s4.u4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32S4U4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32S4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32S4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32U4S4S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32U4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U4S4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u4.s4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32U4S4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32U4S4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4S4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32U4U4S32_TN
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x32 TN
-struct SM80_8x8x32_S32U4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32.satfinite "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U4U4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x32 TN
-struct SM80_16x8x32_S32U4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.u4.u4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x32_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32U4U4S32_TN
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x64 TN
-struct SM80_16x8x64_S32U4U4S32_TN_SATURATE
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x64_S32U4U4S32_TN_SATURATE without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x128 TN
-struct SM80_8x8x128_S32U1U1S32_TN_XORPOPC
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.xor.popc "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x128 TN
-struct SM80_16x8x128_S32U1U1S32_TN_XORPOPC
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.xor.popc "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x256 TN
-struct SM80_16x8x256_S32U1U1S32_TN_XORPOPC
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_B1_XOR_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_XORPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 8x8x128 TN
-struct SM80_8x8x128_S32U1U1S32_TN_ANDPOPC
-{
-  using DRegisters = uint32_t[2];
-  using ARegisters = uint32_t[1];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& a0,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1)
-  {
-#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32.and.popc "
-      "{%0, %1},"
-      "{%2},"
-      "{%3},"
-      "{%4, %5};\n"
-      : "=r"(d0), "=r"(d1)
-      :  "r"(a0),
-         "r"(b0),
-         "r"(c0),  "r"(c1));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_8x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x128 TN
-struct SM80_16x8x128_S32U1U1S32_TN_ANDPOPC
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[2];
-  using BRegisters = uint32_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1,
-      uint32_t const& b0,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k128.row.col.s32.b1.b1.s32.and.popc "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),
-         "r"(b0),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x128_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x256 TN
-struct SM80_16x8x256_S32U1U1S32_TN_ANDPOPC
-{
-  using DRegisters = uint32_t[4];
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint32_t[2];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint32_t const& b0, uint32_t const& b1,
-      uint32_t const& c0, uint32_t const& c1, uint32_t const& c2, uint32_t const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_B1_AND_SM80_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "r"(b0),  "r"(b1),
-         "r"(c0),  "r"(c1),  "r"(c2),  "r"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM80_16x8x256_S32U1U1S32_TN_ANDPOPC without CUTE_ARCH_MMA_SM80_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
deleted file mode 100755
index 51d34563c..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp
+++ /dev/null
@@ -1,9331 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/arch/mma.hpp>
-
-// Config
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-#    define CUTE_ARCH_MMA_SM90_ENABLED
-#    define CUTE_ARCH_MMA_F64_SM90_ENABLED
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cute {
-
-namespace SM90 {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x4 TN
-struct MMA_16x8x4_F64F64F64F64_TN
-{
-  using DRegisters = double[4];
-  using ARegisters = double[2];
-  using BRegisters = double[1];
-  using CRegisters = double[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(double      & d0, double      & d1, double      & d2, double      & d3,
-      double const& a0, double const& a1,
-      double const& b0,
-      double const& c0, double const& c1, double const& c2, double const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64"
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5},"
-      "{%6},"
-      "{%7,  %8,  %9,  %10};\n"
-      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
-      :  "d"(a0),  "d"(a1),
-         "d"(b0),
-         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x4_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct MMA_16x8x8_F64F64F64F64_TN
-{
-  using DRegisters = double[4];
-  using ARegisters = double[4];
-  using BRegisters = double[2];
-  using CRegisters = double[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(double      & d0, double      & d1, double      & d2, double      & d3,
-      double const& a0, double const& a1, double const& a2, double const& a3,
-      double const& b0, double const& b1,
-      double const& c0, double const& c1, double const& c2, double const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64"
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      "{%8,  %9},"
-      "{%10, %11, %12, %13};\n"
-      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
-      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
-         "d"(b0),  "d"(b1),
-         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x8_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct MMA_16x8x16_F64F64F64F64_TN
-{
-  using DRegisters = double[4];
-  using ARegisters = double[8];
-  using BRegisters = double[4];
-  using CRegisters = double[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(double      & d0, double      & d1, double      & d2, double      & d3,
-      double const& a0, double const& a1, double const& a2, double const& a3,
-      double const& a4, double const& a5, double const& a6, double const& a7,
-      double const& b0, double const& b1, double const& b2, double const& b3,
-      double const& c0, double const& c1, double const& c2, double const& c3)
-  {
-#if defined(CUTE_ARCH_MMA_F64_SM90_ENABLED)
-    asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64"
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7,  %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      "{%16, %17, %18, %19};\n"
-      : "=d"(d0), "=d"(d1), "=d"(d2), "=d"(d3)
-      :  "d"(a0),  "d"(a1),  "d"(a2),  "d"(a3),
-         "d"(a4),  "d"(a5),  "d"(a6),  "d"(a7),
-         "d"(b0),  "d"(b1),  "d"(b2),  "d"(b3),
-         "d"(c0),  "d"(c1),  "d"(c2),  "d"(c3));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_16x8x16_F64F64F64F64_TN without CUTE_ARCH_MMA_SM90_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x4 TN
-struct MMA_16x8x4_C64C64C64C64_TN
-{
-  using DRegisters = complex<double>[4];
-  using ARegisters = complex<double>[2];
-  using BRegisters = complex<double>[1];
-  using CRegisters = complex<double>[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(complex<double>      & d0, complex<double>      & d1,
-      complex<double>      & d2, complex<double>      & d3,
-      complex<double> const& a0, complex<double> const& a1,
-      complex<double> const& b0,
-      complex<double> const& c0, complex<double> const& c1,
-      complex<double> const& c2, complex<double> const& c3)
-  {
-    // Because thrust::complex does not provide a mutable ref
-    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
-    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
-    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
-    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
-    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
-    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
-    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
-    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
-
-    // d.real() =  a.real() * b.real() + c.real();
-    MMA_16x8x4_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      a0.real(), a1.real(),
-      b0.real(),
-      c0.real(), c1.real(), c2.real(), c3.real());
-
-    // d.imag() =  a.imag() * b.real() + c.imag();
-    MMA_16x8x4_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.imag(), a1.imag(),
-      b0.real(),
-      c0.imag(), c1.imag(), c2.imag(), c3.imag());
-
-    // d.real() = -a.imag() * b.imag() + d.real();
-    MMA_16x8x4_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      -a0.imag(), -a1.imag(),
-      b0.imag(),
-      d0.real(), d1.real(), d2.real(), d3.real());
-
-    // d.imag() =  a.real() * b.imag() + d.imag();
-    MMA_16x8x4_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.real(), a1.real(),
-      b0.imag(),
-      d0.imag(), d1.imag(), d2.imag(), d3.imag());
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x8 TN
-struct MMA_16x8x8_C64C64C64C64_TN
-{
-  using DRegisters = complex<double>[4];
-  using ARegisters = complex<double>[4];
-  using BRegisters = complex<double>[2];
-  using CRegisters = complex<double>[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(complex<double>      & d0, complex<double>      & d1,
-      complex<double>      & d2, complex<double>      & d3,
-      complex<double> const& a0, complex<double> const& a1,
-      complex<double> const& a2, complex<double> const& a3,
-      complex<double> const& b0, complex<double> const& b1,
-      complex<double> const& c0, complex<double> const& c1,
-      complex<double> const& c2, complex<double> const& c3)
-  {
-    // Because thrust::complex does not provide a mutable ref
-    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
-    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
-    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
-    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
-    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
-    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
-    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
-    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
-
-    // d.real() =  a.real() * b.real() + c.real();
-    MMA_16x8x8_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      a0.real(), a1.real(), a2.real(), a3.real(),
-      b0.real(), b1.real(),
-      c0.real(), c1.real(), c2.real(), c3.real());
-
-    // d.imag() =  a.imag() * b.real() + c.imag();
-    MMA_16x8x8_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
-      b0.real(), b1.real(),
-      c0.imag(), c1.imag(), c2.imag(), c3.imag());
-
-    // d.real() = -a.imag() * b.imag() + d.real();
-    MMA_16x8x8_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
-      b0.imag(), b1.imag(),
-      d0.real(), d1.real(), d2.real(), d3.real());
-
-    // d.imag() =  a.real() * b.imag() + d.imag();
-    MMA_16x8x8_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.real(), a1.real(), a2.real(), a3.real(),
-      b0.imag(), b1.imag(),
-      d0.imag(), d1.imag(), d2.imag(), d3.imag());
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// MMA 16x8x16 TN
-struct MMA_16x8x16_C64C64C64C64_TN
-{
-  using DRegisters = complex<double>[4];
-  using ARegisters = complex<double>[8];
-  using BRegisters = complex<double>[4];
-  using CRegisters = complex<double>[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(complex<double>      & d0, complex<double>      & d1,
-      complex<double>      & d2, complex<double>      & d3,
-      complex<double> const& a0, complex<double> const& a1,
-      complex<double> const& a2, complex<double> const& a3,
-      complex<double> const& a4, complex<double> const& a5,
-      complex<double> const& a6, complex<double> const& a7,
-      complex<double> const& b0, complex<double> const& b1,
-      complex<double> const& b2, complex<double> const& b3,
-      complex<double> const& c0, complex<double> const& c1,
-      complex<double> const& c2, complex<double> const& c3)
-  {
-    // Because thrust::complex does not provide a mutable ref
-    double& rd0 = reinterpret_cast<double(&)[2]>(d0)[0];
-    double& id0 = reinterpret_cast<double(&)[2]>(d0)[1];
-    double& rd1 = reinterpret_cast<double(&)[2]>(d1)[0];
-    double& id1 = reinterpret_cast<double(&)[2]>(d1)[1];
-    double& rd2 = reinterpret_cast<double(&)[2]>(d2)[0];
-    double& id2 = reinterpret_cast<double(&)[2]>(d2)[1];
-    double& rd3 = reinterpret_cast<double(&)[2]>(d3)[0];
-    double& id3 = reinterpret_cast<double(&)[2]>(d3)[1];
-
-    // d.real() =  a.real() * b.real() + c.real();
-    MMA_16x8x16_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      a0.real(), a1.real(), a2.real(), a3.real(),
-      a4.real(), a5.real(), a6.real(), a7.real(),
-      b0.real(), b1.real(), b2.real(), b3.real(),
-      c0.real(), c1.real(), c2.real(), c3.real());
-
-    // d.imag() =  a.imag() * b.real() + c.imag();
-    MMA_16x8x16_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.imag(), a1.imag(), a2.imag(), a3.imag(),
-      a4.imag(), a5.imag(), a6.imag(), a7.imag(),
-      b0.real(), b1.real(), b2.real(), b3.real(),
-      c0.imag(), c1.imag(), c2.imag(), c3.imag());
-
-    // d.real() = -a.imag() * b.imag() + d.real();
-    MMA_16x8x16_F64F64F64F64_TN::fma(
-      rd0, rd1, rd2, rd3,
-      -a0.imag(), -a1.imag(), -a2.imag(), -a3.imag(),
-      -a4.imag(), -a5.imag(), -a6.imag(), -a7.imag(),
-      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
-      d0.real(), d1.real(), d2.real(), d3.real());
-
-    // d.imag() =  a.real() * b.imag() + d.imag();
-    MMA_16x8x16_F64F64F64F64_TN::fma(
-      id0, id1, id2, id3,
-      a0.real(), a1.real(), a2.real(), a3.real(),
-      a4.real(), a5.real(), a6.real(), a7.real(),
-      b0.imag(), b1.imag(), b2.imag(), b3.imag(),
-      d0.imag(), d1.imag(), d2.imag(), d3.imag());
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-
-} // namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cute/arch/mma_sm90_desc.hpp>
-#include <cute/arch/mma_sm90_gmma.hpp>
-#include <cute/arch/mma_sm90_gmma_sparse.hpp>
-#include <cute/layout.hpp>                     // cute::size
-#include <cute/numeric/integral_constant.hpp>  // cute::is_static
-#include <cute/numeric/numeric_types.hpp>      // cute::half_t, cute::float_e4m3_t, cute::tfloat32_t, etc
-#include <cute/util/type_traits.hpp>           // cute::is_same_v
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cute {
-namespace SM90::GMMA {
-
-template <
-  class ElementA,
-  class ElementB,
-  class ElementC,
-  class TileShape_MNK,
-  GMMA::Major MajorA = GMMA::Major::K,
-  GMMA::Major MajorB = GMMA::Major::K,
-  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
-                                       // But most commonly leave empty for defaults
->
-CUTE_HOST_DEVICE constexpr
-auto
-ss_op_selector()
-{
-  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
-  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
-  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
-  auto Tile_N = size<1>(TileShape_MNK{});
-
-  // F16 accumulator
-  if constexpr (is_same_v<ElementC, half_t>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // F32 accumulator
-  else if constexpr (is_same_v<ElementC, float>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: bfloat16_t ; Input B: bfloat16_t
-    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: tfloat32_t ; Input B: tfloat32_t
-    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // S32 accumulator
-  else if constexpr (is_same_v<ElementC, int32_t>) {
-
-    // Input A: int8_t ; Input B: int8_t
-    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: int8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: int8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // Unknown accumulator type
-  else {
-    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
-  }
-}
-
-template <
-  class ElementA,
-  class ElementB,
-  class ElementC,
-  class TileShape_MNK,
-  GMMA::Major MajorA = GMMA::Major::K,
-  GMMA::Major MajorB = GMMA::Major::K,
-  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
-                                       // But most commonly leave empty for defaults
->
-CUTE_HOST_DEVICE constexpr
-auto
-ss_op_selector_sparse()
-{
-  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
-  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
-  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
-  auto Tile_N = size<1>(TileShape_MNK{});
-
-  // F16 accumulator
-  if constexpr (is_same_v<ElementC, half_t>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // F32 accumulator
-  else if constexpr (is_same_v<ElementC, float>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: bfloat16_t ; Input B: bfloat16_t
-    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: tfloat32_t ; Input B: tfloat32_t
-    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // S32 accumulator
-  else if constexpr (is_same_v<ElementC, int32_t>) {
-
-    // Input A: int8_t ; Input B: int8_t
-    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: int8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: int8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // Unknown accumulator type
-  else {
-    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
-  }
-}
-
-template <
-  class ElementA,
-  class ElementB,
-  class ElementC,
-  class TileShape_MNK,
-  GMMA::Major MajorA = GMMA::Major::K,
-  GMMA::Major MajorB = GMMA::Major::K,
-  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
-                                       // But most commonly leave empty for defaults
->
-CUTE_HOST_DEVICE constexpr
-auto
-rs_op_selector()
-{
-  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
-  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
-  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
-  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
-  auto Tile_N = size<1>(TileShape_MNK{});
-
-  // F16 accumulator
-  if constexpr (is_same_v<ElementC, half_t>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // F32 accumulator
-  else if constexpr (is_same_v<ElementC, float>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: bfloat16_t ; Input B: bfloat16_t
-    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: tfloat32_t ; Input B: tfloat32_t
-    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 8 == 0, "Tile_K must be a multiple of 8.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // S32 accumulator
-  else if constexpr (is_same_v<ElementC, int32_t>) {
-
-    // Input A: int8_t ; Input B: int8_t
-    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: int8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: int8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // Unknown accumulator type
-  else {
-    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
-  }
-}
-
-template <
-  class ElementA,
-  class ElementB,
-  class ElementC,
-  class TileShape_MNK,
-  GMMA::Major MajorA = GMMA::Major::K,
-  GMMA::Major MajorB = GMMA::Major::K,
-  auto... Args                         // e.g. GMMA::ScaleOut::One, [GMMA::ScaleIn::One, GMMA::ScaleIn::One]
-                                       // But most commonly leave empty for defaults
->
-CUTE_HOST_DEVICE constexpr
-auto
-rs_op_selector_sparse()
-{
-  static_assert(is_static<TileShape_MNK>::value, "TileShape_MNK must be static.");
-  static_assert(rank(TileShape_MNK{}) == 3, "TileShape_MNK must be rank 3.");
-  static_assert(size<0>(TileShape_MNK{}) % 64 == 0, "Tile_M must be a multiple of 64.");
-  static_assert(MajorA == GMMA::Major::K, "Register source A operand GMMAs must have K-major A layout.");
-  auto Tile_N = size<1>(TileShape_MNK{});
-
-  // F16 accumulator
-  if constexpr (is_same_v<ElementC, half_t>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // F32 accumulator
-  else if constexpr (is_same_v<ElementC, float>) {
-
-    // Input A: half_t ; Input B: half_t
-    if constexpr (is_same_v<ElementA, half_t> && is_same_v<ElementB, half_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: bfloat16_t ; Input B: bfloat16_t
-    else if constexpr (is_same_v<ElementA, bfloat16_t> && is_same_v<ElementB, bfloat16_t>) {
-      static_assert(size<2>(TileShape_MNK{}) % 32 == 0, "Tile_K must be a multiple of 32.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<MajorA, MajorB, Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: tfloat32_t ; Input B: tfloat32_t
-    else if constexpr (is_same_v<ElementA, tfloat32_t> && is_same_v<ElementB, tfloat32_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 16 == 0, "Tile_K must be a multiple of 16.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e4m3_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e4m3_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e4m3_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e4m3_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: float_e5m2_t ; Input B: float_e5m2_t
-    else if constexpr (is_same_v<ElementA, float_e5m2_t> && is_same_v<ElementB, float_e5m2_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 248 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 232 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 216 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 200 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 184 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 168 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 152 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 136 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 120 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 104 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 88 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 72 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 56 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 40 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // S32 accumulator
-  else if constexpr (is_same_v<ElementC, int32_t>) {
-
-    // Input A: int8_t ; Input B: int8_t
-    if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: int8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, int8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: int8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, int8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    // Input A: uint8_t ; Input B: uint8_t
-    else if constexpr (is_same_v<ElementA, uint8_t> && is_same_v<ElementB, uint8_t>) {
-      static_assert(MajorA == GMMA::Major::K, "MajorA must be GMMA::Major::K for this config.");
-      static_assert(MajorB == GMMA::Major::K, "MajorB must be GMMA::Major::K for this config.");
-      static_assert(size<2>(TileShape_MNK{}) % 64 == 0, "Tile_K must be a multiple of 64.");
-
-      if constexpr (Tile_N % 256 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 240 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 224 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 208 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 192 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 176 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 160 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 144 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 128 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 112 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 96 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 80 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 64 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 48 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 32 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<Args...>{};
-      }
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-      else if constexpr (Tile_N % 24 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<Args...>{};
-      }
-#endif
-      else if constexpr (Tile_N % 16 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<Args...>{};
-      }
-      else if constexpr (Tile_N % 8 == 0) {
-        return SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<Args...>{};
-      }
-      else {
-        static_assert(Tile_N % 8 == 0, "Tile_N must be a multiple of 8.");
-      }
-    }
-
-    else {
-      static_assert(sizeof(ElementA) == 0, "No eligible GMMA operator for request configuration.");
-    }
-  }
-
-  // Unknown accumulator type
-  else {
-    static_assert(sizeof(ElementC) == 0, "Unknown ElementC accumulator type.");
-  }
-}
-
-} // end namespace SM90::GMMA
-} // end namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
deleted file mode 100755
index a53a9748b..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/arch/mma.hpp>
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-#    define CUTE_ARCH_MMA_SM90A_ENABLED
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cute {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// GMMA Descriptor and utilities
-
-// GMMA enums and utilities
-namespace SM90::GMMA {
-
-enum class LayoutType : uint8_t {
-  INTERLEAVE = 0,
-  B128 = 1,
-  B64 = 2,
-  B32 = 3,
-};
-
-CUTE_HOST_DEVICE char const* to_string(LayoutType const& t) {
-  switch (t) {
-    case LayoutType::INTERLEAVE: return "INTERLEAVE";
-    case LayoutType::B128:       return "B128";
-    case LayoutType::B64:        return "B64";
-    case LayoutType::B32:        return "B32";
-  }
-  return nullptr;
-}
-
-#if !defined(__CUDACC_RTC__)
-// Output operator for all enums in this namespace
-CUTE_HOST std::ostream& operator<<(std::ostream& os, LayoutType const& t) {
-  char const* s = to_string(t);
-  if (s) {
-    std::operator<<(os, s);  // Explicit call to avoid ambiguity
-  } else {
-    os.setstate(std::ios_base::failbit);
-  }
-  return os;
-}
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace SM90::GMMA
-
-union GmmaDescriptor
-{
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor() noexcept : desc_(0) {}
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor(uint64_t desc) noexcept : desc_(desc) {}
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor(GmmaDescriptor const& t) noexcept : desc_(t.desc_) {}
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor(GmmaDescriptor && t) noexcept : desc_(t.desc_) {}
-
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor& operator=(GmmaDescriptor const& t) noexcept {
-    desc_ = t.desc_;
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  GmmaDescriptor& operator=(GmmaDescriptor && t) noexcept {
-    desc_ = t.desc_;
-    return *this;
-  }
-
-  uint64_t desc_;
-  uint32_t reg32_[2];
-  uint16_t reg16_[4];
-
-  // Bitfield implementation avoids the need for shifts in assignment
-  struct {
-    // start_address, bit [0,14), 4LSB not included
-    uint16_t start_address_ : 14, : 2;        // 14 bits [0,14), 2 bits unused
-    // leading dimension byte offset, bit [16,30), 4LSB not included
-    // For N: This is the stride from the first col to the second col of the 8x2 brick in INTERLEAVED
-    //   Unused for all SWIZZLE_* layouts (and assumed to be 1)
-    // For T: This is the stride from the first 8 rows to the next 8 rows.
-    uint16_t leading_byte_offset_ : 14, : 2;  // 14 bits [0,14), 2 bits unused
-    // stride dimension byte offset, bit [32,46), 4LSB not included
-    // For N: This is the stride from the first 8 rows to the next 8 rows.
-    // For T: This is the stride fro mthe first 8 cols to the next 8 cols.
-    uint16_t stride_byte_offset_ : 14, : 2;   // 14 bits [0,14), 2 bits unused
-    // base_offset, bit [49,52)
-    // Valid only for SWIZZLE_128B and SWIZZLE_64B
-    uint8_t : 1, base_offset_ : 3, : 4;       // 1 bit unused, 3 bits [1,4), 4 bits unused
-    // layout type, bit [62,64)
-    // SWIZZLE_NONE = 0, SWIZZLE_32B = 3, SWIZZLE_64B = 2, SWIZZLE_128B = 1
-    uint8_t : 6, layout_type_ : 2;            // 6 bits unused, 2 bits [6,8)
-  } bitfield;
-
-  // Decay to a uint64_t
-  CUTE_HOST_DEVICE constexpr
-  operator uint64_t() const noexcept { return desc_; }
-};
-
-// Printer
-CUTE_HOST_DEVICE void
-print(GmmaDescriptor const& t)
-{
-#if !defined(__CUDACC_RTC__)
-  printf("GmmaDescriptor: 0x%016llx\n",   static_cast<unsigned long long>(t.desc_));
-  printf("  start_addr :  0x%04x\n",      t.bitfield.start_address_);
-  printf("  leading_off:  0x%04x (%d)\n", t.bitfield.leading_byte_offset_, t.bitfield.leading_byte_offset_);
-  printf("  stride_off :  0x%04x (%d)\n", t.bitfield.stride_byte_offset_, t.bitfield.stride_byte_offset_);
-  printf("  base_offset:  0x%01x\n",      t.bitfield.base_offset_);
-  printf("  layout_type:  0x%01x (%s)\n", t.bitfield.layout_type_, to_string(static_cast<SM90::GMMA::LayoutType>(t.bitfield.layout_type_)));
-#endif // !defined(__CUDACC_RTC__)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
deleted file mode 100755
index d809aa4a6..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp
+++ /dev/null
@@ -1,20974 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
-
-#include "cutlass/arch/synclog.hpp"
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-#  define CUTE_ARCH_MMA_SM90A_ENABLED
-#endif
-
-namespace cute {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Warpgroup sync primitives
-
-CUTE_HOST_DEVICE
-void
-warpgroup_arrive()
-{
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-  cutlass::arch::synclog_emit_warpgroup_arrive(__LINE__);
-  asm volatile ("wgmma.fence.sync.aligned;\n" ::: "memory");
-#else
-  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.fence without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-}
-
-template <int N>
-CUTE_HOST_DEVICE
-void
-warpgroup_wait()
-{
-  static_assert(N >= 0 && N <= 7, "WGMMA wait: N must be in range [0, 7]");
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-  cutlass::arch::synclog_emit_warpgroup_wait(__LINE__, N);
-  asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory");
-#else
-  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.wait_group<N> without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-}
-
-// Marks the commit point for one or more sized batch of warpgroup MMAs.
-CUTE_HOST_DEVICE
-void
-warpgroup_commit_batch()
-{
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-  cutlass::arch::synclog_emit_warpgroup_commit_batch(__LINE__);
-  asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory");
-#else
-  CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.commit_group without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-}
-
-CUTE_HOST_DEVICE
-void
-warpgroup_fence_operand(uint32_t& reg) {
-  // MSVC emits a build error for 'asm volatile'
-  // even if it only occurs in a __device__ function.
-  // This prevents the error.
-#if defined(__CUDA_ARCH__)
-  asm volatile("" : "+r"(reg) :: "memory");
-#endif
-}
-
-CUTE_HOST_DEVICE
-void
-warpgroup_fence_operand(float& reg) {
-#if defined(__CUDA_ARCH__)
-  asm volatile("" : "+f"(reg) :: "memory");
-#endif
-}
-
-namespace SM90::GMMA {
-
-enum class Major {
-  K  = 0,
-  MN = 1
-};
-
-enum class ScaleOut {
-  Zero = 0,
-  One  = 1
-};
-
-enum class ScaleIn {
-  Neg = -1,
-  One =  1
-};
-
-enum class SparseSel {
-  Zero = 0,
-  One  = 1
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %4, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " p,  %5, %6, %7, %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %7, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " p,   %8,  %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8,  %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12, %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15, %16;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20, %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28, %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36, %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52, %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68,  %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71,  %72;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8,  %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12, %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15, %16;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20, %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36, %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52, %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68,  %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100, %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103, %104;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132, %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135, %136;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8,  %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12, %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15, %16;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20, %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36, %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52, %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68,  %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100, %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103, %104;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132, %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135, %136;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*S8
-struct MMA_64x8x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*S8
-struct MMA_64x8x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*S8
-struct MMA_64x16x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*S8
-struct MMA_64x16x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*S8
-struct MMA_64x32x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*S8
-struct MMA_64x32x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*S8
-struct MMA_64x64x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*S8
-struct MMA_64x64x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*S8
-struct MMA_64x96x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*S8
-struct MMA_64x96x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*S8
-struct MMA_64x128x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*S8
-struct MMA_64x128x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*S8
-struct MMA_64x192x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*S8
-struct MMA_64x192x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*S8
-struct MMA_64x256x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*S8
-struct MMA_64x256x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*S8
-struct MMA_64x8x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*S8
-struct MMA_64x8x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*S8
-struct MMA_64x16x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*S8
-struct MMA_64x16x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*S8
-struct MMA_64x32x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*S8
-struct MMA_64x32x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*S8
-struct MMA_64x64x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*S8
-struct MMA_64x64x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*S8
-struct MMA_64x96x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*S8
-struct MMA_64x96x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*S8
-struct MMA_64x128x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*S8
-struct MMA_64x128x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*S8
-struct MMA_64x192x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*S8
-struct MMA_64x192x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*S8
-struct MMA_64x256x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*S8
-struct MMA_64x256x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*U8
-struct MMA_64x8x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*U8
-struct MMA_64x8x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*U8
-struct MMA_64x16x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*U8
-struct MMA_64x16x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*U8
-struct MMA_64x32x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*U8
-struct MMA_64x32x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*U8
-struct MMA_64x64x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*U8
-struct MMA_64x64x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*U8
-struct MMA_64x96x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*U8
-struct MMA_64x96x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*U8
-struct MMA_64x128x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*U8
-struct MMA_64x128x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*U8
-struct MMA_64x192x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*U8
-struct MMA_64x192x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*U8
-struct MMA_64x256x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*U8
-struct MMA_64x256x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*U8
-struct MMA_64x8x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=S8*U8
-struct MMA_64x8x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*U8
-struct MMA_64x16x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=S8*U8
-struct MMA_64x16x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*U8
-struct MMA_64x32x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=S8*U8
-struct MMA_64x32x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*U8
-struct MMA_64x64x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=S8*U8
-struct MMA_64x64x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*U8
-struct MMA_64x96x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=S8*U8
-struct MMA_64x96x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*U8
-struct MMA_64x128x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=S8*U8
-struct MMA_64x128x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*U8
-struct MMA_64x192x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=S8*U8
-struct MMA_64x192x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*U8
-struct MMA_64x256x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=S8*U8
-struct MMA_64x256x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*S8
-struct MMA_64x8x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*S8
-struct MMA_64x8x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*S8
-struct MMA_64x16x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*S8
-struct MMA_64x16x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*S8
-struct MMA_64x32x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*S8
-struct MMA_64x32x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*S8
-struct MMA_64x64x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*S8
-struct MMA_64x64x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*S8
-struct MMA_64x96x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*S8
-struct MMA_64x96x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*S8
-struct MMA_64x128x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*S8
-struct MMA_64x128x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*S8
-struct MMA_64x192x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*S8
-struct MMA_64x192x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*S8
-struct MMA_64x256x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*S8
-struct MMA_64x256x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*S8
-struct MMA_64x8x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*S8
-struct MMA_64x8x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*S8
-struct MMA_64x16x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*S8
-struct MMA_64x16x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*S8
-struct MMA_64x32x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*S8
-struct MMA_64x32x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*S8
-struct MMA_64x64x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*S8
-struct MMA_64x64x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*S8
-struct MMA_64x96x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*S8
-struct MMA_64x96x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*S8
-struct MMA_64x128x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*S8
-struct MMA_64x128x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*S8
-struct MMA_64x192x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*S8
-struct MMA_64x192x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*S8
-struct MMA_64x256x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*S8
-struct MMA_64x256x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*U8
-struct MMA_64x8x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*U8
-struct MMA_64x8x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*U8
-struct MMA_64x16x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*U8
-struct MMA_64x16x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*U8
-struct MMA_64x32x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*U8
-struct MMA_64x32x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*U8
-struct MMA_64x64x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*U8
-struct MMA_64x64x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*U8
-struct MMA_64x96x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*U8
-struct MMA_64x96x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*U8
-struct MMA_64x128x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*U8
-struct MMA_64x128x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*U8
-struct MMA_64x192x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*U8
-struct MMA_64x192x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*U8
-struct MMA_64x256x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*U8
-struct MMA_64x256x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*U8
-struct MMA_64x8x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN S32+=U8*U8
-struct MMA_64x8x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*U8
-struct MMA_64x16x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN S32+=U8*U8
-struct MMA_64x16x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*U8
-struct MMA_64x32x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN S32+=U8*U8
-struct MMA_64x32x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*U8
-struct MMA_64x64x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN S32+=U8*U8
-struct MMA_64x64x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*U8
-struct MMA_64x96x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN S32+=U8*U8
-struct MMA_64x96x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*U8
-struct MMA_64x128x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN S32+=U8*U8
-struct MMA_64x128x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*U8
-struct MMA_64x192x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN S32+=U8*U8
-struct MMA_64x192x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*U8
-struct MMA_64x256x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN S32+=U8*U8
-struct MMA_64x256x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %4, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " p,  %5, %6;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %7, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " p,   %8,  %9;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %4, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " p,  %5, %6;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %7, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " p,   %8,  %9;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %4, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " p,  %5, %6;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %7, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " p,   %8,  %9;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %4, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " p,  %5, %6;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %7, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e5m2 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " p,   %8,  %9;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x8x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x8x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x8x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " p,   %7,  %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x16x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x16x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n16k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x16x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x32x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x32x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x32x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x64x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x64x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x64x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x96x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x96x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n96k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x96x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x128x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x128x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x128x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " p,    %54,  %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %98, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " p,    %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x192x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x192x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %101, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " p,    %102, %103;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x192x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %130, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " p,    %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x256x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x256x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %133, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " p,    %134, %135;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x256x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace SM90::GMMA
-
-} // namespace cute
-
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-#include "mma_sm90_gmma_ext.hpp"
-#endif
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
deleted file mode 100755
index 10a36aff8..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp
+++ /dev/null
@@ -1,56445 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
-#pragma once
-  
-#include <cute/config.hpp>                // CUTE_HOST_DEVICE
-
-#include "cutlass/arch/synclog.hpp"
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-#  define CUTE_ARCH_MMA_SM90A_ENABLED
-#endif
-
-namespace cute {
-
-namespace SM90::GMMA {
-
-// GMMA 64x24x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " p,   %9,  %10, %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " p,   %12, %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " p,   %13, %14, %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " p,   %16, %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16, %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " p,   %17, %18, %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " p,   %20, %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " p,   %21, %22, %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " p,   %24, %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24, %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " p,   %25, %26, %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " p,   %28, %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " p,   %29, %30, %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " p,   %32, %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32, %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " p,   %33, %34, %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " p,   %36, %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " p,   %37, %38, %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " p,   %40, %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40, %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " p,   %41, %42, %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " p,   %44, %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44, %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " p,   %45, %46, %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " p,   %48, %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48, %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " p,   %49, %50, %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " p,   %52, %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " p,    %53,  %54,  %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " p,    %56,  %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56,  %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " p,    %57,  %58,  %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " p,    %60,  %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60,  %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " p,    %61,  %62,  %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " p,    %64,  %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64,  %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " p,    %65,  %66,  %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " p,    %68,  %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16, %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24, %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28, %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32, %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40, %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44, %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48, %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56,  %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60,  %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64,  %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72,  %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76,  %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80,  %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84,  %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88,  %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92,  %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96,  %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104, %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108, %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112, %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116, %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120, %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124, %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128, %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16, %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24, %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28, %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32, %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40, %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44, %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48, %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k16.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56,  %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60,  %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64,  %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72,  %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76,  %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80,  %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84,  %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88,  %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92,  %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96,  %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104, %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108, %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112, %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116, %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120, %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124, %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128, %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x16 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x16_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k16.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x16_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k8.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x8_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x8 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x8_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k8.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x8_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*S8
-struct MMA_64x24x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*S8
-struct MMA_64x24x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*S8
-struct MMA_64x48x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*S8
-struct MMA_64x48x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*S8
-struct MMA_64x80x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*S8
-struct MMA_64x80x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*S8
-struct MMA_64x112x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*S8
-struct MMA_64x112x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*S8
-struct MMA_64x144x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*S8
-struct MMA_64x144x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*S8
-struct MMA_64x160x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*S8
-struct MMA_64x160x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*S8
-struct MMA_64x176x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*S8
-struct MMA_64x176x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*S8
-struct MMA_64x208x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*S8
-struct MMA_64x208x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*S8
-struct MMA_64x224x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*S8
-struct MMA_64x224x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*S8
-struct MMA_64x240x32_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*S8
-struct MMA_64x240x32_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*S8
-struct MMA_64x24x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*S8
-struct MMA_64x24x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*S8
-struct MMA_64x48x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*S8
-struct MMA_64x48x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*S8
-struct MMA_64x80x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*S8
-struct MMA_64x80x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*S8
-struct MMA_64x112x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*S8
-struct MMA_64x112x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*S8
-struct MMA_64x144x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*S8
-struct MMA_64x144x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*S8
-struct MMA_64x160x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*S8
-struct MMA_64x160x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*S8
-struct MMA_64x176x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*S8
-struct MMA_64x176x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*S8
-struct MMA_64x208x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*S8
-struct MMA_64x208x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*S8
-struct MMA_64x224x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*S8
-struct MMA_64x224x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*S8
-struct MMA_64x240x32_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*S8
-struct MMA_64x240x32_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*U8
-struct MMA_64x24x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*U8
-struct MMA_64x24x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*U8
-struct MMA_64x48x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*U8
-struct MMA_64x48x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*U8
-struct MMA_64x80x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*U8
-struct MMA_64x80x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*U8
-struct MMA_64x112x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*U8
-struct MMA_64x112x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*U8
-struct MMA_64x144x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*U8
-struct MMA_64x144x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*U8
-struct MMA_64x160x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*U8
-struct MMA_64x160x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*U8
-struct MMA_64x176x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*U8
-struct MMA_64x176x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*U8
-struct MMA_64x208x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*U8
-struct MMA_64x208x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*U8
-struct MMA_64x224x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*U8
-struct MMA_64x224x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*U8
-struct MMA_64x240x32_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*U8
-struct MMA_64x240x32_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*U8
-struct MMA_64x24x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=S8*U8
-struct MMA_64x24x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*U8
-struct MMA_64x48x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=S8*U8
-struct MMA_64x48x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*U8
-struct MMA_64x80x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=S8*U8
-struct MMA_64x80x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*U8
-struct MMA_64x112x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=S8*U8
-struct MMA_64x112x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*U8
-struct MMA_64x144x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=S8*U8
-struct MMA_64x144x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*U8
-struct MMA_64x160x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=S8*U8
-struct MMA_64x160x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*U8
-struct MMA_64x176x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=S8*U8
-struct MMA_64x176x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*U8
-struct MMA_64x208x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=S8*U8
-struct MMA_64x208x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*U8
-struct MMA_64x224x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=S8*U8
-struct MMA_64x224x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*U8
-struct MMA_64x240x32_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=S8*U8
-struct MMA_64x240x32_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*S8
-struct MMA_64x24x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*S8
-struct MMA_64x24x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*S8
-struct MMA_64x48x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*S8
-struct MMA_64x48x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*S8
-struct MMA_64x80x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*S8
-struct MMA_64x80x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*S8
-struct MMA_64x112x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*S8
-struct MMA_64x112x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*S8
-struct MMA_64x144x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*S8
-struct MMA_64x144x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*S8
-struct MMA_64x160x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*S8
-struct MMA_64x160x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*S8
-struct MMA_64x176x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*S8
-struct MMA_64x176x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*S8
-struct MMA_64x208x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*S8
-struct MMA_64x208x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*S8
-struct MMA_64x224x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*S8
-struct MMA_64x224x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*S8
-struct MMA_64x240x32_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*S8
-struct MMA_64x240x32_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*S8
-struct MMA_64x24x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*S8
-struct MMA_64x24x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*S8
-struct MMA_64x48x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*S8
-struct MMA_64x48x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*S8
-struct MMA_64x80x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*S8
-struct MMA_64x80x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*S8
-struct MMA_64x112x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*S8
-struct MMA_64x112x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*S8
-struct MMA_64x144x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*S8
-struct MMA_64x144x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*S8
-struct MMA_64x160x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*S8
-struct MMA_64x160x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*S8
-struct MMA_64x176x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*S8
-struct MMA_64x176x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*S8
-struct MMA_64x208x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*S8
-struct MMA_64x208x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*S8
-struct MMA_64x224x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*S8
-struct MMA_64x224x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*S8
-struct MMA_64x240x32_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*S8
-struct MMA_64x240x32_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*U8
-struct MMA_64x24x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*U8
-struct MMA_64x24x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*U8
-struct MMA_64x48x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*U8
-struct MMA_64x48x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*U8
-struct MMA_64x80x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*U8
-struct MMA_64x80x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*U8
-struct MMA_64x112x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*U8
-struct MMA_64x112x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*U8
-struct MMA_64x144x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*U8
-struct MMA_64x144x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*U8
-struct MMA_64x160x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*U8
-struct MMA_64x160x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*U8
-struct MMA_64x176x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*U8
-struct MMA_64x176x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*U8
-struct MMA_64x208x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*U8
-struct MMA_64x208x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*U8
-struct MMA_64x224x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*U8
-struct MMA_64x224x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*U8
-struct MMA_64x240x32_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*U8
-struct MMA_64x240x32_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*U8
-struct MMA_64x24x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN S32+=U8*U8
-struct MMA_64x24x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*U8
-struct MMA_64x48x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN S32+=U8*U8
-struct MMA_64x48x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*U8
-struct MMA_64x80x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN S32+=U8*U8
-struct MMA_64x80x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*U8
-struct MMA_64x112x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN S32+=U8*U8
-struct MMA_64x112x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*U8
-struct MMA_64x144x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN S32+=U8*U8
-struct MMA_64x144x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*U8
-struct MMA_64x160x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN S32+=U8*U8
-struct MMA_64x160x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*U8
-struct MMA_64x176x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN S32+=U8*U8
-struct MMA_64x176x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*U8
-struct MMA_64x208x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN S32+=U8*U8
-struct MMA_64x208x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*U8
-struct MMA_64x224x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN S32+=U8*U8
-struct MMA_64x224x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*U8
-struct MMA_64x240x32_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN S32+=U8*U8
-struct MMA_64x240x32_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " p,    %53,  %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " p,    %53,  %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " p,    %53,  %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x24x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x24x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n24k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x24x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x40x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x40x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n40k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x40x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x48x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x48x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n48k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x48x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x56x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x56x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n56k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x56x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x72x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x72x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n72k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x72x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x80x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x80x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n80k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x80x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x88x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x88x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n88k32.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x88x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x104x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x104x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n104k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x104x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x112x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x112x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n112k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x112x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x120x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x120x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n120k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x120x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %70, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " p,    %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x136x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x136x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %73, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n136k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " p,    %74,  %75;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x136x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %74, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " p,    %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x144x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x144x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %77, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n144k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " p,    %78,  %79;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x144x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %78, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " p,    %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x152x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x152x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %81, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n152k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " p,    %82,  %83;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x152x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %82, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " p,    %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x160x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x160x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %85, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " p,    %86,  %87;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x160x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %86, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " p,    %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x168x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x168x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %89, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n168k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " p,    %90,  %91;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x168x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %90, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " p,    %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x176x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x176x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %93, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n176k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " p,    %94,  %95;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x176x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %94, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " p,    %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x184x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x184x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %97, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n184k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " p,    %98,  %99;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x184x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " p,    %53,  %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %102, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " p,    %103, %104;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x200x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x200x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %105, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n200k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " p,    %106, %107;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x200x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %106, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " p,    %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x208x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x208x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %109, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n208k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " p,    %110, %111;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x208x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %110, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " p,    %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x216x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x216x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %113, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n216k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " p,    %114, %115;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x216x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %114, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " p,    %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x224x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x224x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %117, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n224k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " p,    %118, %119;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x224x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %118, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " p,    %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x232x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x232x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %121, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n232k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " p,    %122, %123;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x232x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %122, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " p,    %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x240x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x240x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %125, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n240k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " p,    %126, %127;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x240x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %126, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " p,    %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA 64x248x32 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-struct MMA_64x248x32_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %129, 0;\n"
-      "wgmma.mma_async.sync.aligned.m64n248k32.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " p,    %130, %131;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use MMA_64x248x32_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace SM90::GMMA
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
deleted file mode 100755
index ecca91b93..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp
+++ /dev/null
@@ -1,22743 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
-#include <cute/arch/mma_sm90_gmma.hpp>     // GMMA::Major, etc.
-
-namespace cute {
-
-namespace SM90::GMMA::SPARSE {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// GMMA PTX definitions:  C = (scaleA * A) * (scaleB * B) + (scaleD * C)
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " %4, %5,"
-      " p,  %7, %8, %9, %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f16.f16.f16 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " %7, %8,"
-      " p,   %10, %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10, %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14, %15, %16;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17, %18;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22, %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30, %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38, %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54, %55, %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70,  %71,  %72;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73,  %74;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10, %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14, %15, %16;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17, %18;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22, %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38, %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54, %55, %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70,  %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102, %103, %104;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105, %106;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134, %135, %136;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137, %138;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10, %11, %12;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14, %15, %16;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17, %18;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22, %23, %24;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38, %39, %40;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54, %55, %56;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70,  %71,  %72;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102, %103, %104;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105, %106;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134, %135, %136;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137, %138;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t      & d88, uint32_t      & d89, uint32_t      & d90, uint32_t      & d91,
-      uint32_t      & d92, uint32_t      & d93, uint32_t      & d94, uint32_t      & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87),
-        "+r"(d88), "+r"(d89), "+r"(d90), "+r"(d91),
-        "+r"(d92), "+r"(d93), "+r"(d94), "+r"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t      & d120, uint32_t      & d121, uint32_t      & d122, uint32_t      & d123,
-      uint32_t      & d124, uint32_t      & d125, uint32_t      & d126, uint32_t      & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119),
-        "+r"(d120), "+r"(d121), "+r"(d122), "+r"(d123),
-        "+r"(d124), "+r"(d125), "+r"(d126), "+r"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " %4, %5,"
-      " p,  %7, %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e4m3 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " %7, %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " %4, %5,"
-      " p,  %7, %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e4m3.e5m2 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " %7, %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " %4, %5,"
-      " p,  %7, %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e4m3 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " %7, %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %6, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
-      "{%0, %1},"
-      " %2,"
-      " %3,"
-      " %4, %5,"
-      " p,  %7, %8;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[2];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %9, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f16.e5m2.e5m2 "
-      "{%0,  %1},"
-      "{%2,  %3,  %4,  %5},"
-      " %6,"
-      " %7, %8,"
-      " p,   %10, %11;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x8x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x8x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n8k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %8, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      " %4,"
-      " %5,"
-      " %6, %7,"
-      " p,   %9,  %10;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[4];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %11, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3},"
-      "{%4,  %5,  %6,  %7},"
-      " %8,"
-      " %9, %10,"
-      " p,   %12, %13;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x16x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x16x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      float         & d0, float         & d1, float         & d2, float         & d3,
-      float         & d4, float         & d5, float         & d6, float         & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n16k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+f"(d0), "+f"(d1), "+f"(d2), "+f"(d3),
-        "+f"(d4), "+f"(d5), "+f"(d6), "+f"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %12, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      " %8,"
-      " %9,"
-      " %10, %11,"
-      " p,   %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[8];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5, uint32_t      & d6, uint32_t      & d7,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %15, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-      "{%8,  %9,  %10, %11},"
-      " %12,"
-      " %13, %14,"
-      " p,   %16, %17;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5), "+r"(d6), "+r"(d7)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x32x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x32x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n32k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %20, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      " %16,"
-      " %17,"
-      " %18, %19,"
-      " p,   %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[16];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %23, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15},"
-      "{%16, %17, %18, %19},"
-      " %20,"
-      " %21, %22,"
-      " p,   %24, %25;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x64x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x64x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n64k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x96x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x96x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n96k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %36, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      " %32,"
-      " %33,"
-      " %34, %35,"
-      " p,   %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[32];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %39, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31},"
-      "{%32, %33, %34, %35},"
-      " %36,"
-      " %37, %38,"
-      " p,   %40, %41;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x128x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x128x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n128k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %52, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45, %46, %47},"
-      " %48,"
-      " %49,"
-      " %50, %51,"
-      " p,   %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[48];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %55, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47},"
-      "{%48,  %49,  %50,  %51},"
-      " %52,"
-      " %53, %54,"
-      " p,    %56,  %57;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %100, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      " %96,"
-      " %97,"
-      " %98, %99,"
-      " p,    %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x192x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x192x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[96];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      float         & d92, float         & d93, float         & d94, float         & d95,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %103, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n192k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95},"
-      "{%96,  %97,  %98,  %99},"
-      " %100,"
-      " %101, %102,"
-      " p,    %104, %105;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91),
-        "+f"(d92), "+f"(d93), "+f"(d94), "+f"(d95)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %68, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      " %64,"
-      " %65,"
-      " %66, %67,"
-      " p,    %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[64];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %71, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63},"
-      "{%64,  %65,  %66,  %67},"
-      " %68,"
-      " %69, %70,"
-      " p,    %72,  %73;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %132, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      " %128,"
-      " %129,"
-      " %130, %131,"
-      " p,    %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x256x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x256x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[128];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      float         & d124, float         & d125, float         & d126, float         & d127,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %135, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n256k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123, %124, %125, %126, %127},"
-      "{%128, %129, %130, %131},"
-      " %132,"
-      " %133, %134,"
-      " p,    %136, %137;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123),
-        "+f"(d124), "+f"(d125), "+f"(d126), "+f"(d127)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace SM90::GMMA::SPARSE
-
-} // namespace cute
-
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-#include "mma_sm90_gmma_sparse_ext.hpp"
-#endif
diff --git a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp b/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
deleted file mode 100755
index c224e4034..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
+++ /dev/null
@@ -1,60445 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
-#pragma once
-  
-#include <cute/config.hpp>                // CUTE_HOST_DEVICE
-
-#include "cutlass/arch/synclog.hpp"
-
-// Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-#  define CUTE_ARCH_MMA_SM90A_ENABLED
-#endif
-
-namespace cute {
-
-namespace SM90::GMMA::SPARSE {
-
-// SPARSE GMMA 64x24x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " %8, %9,"
-      " p,   %11, %12, %13, %14;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " %11, %12,"
-      " p,   %14, %15, %16;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " %12, %13,"
-      " p,   %15, %16, %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " %15, %16,"
-      " p,   %18, %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18, %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " %16, %17,"
-      " p,   %19, %20, %21, %22;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " %19, %20,"
-      " p,   %22, %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " %20, %21,"
-      " p,   %23, %24, %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " %23, %24,"
-      " p,   %26, %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26, %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " %24, %25,"
-      " p,   %27, %28, %29, %30;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " %27, %28,"
-      " p,   %30, %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " %28, %29,"
-      " p,   %31, %32, %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " %31, %32,"
-      " p,   %34, %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34, %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " %32, %33,"
-      " p,   %35, %36, %37, %38;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " %35, %36,"
-      " p,   %38, %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " %36, %37,"
-      " p,   %39, %40, %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " %39, %40,"
-      " p,   %42, %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42, %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " %40, %41,"
-      " p,   %43, %44, %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " %43, %44,"
-      " p,   %46, %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46, %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " %44, %45,"
-      " p,   %47, %48, %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " %47, %48,"
-      " p,   %50, %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50, %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " %48, %49,"
-      " p,   %51, %52, %53, %54;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f16.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " %51, %52,"
-      " p,   %54, %55, %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " %52, %53,"
-      " p,    %55,  %56,  %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " %55, %56,"
-      " p,    %58,  %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58,  %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " %56, %57,"
-      " p,    %59,  %60,  %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " %59, %60,"
-      " p,    %62,  %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62,  %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " %60, %61,"
-      " p,    %63,  %64,  %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " %63, %64,"
-      " p,    %66,  %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66,  %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F16F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " %64, %65,"
-      " p,    %67,  %68,  %69,  %70;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F16+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F16F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f16.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " %67, %68,"
-      " p,    %70,  %71,  %72;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18, %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26, %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30, %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34, %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42, %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46, %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50, %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.f16.f16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58,  %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62,  %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66,  %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74,  %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78,  %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82,  %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86,  %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90,  %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94,  %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98,  %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106, %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110, %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114, %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118, %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122, %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126, %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F32F16F16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130, %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F32+=F16*F16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F32F16F16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.f16.f16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18, %19, %20;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21, %22;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26, %27, %28;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30, %31, %32;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34, %35, %36;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37, %38;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42, %43, %44;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46, %47, %48;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50, %51, %52;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k32.f32.bf16.bf16 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53, %54;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58,  %59,  %60;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62,  %63,  %64;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66,  %67,  %68;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69,  %70;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74,  %75,  %76;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78,  %79,  %80;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82,  %83,  %84;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86,  %87,  %88;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90,  %91,  %92;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94,  %95,  %96;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98,  %99,  %100;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101, %102;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106, %107, %108;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110, %111, %112;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114, %115, %116;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118, %119, %120;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122, %123, %124;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126, %127, %128;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F32BF16BF16_SS
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130, %131, %132;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x32 F32+=BF16*BF16
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x32_F32BF16BF16_RS
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  static_assert(tnspA == GMMA::Major::K,
-      "Register source operand A must have K major layout.");
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k32.f32.bf16.bf16 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133, %134;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k16.f32.tf32.tf32 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x16_F32TF32TF32_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x16 TN F32+=TF32*TF32
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x16_F32TF32TF32_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k16.f32.tf32.tf32 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.s8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=S8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32S8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.s8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8S8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8S8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.s8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8S8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*S8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8S8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.s8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8U8_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8U8_SS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.s32.u8.u8.satfinite "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61, uint32_t      & d62, uint32_t      & d63,
-      uint32_t      & d64, uint32_t      & d65, uint32_t      & d66, uint32_t      & d67,
-      uint32_t      & d68, uint32_t      & d69, uint32_t      & d70, uint32_t      & d71,
-      uint32_t      & d72, uint32_t      & d73, uint32_t      & d74, uint32_t      & d75,
-      uint32_t      & d76, uint32_t      & d77, uint32_t      & d78, uint32_t      & d79,
-      uint32_t      & d80, uint32_t      & d81, uint32_t      & d82, uint32_t      & d83,
-      uint32_t      & d84, uint32_t      & d85, uint32_t      & d86, uint32_t      & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61), "+r"(d62), "+r"(d63),
-        "+r"(d64), "+r"(d65), "+r"(d66), "+r"(d67),
-        "+r"(d68), "+r"(d69), "+r"(d70), "+r"(d71),
-        "+r"(d72), "+r"(d73), "+r"(d74), "+r"(d75),
-        "+r"(d76), "+r"(d77), "+r"(d78), "+r"(d79),
-        "+r"(d80), "+r"(d81), "+r"(d82), "+r"(d83),
-        "+r"(d84), "+r"(d85), "+r"(d86), "+r"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8U8_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN S32+=U8*U8
-template <
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_S32U8U8_RS_TN_SATURATE
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      uint32_t      & d000, uint32_t      & d001, uint32_t      & d002, uint32_t      & d003,
-      uint32_t      & d004, uint32_t      & d005, uint32_t      & d006, uint32_t      & d007,
-      uint32_t      & d008, uint32_t      & d009, uint32_t      & d010, uint32_t      & d011,
-      uint32_t      & d012, uint32_t      & d013, uint32_t      & d014, uint32_t      & d015,
-      uint32_t      & d016, uint32_t      & d017, uint32_t      & d018, uint32_t      & d019,
-      uint32_t      & d020, uint32_t      & d021, uint32_t      & d022, uint32_t      & d023,
-      uint32_t      & d024, uint32_t      & d025, uint32_t      & d026, uint32_t      & d027,
-      uint32_t      & d028, uint32_t      & d029, uint32_t      & d030, uint32_t      & d031,
-      uint32_t      & d032, uint32_t      & d033, uint32_t      & d034, uint32_t      & d035,
-      uint32_t      & d036, uint32_t      & d037, uint32_t      & d038, uint32_t      & d039,
-      uint32_t      & d040, uint32_t      & d041, uint32_t      & d042, uint32_t      & d043,
-      uint32_t      & d044, uint32_t      & d045, uint32_t      & d046, uint32_t      & d047,
-      uint32_t      & d048, uint32_t      & d049, uint32_t      & d050, uint32_t      & d051,
-      uint32_t      & d052, uint32_t      & d053, uint32_t      & d054, uint32_t      & d055,
-      uint32_t      & d056, uint32_t      & d057, uint32_t      & d058, uint32_t      & d059,
-      uint32_t      & d060, uint32_t      & d061, uint32_t      & d062, uint32_t      & d063,
-      uint32_t      & d064, uint32_t      & d065, uint32_t      & d066, uint32_t      & d067,
-      uint32_t      & d068, uint32_t      & d069, uint32_t      & d070, uint32_t      & d071,
-      uint32_t      & d072, uint32_t      & d073, uint32_t      & d074, uint32_t      & d075,
-      uint32_t      & d076, uint32_t      & d077, uint32_t      & d078, uint32_t      & d079,
-      uint32_t      & d080, uint32_t      & d081, uint32_t      & d082, uint32_t      & d083,
-      uint32_t      & d084, uint32_t      & d085, uint32_t      & d086, uint32_t      & d087,
-      uint32_t      & d088, uint32_t      & d089, uint32_t      & d090, uint32_t      & d091,
-      uint32_t      & d092, uint32_t      & d093, uint32_t      & d094, uint32_t      & d095,
-      uint32_t      & d096, uint32_t      & d097, uint32_t      & d098, uint32_t      & d099,
-      uint32_t      & d100, uint32_t      & d101, uint32_t      & d102, uint32_t      & d103,
-      uint32_t      & d104, uint32_t      & d105, uint32_t      & d106, uint32_t      & d107,
-      uint32_t      & d108, uint32_t      & d109, uint32_t      & d110, uint32_t      & d111,
-      uint32_t      & d112, uint32_t      & d113, uint32_t      & d114, uint32_t      & d115,
-      uint32_t      & d116, uint32_t      & d117, uint32_t      & d118, uint32_t      & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.s32.u8.u8.satfinite "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p;\n"
-    "}\n"
-      : "+r"(d000), "+r"(d001), "+r"(d002), "+r"(d003),
-        "+r"(d004), "+r"(d005), "+r"(d006), "+r"(d007),
-        "+r"(d008), "+r"(d009), "+r"(d010), "+r"(d011),
-        "+r"(d012), "+r"(d013), "+r"(d014), "+r"(d015),
-        "+r"(d016), "+r"(d017), "+r"(d018), "+r"(d019),
-        "+r"(d020), "+r"(d021), "+r"(d022), "+r"(d023),
-        "+r"(d024), "+r"(d025), "+r"(d026), "+r"(d027),
-        "+r"(d028), "+r"(d029), "+r"(d030), "+r"(d031),
-        "+r"(d032), "+r"(d033), "+r"(d034), "+r"(d035),
-        "+r"(d036), "+r"(d037), "+r"(d038), "+r"(d039),
-        "+r"(d040), "+r"(d041), "+r"(d042), "+r"(d043),
-        "+r"(d044), "+r"(d045), "+r"(d046), "+r"(d047),
-        "+r"(d048), "+r"(d049), "+r"(d050), "+r"(d051),
-        "+r"(d052), "+r"(d053), "+r"(d054), "+r"(d055),
-        "+r"(d056), "+r"(d057), "+r"(d058), "+r"(d059),
-        "+r"(d060), "+r"(d061), "+r"(d062), "+r"(d063),
-        "+r"(d064), "+r"(d065), "+r"(d066), "+r"(d067),
-        "+r"(d068), "+r"(d069), "+r"(d070), "+r"(d071),
-        "+r"(d072), "+r"(d073), "+r"(d074), "+r"(d075),
-        "+r"(d076), "+r"(d077), "+r"(d078), "+r"(d079),
-        "+r"(d080), "+r"(d081), "+r"(d082), "+r"(d083),
-        "+r"(d084), "+r"(d085), "+r"(d086), "+r"(d087),
-        "+r"(d088), "+r"(d089), "+r"(d090), "+r"(d091),
-        "+r"(d092), "+r"(d093), "+r"(d094), "+r"(d095),
-        "+r"(d096), "+r"(d097), "+r"(d098), "+r"(d099),
-        "+r"(d100), "+r"(d101), "+r"(d102), "+r"(d103),
-        "+r"(d104), "+r"(d105), "+r"(d106), "+r"(d107),
-        "+r"(d108), "+r"(d109), "+r"(d110), "+r"(d111),
-        "+r"(d112), "+r"(d113), "+r"(d114), "+r"(d115),
-        "+r"(d116), "+r"(d117), "+r"(d118), "+r"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " %8, %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " %11, %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " %12, %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " %15, %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " %16, %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " %19, %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " %20, %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " %23, %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " %24, %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " %27, %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " %28, %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " %31, %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " %32, %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " %35, %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " %36, %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " %39, %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " %40, %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " %43, %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " %44, %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " %47, %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " %48, %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " %51, %52,"
-      " p,   %54, %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " %52, %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " %55, %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " %56, %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " %59, %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " %60, %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " %63, %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " %64, %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " %67, %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E4M3E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E4M3*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E4M3E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " %8, %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " %11, %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " %12, %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " %15, %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " %16, %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " %19, %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " %20, %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " %23, %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " %24, %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " %27, %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " %28, %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " %31, %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " %32, %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " %35, %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " %36, %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " %39, %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " %40, %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " %43, %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " %44, %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " %47, %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " %48, %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e4m3.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " %51, %52,"
-      " p,   %54, %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " %52, %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " %55, %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " %56, %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " %59, %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " %60, %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " %63, %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " %64, %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " %67, %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E4M3E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E4M3*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E4M3E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e4m3.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " %8, %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " %11, %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " %12, %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " %15, %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " %16, %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " %19, %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " %20, %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " %23, %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " %24, %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " %27, %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " %28, %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " %31, %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " %32, %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " %35, %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " %36, %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " %39, %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " %40, %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " %43, %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " %44, %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " %47, %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " %48, %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e4m3 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " %51, %52,"
-      " p,   %54, %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " %52, %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " %55, %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " %56, %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " %59, %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " %60, %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " %63, %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " %64, %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " %67, %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E5M2E4M3_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E5M2*E4M3
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E5M2E4M3_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e4m3 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %10, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      " %6,"
-      " %7,"
-      " %8, %9,"
-      " p,   %11, %12;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[6];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
-      uint64_t const& desc_b,
-      uint32_t      & d0, uint32_t      & d1, uint32_t      & d2, uint32_t      & d3,
-      uint32_t      & d4, uint32_t      & d5,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %13, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5},"
-      "{%6,  %7,  %8,  %9},"
-      " %10,"
-      " %11, %12,"
-      " p,   %14, %15;\n"
-    "}\n"
-      : "+r"(d0), "+r"(d1), "+r"(d2), "+r"(d3),
-        "+r"(d4), "+r"(d5)
-      :  "r"(a0),  "r"(a1),  "r"(a2),  "r"(a3),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x24x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x24x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n24k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %14, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      " %10,"
-      " %11,"
-      " %12, %13,"
-      " p,   %15, %16;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[10];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %17, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9},"
-      "{%10, %11, %12, %13},"
-      " %14,"
-      " %15, %16,"
-      " p,   %18, %19;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x40x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x40x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n40k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %16, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      " %12,"
-      " %13,"
-      " %14, %15,"
-      " p,   %17, %18;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[12];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %19, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11},"
-      "{%12, %13, %14, %15},"
-      " %16,"
-      " %17, %18,"
-      " p,   %20, %21;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %28, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      " %24,"
-      " %25,"
-      " %26, %27,"
-      " p,   %29, %30;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x48x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x48x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[24];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %31, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n48k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23},"
-      "{%24, %25, %26, %27},"
-      " %28,"
-      " %29, %30,"
-      " p,   %32, %33;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %18, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      " %14,"
-      " %15,"
-      " %16, %17,"
-      " p,   %19, %20;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[14];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %21, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13},"
-      "{%14, %15, %16, %17},"
-      " %18,"
-      " %19, %20,"
-      " p,   %22, %23;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x56x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x56x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n56k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %22, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      " %18,"
-      " %19,"
-      " %20, %21,"
-      " p,   %23, %24;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[18];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %25, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17},"
-      "{%18, %19, %20, %21},"
-      " %22,"
-      " %23, %24,"
-      " p,   %26, %27;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x72x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x72x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n72k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %24, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      " %20,"
-      " %21,"
-      " %22, %23,"
-      " p,   %25, %26;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[20];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %27, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19},"
-      "{%20, %21, %22, %23},"
-      " %24,"
-      " %25, %26,"
-      " p,   %28, %29;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x80x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x80x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n80k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %26, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      " %22,"
-      " %23,"
-      " %24, %25,"
-      " p,   %27, %28;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[22];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %29, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21},"
-      "{%22, %23, %24, %25},"
-      " %26,"
-      " %27, %28,"
-      " p,   %30, %31;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x88x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x88x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n88k64.f32.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %30, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      " %26,"
-      " %27,"
-      " %28, %29,"
-      " p,   %31, %32;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[26];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %33, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25},"
-      "{%26, %27, %28, %29},"
-      " %30,"
-      " %31, %32,"
-      " p,   %34, %35;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x104x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x104x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n104k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %32, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      " %28,"
-      " %29,"
-      " %30, %31,"
-      " p,   %33, %34;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[28];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %35, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27},"
-      "{%28, %29, %30, %31},"
-      " %32,"
-      " %33, %34,"
-      " p,   %36, %37;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x112x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x112x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n112k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %34, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      " %30,"
-      " %31,"
-      " %32, %33,"
-      " p,   %35, %36;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[30];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %37, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29},"
-      "{%30, %31, %32, %33},"
-      " %34,"
-      " %35, %36,"
-      " p,   %38, %39;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x120x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x120x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n120k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %38, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      " %34,"
-      " %35,"
-      " %36, %37,"
-      " p,   %39, %40;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[34];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %41, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33},"
-      "{%34, %35, %36, %37},"
-      " %38,"
-      " %39, %40,"
-      " p,   %42, %43;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %72, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      " %68,"
-      " %69,"
-      " %70, %71,"
-      " p,    %73,  %74;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x136x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x136x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[68];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %75, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n136k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67},"
-      "{%68,  %69,  %70,  %71},"
-      " %72,"
-      " %73, %74,"
-      " p,    %76,  %77;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %40, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      " %36,"
-      " %37,"
-      " %38, %39,"
-      " p,   %41, %42;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[36];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %43, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35},"
-      "{%36, %37, %38, %39},"
-      " %40,"
-      " %41, %42,"
-      " p,   %44, %45;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %76, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      " %72,"
-      " %73,"
-      " %74, %75,"
-      " p,    %77,  %78;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x144x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x144x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[72];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %79, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n144k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71},"
-      "{%72,  %73,  %74,  %75},"
-      " %76,"
-      " %77, %78,"
-      " p,    %80,  %81;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %42, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      " %38,"
-      " %39,"
-      " %40, %41,"
-      " p,   %43, %44;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[38];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %45, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37},"
-      "{%38, %39, %40, %41},"
-      " %42,"
-      " %43, %44,"
-      " p,   %46, %47;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %80, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      " %76,"
-      " %77,"
-      " %78, %79,"
-      " p,    %81,  %82;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x152x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x152x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[76];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %83, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n152k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75},"
-      "{%76,  %77,  %78,  %79},"
-      " %80,"
-      " %81, %82,"
-      " p,    %84,  %85;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %44, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      " %40,"
-      " %41,"
-      " %42, %43,"
-      " p,   %45, %46;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[40];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %47, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39},"
-      "{%40, %41, %42, %43},"
-      " %44,"
-      " %45, %46,"
-      " p,   %48, %49;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %84, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      " %80,"
-      " %81,"
-      " %82, %83,"
-      " p,    %85,  %86;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x160x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x160x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[80];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %87, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n160k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79},"
-      "{%80,  %81,  %82,  %83},"
-      " %84,"
-      " %85, %86,"
-      " p,    %88,  %89;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %46, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      " %42,"
-      " %43,"
-      " %44, %45,"
-      " p,   %47, %48;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[42];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %49, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41},"
-      "{%42, %43, %44, %45},"
-      " %46,"
-      " %47, %48,"
-      " p,   %50, %51;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %88, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      " %84,"
-      " %85,"
-      " %86, %87,"
-      " p,    %89,  %90;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x168x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x168x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[84];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %91, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n168k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83},"
-      "{%84,  %85,  %86,  %87},"
-      " %88,"
-      " %89, %90,"
-      " p,    %92,  %93;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %48, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      " %44,"
-      " %45,"
-      " %46, %47,"
-      " p,   %49, %50;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[44];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %51, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43},"
-      "{%44, %45, %46, %47},"
-      " %48,"
-      " %49, %50,"
-      " p,   %52, %53;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %92, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      " %88,"
-      " %89,"
-      " %90, %91,"
-      " p,    %93,  %94;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x176x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x176x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[88];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %95, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n176k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87},"
-      "{%88,  %89,  %90,  %91},"
-      " %92,"
-      " %93, %94,"
-      " p,    %96,  %97;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %50, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      " %46,"
-      " %47,"
-      " %48, %49,"
-      " p,   %51, %52;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[46];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %53, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f16.e5m2.e5m2 "
-      "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-      " %8,  %9,  %10, %11, %12, %13, %14, %15, "
-      " %16, %17, %18, %19, %20, %21, %22, %23, "
-      " %24, %25, %26, %27, %28, %29, %30, %31, "
-      " %32, %33, %34, %35, %36, %37, %38, %39, "
-      " %40, %41, %42, %43, %44, %45},"
-      "{%46, %47, %48, %49},"
-      " %50,"
-      " %51, %52,"
-      " p,   %54, %55;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %96, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      " %92,"
-      " %93,"
-      " %94, %95,"
-      " p,    %97,  %98;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x184x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x184x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[92];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      float         & d00, float         & d01, float         & d02, float         & d03,
-      float         & d04, float         & d05, float         & d06, float         & d07,
-      float         & d08, float         & d09, float         & d10, float         & d11,
-      float         & d12, float         & d13, float         & d14, float         & d15,
-      float         & d16, float         & d17, float         & d18, float         & d19,
-      float         & d20, float         & d21, float         & d22, float         & d23,
-      float         & d24, float         & d25, float         & d26, float         & d27,
-      float         & d28, float         & d29, float         & d30, float         & d31,
-      float         & d32, float         & d33, float         & d34, float         & d35,
-      float         & d36, float         & d37, float         & d38, float         & d39,
-      float         & d40, float         & d41, float         & d42, float         & d43,
-      float         & d44, float         & d45, float         & d46, float         & d47,
-      float         & d48, float         & d49, float         & d50, float         & d51,
-      float         & d52, float         & d53, float         & d54, float         & d55,
-      float         & d56, float         & d57, float         & d58, float         & d59,
-      float         & d60, float         & d61, float         & d62, float         & d63,
-      float         & d64, float         & d65, float         & d66, float         & d67,
-      float         & d68, float         & d69, float         & d70, float         & d71,
-      float         & d72, float         & d73, float         & d74, float         & d75,
-      float         & d76, float         & d77, float         & d78, float         & d79,
-      float         & d80, float         & d81, float         & d82, float         & d83,
-      float         & d84, float         & d85, float         & d86, float         & d87,
-      float         & d88, float         & d89, float         & d90, float         & d91,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %99, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n184k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91},"
-      "{%92,  %93,  %94,  %95},"
-      " %96,"
-      " %97, %98,"
-      " p,    %100, %101;\n"
-    "}\n"
-      : "+f"(d00), "+f"(d01), "+f"(d02), "+f"(d03),
-        "+f"(d04), "+f"(d05), "+f"(d06), "+f"(d07),
-        "+f"(d08), "+f"(d09), "+f"(d10), "+f"(d11),
-        "+f"(d12), "+f"(d13), "+f"(d14), "+f"(d15),
-        "+f"(d16), "+f"(d17), "+f"(d18), "+f"(d19),
-        "+f"(d20), "+f"(d21), "+f"(d22), "+f"(d23),
-        "+f"(d24), "+f"(d25), "+f"(d26), "+f"(d27),
-        "+f"(d28), "+f"(d29), "+f"(d30), "+f"(d31),
-        "+f"(d32), "+f"(d33), "+f"(d34), "+f"(d35),
-        "+f"(d36), "+f"(d37), "+f"(d38), "+f"(d39),
-        "+f"(d40), "+f"(d41), "+f"(d42), "+f"(d43),
-        "+f"(d44), "+f"(d45), "+f"(d46), "+f"(d47),
-        "+f"(d48), "+f"(d49), "+f"(d50), "+f"(d51),
-        "+f"(d52), "+f"(d53), "+f"(d54), "+f"(d55),
-        "+f"(d56), "+f"(d57), "+f"(d58), "+f"(d59),
-        "+f"(d60), "+f"(d61), "+f"(d62), "+f"(d63),
-        "+f"(d64), "+f"(d65), "+f"(d66), "+f"(d67),
-        "+f"(d68), "+f"(d69), "+f"(d70), "+f"(d71),
-        "+f"(d72), "+f"(d73), "+f"(d74), "+f"(d75),
-        "+f"(d76), "+f"(d77), "+f"(d78), "+f"(d79),
-        "+f"(d80), "+f"(d81), "+f"(d82), "+f"(d83),
-        "+f"(d84), "+f"(d85), "+f"(d86), "+f"(d87),
-        "+f"(d88), "+f"(d89), "+f"(d90), "+f"(d91)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %54, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      " %50,"
-      " %51,"
-      " %52, %53,"
-      " p,    %55,  %56;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[50];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %57, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49},"
-      "{%50,  %51,  %52,  %53},"
-      " %54,"
-      " %55, %56,"
-      " p,    %58,  %59;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %104, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      " %100,"
-      " %101,"
-      " %102, %103,"
-      " p,    %105, %106;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x200x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x200x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[100];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %107, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n200k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99},"
-      "{%100, %101, %102, %103},"
-      " %104,"
-      " %105, %106,"
-      " p,    %108, %109;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %56, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      " %52,"
-      " %53,"
-      " %54, %55,"
-      " p,    %57,  %58;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[52];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %59, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51},"
-      "{%52,  %53,  %54,  %55},"
-      " %56,"
-      " %57, %58,"
-      " p,    %60,  %61;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %108, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      " %104,"
-      " %105,"
-      " %106, %107,"
-      " p,    %109, %110;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x208x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x208x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[104];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %111, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n208k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103},"
-      "{%104, %105, %106, %107},"
-      " %108,"
-      " %109, %110,"
-      " p,    %112, %113;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %58, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      " %54,"
-      " %55,"
-      " %56, %57,"
-      " p,    %59,  %60;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[54];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %61, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53},"
-      "{%54,  %55,  %56,  %57},"
-      " %58,"
-      " %59, %60,"
-      " p,    %62,  %63;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %112, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      " %108,"
-      " %109,"
-      " %110, %111,"
-      " p,    %113, %114;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x216x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x216x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[108];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %115, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n216k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107},"
-      "{%108, %109, %110, %111},"
-      " %112,"
-      " %113, %114,"
-      " p,    %116, %117;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %60, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      " %56,"
-      " %57,"
-      " %58, %59,"
-      " p,    %61,  %62;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[56];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %63, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55},"
-      "{%56,  %57,  %58,  %59},"
-      " %60,"
-      " %61, %62,"
-      " p,    %64,  %65;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %116, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      " %112,"
-      " %113,"
-      " %114, %115,"
-      " p,    %117, %118;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x224x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x224x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[112];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %119, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n224k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111},"
-      "{%112, %113, %114, %115},"
-      " %116,"
-      " %117, %118,"
-      " p,    %120, %121;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %62, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      " %58,"
-      " %59,"
-      " %60, %61,"
-      " p,    %63,  %64;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[58];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %65, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57},"
-      "{%58,  %59,  %60,  %61},"
-      " %62,"
-      " %63, %64,"
-      " p,    %66,  %67;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %120, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      " %116,"
-      " %117,"
-      " %118, %119,"
-      " p,    %121, %122;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x232x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x232x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[116];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %123, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n232k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115},"
-      "{%116, %117, %118, %119},"
-      " %120,"
-      " %121, %122,"
-      " p,    %124, %125;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %64, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      " %60,"
-      " %61,"
-      " %62, %63,"
-      " p,    %65,  %66;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[60];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %67, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59},"
-      "{%60,  %61,  %62,  %63},"
-      " %64,"
-      " %65, %66,"
-      " p,    %68,  %69;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %124, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      " %120,"
-      " %121,"
-      " %122, %123,"
-      " p,    %125, %126;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x240x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x240x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[120];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %127, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n240k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119},"
-      "{%120, %121, %122, %123},"
-      " %124,"
-      " %125, %126,"
-      " p,    %128, %129;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %66, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      " %62,"
-      " %63,"
-      " %64, %65,"
-      " p,    %67,  %68;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F16+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F16E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = uint32_t[62];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a00, uint32_t const& a01, uint32_t const& a02, uint32_t const& a03,
-      uint64_t const& desc_b,
-      uint32_t      & d00, uint32_t      & d01, uint32_t      & d02, uint32_t      & d03,
-      uint32_t      & d04, uint32_t      & d05, uint32_t      & d06, uint32_t      & d07,
-      uint32_t      & d08, uint32_t      & d09, uint32_t      & d10, uint32_t      & d11,
-      uint32_t      & d12, uint32_t      & d13, uint32_t      & d14, uint32_t      & d15,
-      uint32_t      & d16, uint32_t      & d17, uint32_t      & d18, uint32_t      & d19,
-      uint32_t      & d20, uint32_t      & d21, uint32_t      & d22, uint32_t      & d23,
-      uint32_t      & d24, uint32_t      & d25, uint32_t      & d26, uint32_t      & d27,
-      uint32_t      & d28, uint32_t      & d29, uint32_t      & d30, uint32_t      & d31,
-      uint32_t      & d32, uint32_t      & d33, uint32_t      & d34, uint32_t      & d35,
-      uint32_t      & d36, uint32_t      & d37, uint32_t      & d38, uint32_t      & d39,
-      uint32_t      & d40, uint32_t      & d41, uint32_t      & d42, uint32_t      & d43,
-      uint32_t      & d44, uint32_t      & d45, uint32_t      & d46, uint32_t      & d47,
-      uint32_t      & d48, uint32_t      & d49, uint32_t      & d50, uint32_t      & d51,
-      uint32_t      & d52, uint32_t      & d53, uint32_t      & d54, uint32_t      & d55,
-      uint32_t      & d56, uint32_t      & d57, uint32_t      & d58, uint32_t      & d59,
-      uint32_t      & d60, uint32_t      & d61,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %69, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f16.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61},"
-      "{%62,  %63,  %64,  %65},"
-      " %66,"
-      " %67, %68,"
-      " p,    %70,  %71;\n"
-    "}\n"
-      : "+r"(d00), "+r"(d01), "+r"(d02), "+r"(d03),
-        "+r"(d04), "+r"(d05), "+r"(d06), "+r"(d07),
-        "+r"(d08), "+r"(d09), "+r"(d10), "+r"(d11),
-        "+r"(d12), "+r"(d13), "+r"(d14), "+r"(d15),
-        "+r"(d16), "+r"(d17), "+r"(d18), "+r"(d19),
-        "+r"(d20), "+r"(d21), "+r"(d22), "+r"(d23),
-        "+r"(d24), "+r"(d25), "+r"(d26), "+r"(d27),
-        "+r"(d28), "+r"(d29), "+r"(d30), "+r"(d31),
-        "+r"(d32), "+r"(d33), "+r"(d34), "+r"(d35),
-        "+r"(d36), "+r"(d37), "+r"(d38), "+r"(d39),
-        "+r"(d40), "+r"(d41), "+r"(d42), "+r"(d43),
-        "+r"(d44), "+r"(d45), "+r"(d46), "+r"(d47),
-        "+r"(d48), "+r"(d49), "+r"(d50), "+r"(d51),
-        "+r"(d52), "+r"(d53), "+r"(d54), "+r"(d55),
-        "+r"(d56), "+r"(d57), "+r"(d58), "+r"(d59),
-        "+r"(d60), "+r"(d61)
-      :  "r"(a00),  "r"(a01),  "r"(a02),  "r"(a03),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E5M2E5M2_SS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint64_t[1];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint64_t const& desc_a,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_smem_smem(__LINE__, desc_a, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %128, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      " %124,"
-      " %125,"
-      " %126, %127,"
-      " p,    %129, %130;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "l"(desc_a),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SPARSE GMMA 64x248x64 TN F32+=E5M2*E5M2
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One,
-  GMMA::SparseSel spsel = GMMA::SparseSel::Zero
->
-struct GMMA_64x248x64_F32E5M2E5M2_RS_TN
-{
-  using DRegisters = void;
-  using ARegisters = uint32_t[4];
-  using ERegisters = uint32_t[1];
-  using BRegisters = uint64_t[1];
-  using CRegisters = float[124];
-
-  CUTE_HOST_DEVICE static void
-  fma(uint32_t const& a000, uint32_t const& a001, uint32_t const& a002, uint32_t const& a003,
-      uint64_t const& desc_b,
-      float         & d000, float         & d001, float         & d002, float         & d003,
-      float         & d004, float         & d005, float         & d006, float         & d007,
-      float         & d008, float         & d009, float         & d010, float         & d011,
-      float         & d012, float         & d013, float         & d014, float         & d015,
-      float         & d016, float         & d017, float         & d018, float         & d019,
-      float         & d020, float         & d021, float         & d022, float         & d023,
-      float         & d024, float         & d025, float         & d026, float         & d027,
-      float         & d028, float         & d029, float         & d030, float         & d031,
-      float         & d032, float         & d033, float         & d034, float         & d035,
-      float         & d036, float         & d037, float         & d038, float         & d039,
-      float         & d040, float         & d041, float         & d042, float         & d043,
-      float         & d044, float         & d045, float         & d046, float         & d047,
-      float         & d048, float         & d049, float         & d050, float         & d051,
-      float         & d052, float         & d053, float         & d054, float         & d055,
-      float         & d056, float         & d057, float         & d058, float         & d059,
-      float         & d060, float         & d061, float         & d062, float         & d063,
-      float         & d064, float         & d065, float         & d066, float         & d067,
-      float         & d068, float         & d069, float         & d070, float         & d071,
-      float         & d072, float         & d073, float         & d074, float         & d075,
-      float         & d076, float         & d077, float         & d078, float         & d079,
-      float         & d080, float         & d081, float         & d082, float         & d083,
-      float         & d084, float         & d085, float         & d086, float         & d087,
-      float         & d088, float         & d089, float         & d090, float         & d091,
-      float         & d092, float         & d093, float         & d094, float         & d095,
-      float         & d096, float         & d097, float         & d098, float         & d099,
-      float         & d100, float         & d101, float         & d102, float         & d103,
-      float         & d104, float         & d105, float         & d106, float         & d107,
-      float         & d108, float         & d109, float         & d110, float         & d111,
-      float         & d112, float         & d113, float         & d114, float         & d115,
-      float         & d116, float         & d117, float         & d118, float         & d119,
-      float         & d120, float         & d121, float         & d122, float         & d123,
-      uint32_t const& e,
-      GMMA::ScaleOut const scale_D = GMMA::ScaleOut::One)
-  {
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_wgmma_reg_smem(__LINE__, desc_b);
-    asm volatile(
-    "{\n"
-      ".reg .pred p;\n"
-      "setp.ne.b32 p, %131, 0;\n"
-      "wgmma.mma_async.sp.sync.aligned.m64n248k64.f32.e5m2.e5m2 "
-      "{%0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,   "
-      " %8,   %9,   %10,  %11,  %12,  %13,  %14,  %15,  "
-      " %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,  "
-      " %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,  "
-      " %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,  "
-      " %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,  "
-      " %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,  "
-      " %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,  "
-      " %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,  "
-      " %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,  "
-      " %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,  "
-      " %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,  "
-      " %96,  %97,  %98,  %99,  %100, %101, %102, %103, "
-      " %104, %105, %106, %107, %108, %109, %110, %111, "
-      " %112, %113, %114, %115, %116, %117, %118, %119, "
-      " %120, %121, %122, %123},"
-      "{%124, %125, %126, %127},"
-      " %128,"
-      " %129, %130,"
-      " p,    %132, %133;\n"
-    "}\n"
-      : "+f"(d000), "+f"(d001), "+f"(d002), "+f"(d003),
-        "+f"(d004), "+f"(d005), "+f"(d006), "+f"(d007),
-        "+f"(d008), "+f"(d009), "+f"(d010), "+f"(d011),
-        "+f"(d012), "+f"(d013), "+f"(d014), "+f"(d015),
-        "+f"(d016), "+f"(d017), "+f"(d018), "+f"(d019),
-        "+f"(d020), "+f"(d021), "+f"(d022), "+f"(d023),
-        "+f"(d024), "+f"(d025), "+f"(d026), "+f"(d027),
-        "+f"(d028), "+f"(d029), "+f"(d030), "+f"(d031),
-        "+f"(d032), "+f"(d033), "+f"(d034), "+f"(d035),
-        "+f"(d036), "+f"(d037), "+f"(d038), "+f"(d039),
-        "+f"(d040), "+f"(d041), "+f"(d042), "+f"(d043),
-        "+f"(d044), "+f"(d045), "+f"(d046), "+f"(d047),
-        "+f"(d048), "+f"(d049), "+f"(d050), "+f"(d051),
-        "+f"(d052), "+f"(d053), "+f"(d054), "+f"(d055),
-        "+f"(d056), "+f"(d057), "+f"(d058), "+f"(d059),
-        "+f"(d060), "+f"(d061), "+f"(d062), "+f"(d063),
-        "+f"(d064), "+f"(d065), "+f"(d066), "+f"(d067),
-        "+f"(d068), "+f"(d069), "+f"(d070), "+f"(d071),
-        "+f"(d072), "+f"(d073), "+f"(d074), "+f"(d075),
-        "+f"(d076), "+f"(d077), "+f"(d078), "+f"(d079),
-        "+f"(d080), "+f"(d081), "+f"(d082), "+f"(d083),
-        "+f"(d084), "+f"(d085), "+f"(d086), "+f"(d087),
-        "+f"(d088), "+f"(d089), "+f"(d090), "+f"(d091),
-        "+f"(d092), "+f"(d093), "+f"(d094), "+f"(d095),
-        "+f"(d096), "+f"(d097), "+f"(d098), "+f"(d099),
-        "+f"(d100), "+f"(d101), "+f"(d102), "+f"(d103),
-        "+f"(d104), "+f"(d105), "+f"(d106), "+f"(d107),
-        "+f"(d108), "+f"(d109), "+f"(d110), "+f"(d111),
-        "+f"(d112), "+f"(d113), "+f"(d114), "+f"(d115),
-        "+f"(d116), "+f"(d117), "+f"(d118), "+f"(d119),
-        "+f"(d120), "+f"(d121), "+f"(d122), "+f"(d123)
-      :  "r"(a000),  "r"(a001),  "r"(a002),  "r"(a003),
-         "l"(desc_b),
-         "r"(e), "n"(int32_t(spsel)),
-         "r"(int32_t(scale_D)), "n"(int32_t(scaleA)), "n"(int32_t(scaleB)));
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace SM90::GMMA::SPARSE
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/arch/util.hpp b/lightllm-kernel/cutlass/include/cute/arch/util.hpp
deleted file mode 100755
index 3749a9c25..000000000
--- a/lightllm-kernel/cutlass/include/cute/arch/util.hpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/numeric/integer_sequence.hpp>
-
-#if defined(__clang__) && defined(__CUDA__)
-  //  __cvta_generic_to_shared was added in Clang 14: https://reviews.llvm.org/D111665
-  #if __clang_major__ >= 14
-    #define CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
-  #endif
-
-  // __nvvm_get_smem_pointer added in Clang 14: https://reviews.llvm.org/D111665
-  // ... but will not work on Windows until Clang 15: https://reviews.llvm.org/D122897
-  #if (!defined(_WIN32) && __clang_major__ >= 14) || __clang_major__ >= 15
-    #define CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER 1
-  #endif
-#endif
-
-#if defined(__NVCC__) || defined(__CUDACC_RTC__)
-  // __cvta_generic_to_shared added in CUDA 11+
-  #if __CUDACC_VER_MAJOR__ >= 11
-    #define CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED 1
-  #endif
-
-  // __nvvm_get_smem_pointer added in CUDA 10.2
-  #if __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2
-    #define CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER 1
-  #endif
-#endif
-
-#if CUTE_NVCC_SUPPORTS_CVTA_GENERIC_TO_SHARED || CUTE_CLANG_SUPPORTS_CVTA_GENERIC_TO_SHARED
-  #define CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED 1
-#endif
-
-#if !defined(CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED) && CUTE_CVTA_GENERIC_TO_SHARED_SUPPORTED && defined(__CUDA_ARCH__)
-  #define CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED 1
-#endif
-
-#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER || CUTE_CLANG_SUPPORTS_NVVM_GET_SMEM_POINTER
-  #define CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED 1
-#endif
-
-#if !defined(CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED) && CUTE_NVVM_GET_SMEM_POINTER_SUPPORTED && defined(__CUDA_ARCH__)
-  #define CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED 1
-#endif
-
-// Clang 14+ provides a declaration of __nvvm_get_smem_pointer, so we only need
-// to provide one for NVCC
-#if CUTE_NVCC_SUPPORTS_NVVM_GET_SMEM_POINTER
-  extern "C" {
-  // This NVVM intrinsic is subject to change in future versions of CUDA.
-  // Clients should not call it directly.
-  CUTE_DEVICE uint32_t __nvvm_get_smem_pointer(void*);
-  }
-#endif
-
-namespace cute
-{
-
-/// CUTE helper to cast SMEM pointer to unsigned
-CUTE_DEVICE
-uint32_t
-cast_smem_ptr_to_uint(void const* const ptr)
-{
-// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
-// the previous internal intrinsics if they are available.
-#if CUTE_CVTA_GENERIC_TO_SHARED_ACTIVATED
-  //
-  // This NVVM intrinsic converts an address in shared memory to a plain
-  // unsigned integer. This is necessary to pass to shared memory instructions
-  // in inline PTX.
-  //
-  // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer()  [only available in 10.2].
-  //
-  //__device__ size_t __cvta_generic_to_shared(void* ptr);
-
-  /// CUTE helper to get SMEM pointer
-  return static_cast<uint32_t>(__cvta_generic_to_shared(ptr));
-
-#elif CUTE_NVVM_GET_SMEM_POINTER_ACTIVATED
-
-  return __nvvm_get_smem_pointer(ptr);
-
-#elif defined(__CUDA_ARCH__)
-
-  uint32_t smem_ptr;
-
-  asm(
-  "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
-    : "=r"(smem_ptr) : "l"(ptr));
-
-  return smem_ptr;
-
-#else
-
-
-  (void) ptr;
-  printf("ERROR: cast_smem_ptr_to_uint not supported but used.\n");
-  return 0;
-
-#endif
-}
-
-namespace detail {
-
-//
-// Wrapper for MMAOp::fma
-//
-
-template <class MmaOp>
-struct CallFMA {
-  template <class... Args>
-  CUTE_HOST_DEVICE constexpr void
-  operator()(Args&&... args) const {
-    return MmaOp::fma(static_cast<Args&&>(args)...);
-  }
-};
-
-//
-// Wrapper for CopyOp::copy
-//
-
-template <class CopyOp>
-struct CallCOPY {
-  template <class... Args>
-  CUTE_HOST_DEVICE constexpr void
-  operator()(Args&&... args) const {
-    return CopyOp::copy(static_cast<Args&&>(args)...);
-  }
-};
-
-//
-// Utility for exploding pointers/arrays/tensors into functions
-//
-
-template <class Fn,
-          class PtrA, int... I>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrA&& a, int_sequence<I...>)
-{
-  return fn(a[I]...);
-}
-
-template <class Fn,
-          class PtrS, int... Is,
-          class PtrD, int... Id>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrS&& s, int_sequence<Is...>,
-        PtrD&& d, int_sequence<Id...>)
-{
-  return fn(s[Is]..., d[Id]...);
-}
-
-template <class Fn,
-          class PtrA, int... Ia,
-          class PtrB, int... Ib,
-          class PtrC, int... Ic>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrA&& a, int_sequence<Ia...>,
-        PtrB&& b, int_sequence<Ib...>,
-        PtrC&& c, int_sequence<Ic...>)
-{
-  return fn(a[Ia]..., b[Ib]..., c[Ic]...);
-}
-
-template <class Fn,
-          class PtrD, int... Id,
-          class PtrA, int... Ia,
-          class PtrB, int... Ib,
-          class PtrC, int... Ic>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrD&& d, int_sequence<Id...>,
-        PtrA&& a, int_sequence<Ia...>,
-        PtrB&& b, int_sequence<Ib...>,
-        PtrC&& c, int_sequence<Ic...>)
-{
-  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
-}
-
-template <class Fn,
-          class PtrD, int... Id,
-          class PtrA, int... Ia,
-          class PtrB, int... Ib,
-          class PtrC, int... Ic,
-          class PtrE, int... Ie>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrD&& d, int_sequence<Id...>,
-        PtrA&& a, int_sequence<Ia...>,
-        PtrB&& b, int_sequence<Ib...>,
-        PtrC&& c, int_sequence<Ic...>,
-        PtrE&& e, int_sequence<Ie...>)
-{
-  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
-}
-
-template <class Fn,
-          class PtrD, int... Id,
-          class PtrA, int... Ia,
-          class PtrB, int... Ib,
-          class PtrC, int... Ic,
-          class PtrE, int... Ie,
-          class PtrF, int... If>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrD&& d, int_sequence<Id...>,
-        PtrA&& a, int_sequence<Ia...>,
-        PtrB&& b, int_sequence<Ib...>,
-        PtrC&& c, int_sequence<Ic...>,
-        PtrE&& e, int_sequence<Ie...>,
-        PtrF&& f, int_sequence<If...>)
-{
-  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]...);
-}
-
-template <class Fn,
-          class PtrD, int... Id,
-          class PtrA, int... Ia,
-          class PtrB, int... Ib,
-          class PtrC, int... Ic,
-          class PtrE, int... Ie,
-          class PtrF, int... If,
-          class PtrG, int... Ig>
-CUTE_HOST_DEVICE constexpr
-void
-explode(Fn fn,
-        PtrD&& d, int_sequence<Id...>,
-        PtrA&& a, int_sequence<Ia...>,
-        PtrB&& b, int_sequence<Ib...>,
-        PtrC&& c, int_sequence<Ic...>,
-        PtrE&& e, int_sequence<Ie...>,
-        PtrF&& f, int_sequence<If...>,
-        PtrG&& g, int_sequence<Ig...>)
-{
-  return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
-}
-
-//
-// Utility for exploding tuples into functions
-//
-
-template <class Fn,
-          class TupleA, int... I>
-CUTE_HOST_DEVICE constexpr
-void
-explode_tuple(Fn fn,
-              TupleA&& a, int_sequence<I...>)
-{
-  return fn(get<I>(a)...);
-}
-
-template <class Fn,
-          class TupleA, int... Ia,
-          class TupleB, int... Ib>
-CUTE_HOST_DEVICE constexpr
-void
-explode_tuple(Fn fn,
-              TupleA&& a, int_sequence<Ia...>,
-              TupleB&& b, int_sequence<Ib...>)
-{
-  return fn(get<Ia>(a)..., get<Ib>(b)...);
-}
-
-template <class Fn,
-          class TupleA, int... Ia,
-          class TupleB, int... Ib,
-          class TupleC, int... Ic>
-CUTE_HOST_DEVICE constexpr
-void
-explode_tuple(Fn fn,
-              TupleA&& a, int_sequence<Ia...>,
-              TupleB&& b, int_sequence<Ib...>,
-              TupleC&& c, int_sequence<Ic...>)
-{
-  return fn(get<Ia>(a)..., get<Ib>(b)..., get<Ic>(c)...);
-}
-
-} // end namespace detail
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
deleted file mode 100755
index dd6b4e52a..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp
+++ /dev/null
@@ -1,764 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/tensor_impl.hpp>                // cute::Tensor
-#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
-#include <cute/container/tuple.hpp>            // cute::is_tuple
-#include <cute/numeric/integral_constant.hpp>  // cute::is_constant, cute::is_integral
-#include <cute/atom/copy_traits.hpp>           // cute::Copy_Traits
-#include <cute/atom/mma_atom.hpp>              // cute::TiledMMA
-
-namespace cute
-{
-
-template <class... Args>
-struct Copy_Atom;
-
-template <class CopyOperation, class CopyInternalType>
-struct Copy_Atom<CopyOperation, CopyInternalType> : Copy_Atom<Copy_Traits<CopyOperation>, CopyInternalType>
-{};
-
-template <class... Args, class CopyInternalType>
-struct Copy_Atom<Copy_Traits<Args...>, CopyInternalType>
-  : Copy_Traits<Args...>
-{
-  using Traits = Copy_Traits<Args...>;
-
-  // Bit and Thr layouts from the Copy_Traits
-  using ThrID        = typename Traits::ThrID;
-  using BitLayoutSrc = typename Traits::SrcLayout;
-  using BitLayoutDst = typename Traits::DstLayout;
-  using BitLayoutRef = typename Traits::RefLayout;
-
-  using ValType = CopyInternalType;
-
-  using ValLayoutSrc = decltype(recast_layout<uint1_t, ValType>(BitLayoutSrc{}));
-  using ValLayoutDst = decltype(recast_layout<uint1_t, ValType>(BitLayoutDst{}));
-  using ValLayoutRef = decltype(recast_layout<uint1_t, ValType>(BitLayoutRef{}));
-
-  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutSrc{}) == size(ThrID{}), "CopyOperation is not valid for Src of ValType.");
-  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutDst{}) == size(ThrID{}), "CopyOperation is not valid for Dst of ValType.");
-  CUTE_STATIC_ASSERT_V(size<0>(ValLayoutRef{}) == size(ThrID{}), "CopyOperation is not valid for Ref of ValType.");
-
-  static constexpr int NumValSrc = size<1>(ValLayoutSrc{});
-  static constexpr int NumValDst = size<1>(ValLayoutDst{});
-
-  // Additional Trait parameters/transformations
-  template <class... TraitsArgs>
-  CUTE_HOST_DEVICE
-  auto
-  with(TraitsArgs&&... args) const {
-    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
-    return Copy_Atom<decltype(traits), CopyInternalType>{traits};
-  }
-
-  //
-  // Tensor call interfaces
-  //
-
-  // Check and call instruction, or recurse
-  template <class SEngine, class SLayout,
-            class DEngine, class DLayout>
-  CUTE_HOST_DEVICE
-  void
-  call(Tensor<SEngine,SLayout> const& src,
-       Tensor<DEngine,DLayout>      & dst) const
-  {
-    static_assert(SLayout::rank == 1, "Expected rank-1 src tensor");
-    static_assert(DLayout::rank == 1, "Expected rank-1 dst tensor");
-
-    if constexpr (is_constant<NumValSrc, decltype(size(src))>::value ||
-                  is_constant<NumValDst, decltype(size(dst))>::value) {
-      // Dispatch to unpack to execute instruction
-      return copy_unpack(*this, src, dst);
-    } else
-    if constexpr (is_tuple<decltype(shape(src))>::value &&
-                  is_tuple<decltype(shape(dst))>::value) {
-      // If the size of the src/dst doesn't match the instruction,
-      //   recurse this rank-1 layout by peeling off the mode
-      //   ((A,B,C,...)) -> (A,B,C,...)
-      return copy(*this, tensor<0>(src), tensor<0>(dst));
-    } else {
-      static_assert(dependent_false<SEngine>, "No instruction match and no recursion possible.");
-    }
-  }
-
-  // Accept mutable temporaries
-  template <class SEngine, class SLayout,
-            class DEngine, class DLayout>
-  CUTE_HOST_DEVICE
-  void
-  call(Tensor<SEngine,SLayout> const& src,
-       Tensor<DEngine,DLayout>     && dst) const
-  {
-    return call(src, dst);
-  }
-};
-
-//
-// A tiling of copy atoms
-//
-
-template <class TiledCopy, class ThrIdx>
-struct ThrCopy;
-
-template <class Copy_Atom,
-          class LayoutCopy_TV,  // (tid,vid) -> coord   [Need not be 2D...]
-          class ShapeTiler_MN>  // coord space
-struct TiledCopy : Copy_Atom
-{
-  // Layout information from the CopyAtom
-  using AtomThrID     = typename Copy_Atom::ThrID;        // thrid -> thr_idx
-  using AtomLayoutSrc = typename Copy_Atom::ValLayoutSrc; // (thr,val) -> offset
-  using AtomLayoutDst = typename Copy_Atom::ValLayoutDst; // (thr,val) -> offset
-  using AtomLayoutRef = typename Copy_Atom::ValLayoutRef; // (thr,val) -> offset
-
-  using AtomNumThr = decltype(size<0>(AtomLayoutRef{}));
-  using AtomNumVal = decltype(size<1>(AtomLayoutRef{}));
-
-  // Layout information for the TiledCopy
-  using Tiler_MN       = ShapeTiler_MN;
-  using TiledLayout_TV = LayoutCopy_TV;
-  using TiledNumThr    = decltype(size<0>(TiledLayout_TV{}));
-  using TiledNumVal    = decltype(size<1>(TiledLayout_TV{}));
-
-  CUTE_STATIC_ASSERT_V(TiledNumThr{} % AtomNumThr{} == Int<0>{}, "TiledCopy uses too few thrs for selected CopyAtom");
-  CUTE_STATIC_ASSERT_V(TiledNumVal{} % AtomNumVal{} == Int<0>{}, "TiledCopy uses too few vals for selected CopyAtom");
-
-  // Tile a tensor or a layout from shape
-  //   (M,N,...)
-  // to shape
-  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
-  // where
-  //   ThrV:  The threads local to a COPY_ATOM Src.
-  //   ThrX:  The threads tiled across COPY_ATOMs Src.
-  //   FrgV:  The values local to a COPY_ATOM Src.
-  //   RestM: The values tiled in M.
-  //   RestN: The values tiled in N.
-  template <class STensor>
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  tidfrg_S(STensor&& stensor)
-  {
-    CUTE_STATIC_ASSERT_V(rank(stensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
-
-    // Tile the stensor and compute the (src-thr, src-val) -> (ref-thr, ref-val) layout
-    return tile2thrfrg(zipped_divide(stensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}));
-  }
-
-  // Tile a tensor or a layout from shape
-  //   (M,N,...)
-  // to shape
-  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
-  // where
-  //   ThrV:  The threads local to a COPY_ATOM Dst.
-  //   ThrX:  The threads tiled across COPY_ATOMs Dst.
-  //   FrgV:  The values local to a COPY_ATOM Dst.
-  //   RestM: The values tiled in M.
-  //   RestN: The values tiled in N.
-  template <class DTensor>
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  tidfrg_D(DTensor&& dtensor)
-  {
-    CUTE_STATIC_ASSERT_V(rank(dtensor) >= rank(Tiler_MN{}), "Rank of tensor to be partitioned too small.");
-
-    // Tile the dtensor and compute the (dst-thr, dst-val) -> (ref-thr, ref-val) layout
-    return tile2thrfrg(zipped_divide(dtensor,Tiler_MN{}), right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}));
-  }
-
-  // Tile a tensor or a layout from shape
-  //   ((TileM,TileN,...), (RestM,RestN,...))
-  // to shape
-  //   ((ThrV,ThrX),FrgV,(RestM,RestN,...))
-  template <class Tensor, class Ref2TrgLayout>
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  tile2thrfrg(Tensor&& tensor, Ref2TrgLayout const& ref2trg)
-  {
-    // Take the thrs/vals that the atom is interested in
-    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
-    auto atom_layout_TV = zipped_divide(TiledLayout_TV{}, make_shape(AtomNumThr{}, AtomNumVal{}));
-    // ((atom_tid,atom_val),(rest_tid,rest_val)) -> (m,n)
-
-    // Transform to the trg layout
-    auto trg_layout_TV = atom_layout_TV.compose(ref2trg, _);
-    // ((trg_tid,trg_val),(rest_tid,rest_val)) -> (m,n)
-
-    // Transform the thrs mode from thrid to thr_idx
-    // NOTE: Assumes the AtomNumThr are contiguous and identity within TiledThrID
-    auto thrval2mn = coalesce(zip(trg_layout_TV), Shape<_1,Shape<_1,_1>>{});
-    // ((trg_tid,rest_tid),(trg_val,rest_val)) -> (m,n)
-
-    /// ==================
-
-    // Transform the tile mode
-    auto tv_tensor = tensor.compose(thrval2mn, _);
-    // ((thrid,val),(RestM,RestN,...))
-
-    // Unfold and return
-    return tv_tensor(make_coord(_,_), _);
-  }
-
-  // retile_S and retile_D assume they are working with the reference layout -- they are the same
-  template <class Tensor>
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  retile(Tensor&& tensor)
-  {
-    constexpr int R = remove_cvref_t<Tensor>::rank;
-    // Assert that AtomLayoutSrc|Dst is identity so we can skip the Ref transformation
-
-    // Assume the first size<0>(tensor) elements are the first val_ids in TiledLayout_TV.
-    // Then, we only need the shape+layout of those size<0>(tensor) elements in TiledLayout_TV
-    //   and that shape is what we gather from the other modes of tensor
-
-    auto V = size<0>(tensor);
-
-    auto frg_layout_mn = upcast<TiledNumThr{} * V>(right_inverse(TiledLayout_TV{}).with_shape(shape(Tiler_MN{})));
-    // (m,n) -> v_idx -- The shape and order of the V inside of TiledLayout_TV
-
-    auto frg_layout_v = zipped_divide(logical_product(make_layout(V), right_inverse(frg_layout_mn)), make_layout(AtomNumVal{}));
-    // (atom_vals,rest_vals) -> (v,m,n)
-
-    /// =======
-
-    // Tile the tensor for TileFrg
-    auto t_tensor = zipped_divide(tensor, prepend(product_each(shape(frg_layout_mn)), V));
-    // ((TileV,TileM,TileN,...),(1,RestM,RestN,...))
-
-    // Transform the tile mode
-    auto v_tensor = t_tensor.compose(frg_layout_v, _);
-    // ((atom_vals,rest_vals),(1,RM,RN,...))
-
-    // Unfold and return
-    return v_tensor(_, append<R>(Int<0>{},_));
-  }
-
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  get_layoutS_TV()
-  {
-    // (M,N) -> (M,N)
-    auto ref_S = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
-    // (thr_idx,val_idx) -> (M,N)
-    return tile2thrfrg(ref_S, right_inverse(AtomLayoutRef{}).compose(AtomLayoutSrc{}))(_,_,Int<0>{});
-  }
-
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  get_layoutS_MN()
-  {
-    // (thr_idx,val_idx) -> (M,N)
-    auto layoutS_TV = get_layoutS_TV();
-    // (M,K) -> (thr_idx,val_idx)
-    auto layoutS_MK = right_inverse(layoutS_TV).with_shape(shape(Tiler_MN{}));
-
-    // athrid = (v,m,k) -> thr_idx
-    auto thrID_S = make_layout(size<0>(TiledLayout_TV{}));
-
-    return cute::make_tuple(layoutS_MK, thrID_S);
-  }
-
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  get_layoutD_TV()
-  {
-    // (M,N) -> (M,N)
-    auto ref_D = make_layout(make_shape(shape(Tiler_MN{}), Int<1>{}));
-    // (thr_idx,val_idx) -> (M,N)
-    return tile2thrfrg(ref_D, right_inverse(AtomLayoutRef{}).compose(AtomLayoutDst{}))(_,_,Int<0>{});
-  }
-
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  get_layoutD_MN()
-  {
-    // (thr_idx,val_idx) -> (M,N)
-    auto layoutD_TV = get_layoutD_TV();
-    // (M,K) -> (thr_idx,val_idx)
-    auto layoutD_MK = right_inverse(layoutD_TV).with_shape(shape(Tiler_MN{}));
-
-    // athrid = (v,m,k) -> thr_idx
-    auto thrID_D = make_layout(size<0>(TiledLayout_TV{}));
-
-    return cute::make_tuple(layoutD_MK, thrID_D);
-  }
-
-  template <class ThrIdx,
-            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
-  CUTE_HOST_DEVICE static
-  auto
-  get_slice(ThrIdx const& thr_idx)
-  {
-    return ThrCopy<TiledCopy, ThrIdx>(thr_idx);
-  }
-
-  template <class ThrIdx,
-            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
-  CUTE_HOST_DEVICE  static
-  auto
-  get_thread_slice(ThrIdx const& thr_idx)
-  {
-    return get_slice(thr_idx);
-  }
-};
-
-template <class TiledCopy, class ThrIdx>
-struct ThrCopy
-{
-  ThrIdx thr_idx_;
-
-  CUTE_HOST_DEVICE
-  ThrCopy(ThrIdx const& thr_idx) : thr_idx_(thr_idx) {}
-
-  template <class STensor>
-  CUTE_HOST_DEVICE
-  auto
-  partition_S(STensor&& stensor) const {
-    //static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
-    //              "Expected ValType for tiling SrcTensor.");
-    auto thr_tensor = make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::tidfrg_S(stensor.layout()));
-    return thr_tensor(thr_idx_, _, repeat<rank_v<STensor>>(_));
-  }
-
-  template <class DTensor>
-  CUTE_HOST_DEVICE
-  auto
-  partition_D(DTensor&& dtensor) const {
-    //static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
-    //              "Expected ValType for tiling DstTensor.");
-    auto thr_tensor = make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::tidfrg_D(dtensor.layout()));
-    return thr_tensor(thr_idx_, _, repeat<rank_v<DTensor>>(_));
-  }
-
-  template <class STensor>
-  CUTE_HOST_DEVICE static
-  auto
-  retile_S(STensor&& stensor) {
-    // static_assert(sizeof(typename remove_cvref_t<STensor>::value_type) == sizeof(typename TiledCopy::ValType),
-    //               "Expected ValType for tiling SrcTensor.");
-    return make_tensor(static_cast<STensor&&>(stensor).data(), TiledCopy::retile(stensor.layout()));
-  }
-
-  template <class DTensor>
-  CUTE_HOST_DEVICE static
-  auto
-  retile_D(DTensor&& dtensor) {
-    // static_assert(sizeof(typename remove_cvref_t<DTensor>::value_type) == sizeof(typename TiledCopy::ValType),
-    //               "Expected ValType for tiling DstTensor.");
-    return make_tensor(static_cast<DTensor&&>(dtensor).data(), TiledCopy::retile(dtensor.layout()));
-  }
-};
-
-
-template <class... Args,
-          class LayoutCopy_TV,
-          class Tiler>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_impl(Copy_Atom<Args...> const& atom,
-                     LayoutCopy_TV      const&,
-                     Tiler              const&)
-{
-  return TiledCopy<Copy_Atom<Args...>, LayoutCopy_TV, Tiler>{atom};
-}
-
-//
-// These tile the Copy_Atom as a whole
-//
-
-template <class... CArgs, class... MArgs>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_A(Copy_Atom<CArgs...> const& copy_atom,
-                  TiledMMA<MArgs...>  const& mma)
-{
-  return make_tiled_copy_impl(copy_atom, mma.get_layoutA_TV(), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
-}
-
-template <class... CArgs, class... MArgs>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_B(Copy_Atom<CArgs...> const& copy_atom,
-                  TiledMMA<MArgs...>  const& mma)
-{
-  return make_tiled_copy_impl(copy_atom, mma.get_layoutB_TV(), make_shape(tile_size<1>(mma),tile_size<2>(mma)));
-}
-
-template <class... CArgs, class... MArgs>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_C(Copy_Atom<CArgs...> const& copy_atom,
-                  TiledMMA<MArgs...>  const& mma)
-{
-  return make_tiled_copy_impl(copy_atom, mma.get_layoutC_TV(), make_shape(tile_size<0>(mma),tile_size<1>(mma)));
-}
-
-// returns the smallest tiled copy that can retile LayoutC_TV
-// for use with pipelined epilogues with subtiled stores
-template <class... CArgs, class... MArgs>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_C_atom(Copy_Atom<CArgs...> const& copy_atom,
-                       TiledMMA<MArgs...>  const& mma)
-{
-  // Truncate the V-layout to just the Copy_Atom, keep the V-order
-  auto layoutC_TV = mma.get_layoutC_TV();
-  auto copy_V     = Int<Copy_Atom<CArgs...>::NumValSrc>{};
-  CUTE_STATIC_ASSERT_V(copy_V <= size<1>(layoutC_TV));
-  auto layout_TV  = composition(layoutC_TV, make_layout(make_shape(size<0>(layoutC_TV), copy_V)));
-
-  // Recompute tiler and restride the TV layout for the new tiler
-
-  // Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them
-  // Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA
-  auto mma_tiler = make_shape(tile_size<0>(mma),tile_size<1>(mma));
-  auto mma_zeros = repeat_like(mma_tiler, Int<0>{});
-
-  auto tiler = transform(make_seq<rank(mma_tiler)>{}, [&](auto i) {
-    return filter(composition(make_layout(mma_tiler, replace<i>(mma_zeros, Int<1>{})), layout_TV));
-  });
-
-  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
-  // Apply the tiler to a reference and transform the codomain
-  // tile_coord -> mma_coord
-  auto tile2mma = composition(make_layout(mma_tiler), tiler);
-
-  // (tid,vid) -> tile_coord
-  auto layout_tv = composition(left_inverse(tile2mma), layout_TV);
-
-  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
-}
-
-/** Produce a TiledCopy from logical thread and values layouts.
- * The thread and value layouts map coordinates to thr_idx and val_idx.
- *    The product of these layouts is taken to produce the TV layout and the Tiler.
- * Useful when threads and values need very specific mappings onto coordinates
- *    in the target tensors.
- */
-template <class... Args,
-          class ThrLayout,
-          class ValLayout = Layout<_1>>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy(Copy_Atom<Args...> const& copy_atom,
-                ThrLayout          const& thr_layout = {},     // (m,n) -> thr_idx
-                ValLayout          const& val_layout = {})     // (m,n) -> val_idx
-{
-  // Take the raked_products to compute the Layout_MN
-  // (M,N) -> (thr_idx, val_idx)
-  auto layout_mn = raked_product(thr_layout, val_layout);
-  // (thr_idx, val_idx) -> (M,N)
-  auto layout_tv = right_inverse(layout_mn).with_shape(make_shape(size(thr_layout), size(val_layout)));
-  // Tiler for extracting relevant elements
-  // (M,N) -> tensor coord
-  auto tiler = product_each(shape(layout_mn));
-
-#if 0
-  print("thr_layout: "); print(thr_layout); print("\n");
-  print("val_layout: "); print(val_layout); print("\n");
-  print("layout_mn : "); print(layout_mn);  print("\n");
-  print("layout_tv : "); print(layout_tv);  print("\n");
-  print("tiler     : "); print(tiler);      print("\n");
-#endif
-
-  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
-}
-
-/** Produce a TiledCopy from thread and value offset maps.
- * The TV Layout maps threads and values to the codomain of the data_layout.
- * It is verified that the intended codomain is valid within data_layout.
- * Useful when threads and values don't care about owning specific coordinates, but
- *   care more about the vector-width and offsets between them.
- */
-template <class... Args, class AtomTVLayout, class DataLayout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_cotiled_copy(Copy_Atom<Args...> const& copy_atom,
-                  AtomTVLayout const& atom_tv_layout,   // atom (thr,val) -> data addr
-                  DataLayout   const& data_layout)      // coord          -> data addr    The target layout
-{
-  static_assert(is_static<AtomTVLayout>::value);
-  static_assert(is_static<DataLayout>::value);
-
-  // data addr -> data coord    Append 1:0 so off-the-ends get the stride-0
-  auto inv_data_layout = make_layout(left_inverse(data_layout), Layout<_1,_0>{});
-
-  // (tid,vid) -> data_coord
-  auto layout_tv_data = composition(inv_data_layout, atom_tv_layout);
-
-  // Check validity
-  CUTE_STATIC_ASSERT_V(coalesce(composition(data_layout, layout<1>(layout_tv_data))) == coalesce(layout<1>(atom_tv_layout)),
-                       "The memory pointed to by AtomTVLayout does not exist in the DataLayout.");
-
-#if 0
-  if (thread0()) {
-    print("data_layout        : "); print(data_layout); print("\n");
-    print("atom_tv_layout     : "); print(atom_tv_layout); print("\n");
-    print("layout_tv_data     : "); print(layout_tv_data); print("\n");
-  }
-#endif
-
-  //
-  // Tiler -- Find the active elements in the DATA tensor and generate a tiler to extract them
-  //
-
-  // Convert to the awkward by-mode tiler to preserve the modes of the tiled DATA
-  auto flat_data_shape = product_each(shape(data_layout));
-  auto flat_data_zeros = repeat<rank(flat_data_shape)>(Int<0>{});
-
-  auto tiler = transform(make_seq<rank(flat_data_shape)>{}, [&](auto i) {
-    return filter(composition(make_layout(flat_data_shape, replace<i>(flat_data_zeros, Int<1>{})), layout_tv_data));
-  });
-
-  //
-  // Layout_TV -- Find the (tid,vid) -> tile coord transformation
-  //
-
-  // Apply the tiler to a reference and transform the codomain
-  // tile_coord -> data_coord
-  auto tile2data = composition(make_layout(flat_data_shape), tiler);
-
-  // (tid,vid) -> tile_coord
-  auto layout_tv = composition(left_inverse(tile2data), layout_tv_data);
-
-#if 0
-  if (thread0()) {
-    print("tiler              : "); print(tiler); print("\n");
-    print("tile2data          : "); print(tile2data); print("\n");
-    print("layout_tv          : "); print(layout_tv); print("\n");
-  }
-#endif
-
-  return make_tiled_copy_impl(copy_atom, layout_tv, tiler);
-}
-
-// Make a TiledCopy out of the copy_atom that matches the Src-Layout of tiled_copy
-template <class... Args,
-          class TiledCopy>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_S(Copy_Atom<Args...> const& copy_atom,
-                  TiledCopy          const& tiled_copy)
-{
-  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutS_TV(), typename TiledCopy::Tiler_MN{});
-}
-
-// Make a TiledCopy out of the copy_atom that matches the Dst-Layout of tiled_copy
-template <class... Args,
-          class TiledCopy>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_D(Copy_Atom<Args...> const& copy_atom,
-                  TiledCopy          const& tiled_copy)
-{
-  return make_tiled_copy_impl(copy_atom, tiled_copy.get_layoutD_TV(), typename TiledCopy::Tiler_MN{});
-}
-
-//
-// Size
-//
-
-// The logical size of a TileCopy
-template <int... I, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_size(TiledCopy<Args...> const&)
-{
-  return size<I...>(typename TiledCopy<Args...>::Tiler_MN{});
-}
-
-// The number of threads involved in a TiledCopy
-template <class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-size(TiledCopy<Args...> const&)
-{
-  return typename TiledCopy<Args...>::TiledNumThr{};
-}
-
-//
-// Display utilities
-//
-
-template <class... Args, class T>
-CUTE_HOST_DEVICE
-void
-print(Copy_Atom<Copy_Traits<Args...>, T> const&)
-{
-  using Atom = Copy_Atom<Copy_Traits<Args...>, T>;
-  print("Copy_Atom\n");
-  print("  ThrID:        "); print(typename Atom::ThrID{});        print("\n");
-  print("  ValLayoutSrc: "); print(typename Atom::ValLayoutSrc{}); print("\n");
-  print("  ValLayoutDst: "); print(typename Atom::ValLayoutDst{}); print("\n");
-  print("  ValLayoutRef: "); print(typename Atom::ValLayoutRef{}); print("\n");
-  print("  ValueType:    "); print(sizeof_bits<typename Atom::ValType>::value); print("b\n");
-}
-
-template <class Atom, class... Args>
-CUTE_HOST_DEVICE
-void
-print(TiledCopy<Atom, Args...> const& copy, char const* pad = "")
-{
-  using Copy = TiledCopy<Atom, Args...>;
-  print("TiledCopy\n");
-  print("  Tiler_MN:       "); print(typename Copy::Tiler_MN{});       print("\n");
-  print("  TiledLayout_TV: "); print(typename Copy::TiledLayout_TV{}); print("\n");
-  print(static_cast<Atom const&>(copy));
-}
-
-template <class TiledCopy, class ThrIdx>
-CUTE_HOST_DEVICE
-void
-print(ThrCopy<TiledCopy, ThrIdx> const& thr_copy)
-{
-  print("ThrCopy\n");
-  print("  ThrIdx: "); print(thr_copy.thr_idx_); print("\n");
-  print(TiledCopy{});
-}
-
-// TiledCopy to LaTeX TikZ
-template <class... Args, class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-auto
-print_latex(TiledCopy<Args...> const& copy,
-            TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
-{
-  auto [layoutS_MN, thrID_S] = copy.get_layoutS_MN();
-  auto [layoutD_MN, thrID_D] = copy.get_layoutD_MN();
-
-  print_latex_copy(layoutS_MN, thrID_S,
-                   layoutD_MN, thrID_D);
-}
-
-// MNK Copy Layout to LaTeX TikZ
-template <class LayoutS, class ThrIDS,
-          class LayoutD, class ThrIDD,
-          class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-void
-print_latex_copy(LayoutS const& S, ThrIDS const& TS,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-                 LayoutD const& D, ThrIDD const& TD,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-                 TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
-{
-  CUTE_STATIC_ASSERT_V(rank(S) == Int<2>{});
-  CUTE_STATIC_ASSERT_V(rank(D) == Int<2>{});
-
-  assert(size<0>(S) == size<0>(D));
-  assert(size<1>(S) == size<1>(D));
-
-  // Commented prints
-  printf("%% LayoutS: "); print(S);  printf("\n");
-  printf("%% ThrIDS : "); print(TS); printf("\n");
-  printf("%% LayoutD: "); print(D);  printf("\n");
-  printf("%% ThrIDD : "); print(TD); printf("\n\n");
-
-  // Header
-  printf("\\documentclass[convert]{standalone}\n"
-         "\\usepackage{tikz}\n\n"
-         "\\begin{document}\n"
-         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
-
-  // S starting at 0,0
-  for (int i = 0; i < size<0>(S); ++i) {
-    for (int j = 0; j < size<1>(S); ++j) {
-      int thrid   = S(i,j) % size(TS);
-      int val_idx = S(i,j) / size(TS);
-      int thr_idx = TS(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             i, j,
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
-         0, 0, int(size<0>(S)), int(size<1>(S)));
-  // S Labels
-  for (int i =  0, j = -1; i < size<0>(S); ++i) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
-  }
-  for (int i = -1, j =  0; j < size<1>(S); ++j) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
-  }
-
-  // D starting at 0,size<1>(S)+3
-  for (int i = 0; i < size<0>(D); ++i) {
-    for (int j = 0; j < size<1>(D); ++j) {
-      int thrid   = D(i,j) % size(TD);
-      int val_idx = D(i,j) / size(TD);
-      int thr_idx = TD(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             i, j + size<1>(S) + 3,
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
-         0, int(size<1>(S)+3), int(size<0>(D)), int(size<1>(D)+size<1>(S)+3));
-  // D Labels
-  for (int i = 0, j = size<1>(D); i < size<0>(D); ++i) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, i);
-  }
-  for (int i = -1, j =         0; j < size<1>(D); ++j) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j + size<1>(S) + 3, j);
-  }
-
-  // Footer
-  printf("\\end{tikzpicture}\n"
-         "\\end{document}\n");
-}
-
-} // end namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cute/atom/copy_traits_sm50.hpp>
-#include <cute/atom/copy_traits_sm75.hpp>
-#include <cute/atom/copy_traits_sm80.hpp>
-#include <cute/atom/copy_traits_sm90.hpp>
-
-// Config
-#if (__CUDACC_VER_MAJOR__ >= 12)
-#  define CUTE_COPY_ATOM_TMA_SM90_ENABLED
-#endif
-
-#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
-#include <cute/atom/copy_traits_sm90_tma.hpp>
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
deleted file mode 100755
index bfbeb4ea5..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy.hpp>
-
-#include <cute/tensor_impl.hpp>
-
-namespace cute
-{
-
-/**
- * concept Copy_Traits
- * {
- *   using ThrID     =    // Logical thread id (tid) -> tidx
- *
- *   using SrcLayout =    // (Logical src thread id (tid), Logical src value id (vid)) -> bit
- *   using DstLayout =    // (Logical dst thread id (tid), Logical dst value id (vid)) -> bit
- *   using RefLayout =    // (Logical ref thread id (tid), Logical ref value id (vid)) -> bit
- * };
- *
- * The abstract bit ordering of the Copy_Traits (the codomain of SrcLayout, DstLayout, and RefLayout)
- * is arbitrary and only used to construct maps
- *   (ref-tid,ref-vid) -> (src-tid,src-vid)
- *   (ref-tid,ref-vid) -> (dst-tid,dst-vid)
- * in TiledCopy. The Layout_TV in TiledCopy is in accordance with the RefLayout of a Traits, then mapped to
- * the Src or Dst (tid,vid) representation on demand.
- *
- */
-
-template <class CopyOperation, class... CopyOpArgs>
-struct Copy_Traits
-{
-  static_assert(dependent_false<CopyOperation>, "Copy_Traits not implemented for this CopyOperation.");
-};
-
-template <class S, class D>
-struct Copy_Traits<UniversalCopy<S,D>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <int MaxVecBits>
-struct Copy_Traits<AutoVectorizingCopyWithAssumedAlignment<MaxVecBits>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-//
-// Generic copy_unpack for common argument-based Copy_Traits
-//
-
-template <class CopyOp, class... Args,
-          class SEngine, class SLayout,
-          class DEngine, class DLayout>
-CUTE_HOST_DEVICE constexpr
-void
-copy_unpack(Copy_Traits<CopyOp,Args...> const&,
-            Tensor<SEngine,SLayout>     const& src,
-            Tensor<DEngine,DLayout>          & dst)
-{
-  // Specializations can generalize on these checks
-  //static_assert(is_smem<TS>::value, "Expected smem for this Copy_Traits<CopyOp>");
-  //static_assert(is_rmem<TD>::value, "Expected rmem for this Copy_Traits<CopyOp>");
-
-  using RegistersSrc = typename CopyOp::SRegisters;
-  using RegistersDst = typename CopyOp::DRegisters;
-  using RegTypeSrc   = typename remove_extent<RegistersSrc>::type;
-  using RegTypeDst   = typename remove_extent<RegistersDst>::type;
-  constexpr int RegNumSrc = extent<RegistersSrc>::value;
-  constexpr int RegNumDst = extent<RegistersDst>::value;
-
-  Tensor rS = recast<RegTypeSrc>(src);
-  Tensor rD = recast<RegTypeDst>(dst);
-
-  CUTE_STATIC_ASSERT_V(size(rS) == Int<RegNumSrc>{},
-    "Copy_Traits: src failed to vectorize into registers. Layout is incompatible with this CopyOp.");
-  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumDst>{},
-    "Copy_Traits: dst failed to vectorize into registers. Layout is incompatible with this CopyOp.");
-
-  detail::explode(detail::CallCOPY<CopyOp>{},
-                  rS, make_int_sequence<RegNumSrc>{},
-                  rD, make_int_sequence<RegNumDst>{});
-}
-
-//
-// Accept mutable temporaries
-//
-
-template <class CopyOp, class... Args,
-          class SEngine, class SLayout,
-          class DEngine, class DLayout>
-CUTE_HOST_DEVICE constexpr
-void
-copy_unpack(Copy_Traits<CopyOp,Args...> const& traits,
-            Tensor<SEngine,SLayout>     const& src,
-            Tensor<DEngine,DLayout>         && dst)
-{
-  copy_unpack(traits, src, dst);
-}
-
-namespace detail {
-
-template <class CopyOp, class = void>
-constexpr bool is_prefetch = false;
-
-template <class CopyOp>
-constexpr bool is_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = is_same_v<CopyOp, typename CopyOp::PREFETCH>;
-
-} // end namespace detail
-
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
deleted file mode 100755
index 7a693805e..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy_sm50.hpp>
-#include <cute/atom/copy_traits.hpp>
-
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <>
-struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR1>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_32,_64>,
-                           Stride<_64, _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape < _2,  _16>,Shape <_32,  _2>>,
-                           Stride<Stride<_32, _128>,Stride< _1, _64>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM50_Shuffle_U32_2x2Trans_XOR4>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_32>;
- 
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <_32,_64>,
-                           Stride<_64, _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape < _4,  _2,   _4>, Shape<_32,   _2>>,
-                           Stride<Stride<_64, _32, _512>,Stride< _1, _256>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
deleted file mode 100755
index 9ad82c617..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy_sm75.hpp>
-#include <cute/atom/copy_traits.hpp>
-
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <>
-struct Copy_Traits<SM75_U32x1_LDSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
-                           Stride<Stride<_128,_0>,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_32,_32>,
-                           Stride<_32, _1>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-template <>
-struct Copy_Traits<SM75_U32x2_LDSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
-                           Stride<Stride<_128,_0>,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_32,Shape <_32,   _2>>,
-                           Stride<_32,Stride< _1,_1024>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-template <>
-struct Copy_Traits<SM75_U32x4_LDSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape < _32,_128>,
-                           Stride<_128,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <_32,Shape <_32,   _4>>,
-                           Stride<_32,Stride< _1,_1024>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-template <>
-struct Copy_Traits<SM75_U16x2_LDSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <Shape <  _8,_4>,_128>,
-                           Stride<Stride<_128,_0>,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2>>,
-                           Stride<Stride<_256,_16>,Stride< _1,_128>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-template <>
-struct Copy_Traits<SM75_U16x4_LDSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape <Shape < _16,_2>,_128>,
-                           Stride<Stride<_128,_0>,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _2>>,
-                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-template <>
-struct Copy_Traits<SM75_U16x8_LDSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape < _32,_128>,
-                           Stride<_128,  _1>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape <Shape <  _4, _8>,Shape <_16,  _2,   _4>>,
-                           Stride<Stride<_256,_16>,Stride< _1,_128,_1024>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = DstLayout;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
deleted file mode 100755
index e5ff0b7b3..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy_sm80.hpp>
-#include <cute/atom/copy_traits.hpp>
-
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <class S, class D>
-struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS<S,D>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // Construct a zfill variant with a given predicate value
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
-  with(bool pred) const {
-    return {pred};
-  }
-};
-
-template <class S, class D>
-struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL<S,D>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // Construct a zfill variant with a given predicate value
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
-  with(bool pred) const {
-    return {pred};
-  }
-};
-
-template <class S, class D>
-struct Copy_Traits<SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // Predicate value that determines whether to load or zfill
-  bool pred = false;
-
-  // Overload copy_unpack for zfill variant to pass the predicate into the op
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
-    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
-
-    Tensor rS = recast<S>(src);
-    Tensor rD = recast<D>(dst);
-
-    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
-      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
-    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
-      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
-
-    SM80_CP_ASYNC_CACHEALWAYS_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
-  }
-};
-
-template <class S, class D>
-struct Copy_Traits<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,Int<sizeof_bits<S>::value>>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,Int<sizeof_bits<D>::value>>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // Predicate value that determines whether to load or zfill
-  bool pred = false;
-
-  // Overload copy_unpack for zfill variant to pass the predicate into the op
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_gmem<TS>::value, "Expected gmem source for cp.async.");
-    static_assert(is_smem<TD>::value, "Expected smem destination for cp.async.");
-
-    Tensor rS = recast<S>(src);
-    Tensor rD = recast<D>(dst);
-
-    CUTE_STATIC_ASSERT_V(size(rS) == Int<1>{},
-      "In CopyAtom, src layout doesn't vectorize into registers. This src layout is incompatible with this tiled copy.");
-    CUTE_STATIC_ASSERT_V(size(rD) == Int<1>{},
-      "In CopyAtom, dst layout doesn't vectorize into registers. This dst layout is incompatible with this tiled copy.");
-
-    SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<S,D>::copy(rS[0], rD[0], traits.pred);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Element copy selector
-template <class SrcTensor, class DstTensor>
-CUTE_HOST_DEVICE constexpr
-auto
-select_elementwise_copy(SrcTensor const&, DstTensor const&)
-{
-  using SrcType = typename SrcTensor::value_type;
-  using DstType = typename DstTensor::value_type;
-
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-  if constexpr (is_gmem<SrcTensor>::value && is_smem<DstTensor>::value &&
-                sizeof(SrcType) == sizeof(DstType) &&
-               (sizeof(SrcType) == 4 || sizeof(SrcType) == 8 || sizeof(SrcType) == 16))
-  {
-    return SM80_CP_ASYNC_CACHEALWAYS<SrcType,DstType>{};
-  } else {
-    return UniversalCopy<SrcType,DstType>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-#else
-  return UniversalCopy<SrcType,DstType>{};
-#endif
-}
-
-}
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
deleted file mode 100755
index f9590848a..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy_sm90.hpp>
-#include <cute/atom/copy_traits.hpp>
-#include <cute/atom/copy_traits_sm75.hpp>
-
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <>
-struct Copy_Traits<SM90_U32x1_STSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U32x1_LDSM_N>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM90_U32x2_STSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U32x2_LDSM_N>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM90_U32x4_STSM_N>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U32x4_LDSM_N>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM90_U16x2_STSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U16x2_LDSM_T>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM90_U16x4_STSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U16x4_LDSM_T>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-template <>
-struct Copy_Traits<SM90_U16x8_STSM_T>
-{
-  // Logical thread id to thread idx (warp)
-  using ThrID   = Layout<_32>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::DstLayout;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = typename Copy_Traits<SM75_U16x8_LDSM_T>::SrcLayout;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
deleted file mode 100755
index 54f76073b..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp
+++ /dev/null
@@ -1,940 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-  \brief im2col make_tma_copy
-*/
-
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/arch/copy_sm90_desc.hpp"
-#include "cute/tensor.hpp"
-
-#include "cute/algorithm/prefetch.hpp"
-#include "cutlass/fast_math.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-namespace cute
-{
-
-// Utility for unpacking TMA_LOAD_IM2COL arguments into a CopyOp
-template <class CopyOp>
-struct TMA_LOAD_IM2COL_Unpack
-{
-  /// Copy from src to dst.
-  ///
-  /// @param traits Copy traits created with a TMA descriptor that
-  ///   correctly matches the input tensor and other convolution
-  ///   parameters.
-  ///
-  /// @param src Tile of the im2col-transformed coordinate tensor
-  ///   (result of get_tma_tensor), representing the global-memory
-  ///   tensor from which to load.
-  ///
-  /// @param dst Shared memory tile, into which to load.
-  template <class... Args,
-            class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
-              Tensor<TS,SLayout>           const& src, // tile of the transformed global activation (A) tensor
-              Tensor<TD,DLayout>                & dst) // shared memory tile
-  {
-    auto src_coord_offset = src(Int<0>{});
-    auto src_coord_cwhdn_offset_srt = flatten(src_coord_offset);
-    // Interpret the TMA IM2COL coordinate as  (c, ([w,h,d]), n, ([s,r,t]))
-    CUTE_STATIC_ASSERT_V(rank(src_coord_offset) == _4{});
-    CUTE_STATIC_ASSERT_V(rank<1>(src_coord_offset) == rank<3>(src_coord_offset));
-
-    if constexpr (detail::is_prefetch<CopyOp>) {
-      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
-                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
-                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
-    } else {
-      static_assert(is_smem<TD>::value, "SM90_TMA_LOAD_IM2COL requires the destination be shared memory.");
-      void* dst_ptr = cute::raw_pointer_cast(dst.data());
-      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
-                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
-                                   make_tuple(dst_ptr), seq<0>{},
-                                   src_coord_cwhdn_offset_srt, tuple_seq<decltype(src_coord_cwhdn_offset_srt)>{});
-    }
-  }
-};
-
-// Copy_Traits for SM90 im2col TMA load comes in two layers.
-//
-// 1. Copy_Traits<SM90_TMA_LOAD_IM2COL>
-// 2. Copy_Traits<SM90_TMA_LOAD_IM2COL_OP>
-//
-// Copy_Traits<SM90_TMA_LOAD_IM2COL>
-// is the "outer" layer.  It has a TMA descriptor,
-// but no barrier ("tma_mbar"), so it's "nonexecutable."
-// One calls its "with" member function with a barrier,
-// to get an executable "inner"-layer
-// Copy_Traits<SM90_TMA_LOAD_IM2COL_OP> object.
-// That object's "copy_unpack" member function
-// actually invokes im2col TMA load.
-
-struct SM90_TMA_LOAD_IM2COL_OP : SM90_TMA_LOAD_IM2COL {};
-
-/// @brief Non-executable specialization of Copy_Traits for SM90
-///   im2col TMA load, with TMA descriptor but no barrier.
-///
-/// Use `.with(memory_barrier)` to construct an executable version.
-template <class NumBitsPerTMA, class TMATensor>
-struct Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, TMATensor>
-{
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  Im2ColTmaDescriptor tma_desc_;
-  TMATensor tma_tensor_;
-
-  CUTE_HOST_DEVICE constexpr
-  Im2ColTmaDescriptor const*
-  get_tma_descriptor() const
-  {
-    return &tma_desc_;
-  }
-
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  TMATensor const
-  get_tma_tensor(GShape const&) const
-  {
-    return tma_tensor_;
-  }
-
-  /// @brief Get an executable specialization.
-  ///
-  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL are not
-  /// directly executable.  Instead, call this "with" member function
-  /// to get an executable specialization.  "Executable" means that
-  /// @c copy_unpack works.
-  ///
-  /// @param tma_mbar Memory barrier for synchronization
-  ///
-  /// @param multicast_mask Multicast mask (unused; only exists
-  ///   for interface compatibility with the actual multicast Copy_Traits)
-  ///
-  /// @return Executable specialization of @c Copy_Traits
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
-  with(uint64_t& tma_mbar, [[maybe_unused]] uint16_t const& multicast_mask = 0) const
-  {
-    return {{}, {&tma_desc_, &tma_mbar}};
-  }
-
-  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL
-  // are not directly executable.  Instead, call .with
-  // to get an executable specialization.
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst) = delete;
-};
-
-/// @brief Executable specialization of Copy_Traits for SM90 im2col
-///   TMA load, with TMA descriptor and barrier.
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_TMA_LOAD_IM2COL_OP, NumBitsPerTMA>
-     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_OP>
-{
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD_IM2COL arguments
-  tuple<
-  Im2ColTmaDescriptor const*,
-  uint64_t* // smem mbarrier
-  > const opargs_;
-};
-
-template <class NumBitsPerTMA, class... Args>
-struct Copy_Traits<SM90_TMA_LOAD_IM2COL::PREFETCH, NumBitsPerTMA, Args...>
-     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL::PREFETCH>
-{
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD_IM2COL::PREFETCH arguments
-  tuple<Im2ColTmaDescriptor const*> const opargs_;
-
-  CUTE_HOST_DEVICE
-  Copy_Traits(Copy_Traits<SM90_TMA_LOAD_IM2COL, NumBitsPerTMA, Args...> const& traits)
-    : opargs_({&traits.tma_desc_}) {}
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_IM2COL_MULTICAST_OP : SM90_TMA_LOAD_IM2COL_MULTICAST {};
-
-/// @brief Non-executable specialization of Copy_Traits for SM90
-///   im2col TMA load, with TMA descriptor but no barrier or multicast
-///   mask.
-///
-/// Use `.with(memory_barrier)` to construct an executable version.
-template <class NumBitsPerTMA, class TMATensor>
-struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST, NumBitsPerTMA, TMATensor>
-{
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  Im2ColTmaDescriptor tma_desc_;
-  TMATensor tma_tensor_;
-
-  CUTE_HOST_DEVICE constexpr
-  Im2ColTmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  TMATensor const
-  get_tma_tensor(GShape const&) const
-  {
-    return tma_tensor_;
-  }
-
-  /// @brief Get an executable specialization.
-  ///
-  /// Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
-  /// are not directly executable.  Instead, call this "with" member
-  /// function to get an executable specialization.  "Executable"
-  /// means that @c copy_unpack works.
-  ///
-  /// @param tma_mbar Memory barrier for synchronization
-  ///
-  /// @param multicast_mask Multicast mask (defaults to a single CTA)
-  ///
-  /// @return Executable specialization of @c Copy_Traits
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
-  with(uint64_t& tma_mbar, uint16_t const& multicast_mask) const {
-    return {{}, {&tma_desc_, &tma_mbar, multicast_mask}};
-  }
-
-  // Copy_Traits specializations with SM90_TMA_LOAD_IM2COL_MULTICAST
-  // are not directly executable.  Instead, call .with to get an
-  // executable specialization.
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst) = delete;
-};
-
-/// @brief Executable specialization of Copy_Traits for SM90 multicast
-///   im2col TMA load, with TMA descriptor, barrier, and multicast mask.
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_TMA_LOAD_IM2COL_MULTICAST_OP, NumBitsPerTMA>
-     : TMA_LOAD_IM2COL_Unpack<SM90_TMA_LOAD_IM2COL_MULTICAST_OP>
-{
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit.
-  using SrcLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1, NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD_IM2COL_MULTICAST arguments
-  tuple<
-  Im2ColTmaDescriptor const*,
-  uint64_t*, // smem mbarrier
-  uint16_t   // multicast mask
-  > const opargs_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_STORE IM2COL////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-// The executable SM90_TMA_STORE_IM2COL with tma_desc
-template <class NumBitsPerTMA, class TMATensor>
-struct Copy_Traits<SM90_TMA_STORE_IM2COL, NumBitsPerTMA, TMATensor>
-{
-  using ThrID   = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_STORE_IM2COL arguments
-  Im2ColTmaDescriptor tma_desc_;
-  TMATensor tma_tensor_;
-
-  // Return TmaDescriptor/TensorMap
-  CUTE_HOST_DEVICE constexpr
-  Im2ColTmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  TMATensor const
-  get_tma_tensor(GShape const&) const
-  {
-    return tma_tensor_;
-  }
-
-  // This is the copy_unpack dispatch for this Copy_Traits
-  // Src needs to be a smem tensor
-  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE_IM2COL");
-
-    void const* const desc_ptr = &(traits.tma_desc_);
-    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
-    auto dst_coord = flatten(take<0,3>(dst(Int<0>{})));
-
-    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE_IM2COL>{},
-                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
-                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
-  }
-};
-
-namespace detail {
-
-/// @brief Creates a TMA descriptor for im2col TMA load.
-///
-/// @param tensor_cwhdn Global activation tensor (A matrix of Fprop).
-///   This is the original (not im2col-transformed) tensor in global
-///   memory.
-///
-/// @param slayout Rank 2 (M,K) shared memory layout of the activation
-///   tensor.  Here, K is "GEMM K," not the filter tensor's mode of
-///   the same name.
-//////
-/// @param traversal_stride Traversal strides convolution parameter
-//////
-/// Each of padding_shape, traversal_stride, and dilation_shape is a
-/// tuple whose size is the number of spatial modes (e.g., 3 for a 5-D
-/// convolution).
-///
-/// @return TMA descriptor for im2col TMA load
-template <class EngineA, class LayoutA,
-          class SmemSwizzle, class TMALayout,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST
-auto
-make_im2col_tma_copy_desc(
-    Tensor<EngineA, LayoutA>    const& tensor_cwhdn,       // (C,W,H,D,N)
-    uint32_t                           range_c,            // TILE_C
-    uint32_t                           range_whdn,         // TILE_WHDN
-    SmemSwizzle                 const& smem_swizzle,       // Swizzle
-    TMALayout                   const& tma_layout_vt,      // TMA layout
-    LowerCornerStride           const& lower_corner_whd,   // WHD offset of the "base pointer"
-    UpperCornerStride           const& upper_corner_whd,   // WHD upper corner
-    LowerPaddingStride          const& lower_padding_whd,  // WHD lower padding
-    UpperPaddingStride          const& upper_padding_whd,  // WHD upper padding
-    TraversalStride             const& stride_whd,         // WHD traversal stride
-    LowerSRTStride              const& lower_srt,          // SRT offset of the "base pointer"
-    DilationStride              const& stride_srt,          // SRT stride - dilation
-    TMA::DescriptorAuxParams    const& aux_params = {})
-{
-  static_assert(is_gmem<EngineA>::value, "Tensor must point to GPU global memory.");
-  using value_type = typename EngineA::value_type;
-
-  constexpr uint32_t num_total_modes   = LayoutA::rank;
-  constexpr int      num_spatial_modes = num_total_modes - 2;
-
-  // Gmem starting address
-  void* gmem_address = (void*) raw_pointer_cast(tensor_cwhdn.data());
-
-  // Gmem extents are just the tensor shape
-  cute::array<uint64_t, 5> gmem_prob_shape = {1,1,1,1,1};
-  for_each(make_seq<num_total_modes>{}, [&](auto i) {
-    gmem_prob_shape[i] = static_cast<uint64_t>(shape<i>(tensor_cwhdn));
-  });
-
-  // Gmem strides are byte strides of the activation tensor in CWHDN order
-  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
-  for_each(make_seq<num_total_modes>{}, [&](auto i) {
-    gmem_prob_stride[i] = sizeof(value_type) * stride<i>(tensor_cwhdn);
-  });
-
-  // Traversal strides are a function of the dilation shape
-  // corresponding to spatial (WHD) modes.
-  cute::array<uint32_t, 5> tma_traversal_strides = {1,1,1,1,1};
-  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
-    tma_traversal_strides[i+1] = static_cast<uint32_t>(get<i>(stride_whd));
-  });
-
-  cute::array<int32_t, num_spatial_modes> tma_lower_corner{};
-  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
-    tma_lower_corner[i] = static_cast<int32_t>(get<i>(lower_corner_whd));
-  });
-
-  cute::array<int32_t, num_spatial_modes> tma_upper_corner{};
-  for_each(make_seq<num_spatial_modes>{}, [&](auto i) {
-    tma_upper_corner[i] = static_cast<int32_t>(get<i>(upper_corner_whd));
-  });
-
-  Im2ColTmaDescriptor tma_desc;
-
-#if (__CUDACC_VER_MAJOR__ >= 12)
-
-  CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<value_type>();
-  CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
-  CUtensorMapL2promotion  tma_l2Promotion = to_CUtensorMapL2promotion(aux_params.l2promo_);
-  CUtensorMapFloatOOBfill tma_oob_fill    = to_CUtensorMapFloatOOBfill(aux_params.oobfill_);
-  TMA::SmemSwizzleBits    swizzle_bits    = detail::get_tma_swizzle_bits(smem_swizzle);
-  TMA::SmemSwizzleBase    swizzle_base    = detail::get_tma_swizzle_base(smem_swizzle);
-  CUtensorMapSwizzle      tma_swizzle     = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
-
-  CUresult encode_result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeIm2col)(
-      &tma_desc,
-      tma_format,
-      num_total_modes,
-      gmem_address,
-      gmem_prob_shape.data(),
-      gmem_prob_stride.data() + 1, // gmem_prob_stride[0] implicitly sizeof(value_type)
-      tma_lower_corner.data(),
-      tma_upper_corner.data(),
-      range_c,
-      range_whdn,
-      tma_traversal_strides.data(),
-      tma_interleave,
-      tma_swizzle,
-      tma_l2Promotion,
-      tma_oob_fill);
-
-  // The extra asserts help indicate the error's cause.
-  assert(encode_result != CUDA_ERROR_DEINITIALIZED);
-  assert(encode_result != CUDA_ERROR_NOT_INITIALIZED);
-  assert(encode_result != CUDA_ERROR_INVALID_CONTEXT);
-  assert(encode_result != CUDA_ERROR_INVALID_VALUE);
-  assert(encode_result == CUDA_SUCCESS);
-
-#endif // (__CUDACC_VER_MAJOR__ >= 12)
-  //
-  // Calculate gemm shapes and linearized shapes based on tma layout tiling.
-  //
-
-  // Compute [w, h, d, n]
-  // q/p/z = (w/h/d + (upper_corner_whd - lower_corner_whd - 1)) / stride_whd + 1
-  auto gemm_mn_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
-    return (shape<i+1>(tensor_cwhdn) + get<i>(upper_corner_whd) - get<i>(lower_corner_whd) - Int<1>{}) / get<i>(stride_whd) + Int<1>{};
-  });
-  auto gemm_mn = append(gemm_mn_, shape<num_spatial_modes+1>(tensor_cwhdn));
-
-  // Compute [c, s, r, t]
-  // fprop/wgrad, s/r/t = 1 + (upper_padding_whd - upper_corner_whd) / stride_srt
-  // wgrad,       s/r/t = 1 + (lower_padding_whd - lower_corner_whd) / stride_srt
-  auto gemm_k_ = cute::transform(cute::make_seq<num_spatial_modes>{}, [&](auto i) {
-    auto padding_size = conditional_return(get<i>(stride_srt) > Int<0>{},
-                                           get<i>(upper_padding_whd) - get<i>(upper_corner_whd),
-                                           get<i>(lower_corner_whd)  - get<i>(lower_padding_whd));
-    return Int<1>{} + padding_size / get<i>(stride_srt);
-  });
-  auto gemm_k = prepend(gemm_k_, shape<0>(tensor_cwhdn));
-
-  // For fprop/dgrad kernel, gemm_shapes is ((q, p, z, n), (c, s, r, t))
-  // For wgrad kernel, gemm_shapes is ((c, s, r, t), (q, p, z, n))
-  auto gemm_shapes_common = make_shape(
-      transform_leaf(gemm_mn, [](auto s) {
-        return conditional_return(cute::is_static<decltype(s)>{}, s, cutlass::FastDivmod(s));
-      }),
-      gemm_k);
-  auto gemm_shapes = make_shape(
-      basis_get(stride<0,1>(tma_layout_vt), gemm_shapes_common),
-      basis_get(stride<0,0>(tma_layout_vt), gemm_shapes_common));
-
-  // For fprop/dgrad kernel, linearized shapes is (whdn, (c, s, r, t))
-  // For wgrad kernel linearized shapes is ((c, s, r, t), whdn)
-  auto linear_shapes_common = make_shape(size(gemm_mn), gemm_k);
-  auto linear_shapes = make_shape(
-      basis_get(stride<0,1>(tma_layout_vt), linear_shapes_common),
-      basis_get(stride<0,0>(tma_layout_vt), linear_shapes_common));
-
-  //
-  // Calculate gmem basis stride based on tma layout tiling.
-  //
-
-  auto tma_basis_scale = make_shape(Int<1>{}, stride_whd, Int<1>{}, stride_srt);
-  auto tma_basis = elem_scale(tma_basis_scale, make_basis_like(tma_basis_scale));
-
-  auto gbasis_strides_common = make_stride(
-      append(get<1>(tma_basis), get<2>(tma_basis)),
-      prepend(get<3>(tma_basis), get<0>(tma_basis)));    // ((w,h,d,n),(c,s,r,t))
-  auto gbasis_strides = make_stride(
-      basis_get(stride<0,1>(tma_layout_vt), gbasis_strides_common),
-      basis_get(stride<0,0>(tma_layout_vt), gbasis_strides_common));
-
-  //
-  // Create tma tensor
-  //
-
-  auto lower_corner = make_arithmetic_tuple(Int<0>{}, lower_corner_whd, Int<0>{}, lower_srt);
-
-  auto tensor_multimode = make_tensor(ArithmeticTupleIterator(lower_corner), gemm_shapes, gbasis_strides);
-  auto tensor_linear = make_identity_tensor(linear_shapes);
-  auto tma_tensor = make_tensor(tensor_multimode.data(), composition(
-      tensor_multimode.layout(),
-      tensor_linear(Int<0>{}),
-      tensor_linear.layout()));
-
-  return cute::make_tuple(tma_desc, tma_tensor);
-}
-
-template <class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class VShape, class VStride,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST_RTC
-auto
-make_tma_atom_im2col(CopyOp,
-                     Tensor<GEngine,GLayout>      const& gtensor,           // Full GMEM Tensor: ((w, h, d, n), c)
-                     SLayout                      const& slayout,           // CTA Tile of SMEM, potentially swizzled
-                     int32_t                      const& num_multicast,     // The number of CTAs involved in multicasting
-                     Layout<VShape,VStride>       const& cta_v_map,         // V: CTA val idx -> gmem mode
-                     LowerCornerStride            const& lower_corner_whd,
-                     UpperCornerStride            const& upper_corner_whd,
-                     LowerPaddingStride           const& lower_padding_whd,
-                     UpperPaddingStride           const& upper_padding_whd,
-                     TraversalStride              const& stride_whd,        // traversal stride
-                     LowerSRTStride               const& lower_srt,
-                     DilationStride               const& stride_srt,        // dilation
-                     TMA::DescriptorAuxParams     const& aux_params = {})
-{
-  //
-  // TMA parameter checking
-  //
-
-  CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
-    "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
-
-  //
-  // TMA slayout manipulation
-  //
-
-  // Invert the smem to get the largest contiguous vector in the smem layout
-  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
-  // trunc_smem_idx -> trunc_smem_coord
-
-  // Map from smem idx to a gmem mode
-  auto sidx_to_gmode = coalesce(composition(cta_v_map, inv_smem_layout));
-
-#if 0
-  print("g_layout         : "); print(gtensor.layout()); print("\n");
-  print("s_layout         : "); print(slayout); print("\n");
-  print("cta_t_map        : "); print(cta_t_map); print("\n");
-  print("cta_v_map        : "); print(cta_v_map); print("\n");
-  print("inv_smem         : "); print(inv_smem_layout); print("\n");
-  print("sidx_to_gmode    : "); print(sidx_to_gmode); print("\n");
-#endif
-
-  //
-  // TMA gtensor manipulation
-  //
-
-  // Generate a TupleBasis for the gtensor
-  auto glayout_basis = make_identity_layout(product_each(shape(gtensor)));
-
-  // Tile the modes of gtensor with the truncated cta_v_map o inv_smem_layout_trunc
-  auto tma_layout_full = flatten(composition(glayout_basis, sidx_to_gmode));
-
-  // Truncate any incompatibilities -- no starting in the middle of gmodes
-  auto smem_rank = find_if(stride(tma_layout_full), [](auto e) {
-    [[maybe_unused]] auto v = basis_value(e);
-    return not is_constant<1,decltype(v)>{};
-  });
-  static_assert(smem_rank >= 2, "IM2COL expects at least 2 modes of the smem to vectorize with gmem.");
-  // IM2COL uses a maximum of 2 modes
-  constexpr int smem_tma_rank = cute::min(int(smem_rank), 2);
-
-  // Keep only the static-1 basis modes into gmem
-  auto tma_layout_trunc = take<0,smem_tma_rank>(tma_layout_full);
-
-  // Split according to the portion each multicast CTA will be responsible for
-  auto tma_layout_vt = logical_divide(tma_layout_trunc, shape_div(size(tma_layout_trunc), num_multicast));
-
-#if 0
-  print("glayout_basis   : "); print(glayout_basis); print("\n");
-  print("tma_layout_full : "); print(tma_layout_full); print("\n");
-
-  print("tma_layout_trunc: "); print(tma_layout_trunc); print("\n");
-  print("tma_layout_vt   : "); print(tma_layout_vt); print("\n");
-#endif
-
-  auto range_c    = size<0,0>(tma_layout_vt);
-  auto range_whdn = size<0,1>(tma_layout_vt);
-  Tensor gtensor_cwhdn = make_tensor(gtensor.data(),
-                                     flatten(make_layout(make_layout(basis_get(stride<0,0>(tma_layout_vt), gtensor.shape()),
-                                                                     basis_get(stride<0,0>(tma_layout_vt), gtensor.stride())),
-                                                         make_layout(basis_get(stride<0,1>(tma_layout_vt), gtensor.shape()),
-                                                                     basis_get(stride<0,1>(tma_layout_vt), gtensor.stride())))));
-  auto [tma_desc, tma_tensor] = make_im2col_tma_copy_desc(
-      gtensor_cwhdn,
-      range_c,
-      range_whdn,
-      detail::get_swizzle_portion(slayout),
-      tma_layout_vt,
-      lower_corner_whd,
-      upper_corner_whd,
-      lower_padding_whd,
-      upper_padding_whd,
-      stride_whd,
-      lower_srt,
-      stride_srt,
-      aux_params);
-
-  //
-  // Construct the Copy_Traits
-  //
-
-  using T = typename GEngine::value_type;
-  constexpr int num_bits_per_tma = decltype(size(tma_layout_trunc))::value * sizeof(T) * 8;
-
-  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(tma_tensor)>;
-  using Atom = Copy_Atom<Traits, typename GEngine::value_type>;
-
-#if 0
-  print("num_bits      :  "); print(num_bits_per_tma); print("\n");
-#endif
-
-  Traits tma_traits{tma_desc, tma_tensor};
-
-  // Return the Copy_Atom
-  return Atom{tma_traits};
-}
-
-/// Make a TiledCopy for im2col TMA load.
-///
-/// @param copy_op The copy implementation: either
-///   SM90_TMA_LOAD_IM2COL or SM90_TMA_LOAD_IM2COL_MULTICAST.
-///
-/// @param tensor_cwhdn The global tensor to use for im2col TMA loads.
-///   For Fprop convolutions, this is the activation tensor.  This is
-///   the "original tensor that points to global memory, not the
-///   coordinate (im2col-transformed) tensor.
-///
-/// @param slayout Layout of shared memory tile.
-///
-/// @param stride_whd The traversal strides convolution
-///   parameter.
-///
-/// @return TiledCopy specialization for im2col TMA loads.
-template <class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class TShape, class TStride,
-          class VShape, class VStride,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST_RTC
-auto
-make_tma_copy_im2col(CopyOp                       const& copy_op,
-                     Tensor<GEngine,GLayout>      const& gtensor,
-                     SLayout                      const& slayout,
-                     Layout<TShape,TStride>       const& cta_t_map,          // CTA tid -> logical TMA tid
-                     Layout<VShape,VStride>       const& cta_v_map,          // CTA vid -> gmem coord
-                     LowerCornerStride            const& lower_corner_whd,
-                     UpperCornerStride            const& upper_corner_whd,
-                     LowerPaddingStride           const& lower_padding_whd,
-                     UpperPaddingStride           const& upper_padding_whd,
-                     TraversalStride              const& stride_whd,         // traversal stride
-                     LowerSRTStride               const& lower_srt,
-                     DilationStride               const& stride_srt,         // dilation
-                     TMA::DescriptorAuxParams     const& aux_params = {})
-{
-  //
-  // TMA parameter checking
-  //
-
-  CUTE_STATIC_ASSERT_V(size(slayout) % cosize(cta_t_map) == Int<0>{},
-    "Number of active CTAs in TMA must divide domain size of slayout.");
-
-  Copy_Atom atom = make_tma_atom_im2col(copy_op, gtensor, slayout, cosize(cta_t_map), cta_v_map,
-                                        lower_corner_whd, upper_corner_whd, lower_padding_whd,
-                                        upper_padding_whd, stride_whd, lower_srt, stride_srt, aux_params);
-
-  //
-  // Construct the TiledCopy
-  //
-
-  auto cta_tiler = product_each(shape(cta_v_map));
-
-  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
-
-  // smem idx -> smem coord
-  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
-  // CTA V -> smem_coord
-  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
-  // Scale that up to cover all of the smem_coords
-  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
-  // CTA T -> smem idx
-  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
-  // CTA TID -> smem coord
-  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
-  // Combine with the T mapping
-  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
-
-#if 0
-  print("cta_tiler : "); print(cta_tiler); print("\n");
-  print("layout_v : "); print(layout_v); print("\n");
-  print("layout_V : "); print(layout_V); print("\n");
-  print("layout_t : "); print(layout_t); print("\n");
-  print("layout_T : "); print(layout_T); print("\n");
-  print("layout_TV : "); print(layout_TV); print("\n");
-#endif
-
-  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
-}
-
-/// Make a TiledCopy for im2col TMA with no offsets.
-/// E.g. im2col TMA load for C and im2col TMA store for D.
-template <class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class TShape, class TStride,
-          class VShape, class VStride>
-CUTE_HOST_RTC
-auto
-make_tma_copy_im2col(CopyOp                  const& copy_op,
-                     Tensor<GEngine,GLayout> const& gtensor,
-                     SLayout                 const& slayout,
-                     Layout<TShape,TStride>  const& cta_t_map,          // CTA tid -> logical TMA tid
-                     Layout<VShape,VStride>  const& cta_v_map)          // CTA vid -> gmem coord
-{
-  constexpr int num_spatial_modes = rank<0>(GLayout{}) - 1;
-  return make_tma_copy_im2col(copy_op, gtensor, slayout, cta_t_map, cta_v_map,
-                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_corner_whd
-                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_corner_whd
-                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_padding_whd
-                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // upper_padding_whd
-                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{}),  // stride_whd
-                              append<num_spatial_modes>(Stride<_0>{}, Int<0>{}),  // lower_srt
-                              append<num_spatial_modes>(Stride<_1>{}, Int<1>{})); // stride_srt
-}
-
-} // namespace detail
-
-
-
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout,
-          class CTATiler,
-          class MulticastSize,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout,
-                     CTATiler                 const& cta_tiler,
-                     MulticastSize            const& multicast_size,
-                     LowerCornerStride        const& lower_corner_whd,
-                     UpperCornerStride        const& upper_corner_whd,
-                     LowerPaddingStride       const& lower_padding_whd,
-                     UpperPaddingStride       const& upper_padding_whd,
-                     TraversalStride          const& stride_whd,
-                     LowerSRTStride           const& lower_srt,
-                     DilationStride           const& stride_srt)
-{
-  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
-  auto cta_t_tile = make_layout(multicast_size);
-
-  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn,
-                                      slayout, cta_t_tile, cta_v_tile,
-                                      lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
-}
-
-// Explicit default for multicast_size
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout,
-          class CTATiler,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout,
-                     CTATiler                 const& cta_tiler,
-                     LowerCornerStride        const& lower_corner_whd,
-                     UpperCornerStride        const& upper_corner_whd,
-                     LowerPaddingStride       const& lower_padding_whd,
-                     UpperPaddingStride       const& upper_padding_whd,
-                     TraversalStride          const& stride_whd,
-                     LowerSRTStride           const& lower_srt,
-                     DilationStride           const& stride_srt)
-{
-  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{},
-                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
-}
-
-// Explicit default for cta_tiler and multicast_size
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout,
-          class LowerCornerStride,
-          class UpperCornerStride,
-          class LowerPaddingStride,
-          class UpperPaddingStride,
-          class TraversalStride,
-          class LowerSRTStride,
-          class DilationStride>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout,
-                     LowerCornerStride        const& lower_corner_whd,
-                     UpperCornerStride        const& upper_corner_whd,
-                     LowerPaddingStride       const& lower_padding_whd,
-                     UpperPaddingStride       const& upper_padding_whd,
-                     TraversalStride          const& stride_whd,
-                     LowerSRTStride           const& lower_srt,
-                     DilationStride           const& stride_srt)
-{
-  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{},
-                              lower_corner_whd, upper_corner_whd, lower_padding_whd, upper_padding_whd, stride_whd, lower_srt, stride_srt);
-}
-
-// No offsets copy.
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout,
-          class CTATiler,
-          class MulticastSize>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout,
-                     CTATiler                 const& cta_tiler,
-                     MulticastSize            const& multicast_size)
-{
-  auto cta_v_tile = make_identity_layout(product_each(shape(tensor_cwhdn))).compose(cta_tiler);
-  auto cta_t_tile = make_layout(multicast_size);
-
-  return detail::make_tma_copy_im2col(copy_op, tensor_cwhdn, slayout, cta_t_tile, cta_v_tile);
-}
-
-// Explicit default for multicast_size
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout,
-          class CTATiler>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout,
-                     CTATiler                 const& cta_tiler)
-{
-  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, cta_tiler, Int<1>{});
-}
-
-// Explicit default for cta_tiler and multicast_size
-template <class CopyOp,
-          class Engine0, class Layout0,
-          class SLayout>
-CUTE_HOST_RTC
-auto
-make_im2col_tma_copy(CopyOp                   const& copy_op,
-                     Tensor<Engine0, Layout0> const& tensor_cwhdn,
-                     SLayout                  const& slayout)
-{
-  return make_im2col_tma_copy(copy_op, tensor_cwhdn, slayout, product_each(shape(slayout)), Int<1>{});
-}
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
deleted file mode 100755
index 3738cc396..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp
+++ /dev/null
@@ -1,1525 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <cuda.h>
-#endif
-
-#include <cute/atom/copy_traits_sm90_tma_swizzle.hpp>
-#include <cute/atom/copy_traits.hpp>
-#include <cute/atom/copy_atom.hpp>
-
-#include <cute/algorithm/prefetch.hpp>
-
-#include <cute/numeric/integral_ratio.hpp>
-
-#include <cutlass/cuda_host_adapter.hpp>
-
-namespace cute
-{
-
-template <class GmemTmaBasisStrides_, class TmaGmemBasis_, class TmaSwizzle_>
-struct AuxTmaParams {
-  using GmemStrides  = GmemTmaBasisStrides_;    // Strides for Gmem mode -> Tma coord mode, may be dynamic
-  GmemStrides g_stride_;
-  using TmaGmemBasis = TmaGmemBasis_;           // Layout for Tma box shape -> Gmem mode(s), always static
-  static_assert(is_static<TmaGmemBasis>::value);
-  using TmaSwizzle   = TmaSwizzle_;             // Tma swizzle, always Swizzle<B,M,S>
-  static_assert(is_static<TmaSwizzle>::value);
-};
-
-// Utility for unpacking TMA_LOAD arguments into a CopyOp
-template <class CopyOp>
-struct TMA_LOAD_Unpack
-{
-  template <class... Args,
-            class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
-              Tensor<TS,SLayout>           const& src,
-              Tensor<TD,DLayout>                & dst)
-  {
-    auto src_coord = src.data().coord_;
-    if constexpr (detail::is_prefetch<CopyOp>) {
-      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
-                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
-                                   src_coord, tuple_seq<decltype(src_coord)>{});
-    } else {
-      static_assert(is_smem<TD>::value, "SM90_TMA_LOAD requires the destination be shared memory.");
-      void* dst_ptr = cute::raw_pointer_cast(dst.data());
-#if 0
-      auto [c0,c1,c2,c3,c4] = append<5>(src_coord, 0);
-      printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-            threadIdx.x, threadIdx.y, threadIdx.z,
-            blockIdx.x, blockIdx.y, blockIdx.z,
-            int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), dst_ptr);
-#endif
-      return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
-                                   traits.opargs_, tuple_seq<decltype(traits.opargs_)>{},
-                                   make_tuple(dst_ptr), seq<0>{},
-                                   src_coord, tuple_seq<decltype(src_coord)>{});
-    }
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_LOAD ///////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_OP : SM90_TMA_LOAD {};
-
-// The non-executable SM90_TMA_LOAD with tma_desc and no tma_mbar
-// Use .with(tma_mbar) to construct an executable version
-template <class NumBitsPerTMA, class AuxParams_>
-struct Copy_Traits<SM90_TMA_LOAD, NumBitsPerTMA, AuxParams_>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD arguments
-  TmaDescriptor tma_desc_;
-  using AuxParams = AuxParams_;
-  AuxParams aux_params_;
-
-  // Return TmaDescriptor/TensorMap
-  CUTE_HOST_DEVICE constexpr
-  TmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  // Construct an executable SM90_TMA_LOAD with tma_mbar
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
-  with(
-    uint64_t& tma_mbar,
-    [[maybe_unused]] uint16_t const& multicast_mask = 0,
-    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
-    // We accept multicast_mask here to keep the API for both atoms consistent
-    return {{}, {&tma_desc_, &tma_mbar, static_cast<uint64_t>(cache_hint)}};
-  }
-
-  // Construct an executable SM90_TMA_LOAD with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
-  with(
-    TmaDescriptor const* new_tma_desc,
-    uint64_t& tma_mbar,
-    [[maybe_unused]] uint16_t const& multicast_mask = 0,
-    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
-    // We accept multicast_mask here to keep the API for both atoms consistent
-    return {{}, {new_tma_desc, &tma_mbar, static_cast<uint64_t>(cache_hint)}};
-  }
-
-  // Generate the TMA coord tensor
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
-  }
-
-  // Don't try to execute a copy with SM90_TMA_LOAD before calling .with()
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst) = delete;
-};
-
-// The executable SM90_TMA_LOAD with tma_desc and tma_mbar
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_TMA_LOAD_OP, NumBitsPerTMA>
-     : TMA_LOAD_Unpack<SM90_TMA_LOAD_OP>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD arguments
-  tuple<
-  TmaDescriptor const*,
-  uint64_t*, // smem mbarrier
-  uint64_t   // cache hint
-  > const opargs_;
-};
-
-// The prefetch for SM90_TMA_LOAD with tma_desc
-template <class NumBitsPerTMA, class... Args>
-struct Copy_Traits<SM90_TMA_LOAD::PREFETCH, NumBitsPerTMA, Args...>
-     : TMA_LOAD_Unpack<SM90_TMA_LOAD::PREFETCH>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD::PREFETCH arguments
-  tuple<TmaDescriptor const*> const opargs_;
-
-  // Construct with any other Traits' TMA Desc
-  template <class... CopyArgs>
-  CUTE_HOST_DEVICE
-  Copy_Traits(Copy_Traits<CopyArgs...> const& traits)
-    : opargs_({&traits.tma_desc_}) {}
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_LOAD_MULTICAST /////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-struct SM90_TMA_LOAD_MULTICAST_OP : SM90_TMA_LOAD_MULTICAST {};
-
-// The non-executable SM90_TMA_LOAD_MULTICAST with tma_desc and no tma_mbar
-// Use .with(tma_mbar, multicast_mask) to construct an executable version
-template <class NumBitsPerTMA, class AuxParams_>
-struct Copy_Traits<SM90_TMA_LOAD_MULTICAST, NumBitsPerTMA, AuxParams_>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD_MULTICAST arguments
-  TmaDescriptor tma_desc_;
-  using AuxParams = AuxParams_;
-  AuxParams aux_params_;
-
-  // Return TmaDescriptor/TensorMap
-  CUTE_HOST_DEVICE constexpr
-  TmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  // Construct an executable SM90_TMA_LOAD_MULTICAST with tma_mbar
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
-  with(
-    uint64_t& tma_load_mbar,
-    uint16_t const& multicast_mask,
-    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
-    return {{}, {&tma_desc_, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)}};
-  }
-
-  // Construct an executable SM90_TMA_LOAD_MULTICAST_OP with tma_mbar (temp. overloaded for grouped gemm/ptr array gemm)
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
-  with(
-    TmaDescriptor const* new_tma_desc,
-    uint64_t& tma_load_mbar,
-    uint16_t const& multicast_mask,
-    TMA::CacheHintSm90 const& cache_hint = TMA::CacheHintSm90::EVICT_NORMAL) const {
-    return {{}, {new_tma_desc, &tma_load_mbar, multicast_mask, static_cast<uint64_t>(cache_hint)}};
-  }
-
-  // Generate the TMA coord tensor
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
-  }
-
-  // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with()
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst) = delete;
-};
-
-// The executable SM90_TMA_LOAD_MULTICAST with tma_desc and tma_mbar and multicast_mask
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_TMA_LOAD_MULTICAST_OP, NumBitsPerTMA>
-     : TMA_LOAD_Unpack<SM90_TMA_LOAD_MULTICAST_OP>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_LOAD_MULTICAST arguments
-  tuple<
-  TmaDescriptor const*,
-  uint64_t*, // smem mbarrier
-  uint16_t,  // multicast mask
-  uint64_t   // cache hint
-  > const opargs_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_STORE //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-// Utility for unpacking TMA_STORE arguments into a CopyOp
-template <class CopyOp>
-struct TMA_STORE_Unpack
-{
-  template <class... Args,
-            class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits<CopyOp, Args...> const& traits,
-              Tensor<TS,SLayout>           const& src,
-              Tensor<TD,DLayout>                & dst)
-  {
-    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
-
-    void const* const desc_ptr = traits.tma_desc_;
-    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
-    auto dst_coord = dst.data().coord_;
-#if 0
-    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
-    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
-           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
-#endif
-    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
-                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
-                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
-  }
-};
-
-struct SM90_TMA_STORE_OP : SM90_TMA_STORE {};
-
-// The executable SM90_TMA_STORE with tma_desc
-template <class NumBitsPerTMA, class AuxParams_>
-struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_STORE arguments
-  TmaDescriptor tma_desc_;
-  using AuxParams = AuxParams_;
-  AuxParams aux_params_;
-
-  // Return TmaDescriptor/TensorMap
-  CUTE_HOST_DEVICE constexpr
-  TmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  // Generate the TMA coord tensor
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
-  }
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_STORE");
-    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_STORE");  // TMA spoofed src tensor
-
-    void const* const desc_ptr = &(traits.tma_desc_);
-    void const* const src_ptr  = cute::raw_pointer_cast(src.data());
-    auto dst_coord = dst.data().coord_;
-#if 0
-    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
-    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
-           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
-#endif
-    return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
-                                 make_tuple(desc_ptr, src_ptr), seq<0,1>{},
-                                 dst_coord, tuple_seq<decltype(dst_coord)>{});
-  }
-
-  // Construct Copy_Traits executable (w/ swapped out TMA descriptor) for SM90_TMA_STORE (for grouped gemm/ptr array gemm)
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_TMA_STORE_OP, NumBitsPerTMA>
-  with(TmaDescriptor const* new_tma_desc) const {
-    return {{}, new_tma_desc};
-  }
-};
-
-// The executable SM90_TMA_STORE with tma_desc
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_TMA_STORE_OP, NumBitsPerTMA>
-     : TMA_STORE_Unpack<SM90_TMA_STORE_OP>
-{
-  using ThrID     = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_STORE arguments
-  TmaDescriptor const* tma_desc_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// TMA_REDUCE_ADD //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-// The executable SM90_TMA_REDUCE_ADD with tma_desc
-template <class NumBitsPerTMA, class AuxParams_>
-struct Copy_Traits<SM90_TMA_REDUCE_ADD, NumBitsPerTMA, AuxParams_>
-{
-  using ThrID   = Layout<_1>;
-
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_TMA_REDUCE_ADD arguments
-  TmaDescriptor tma_desc_;
-  using AuxParams = AuxParams_;
-  AuxParams aux_params_;
-
-  // Return TmaDescriptor/TensorMap
-  CUTE_HOST_DEVICE constexpr
-  TmaDescriptor const*
-  get_tma_descriptor() const {
-    return &tma_desc_;
-  }
-
-  // Generate the TMA coord tensor
-  template <class GShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_tma_tensor(GShape const& g_shape) const {
-    static_assert(is_congruent<decltype(g_shape), decltype(aux_params_.g_stride_)>::value);
-    return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_));
-  }
-
-  template <class Coord, int... Is>
-  CUTE_HOST_DEVICE constexpr
-  void
-  copy_unpack_(void const* const src_ptr,
-               Coord const& dst_coord, seq<Is...>) const
-  {
-#if 0
-    auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
-    printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
-           int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
-#endif
-
-    SM90_TMA_REDUCE_ADD::copy(&tma_desc_,
-                         src_ptr, get<Is>(dst_coord)...);
-  }
-
-  // This is the copy_unpack dispatch for this Copy_Traits
-  // Src needs to be a smem tensor
-  // Dst needs to be a gmem tensor with TmaCoordIterator .data()
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_smem<TS>::value, "Expected smem src for SM90_TMA_REDUCE_ADD");
-    //static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_TMA_REDUCE_ADD");  // TMA spoofed src tensor
-
-    traits.copy_unpack_(cute::raw_pointer_cast(src.data()), dst.data().coord_, tuple_seq<decltype(dst.data().coord_)>{});
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// BULK COPY //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-template <class NumBitsPerTMA, class... OpArgs>
-struct Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, OpArgs...>
-{
-  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
-                "Bulk Copy requires copy vector size align to 16B.");
-
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_BULK_COPY_G2S arguments
-  // 0: uint64_t* bulk_load_memory_barrier
-  cute::tuple<OpArgs...> bulk_load_mbar_;
-
-  // Record the memory barrier for the instruction
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA, uint64_t*>
-  with(uint64_t& bulk_mbar) const {
-    return {{&bulk_mbar}};
-  }
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_same<cute::tuple<OpArgs...>, cute::tuple<uint64_t*>>::value,
-                  "Extra arguments not set. Set .with() before use.");
-    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_COPY_G2S");
-    static_assert(is_smem<TD>::value, "Expected smem dst for SM90_BULK_COPY_G2S");
-    SM90_BULK_COPY_G2S::copy(raw_pointer_cast(src.data()), get<0>(traits.bulk_load_mbar_),
-                             raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
-  }
-};
-
-template <class NumBitsPerTMA, class... Args>
-struct Copy_Traits<SM90_BULK_COPY_G2S::PREFETCH, NumBitsPerTMA, Args...>
-     : Copy_Traits<SM90_BULK_COPY_G2S, NumBitsPerTMA>
-{
-  template <class... CopyArgs>
-  CUTE_HOST_DEVICE
-  Copy_Traits(Copy_Traits<CopyArgs...> const& traits) {}
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_gmem<TS>::value, "Expected gmem src for SM90_BULK_PREFETCH");
-    SM90_BULK_COPY_G2S::PREFETCH::copy(raw_pointer_cast(src.data()), int32_t(NumBitsPerTMA::value / 8));
-  }
-};
-
-template <class NumBitsPerTMA>
-struct Copy_Traits<SM90_BULK_COPY_S2G, NumBitsPerTMA>
-{
-  static_assert(int32_t(NumBitsPerTMA::value / 8) % 16 == 0,
-                "Bulk Copy requires copy vector size align to 16B.");
-
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,NumBitsPerTMA>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  template <class TS, class SLayout,
-            class TD, class DLayout>
-  CUTE_HOST_DEVICE friend constexpr
-  void
-  copy_unpack(Copy_Traits        const& traits,
-              Tensor<TS,SLayout> const& src,
-              Tensor<TD,DLayout>      & dst)
-  {
-    static_assert(is_smem<TS>::value, "Expected smem src for SM90_BULK_COPY_S2G");
-    static_assert(is_gmem<TD>::value, "Expected gmem dst for SM90_BULK_COPY_S2G");
-    SM90_BULK_COPY_S2G::copy(raw_pointer_cast(src.data()), raw_pointer_cast(dst.data()), int32_t(NumBitsPerTMA::value / 8));
-  }
-};
-
-//
-// Placeholder for the bulk copy algorithm's default, auto-vectorizing behavior
-//
-
-template <class... OpArgs>
-struct Copy_Traits<SM90_BULK_COPY_AUTO, OpArgs...>
-{
-  // Logical thread id to thread idx (one-thread)
-  using ThrID = Layout<_1>;
-  // Map from (src-thr,src-val) to bit
-  using SrcLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
-  // Map from (dst-thr,dst-val) to bit
-  using DstLayout = Layout<Shape<_1,_1>, Stride<_0,_0>>;
-  // Reference map from (thr,val) to bit
-  using RefLayout = SrcLayout;
-
-  // SM90_UBULK_COPY arguments
-  // 0: uint64_t* bulk_load_memory_barrier [if this is a BULK_LOAD_G2S]
-  cute::tuple<OpArgs...> opargs_;
-
-  // Record the memory barrier for the instruction
-  CUTE_HOST_DEVICE constexpr
-  Copy_Traits<SM90_BULK_COPY_AUTO, uint64_t*>
-  with(uint64_t& bulk_mbar) const {
-    return {{&bulk_mbar}};
-  }
-};
-
-//
-// MAKE_TMA_COPY and related
-//
-
-namespace detail {
-
-// Custom version of coalesce that greedily combines modes only up to size-256
-// Look at each element and the back of the stack (in order of priority)
-// back(NewLayout)  get<I>(OldLayout)
-//      s0:d0           _1:d1     =>  continue
-//      _1:d0           s1:d1     =>  replace_back     s1:d1
-//      s0:d0           s1:s0*d0  =>  replace_back  s0*s1:d0   if s0*s1 <= 256
-//      s0:d0           s1:d1     =>  append           s1:d1
-//
-// @pre OldShape and OldStride are flat
-template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce_256_impl(OldShape const& old_shape, OldStride const& old_stride,
-                  NewShape const& new_shape, NewStride const& new_stride)
-{
-  if constexpr (I == rank_v<OldShape>) {
-    // Base case, we're done
-    if constexpr (is_constant<1, NewShape>::value) {
-      return Layout<_1,_0>{};
-    } else {
-      return Layout<NewShape,NewStride>{new_shape,new_stride};
-    }
-  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
-    // shape<I>(layout) == _1, skip it and continue
-    return coalesce_256_impl<I+1>(old_shape, old_stride, new_shape, new_stride);
-  } else if constexpr (is_constant<1, NewShape>::value) {
-    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
-    return coalesce_256_impl<I+1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
-  } else if constexpr (is_constant<true, decltype(back(new_shape) * back(new_stride) == get<I>(old_stride) &&
-                                                  get<I>(old_shape) * back(new_shape) <= Int<256>{})>::value) {
-    // Merge modes because the shapes and strides match and the merge is 256 or less
-    return coalesce_256_impl<I+1>(old_shape, old_stride,
-                                  replace_back(new_shape, get<I>(old_shape) * back(new_shape)),
-                                  new_stride);
-  } else {
-    // Can't replace or merge, so append a new mode
-    return coalesce_256_impl<I+1>(old_shape, old_stride,
-                                  append(new_shape,  get<I>(old_shape)),
-                                  append(new_stride, get<I>(old_stride)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Combine all the modes that are possible to combine
-// Does not respect the profile of the layout, but does preserve total size
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce_256(Layout<Shape,Stride> const& layout)
-{
-  auto flat_shape  = flatten(layout.shape());
-  auto flat_stride = flatten(layout.stride());
-  return coalesce_256_impl<1>(flat_shape, flat_stride, get<0>(flat_shape), get<0>(flat_stride));
-}
-
-template <class TmaInternalType,
-          class GEngine, class GLayout,
-          class SShape, class SStride,
-          class VShape, class VStride>
-CUTE_HOST_DEVICE constexpr
-auto
-construct_tma_gbasis(Tensor<GEngine,GLayout> const& gtensor,       // The original GMEM Tensor
-                     Layout<SShape,SStride>  const& slayout,       // The layout of SMEM
-                     Layout<VShape,VStride>  const& cta_v_map)     // smem_idx to hier gmode
-{
-  //
-  // TMA parameter checking
-  //
-
-  // CUTE_STATIC_ASSERT_V(product_each(shape(slayout)) == product_each(shape(cta_v_map)),
-  //                      "TMA requires CTA_Tile and SLayout top-level shape equivalence.");
-  CUTE_STATIC_ASSERT_V(size(slayout) == size(cta_v_map),
-                       "TMA requires CTA_Tile and SLayout top-level size equivalence.");
-
-#if 0
-  print("gtensor         : "); print(gtensor); print("\n");
-  print("slayout         : "); print(slayout); print("\n");
-  print("cta_v_map       : "); print(cta_v_map); print("\n");
-#endif
-
-  //
-  // TMA slayout manipulation
-  //
-
-  // Invert the smem to get the largest contiguous vector in the smem layout
-  // smem idx -> smem coord
-  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
-
-  // Compose with the V-Map to convert smem coord (CTA val idx) to gmem mode
-  // smem idx -> gmem mode
-  auto sidx2gmode_full = coalesce(composition(cta_v_map, inv_smem_layout));
-
-#if 0
-  print("inv_smem_layout : "); print(inv_smem_layout); print("\n");
-  print("sidx2gmode_full : "); print(sidx2gmode_full); print("\n");
-#endif
-
-  //
-  // TMA gtensor truncation
-  //
-
-  // Truncate any incompatibilities -- no starting in the middle of gmodes
-  auto smem_rank = find_if(stride(sidx2gmode_full), [](auto e) {
-    [[maybe_unused]] auto v = basis_value(e);
-    return not is_constant<1,decltype(v)>{};
-  });
-  static_assert(smem_rank > 0, "Could not find a common tile-gmem vectorization. Does the Tile select out major GMEM modes?");
-
-  // Keep only the static-1 basis modes into gmem
-  auto sidx2gmode = take<0,smem_rank>(sidx2gmode_full);
-
-#if 0
-  print("smem_rank  : "); print(smem_rank); print("\n");
-  print("sidx2gmode : "); print(sidx2gmode); print("\n");
-#endif
-
-  //
-  // TMA gtensor manipulation
-  //
-
-  // The smem vector is the same units as gtensor, so compose first and then recast
-  // tma_val_idx:gmem_strides
-  auto tile_gstride = recast<TmaInternalType>(gtensor.compose(sidx2gmode)).layout();
-  // Coalesce modes up to size-256 (the maximum TMA box extent in units of TmaInternalType)
-  // tma_box_shape:gmem_strides
-  auto tma_gstride  = coalesce_256(tile_gstride);
-
-  // Perform the tiling, recast, and coalesce to the gmem vector again, but with indirections to the gtensor modes
-  auto gbasis = make_identity_layout(shape(gtensor));
-  auto tile_gbasis_tmp = gbasis.compose(sidx2gmode);
-
-  // Instead of the recast (gbasis doesn't have type info), replace the shape with the already-recasted shape
-  // tma_box_shape:gmem_mode
-  auto tile_gbasis = make_layout(shape(tile_gstride), stride(tile_gbasis_tmp));
-
-  // "Coalesce" the tile basis into a compatible shape with the tma_gstride
-  auto tma_gbasis_tile = tile_gbasis.compose(make_layout(wrap(shape(tma_gstride))));
-
-  // Recast the original tensor for shape/stride inspections
-  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
-
-  // Find missing bases that don't appear in tile_gbasis
-  auto tile_gbasis_remaining_stride = filter_tuple(flatten(shape (gtensor_T)), flatten(stride(gtensor_T)),
-                                                   flatten(stride(gbasis)),
-                                                   [&](auto s, auto d, auto e)
-  {
-    if constexpr (is_constant<1, decltype(s)>::value || is_constant<0, decltype(d)>::value) {
-      return cute::tuple<>{};          // If size-1 or stride-0, then don't append
-    } else {
-      using E = decltype(e);
-      auto has_e = any_of(flatten(stride(tma_gbasis_tile)), [] (auto tb) { return tb == E{}; });
-      if constexpr (decltype(has_e)::value) {
-        return cute::tuple<>{};        // If d was found, then don't append
-      } else {
-        return cute::tuple<E>(e);      // Else, this is missing so append
-      }
-    }
-  });
-
-  // Append the remaining basis modes that contribute to the TMA with size-1
-  auto tile_gbasis_remaining_shape = repeat<rank(tile_gbasis_remaining_stride)>(Int<1>{});
-  auto tma_gbasis_full = make_layout(tuple_cat(wrap( shape(tma_gbasis_tile)), wrap(tile_gbasis_remaining_shape )),
-                                     tuple_cat(wrap(stride(tma_gbasis_tile)), wrap(tile_gbasis_remaining_stride)));
-
-  // Group the trailing modes to make this max rank-5 -- TMA rank limitation
-  // tma_box_shape:gmem_mode
-  auto tma_gbasis = group<cute::min(rank(tma_gbasis_full),4),-1>(tma_gbasis_full);
-
-#if 0
-  print("tile_gstride : "); print(tile_gstride); print("\n");
-  print("tma_gstride  : "); print(tma_gstride); print("\n");
-  print("gbasis       : "); print(gbasis); print("\n");
-  print("tile_gbasis  : "); print(tma_gbasis_tile); print("\n");
-  print("tma_gbasis   : "); print(tma_gbasis); print("\n");
-#endif
-
-  return tma_gbasis;
-}
-
-template <class GEngine, class GLayout,
-          class TmaGmemBasisStride,
-          class ShapeT, size_t TmaRank>
-CUTE_HOST_DEVICE constexpr
-void
-fill_tma_gmem_shape_stride(Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, in units of TmaInternalType
-                           TmaGmemBasisStride        const& tma_gbasis_stride, // Map Tma mode idx -> Gmem mode(s)
-                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
-                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
-{
-  static_assert(is_tuple<TmaGmemBasisStride>::value);
-  static_assert(is_same<uint32_t, ShapeT>::value || is_same<uint64_t, ShapeT>::value);
-
-  using TmaInternalType = typename GEngine::value_type;
-  constexpr int tma_rank = decltype(rank(tma_gbasis_stride))::value;
-  static_assert(TmaRank >= tma_rank);
-
-  auto gmem_shape  =  shape(gtensor);
-  auto gmem_stride = stride(gtensor);
-  // Use the indirections in tma_gbasis_stride into gtensor to construct the tma gmem shapes/strides
-  for_each(make_seq<tma_rank>{}, [&](auto i) {
-    constexpr int tma_i_rank = decltype(rank<i>(tma_gbasis_stride))::value;
-    if constexpr (tma_i_rank == 1) {
-      // Trivial contribution of this gmem mode to this tma mode
-      auto ej = unwrap(get<i>(tma_gbasis_stride));
-      gmem_prob_shape[i]  = basis_get(ej, gmem_shape);
-      gmem_prob_stride[i] = basis_get(ej, gmem_stride);
-    } else {
-      // Apply a recurrence to each gmem mode that contributes to this tma mode
-      for_each(get<i>(tma_gbasis_stride), [&](auto ej) {
-        // Problem shape
-        uint64_t shape_j  = basis_get(ej, gmem_shape);
-        // Problem stride (in bytes)
-        uint64_t stride_j = basis_get(ej, gmem_stride);
-        uint64_t old_stride = gmem_prob_stride[i];
-        gmem_prob_stride[i] = gcd(gmem_prob_stride[i], stride_j);
-
-        if (gmem_prob_stride[i] != 0) {
-          // Recurrence: g_shape = (s_i - 1) * (d_i / gcd_j d_j) + 1
-          gmem_prob_shape[i] = (gmem_prob_shape[i]-1) * (old_stride / gmem_prob_stride[i])
-                             +            (shape_j-1) * (stride_j   / gmem_prob_stride[i])
-                             + 1;
-        } else {
-          gmem_prob_shape[i] = shape_j;
-        }
-      });
-    }
-  });
-}
-
-// Overload for an existing Copy_Traits
-template <class GEngine, class GLayout,
-          class Op, class Bits, class Aux,
-          class ShapeT, size_t TmaRank>
-CUTE_HOST_DEVICE constexpr
-void
-fill_tma_gmem_shape_stride(Copy_Traits<Op,Bits,Aux>  const& tma_traits,
-                           Tensor<GEngine,GLayout>   const& gtensor,           // Gmem Shapes and Strides, value_type = TmaInternalType
-                           cute::array<ShapeT,   TmaRank> & gmem_prob_shape,   // Tma Shapes, uint32_t or uin64_t
-                           cute::array<uint64_t, TmaRank> & gmem_prob_stride)  // Tma Strides
-{
-  return fill_tma_gmem_shape_stride(gtensor, stride(typename Aux::TmaGmemBasis{}),
-                                    gmem_prob_shape, gmem_prob_stride);
-}
-
-// Use a sidx2gmode to read through the GMEM tensor
-//   and construct a TMA Descriptor for the resulting instruction
-// At the same time, construct the Tma Tensor's Stride to generate
-//   the TMA coordinates that the instruction consumes.
-//
-template <class TmaInternalType,
-          class GEngine, class GLayout,
-          class TShape, class TStride,
-          int B, int M, int S>
-CUTE_HOST_RTC
-auto
-make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The original GMEM Tensor
-                   Layout<TShape,TStride>  const& tma_gbasis,      // TMA mode -> GMEM mode mapping
-                   Swizzle<B,M,S>          const& swizzle,         // Swizzle fn on smem_idx
-                   uint32_t                       num_multicast)   // The number of CTAs in multicasting
-{
-  //
-  // TMA desc creation
-  //
-
-  constexpr int tma_dim = decltype(rank(tma_gbasis))::value;
-
-  //
-  // TMA gmem desc info
-  //
-
-  // Recast the original tensor for shape/stride inspections
-  Tensor gtensor_T = recast<TmaInternalType>(gtensor);
-
-  void* gmem_address = (void*) raw_pointer_cast(gtensor_T.data());
-  auto  gmem_layout  = gtensor_T.layout();
-
-  cute::array<uint64_t, 5> gmem_prob_shape  = {1,1,1,1,1};
-  cute::array<uint64_t, 5> gmem_prob_stride = {0,0,0,0,0};
-
-  fill_tma_gmem_shape_stride(gtensor_T, stride(tma_gbasis), gmem_prob_shape, gmem_prob_stride);
-
-  assert((reinterpret_cast<uint64_t>(gmem_address) & 0b1111) == 0);  // Address must be 16B-aligned
-
-  assert(gmem_prob_shape[0] >= (uint64_t(1)));               // Size must be min 1
-  assert(gmem_prob_shape[0] <= (uint64_t(1) << 32));         // Size must be max 2^32
-  assert(gmem_prob_shape[1] >= (uint64_t(1)));               // Size must be min 1
-  assert(gmem_prob_shape[1] <= (uint64_t(1) << 32));         // Size must be max 2^32
-  assert(gmem_prob_shape[2] >= (uint64_t(1)));               // Size must be min 1
-  assert(gmem_prob_shape[2] <= (uint64_t(1) << 32));         // Size must be max 2^32
-  assert(gmem_prob_shape[3] >= (uint64_t(1)));               // Size must be min 1
-  assert(gmem_prob_shape[3] <= (uint64_t(1) << 32));         // Size must be max 2^32
-  assert(gmem_prob_shape[4] >= (uint64_t(1)));               // Size must be min 1
-  assert(gmem_prob_shape[4] <= (uint64_t(1) << 32));         // Size must be max 2^32
-
-  // TMA descriptor does not store the zeroth stride and assumes it is 1 (TmaInternalType element).
-  assert(gmem_prob_stride[0] == 1 && "Majorness of smem doesn't match majorness of gmem");
-
-  // convert strides to byte strides
-  for(uint64_t& stride : gmem_prob_stride) {
-    stride = (stride * sizeof_bits_v<TmaInternalType>) / 8;
-  }
-
-  // Assert the byte strides. Tma Descriptor uses byte strides
-  assert((gmem_prob_stride[1]) < (uint64_t(1) << 40));       // Stride must be max 2^40
-  assert((gmem_prob_stride[1] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
-  assert((gmem_prob_stride[2]) < (uint64_t(1) << 40));       // Stride must be max 2^40
-  assert((gmem_prob_stride[2] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
-  assert((gmem_prob_stride[3]) < (uint64_t(1) << 40));       // Stride must be max 2^40
-  assert((gmem_prob_stride[3] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
-  assert((gmem_prob_stride[4]) < (uint64_t(1) << 40));       // Stride must be max 2^40
-  assert((gmem_prob_stride[4] & 0b1111) == 0);               // Stride must be multiple of 16B (128b)
-
-  //
-  // TMA smem desc info
-  //
-
-  cute::array<uint32_t, 5> smem_box_shape  = {1,1,1,1,1};
-  cute::array<uint32_t, 5> smem_box_stride = {1,1,1,1,1};
-  // The smem box is simply given by the sizes of the modes in tma_gbasis
-  for_each(make_seq<tma_dim>{}, [&](auto i) {
-    smem_box_shape[i] *= size<i>(tma_gbasis);
-  });
-  // Finally, truncate the tma box by the num_multicast
-  for (uint32_t i = tma_dim-1, multicast = num_multicast; multicast > 1; --i) {
-    assert(smem_box_shape[i] % multicast == 0 || multicast % smem_box_shape[i] == 0);
-    uint32_t new_mult = ceil_div(multicast, smem_box_shape[i]);
-    smem_box_shape[i] = ceil_div(smem_box_shape[i], multicast);
-    multicast = new_mult;
-  }
-
-  assert(smem_box_shape[0] >= (uint32_t(1)));                // Size must be min 1
-  assert(smem_box_shape[0] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[1] >= (uint32_t(1)));                // Size must be min 1
-  assert(smem_box_shape[1] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[2] >= (uint32_t(1)));                // Size must be min 1
-  assert(smem_box_shape[2] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[3] >= (uint32_t(1)));                // Size must be min 1
-  assert(smem_box_shape[3] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
-  assert(smem_box_shape[4] >= (uint32_t(1)));                // Size must be min 1
-  assert(smem_box_shape[4] <= (uint32_t(1) << 8));           // Size must be max 2^8 = 256
-
-  assert(smem_box_stride[0] >= (uint32_t(1)));               // Stride must be min 1
-  assert(smem_box_stride[0] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
-  assert(smem_box_stride[1] >= (uint32_t(1)));               // Stride must be min 1
-  assert(smem_box_stride[1] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
-  assert(smem_box_stride[2] >= (uint32_t(1)));               // Stride must be min 1
-  assert(smem_box_stride[2] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
-  assert(smem_box_stride[3] >= (uint32_t(1)));               // Stride must be min 1
-  assert(smem_box_stride[3] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
-  assert(smem_box_stride[4] >= (uint32_t(1)));               // Stride must be min 1
-  assert(smem_box_stride[4] <= (uint32_t(8)));               // Stride must be max 2^3 = 8
-
-    //
-    // Construct the descriptor
-    //
-
-    TmaDescriptor tma_desc{};
-
-    //
-    // TMA general info
-    //
-
-  #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
-
-    CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<TmaInternalType>();
-    CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
-    CUtensorMapL2promotion  tma_l2Promotion = CU_TENSOR_MAP_L2_PROMOTION_L2_128B;
-    CUtensorMapFloatOOBfill tma_oobFill     = CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
-
-    // TMA smem swizzle type
-    TMA::SmemSwizzleBits swizzle_bits = get_tma_swizzle_bits(swizzle);
-    TMA::SmemSwizzleBase swizzle_base = get_tma_swizzle_base(swizzle);
-    CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(swizzle_bits, swizzle_base);
-    CUresult result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeTiled)(
-        &tma_desc,
-        tma_format,
-        tma_dim,
-        gmem_address,
-        gmem_prob_shape.data(),
-        gmem_prob_stride.data() + 1,  // gmem_prob_stride[0] implicitly 1
-        smem_box_shape.data(),
-        smem_box_stride.data(),
-        tma_interleave,
-        smem_swizzle,
-        tma_l2Promotion,
-        tma_oobFill);
-
-    if (result != CUDA_SUCCESS) {
-      std::cerr << "TMA Desc Addr:   " << &tma_desc
-                << "\nformat         " << tma_format
-                << "\ndim            " << tma_dim
-                << "\ngmem_address   " << gmem_address
-                << "\nglobalDim      " << gmem_prob_shape
-                << "\nglobalStrides  " << gmem_prob_stride
-                << "\nboxDim         " << smem_box_shape
-                << "\nelementStrides " << smem_box_stride
-                << "\ninterleave     " << tma_interleave
-                << "\nswizzle        " << smem_swizzle
-                << "\nl2Promotion    " << tma_l2Promotion
-                << "\noobFill        " << tma_oobFill << std::endl;
-      std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
-      assert(false);
-    }
-
-  #endif // (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
-  auto recast_ratio = cute::trait_ratio(sizeof_bits<typename GEngine::value_type>{},
-                                        sizeof_bits<             TmaInternalType>{});
-
-  auto gbasis = make_basis_like(shape(gtensor));
-
-  // Finally, get the inverse permutation of the E<i> bases for the mocked gmem stride
-  auto gmem_tma_basis_stride = transform_leaf(gbasis, [&](auto ei) {
-    auto si = basis_get(ei,  shape(gmem_layout));
-    auto di = basis_get(ei, stride(gmem_layout));
-    if constexpr (is_constant<1, decltype(si)>::value || is_constant<0, decltype(di)>::value) {
-      return Int<0>{};                  // If size-1 or stride-0, return arithmetic identity -- no contribution to the TMA
-    } else {
-      auto tma_gmem_basis_stride = stride(tma_gbasis);
-      // Find j such that E<i> is in stride<j>(tma_gbasis)
-      using EI = decltype(ei);
-      [[maybe_unused]] auto j = find_if(tma_gmem_basis_stride, [&](auto tma_stride_j) { return any_of(tma_stride_j, [&](auto dj) { return dj == EI{}; }); });
-      if constexpr (decltype(j == rank(tma_gmem_basis_stride))::value) {
-        return Int<0>{};               // If not-found, return arithmetic identity -- no contribution to the TMA
-      } else
-      if constexpr (decltype(j == Int<0>{})::value) {
-        auto scale = recast_ratio * basis_get(ei, stride(gtensor));
-        return E<j>{} * scale;         // Return TMA Coord basis -- with a recast scale factor
-      } else
-      if constexpr (decltype(rank<j>(tma_gmem_basis_stride) == Int<1>{})::value) {
-        return E<j>{};                 // Return TMA Coord basis -- known scale of Int<1>{}
-      } else {
-        int32_t scale = ceil_div(int32_t(di * sizeof_bits_v<TmaInternalType> / cute::max(gmem_prob_stride[j], uint64_t{16})), 8);
-        return E<j>{} * scale;         // Return TMA Coord basis -- with a dynamic scale factor
-      }
-    }
-  });
-
-#if 0
-    print("gmem_tma_basis_stride : "); print(gmem_tma_basis_stride); print("\n");
-#endif
-
-  using AuxParams = AuxTmaParams<decltype(gmem_tma_basis_stride),
-                                 decltype(tma_gbasis),
-                                 decltype(swizzle)>;
-  return cute::make_tuple(tma_desc, AuxParams{gmem_tma_basis_stride});
-}
-
-template <class TmaInternalType,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class VShape, class VStride>
-CUTE_HOST_RTC
-auto
-make_tma_copy_atom(CopyOp,
-                   Tensor<GEngine,GLayout> const& gtensor,       // Full GMEM Tensor
-                   SLayout                 const& slayout,       // CTA Tile of SMEM, potentially swizzled
-                   uint32_t                const& num_multicast, // The number of CTAs involved in multicasting
-                   Layout<VShape,VStride>  const& cta_v_map)     // V: CTA val idx -> gmem mode
-{
-  //
-  // TMA truncated layout
-  //
-
-  auto smem_swizzle = get_swizzle_portion(slayout);
-  auto smem_layout  = get_nonswizzle_portion(slayout);
-
-  auto tma_gbasis = detail::construct_tma_gbasis<TmaInternalType>(gtensor, smem_layout, cta_v_map);
-
-  //
-  // Construct the TMA Desc and the strides of the TMA Tensor
-  //
-
-  auto [tma_desc, aux_params] = detail::make_tma_copy_desc<TmaInternalType>(gtensor,
-                                                                            tma_gbasis,
-                                                                            smem_swizzle,
-                                                                            num_multicast);
-
-  //
-  // Construct the Copy_Traits
-  //
-
-  constexpr int num_bits_per_tma = size(tma_gbasis) * sizeof_bits_v<TmaInternalType>;
-  using Traits = Copy_Traits<CopyOp, cute::C<num_bits_per_tma>, decltype(aux_params)>;
-  using Atom   = Copy_Atom<Traits, typename GEngine::value_type>;
-
-  Traits tma_traits{tma_desc, aux_params};
-
-#if 0
-  print("num_bits_per_tma :  "); print(num_bits_per_tma); print("\n");
-  print("g_stride_bases   :  "); print(tma_traits.aux_params_.g_stride_); print("\n");
-#endif
-
-  // Return the Copy_Atom
-  return Atom{tma_traits};
-}
-
-// The "logical TMA tid" is a map from the CTA rank to its logical id
-// within the instruction.  It works like a mask or ordering on the
-// CTAs.  For non-multicast TMA, all CTAs should map to 0.  For
-// multicast TMA of size 4, CTAs will be mapped to {0,1,2,3}.
-template <class TmaInternalType,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class TShape, class TStride,
-          class VShape, class VStride>
-CUTE_HOST_RTC
-auto
-make_tma_copy_tiled(CopyOp                  const& copy_op,
-                    Tensor<GEngine,GLayout> const& gtensor,     // Full GMEM Tensor
-                    SLayout                 const& slayout,     // CTA Tile of SMEM
-                    Layout<TShape,TStride>  const& cta_t_map,   // T: CTA thr idx -> logical TMA tid
-                    Layout<VShape,VStride>  const& cta_v_map)   // V: CTA val idx -> gmem mode
-{
-  Copy_Atom atom = make_tma_copy_atom<TmaInternalType>(copy_op, gtensor, slayout,
-                                                       cosize(cta_t_map), cta_v_map);
-
-  //
-  // Construct the TiledCopy
-  //
-
-  [[maybe_unused]] auto cta_tiler = product_each(shape(cta_v_map));
-
-  auto num_elems_per_tma = size<1>(typename decltype(atom)::RefLayout{}) / static_value<sizeof_bits<typename GEngine::value_type>>();
-
-  // smem idx -> smem coord
-  auto inv_smem_layout = right_inverse(get_nonswizzle_portion(slayout));
-  // CTA V -> smem_coord
-  auto layout_v = composition(inv_smem_layout, num_elems_per_tma);
-  // Scale that up to cover all of the smem_coords
-  auto layout_V = tile_to_shape(make_layout(layout_v), size(cta_v_map));
-  // CTA T -> smem idx
-  auto layout_t = make_layout(cosize(cta_t_map), shape_div(num_elems_per_tma, cosize(cta_t_map)));
-  // CTA TID -> smem coord
-  auto layout_T = composition(inv_smem_layout, composition(layout_t, cta_t_map));
-  // Combine with the T mapping
-  [[maybe_unused]] auto layout_TV = make_layout(layout_T, layout_V);
-
-#if 0
-  print("cta_tiler : "); print(cta_tiler); print("\n");
-  print("layout_v : "); print(layout_v); print("\n");
-  print("layout_V : "); print(layout_V); print("\n");
-  print("layout_t : "); print(layout_t); print("\n");
-  print("layout_T : "); print(layout_T); print("\n");
-  print("layout_TV : "); print(layout_TV); print("\n");
-#endif
-
-  return TiledCopy<decltype(atom), decltype(layout_TV), decltype(cta_tiler)>{atom};
-}
-
-} // end namespace detail
-
-/** Make a CuTe CTA-collective TiledCopy for a TMA operation.
- *
- * @param CopyOp The target copy operation: SM90_TMA_LOAD, SM90_TMA_LOAD_MULTICAST, SM90_TMA_STORE
- * @param gtensor The GMEM Tensor to be involved in the TMA.
- * @param slayout The SMEM Layout to be involved in the TMA.
- * @param cta_tile The CTA-local tile that each CTA will be tiling GMEM with.
- *                 This is often the blk_shape that is used to tile the GMEM for CTAs:
- *                   local_tile(gtensor, blk_shape, blk_coord) -> CTA-local tile of gtensor
- * @param cluster_size When using SM90_TMA_LOAD_MULTICAST, this can be a (static) power-of-2 <= 16
- *                   defining the multicast size (used to further partition the SMEM)
- *                 Else, static-1
- *
- * This code attempts to maximize the TMA box size. It does this by tracing
- * the SMEM "vector" -- the inverse of the smem layout -- to find the largest
- * contiguous array of smem that can be written to/from global memory given
- * the constraints that the TMA instruction imposes.
- *
- * This is accomplished by assigning "basis" strides to the GMEM to track which
- * modes of SMEM map to which modes of GMEM, then reorder the modes of GMEM according
- * to the SMEM vector, and then using those GMEM/SMEM modes to fill in the desc.
- *
- * Examples:
-     using T = float;
-     T* gptr = nullptr;
-
-    {
-    // Simple 2D
-    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256), GenRowMajor{}); // K-Major GMEM
-    auto slayout   = make_layout(make_shape(_64{}, _32{}), GenRowMajor{});    // K-Major SMEM
-    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
-    }
-
-    {
-    // GMMA 2D
-    Tensor gtensor = make_tensor(gptr, make_shape(1024, 256));                                 // MN-Major GMEM
-    auto slayout   = tile_to_shape(GMMA::Layout_MN_SW128_Atom<T>{}, make_shape(_128{},_64{})); // MN-Major Swizzled+Tiled 128x64 SMEM
-    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
-    }
-
-    {
-    // 3D
-    Tensor gtensor = make_tensor(gptr, make_shape(1024, 32, 512), make_stride(64, Int<1>{}, 65536)); // GMEM
-    auto slayout   = make_layout(make_shape(_16{}, _8{}, _2{}), make_stride(_16{}, _1{}, _8{}));     // SMEM w/ same major-mode
-    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout);
-    }
-
-    {
-    // cuTENSOR 4D
-    auto layout = make_shape(make_shape(32,40),make_shape(make_shape(8,8),656)); // GMEM
-    auto cta_tile    = make_shape(_128{},make_shape(_32{},_2{}));                // GMEM Tiling:
-                                                                                 //   Take 128-elem from m: m0 must divide 128,
-                                                                                 //                         m-last may be predicated
-                                                                                 //   Take 32-elem from k0, 2-elem from k1
-    auto slayout = make_layout(cta_tile);                                        // Col-Major SMEM
-    auto tma = make_tma_copy(SM90_TMA_LOAD{}, gtensor, slayout, cta_tile, Int<1>{});
-    }
- *
- * Check the TMA box size and desc:
-    print("TMA Box size:  "); print(typename decltype(tma)::Tiler_MN{}); print("\n");
-    print("TMA desc     : "); print(tma.tma_desc_); print("\n");
- *
- * Usage:
-     Tensor mA = tma_a.get_tma_tensor(make_shape(M,N));        // (M,N) TMA coord tensor
-     Tensor gA = local_tile(mA, cta_tile, cta_coord);          // (BLK_M,BLK_N) TMA coord tensor for this CTA
-     Tensor sA = make_tensor(make_smem_ptr<T>(sptr), slayout); // (BLK_M,BLK_N) SMEM tensor
-
-     auto cta_tma = tma.get_slice(cta_idx_in_cluster);         // Slice for multicast partitioning
-     Tensor tAgA = cta_tma.partition_S(gA);                    // Partition for src
-     Tensor tAsA = cta_tma.partition_D(sA);                    // Partition for dst
-
-     copy(tma.with(barrier, mcast_mask), tAgA, tAsA);          // copy with supporting TMA params
- */
-template <class TmaInternalType = void,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class CTA_Tiler,
-          class Cluster_Size>
-CUTE_HOST_RTC
-auto
-make_tma_copy(CopyOp                  const& copy_op,
-              Tensor<GEngine,GLayout> const& gtensor,
-              SLayout                 const& slayout,
-              CTA_Tiler               const& cta_tiler,
-              Cluster_Size            const& cluster_size)
-{
-  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
-                cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
-    return make_im2col_tma_copy(copy_op,
-                                gtensor,
-                                slayout,
-                                cta_tiler,
-                                cluster_size);
-  } else {
-    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
-    auto cta_t_tile = make_layout(cluster_size);
-    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
-    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
-    return detail::make_tma_copy_tiled<TmaType>(copy_op,
-                                                gtensor, slayout,
-                                                cta_t_tile, cta_v_tile);
-  }
-}
-
-// Explicit defaulting
-template <class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout>
-CUTE_HOST_RTC
-auto
-make_tma_copy(CopyOp                  const& copy_op,
-              Tensor<GEngine,GLayout> const& gtensor,
-              SLayout                 const& slayout)
-{
-  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), Int<1>{});
-}
-
-// Explicit defaulting
-template <class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class Cluster_Size>
-CUTE_HOST_RTC
-auto
-make_tma_copy(CopyOp                  const& copy_op,
-              Tensor<GEngine,GLayout> const& gtensor,
-              SLayout                 const& slayout,
-              Cluster_Size            const& cluster_size)
-{
-  return make_tma_copy(copy_op, gtensor, slayout, product_each(shape(slayout)), cluster_size);
-}
-
-////////////////////////////////////
-// Experimental Make TMA Atom and Partitioner
-///////////////////////////////////
-
-template <class TmaInternalType = void,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class CTA_Tiler,
-          class Cluster_Size = Int<1>>
-CUTE_HOST_RTC
-auto
-make_tma_atom(CopyOp                  const& copy_op,
-              Tensor<GEngine,GLayout> const& gtensor,
-              SLayout                 const& slayout,
-              CTA_Tiler               const& cta_tiler,
-              Cluster_Size            const& cluster_size = {})
-{
-  auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler);
-  // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
-  using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
-  return detail::make_tma_copy_atom<TmaType>(copy_op,
-                                             gtensor, slayout,
-                                             size(cluster_size), cta_v_tile);
-}
-
-// The "VectorCopy Partitioner" for TMA
-template <class... Args,
-          class CtaCoord,
-          class TShape, class TStride,
-          class SEngine, class SLayout,
-          class GEngine, class GLayout>
-CUTE_DEVICE
-auto
-tma_partition(Copy_Atom<Args...>      const& copy_atom,
-              CtaCoord                const& cta_coord,
-              Layout<TShape,TStride>  const& cta_layout,  // T: CTA coord -> logical multicast id
-              Tensor<SEngine,SLayout> const& stensor,     // SMEM Tensor (TMATile, Rest...)
-              Tensor<GEngine,GLayout> const& gtensor)     // GMEM Tensor (TMATile, Rest...)
-{
-  CUTE_STATIC_ASSERT_V(size<0>(stensor) == size<0>(gtensor));
-
-  // Invert the smem to get the largest contiguous vector in the smem layout
-  Layout inv_smem_layout = right_inverse(get_nonswizzle_portion(layout<0>(stensor)));
-  // Scale that up to cover all of the smem_coords
-  Layout layout_v = tile_to_shape(make_layout(inv_smem_layout), size<0>(stensor));
-
-  // Factor out the single-instrucion portion
-  Layout tma_layout_v = make_layout(Int<Copy_Atom<Args...>::NumValSrc>{});
-  auto layout_V = make_tile(logical_divide(layout_v, tma_layout_v));
-
-  // Append with _ until we cover all Rest... modes
-  auto glayout_V = append<GLayout::rank>(layout_V, _);
-  auto slayout_V = append<SLayout::rank>(layout_V, _);
-  // Transform tile mode and coalesce
-  Tensor gtensor_v = coalesce(gtensor.compose(glayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
-  Tensor stensor_v = coalesce(stensor.compose(slayout_V), Shape<Shape<_1,_1>>{});    // ((TMA,TMA_Iter), Rest...)
-
-#if 0
-  if (thread0()) {
-    print("cta_coord  : "); print(cta_coord); print("\n");
-    print("cta_layout : "); print(cta_layout); print("\n");
-    print("gtensor   : "); print(gtensor); print("\n");
-    print("stensor   : "); print(stensor); print("\n");
-    print("layout_V  : "); print(layout_V); print("\n");
-    print("gtensor_v : "); print(gtensor_v); print("\n");
-    print("stensor_v : "); print(stensor_v); print("\n");
-  }
-#endif
-
-  // Offset inside the TMA-mode for the multicast
-  auto multicast_offset = cta_layout(cta_coord) * (size(tma_layout_v) / cosize(cta_layout));
-  auto multicast_coord  = make_coord(make_coord(multicast_offset, Int<0>{}));
-  auto gcoord = append<GLayout::rank>(multicast_coord, Int<0>{});
-  auto scoord = append<SLayout::rank>(multicast_coord, Int<0>{});
-
-  Tensor gresult = domain_offset(gcoord, gtensor_v);
-  Tensor sresult = domain_offset(scoord, stensor_v);
-
-  return cute::make_tuple(gresult, sresult);
-}
-
-// TMA Multicast Masks Calculation
-template <int Mode, class CtaLayout, class CtaCoord>
-CUTE_HOST_DEVICE constexpr
-auto
-create_tma_multicast_mask(CtaLayout const& cta_layout_vmnk,
-                          CtaCoord  const& cta_coord_vmnk)
-{
-  auto cta_coord_slicer = replace<Mode>(cta_coord_vmnk, _);
-  auto [cta_layout, elected_cta] = slice_and_offset(cta_coord_slicer, cta_layout_vmnk);
-  // Get the instruction code
-  uint16_t mcast_mask = 0;
-  for (int i = 0; i < size(cta_layout); ++i) {
-    mcast_mask |= uint16_t(1) << cta_layout(i);
-  }
-  // Shift by the instruction's elected block rank (dynamic)
-  mcast_mask <<= elected_cta;
-  return mcast_mask;
-}
-
-////////////////////////////////////
-// Make TMA copy A/B/C
-///////////////////////////////////
-
-template <class TmaInternalType = void,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class CTA_Tiler,
-          class Cluster_Size>
-CUTE_HOST_RTC
-auto
-make_tma_copy_A_sm90(CopyOp                  const& copy_op,
-                     Tensor<GEngine,GLayout> const& gtensor,
-                     SLayout                 const& slayout,
-                     CTA_Tiler               const& cta_tiler,
-                     Cluster_Size            const& cluster_size)
-{
-  // Keep only MK modes from MNK
-  auto cta_tiler_mk = remove<1>(cta_tiler);
-
-  // mcast along N mode for this M load, if any
-  auto cluster_size_n = size<1>(cluster_size);
-
-  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
-    return make_im2col_tma_copy(copy_op,
-                                gtensor,
-                                slayout,
-                                cta_tiler_mk,
-                                cluster_size_n);
-  } else {
-    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mk);
-    auto cta_t_tile = make_layout(cluster_size_n);
-
-    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
-    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
-    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
-    return tma_copy;
-  }
-}
-
-template <class TmaInternalType = void,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class CTA_Tiler,
-          class Cluster_Size>
-CUTE_HOST_RTC
-auto
-make_tma_copy_B_sm90(CopyOp                  const& copy_op,
-                     Tensor<GEngine,GLayout> const& gtensor,
-                     SLayout                 const& slayout,
-                     CTA_Tiler               const& cta_tiler,
-                     Cluster_Size            const& cluster_size)
-{
-  // Keep only NK modes from MNK
-  auto cta_tiler_nk = remove<0>(cta_tiler);
-
-  // mcast along M mode for this N load, if any
-  auto cluster_size_m = size<0>(cluster_size);
-
-  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL>) {
-    return make_im2col_tma_copy(copy_op,
-                                gtensor,
-                                slayout,
-                                cta_tiler_nk,
-                                cluster_size_m);
-  } else {
-    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_nk);
-    auto cta_t_tile = make_layout(cluster_size_m);
-
-    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
-    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
-    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_tile, cta_v_tile);
-    return tma_copy;
-  }
-}
-
-template <class TmaInternalType = void,
-          class CopyOp,
-          class GEngine, class GLayout,
-          class SLayout,
-          class CTA_Tiler>
-CUTE_HOST_RTC
-auto
-make_tma_copy_C_sm90(CopyOp                  const& copy_op,
-                     Tensor<GEngine,GLayout> const& gtensor,
-                     SLayout                 const& slayout,
-                     CTA_Tiler               const& cta_tiler)
-{
-  // Keep only MN modes from MNK
-  auto cta_tiler_mn = remove<2>(cta_tiler);
-
-  if constexpr (cute::is_same_v<CopyOp, SM90_TMA_LOAD_IM2COL> ||
-      cute::is_same_v<CopyOp, SM90_TMA_STORE_IM2COL>) {
-    return make_im2col_tma_copy(copy_op,
-                                gtensor,
-                                slayout,
-                                cta_tiler_mn,
-                                _1{});
-  } else {
-    auto cta_v_tile = make_identity_layout(shape(gtensor)).compose(cta_tiler_mn);
-
-    // No multicast, so only 1 CTA involved
-    auto cta_t_map = Layout<_1,_0>{};
-
-    // Prefer TmaInternalType if specified. Fallback to GEngine::value_type
-    using TmaType = conditional_t<is_same<void, TmaInternalType>::value, typename GEngine::value_type, TmaInternalType>;
-    auto tma_copy = detail::make_tma_copy_tiled<TmaType>(copy_op, gtensor, slayout, cta_t_map, cta_v_tile);
-    return tma_copy;
-  }
-}
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp b/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
deleted file mode 100755
index 3286e72b3..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/// @file copy_traits_sm90_tma_swizzle.hpp
-/// @brief Functions for converting swizzle layout to TMA descriptor
-
-#if !defined(__CUDACC_RTC__)
-#include <cuda.h>
-#endif
-
-#include <cute/arch/copy_sm90_desc.hpp>
-#include <cute/swizzle_layout.hpp>
-
-namespace cute::detail {
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-TMA::SmemSwizzleBits
-get_tma_swizzle_bits(Swizzle<B,M,S>)
-{
-  if constexpr (M == 4) {
-    switch (B) {
-      default:  static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
-      case 3:   return TMA::SmemSwizzleBits::B128;
-      case 2:   return TMA::SmemSwizzleBits::B64;
-      case 1:   return TMA::SmemSwizzleBits::B32;
-      case 0:   return TMA::SmemSwizzleBits::DISABLE;
-    }
-  } else
-  {
-    static_assert(M < 0, "Unsupported layout swizzle.");
-  }
-}
-
-template <class Layout>
-TMA::SmemSwizzleBits
-get_tma_swizzle_bits(Layout const& layout)
-{
-  return get_tma_swizzle_bits(get_swizzle_portion(layout));
-}
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-TMA::SmemSwizzleBase
-get_tma_swizzle_base(Swizzle<B,M,S>)
-{
-  if constexpr (M == 4) {
-    static_assert(0 <= B && B <= 3, "Expected B = 0,1,2, or 3 when M == 4. Unsupported layout swizzle.");
-    static_assert(S == 3, "Expected S = 3 when M == 4. Unsupported layout swizzle.");
-    return TMA::SmemSwizzleBase::SWIZZLE_BASE_16B;
-  } 
-  else {
-    static_assert(M == 4, "Expected 128b=16B=(2^4)B base swizzle.");
-  }
-}
-
-template <class Layout>
-TMA::SmemSwizzleBase
-get_tma_swizzle_base(Layout const& layout)
-{
-  return get_tma_swizzle_base(get_swizzle_portion(layout));
-}
-
-} // namespace cute::detail
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
deleted file mode 100755
index bf4082743..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp
+++ /dev/null
@@ -1,1117 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/arch/mma.hpp>
-#include <cute/atom/mma_traits.hpp>
-#include <cute/tensor_impl.hpp>
-#include <cute/util/type_traits.hpp>
-
-namespace cute {
-
-template <class... Args>
-struct MMA_Atom;
-
-template <class MMAOperation>
-struct MMA_Atom<MMAOperation> : MMA_Atom<MMA_Traits<MMAOperation>>
-{};
-
-template <class MMAOperation, class... Args>
-struct MMA_Atom<MMA_Traits<MMAOperation, Args...>>
-  : MMA_Traits<MMAOperation, Args...>
-{
-  using MMA_Op = MMAOperation;
-  using Traits = MMA_Traits<MMAOperation, Args...>;
-
-  // Element value types from the MMA_Traits
-  using ValTypeD = typename Traits::ValTypeD;
-  using ValTypeA = typename Traits::ValTypeA;
-  using ValTypeB = typename Traits::ValTypeB;
-  using ValTypeC = typename Traits::ValTypeC;
-
-  // Thr-Val layouts from the MMA_Traits
-  using Shape_MNK  = typename Traits::Shape_MNK;
-  using ThrID      = typename Traits::ThrID;
-  using LayoutC_TV = typename Traits::CLayout;
-  using LayoutA_TV = typename Traits::ALayout;
-  using LayoutB_TV = typename Traits::BLayout;
-
-  // Fragment value types from the MMA_Traits (optional, defaults to Val type)
-  using FrgTypeD = typename detail::FrgTypeC_or_Default<Traits>::type;
-  using FrgTypeA = typename detail::FrgTypeA_or_Default<Traits>::type;
-  using FrgTypeB = typename detail::FrgTypeB_or_Default<Traits>::type;
-  using FrgTypeC = typename detail::FrgTypeC_or_Default<Traits>::type;
-
-  // Additional Trait parameters/transformations
-  template <class... TraitsArgs>
-  CUTE_HOST_DEVICE
-  auto
-  with(TraitsArgs&&... args) const {
-    auto traits = Traits::with(static_cast<TraitsArgs&&>(args)...);
-    return MMA_Atom<decltype(traits)>{traits};
-  }
-
-  //
-  // Tensor call interfaces
-  //
-
-  // Cast, check, and call fma
-  template <class TD, class DLayout,
-            class TA, class ALayout,
-            class TB, class BLayout,
-            class TC, class CLayout>
-  CUTE_HOST_DEVICE constexpr
-  void
-  call(Tensor<TD, DLayout>      & D,
-       Tensor<TA, ALayout> const& A,
-       Tensor<TB, BLayout> const& B,
-       Tensor<TC, CLayout> const& C) const
-  {
-    static_assert(DLayout::rank == 1, "Expected rank-1 D tensor");
-    static_assert(ALayout::rank == 1, "Expected rank-1 A tensor");
-    static_assert(BLayout::rank == 1, "Expected rank-1 B tensor");
-    static_assert(CLayout::rank == 1, "Expected rank-1 C tensor");
-
-    return mma_unpack(static_cast<Traits const&>(*this), D, A, B, C);
-  }
-
-  // Three arguments reproduces C
-  template <class TA, class ALayout,
-            class TB, class BLayout,
-            class TC, class CLayout>
-  CUTE_HOST_DEVICE constexpr
-  void
-  call(Tensor<TA, ALayout> const& A,
-       Tensor<TB, BLayout> const& B,
-       Tensor<TC, CLayout>      & C) const
-  {
-    return call(C, A, B, C);
-  }
-
-  //
-  // make_fragment_A|B|C
-  //   These functions are awkward as they expect already-partitioned tensors
-  //     resulting from a previous call to partition_A|B|C
-  //   The reasoning is that we can inspect the layout of the partitioned data
-  //     and attempt to match it in generated fragment to promote vectorization
-  //     when copying from partition to fragment.
-  //
-
-  template <class CTensor>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  make_fragment_C(CTensor&& ctensor)
-  {
-    // Check that this tensor is likely already partitioned
-    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<3>{});  // VMN
-    CUTE_STATIC_ASSERT_V(size<0>(ctensor) == size<1>(LayoutC_TV{}));
-    // C is a bit special because we are after accumulators here
-    // The input/output type doesn't have to match the accumulator type
-    //static_assert(std::is_same<ValTypeC, typename remove_cvref_t<CTensor>::value_type>::value, "Expecting ValTypeC type");
-
-    // We'll never base the accumulator layout on the input tensor layout, so just return a FrgTypeC tensor
-    return make_tensor<FrgTypeC>(shape(ctensor));
-  }
-
-  template <class ATensor>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  make_fragment_A(ATensor&& atensor)
-  {
-    // Check that this tensor is likely already partitioned
-    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<3>{});  // VMK
-    CUTE_STATIC_ASSERT_V(size<0>(atensor) == size<1>(LayoutA_TV{}));
-
-    if constexpr (has_dereference<FrgTypeA>::value) {
-      // If the intended FrgTypeA is a view (of the current tensor), forward the whole
-      static_assert(is_same<ValTypeA, typename remove_cvref_t<ATensor>::value_type>::value
-                      , "Expecting ValTypeA type");
-      return make_tensor<FrgTypeA>(static_cast<ATensor&&>(atensor));
-    } else {
-      // Else, the intended FrgTypeA is a value type, construct a new tensor with a fragment layout
-      return make_fragment_like<FrgTypeA>(atensor);
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  template <class BTensor>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  make_fragment_B(BTensor&& btensor)
-  {
-    // Check that this tensor is likely already partitioned
-    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<3>{});  // VNK
-    CUTE_STATIC_ASSERT_V(size<0>(btensor) == size<1>(LayoutB_TV{}));
-
-    if constexpr (has_dereference<FrgTypeB>::value) {
-      // If the intended FrgTypeB is a view (of the current tensor), forward the whole
-      static_assert(is_same<ValTypeB, typename remove_cvref_t<BTensor>::value_type>::value
-                      , "Expecting ValTypeB type");
-      return make_tensor<FrgTypeB>(static_cast<BTensor&&>(btensor));
-    } else {
-      // Else, the intended FrgTypeB is a value type, construct a new tensor with a fragment layout
-      return make_fragment_like<FrgTypeB>(btensor);
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-};
-
-//
-// A tiling of mma atoms
-//
-
-template <class TiledMMA, class ThrCoord>
-struct ThrMMA;
-
-// @tparam MMA_Atom The MMA_Atom to use in the TiledMMA
-// @tparam AtomLayoutMNK The MNK-tiling of the Atom to be performed.
-// @tparam PermuationsMNK Permutations to apply to each MNK-mode before tiling for the Atom.
-template <class MMA_Atom,
-          class AtomLayoutMNK,
-          class PermutationMNK = Tile<Underscore,Underscore,Underscore>>
-struct TiledMMA : MMA_Atom
-{
-  using Atom           = MMA_Atom;
-  using AtomShape_MNK  = typename MMA_Atom::Shape_MNK;
-  using AtomThrID      = typename MMA_Atom::ThrID;
-  using AtomLayoutC_TV = typename MMA_Atom::LayoutC_TV;
-  using AtomLayoutA_TV = typename MMA_Atom::LayoutA_TV;
-  using AtomLayoutB_TV = typename MMA_Atom::LayoutB_TV;
-
-  static_assert(   rank_v<AtomLayoutMNK>  == 3,   "TiledMMA requires rank-3 AtomLayoutMNK");
-  static_assert(   rank_v<PermutationMNK> == 3,   "TiledMMA requires rank-3 PermutationMNK");
-  static_assert( is_tuple<PermutationMNK>::value, "TiledMMA requires independent permutations of MNK.");
-  static_assert(is_static<PermutationMNK>::value, "TiledMMA requires static permutations of MNK.");
-
-  using ThrLayoutVMNK = decltype(tiled_product(AtomThrID{}, AtomLayoutMNK{}));
-  ThrLayoutVMNK thr_layout_vmnk_;
-
-  CUTE_HOST_DEVICE constexpr
-  TiledMMA(MMA_Atom const& mma_atom = {}, AtomLayoutMNK const& thr_layout_mnk = {})
-    : MMA_Atom(mma_atom),
-      thr_layout_vmnk_(tiled_product(AtomThrID{}, thr_layout_mnk)) {}
-
-  CUTE_HOST_DEVICE constexpr auto
-  get_thr_layout_vmnk() const {
-    return thr_layout_vmnk_;
-  }
-
-  // Tile a tensor or a layout from shape
-  //   (M,N,...)
-  // to shape
-  //   ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN,...)))
-  // where
-  //   ThrV:  The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
-  //   ThrM:  The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
-  //   ThrN:  The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
-  //   FrgV:  The values local to an MMA.
-  //   RestM: The values tiled in M.
-  //   RestN: The values tiled in N.
-  template <class CTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  thrfrg_C(CTensor&& ctensor) const
-  {
-    CUTE_STATIC_ASSERT_V(rank(ctensor) >= Int<2>{});
-    // Reorder the tensor for the TiledAtom
-    auto t_tile = make_tile(permutation_mnk<0>(),
-                            permutation_mnk<1>());
-    auto t_tensor = logical_divide(ctensor, t_tile);                 // (PermM,PermN)
-
-    // Tile the tensor for the Atom
-    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
-                            make_layout(size<1>(AtomShape_MNK{})));
-    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomN),(RestM,RestN))
-
-    // Transform the Atom mode from (M,K) to (Thr,Val)
-    auto tv_tensor = a_tensor.compose(AtomLayoutC_TV{},_);           // ((ThrV,FrgV),(RestM,RestN))
-
-    // Tile the tensor for the C-threads
-    auto thr_tile = make_tile(_,
-                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
-                                        make_layout(size<2>(thr_layout_vmnk_))));
-    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrN)),(FrgV,(RestM,RestN)))
-
-    return thr_tensor;
-  }
-
-  // Tile a tensor or a layout from shape
-  //   (M,K,...)
-  // to shape
-  //   ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK,...)))
-  // where
-  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
-  //   ThrM: The threads tiled in M.      layout<1>(ThrLayoutVMNK): ThrM -> thread_idx
-  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
-  //   FrgV:  The values local to an MMA.
-  //   RestM: The values tiled in M.
-  //   RestK: The values tiled in K.
-  template <class ATensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  thrfrg_A(ATensor&& atensor) const
-  {
-    CUTE_STATIC_ASSERT_V(rank(atensor) >= Int<2>{});
-    // Reorder the tensor for the TiledAtom
-    auto t_tile = make_tile(permutation_mnk<0>(),
-                            permutation_mnk<2>());
-    auto t_tensor = logical_divide(atensor, t_tile);                 // (PermM,PermK)
-
-    // Tile the tensor for the Atom
-    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
-                            make_layout(size<2>(AtomShape_MNK{})));
-    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
-
-    // Transform the Atom mode from (M,K) to (Thr,Val)
-    auto tv_tensor = a_tensor.compose(AtomLayoutA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
-
-    // Tile the tensor for the Thread
-    auto thr_tile = make_tile(_,
-                              make_tile(make_layout(size<1>(thr_layout_vmnk_)),
-                                        make_layout(size<3>(thr_layout_vmnk_))));
-    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
-
-    return thr_tensor;
-  }
-
-  // Tile a tensor or a layout from shape
-  //   (N,K,...)
-  // to shape
-  //   ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK,...)))
-  // where
-  //   ThrV: The threads local to an MMA. layout<0>(ThrLayoutVMNK): ThrV -> thread_idx
-  //   ThrN: The threads tiled in N.      layout<2>(ThrLayoutVMNK): ThrN -> thread_idx
-  //   ThrK: The threads tiled in K.      layout<3>(ThrLayoutVMNK): ThrK -> thread_idx
-  //   FrgV:  The values local to an MMA.
-  //   RestN: The values tiled in N.
-  //   RestK: The values tiled in K.
-  template <class BTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  thrfrg_B(BTensor&& btensor) const
-  {
-    CUTE_STATIC_ASSERT_V(rank(btensor) >= Int<2>{});
-    // Reorder the tensor for the TiledAtom
-    auto t_tile = make_tile(permutation_mnk<1>(),
-                            permutation_mnk<2>());
-    auto t_tensor = logical_divide(btensor, t_tile);                 // (PermN,PermK)
-
-    // Tile the tensor for the Atom
-    auto b_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
-                            make_layout(size<2>(AtomShape_MNK{})));
-    auto b_tensor = zipped_divide(t_tensor, b_tile);                 // ((AtomN,AtomK),(RestN,RestK))
-
-    // Transform the Atom mode from (M,K) to (Thr,Val)
-    auto tv_tensor = b_tensor.compose(AtomLayoutB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
-
-    // Tile the tensor for the Thread
-    auto thr_tile = make_tile(_,
-                              make_tile(make_layout(size<2>(thr_layout_vmnk_)),
-                                        make_layout(size<3>(thr_layout_vmnk_))));
-    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
-
-    return thr_tensor;
-  }
-
-  template <class ThrIdx,
-            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_slice(ThrIdx const& thr_idx) const
-  {
-    auto thr_vmnk = thr_layout_vmnk_.get_flat_coord(thr_idx);
-    return ThrMMA<TiledMMA, decltype(thr_vmnk)>{*this, thr_vmnk};
-  }
-
-  template <class ThrIdx,
-            __CUTE_REQUIRES(is_integral<ThrIdx>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_thread_slice(ThrIdx const& thr_idx) const
-  {
-    return get_slice(thr_idx);
-  }
-
-  //
-  // Utility for printing and visualization
-  //
-
-  // The permutation applied to the MNK-mode data
-  template <int I>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  permutation_mnk() const {
-    static_assert(0 <= I && I < 3);
-    auto perm = get<I>(PermutationMNK{});
-    return conditional_return(is_underscore<decltype(perm)>{}, size<I>(AtomShape_MNK{}) * size<I+1>(get_thr_layout_vmnk()), perm);
-  }
-
-  // The size of the MNK-mode
-  template <int I>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile_size_mnk() const {
-    static_assert(0 <= I && I < 3);
-    return size(permutation_mnk<I>());
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutC_MN() const
-  {
-    // (M,N) -> (M,N)
-    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
-    // (cthrid,val) -> (M,N)
-    auto layoutC_TV = thrfrg_C(ref_C);
-    // (M,N) -> (cthrid,frg)
-    auto layoutC_MN = right_inverse(layoutC_TV).with_shape(shape(ref_C));
-
-    // cthrid = (v,m,n) -> thr_idx
-    auto thrID_C = thr_layout_vmnk_(_,_,_,Int<0>{});
-
-    return cute::make_tuple(layoutC_MN, thrID_C);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutC_TV() const
-  {
-    // (M,N) -> (M,N)
-    auto ref_C = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<1>()));
-    // (cthrid,val) -> (M,N)
-    auto layoutC_TV = thrfrg_C(ref_C);
-
-    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
-    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
-
-    // (thr_idx,val) -> (M,N)
-    return layoutC_TV.compose(thridx_2_thrid, _);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutA_MK() const
-  {
-    // (M,K) -> (M,K)
-    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
-    // (athrid,val) -> (M,K)
-    auto layoutA_TV = thrfrg_A(ref_A);
-    // (M,K) -> (athrid,frg)
-    auto layoutA_MK = right_inverse(layoutA_TV).with_shape(shape(ref_A));
-
-    // athrid = (v,m,k) -> thr_idx
-    auto thrID_A = thr_layout_vmnk_(_,_,Int<0>{},_);
-
-    return cute::make_tuple(layoutA_MK, thrID_A);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutA_TV() const
-  {
-    // (M,K) -> (M,K)
-    auto ref_A = make_layout(make_shape(tile_size_mnk<0>(), tile_size_mnk<2>()));
-    // (athrid,val) -> (M,K)
-    auto layoutA_TV = thrfrg_A(ref_A);
-
-    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
-    auto atile = make_tile(_,
-                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
-                                                 make_stride(               Int<1>{} ,                Int<0>{} )),
-                                     _));
-
-    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
-    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
-
-    // (thr_idx,val) -> (M,K)
-    return thrfrg_A(ref_A).compose(atile, _).compose(thridx_2_thrid, _);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutB_NK() const
-  {
-    // (N,K) -> (N,K)
-    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
-    // (bthrid,val) -> (N,K)
-    auto layoutB_TV = thrfrg_B(ref_B);
-    // (N,K) -> (bthrid,frg)
-    auto layoutB_NK = right_inverse(layoutB_TV).with_shape(shape(ref_B));
-
-    // bthrid = (v,n,k) -> thr_idx
-    auto thrID_B = thr_layout_vmnk_(_,Int<0>{},_,_);
-
-    return cute::make_tuple(layoutB_NK, thrID_B);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_layoutB_TV() const
-  {
-    // (N,K) -> (N,K)
-    auto ref_B = make_layout(make_shape(tile_size_mnk<1>(), tile_size_mnk<2>()));
-    // (bthrid,val) -> (N,K)
-    auto layoutB_TV = thrfrg_B(ref_B);
-
-    // (ThrV,(ThrN,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
-    auto btile = make_tile(_,
-                           make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk_), size<2>(thr_layout_vmnk_)),
-                                                 make_stride(               Int<0>{} ,                Int<1>{} )),
-                                     _));
-
-    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
-    auto thridx_2_thrid = right_inverse(thr_layout_vmnk_);
-
-    // (thr_idx,val) -> (N,K)
-    return thrfrg_B(ref_B).compose(btile, _).compose(thridx_2_thrid, _);
-  }
-};
-
-template <class TiledMMA, class ThrVMNK>
-struct ThrMMA : TiledMMA
-{
-  ThrVMNK thr_vmnk_;
-
-  template <class CTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_C(CTensor&& ctensor) const
-  {
-    auto thr_tensor = make_tensor(static_cast<CTensor&&>(ctensor).data(), this->thrfrg_C(ctensor.layout()));
-
-    auto thr_vmn = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<2>(thr_vmnk_)));
-    return thr_tensor(thr_vmn, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
-  }
-
-  template <class ATensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_A(ATensor&& atensor) const
-  {
-    auto thr_tensor = make_tensor(static_cast<ATensor&&>(atensor).data(), this->thrfrg_A(atensor.layout()));
-
-    auto thr_vmk = make_coord(get<0>(thr_vmnk_), make_coord(get<1>(thr_vmnk_), get<3>(thr_vmnk_)));
-    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
-  }
-
-  template <class BTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_B(BTensor&& btensor) const
-  {
-    auto thr_tensor = make_tensor(static_cast<BTensor&&>(btensor).data(), this->thrfrg_B(btensor.layout()));
-
-    auto thr_vnk = make_coord(get<0>(thr_vmnk_), make_coord(get<2>(thr_vmnk_), get<3>(thr_vmnk_)));
-    return thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
-  }
-
-  template <class CTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_fragment_C(CTensor&& ctensor) const
-  {
-    return TiledMMA::make_fragment_C(partition_C(ctensor));
-  }
-
-  template <class ATensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_fragment_A(ATensor&& atensor) const
-  {
-    return TiledMMA::make_fragment_A(partition_A(atensor));
-  }
-
-  template <class BTensor>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  partition_fragment_B(BTensor&& btensor) const
-  {
-    return TiledMMA::make_fragment_B(partition_B(btensor));
-  }
-};
-
-//
-// These tile the MMA_Atom as a whole
-//
-
-template <class MMA_Op,
-          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
-          class Permutations = Tile<Underscore,Underscore,Underscore>>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tiled_mma(MMA_Atom<MMA_Op> const& mma_atom,
-               MMAThrLayout     const& thr_layout   = {},
-               Permutations     const& permutations = {})
-{
-  auto thr_layout_mnk  = append<3>(thr_layout, Layout<_1,_0>{});
-  auto permutation_mnk = append<3>(permutations, _);
-
-  return TiledMMA<MMA_Atom<MMA_Op>,
-                  decltype(thr_layout_mnk),
-                  decltype(permutation_mnk)>{mma_atom, thr_layout_mnk};
-}
-
-template <class MMA_Op,
-          class MMAThrLayout = Layout<Shape<_1,_1,_1>>,
-          class Permutations = Tile<Underscore,Underscore,Underscore>>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tiled_mma(MMA_Op       const&,
-               MMAThrLayout const& thr_layout   = {},
-               Permutations const& permutations = {})
-{
-  // Attempt to wrap in an MMA_Atom<> and forward
-  return make_tiled_mma(MMA_Atom<MMA_Op>{}, thr_layout, permutations);
-}
-
-//
-// partition_fragment_C -- static context
-//
-
-template <class... Args, class Shape_MN>
-CUTE_HOST_DEVICE constexpr
-auto
-partition_shape_C(TiledMMA<Args...> const& mma, Shape_MN const& shape_MN)
-{
-  constexpr int R = rank_v<Shape_MN>;
-  static_assert(R >= 2, "Must have at least rank-2");
-  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
-  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
-  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutC_TV{});
-  auto M = shape_div(size<0>(shape_MN), size<0>(atomMNK) * size<1>(thrVMNK));
-  auto N = shape_div(size<1>(shape_MN), size<1>(atomMNK) * size<2>(thrVMNK));
-  return cute::tuple_cat(make_shape(V,M,N), take<2,R>(shape_MN));
-}
-
-template <class... Args, class Shape_MN>
-CUTE_HOST_DEVICE constexpr
-auto
-partition_fragment_C(TiledMMA<Args...> const& mma, Shape_MN const& shapeMN)
-{
-  return make_tensor<typename TiledMMA<Args...>::FrgTypeC>(partition_shape_C(mma, shapeMN));
-}
-
-// partition_fragment_A and partition_fragment_B often depend on the
-//   layout of A and B and/or the thread_idx that is requesting the partition.
-// For these reasons, they should not be used in a static context.
-// See TiledMMA::get_slice(thr_idx).partition_fragment_A(tensorA) instead.
-
-template <class... Args, class Shape_MK>
-CUTE_HOST_DEVICE constexpr
-auto
-partition_shape_A(TiledMMA<Args...> const& mma, Shape_MK const& shape_MK)
-{
-  constexpr int R = rank_v<Shape_MK>;
-  static_assert(R >= 2, "Must have at least rank-2");
-  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
-  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
-  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutA_TV{});
-  auto M = shape_div(size<0>(shape_MK), size<0>(atomMNK) * size<1>(thrVMNK));
-  auto K = shape_div(size<1>(shape_MK), size<2>(atomMNK) * size<3>(thrVMNK));
-  return cute::tuple_cat(make_shape(V,M,K), take<2,R>(shape_MK));
-}
-
-template <class... Args, class Shape_NK>
-CUTE_HOST_DEVICE constexpr
-auto
-partition_shape_B(TiledMMA<Args...> const& mma, Shape_NK const& shape_NK)
-{
-  constexpr int R = rank_v<Shape_NK>;
-  static_assert(R >= 2, "Must have at least rank-2");
-  auto atomMNK = typename TiledMMA<Args...>::AtomShape_MNK{};
-  auto thrVMNK = typename TiledMMA<Args...>::ThrLayoutVMNK{};
-  auto V = shape<1>(typename TiledMMA<Args...>::AtomLayoutB_TV{});
-  auto N = shape_div(size<0>(shape_NK), size<1>(atomMNK) * size<2>(thrVMNK));
-  auto K = shape_div(size<1>(shape_NK), size<2>(atomMNK) * size<3>(thrVMNK));
-  return cute::tuple_cat(make_shape(V,N,K), take<2,R>(shape_NK));
-}
-
-//
-// Size
-//
-
-template <int I, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_size(TiledMMA<Args...> const& mma)
-{
-  return mma.template tile_size_mnk<I>();
-}
-
-template <class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_shape(TiledMMA<Args...> const& mma)
-{
-  return make_shape(tile_size<0>(mma), tile_size<1>(mma), tile_size<2>(mma));
-}
-
-// Deprecate?
-template <int... I, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-size(TiledMMA<Args...> const& mma)
-{
-  return size<I...>(mma.get_thr_layout_vmnk());
-}
-
-// Alias
-template <int... I, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-thr_size(TiledMMA<Args...> const& mma)
-{
-  return size<I...>(mma.get_thr_layout_vmnk());
-}
-
-//
-// Display utilities
-//
-
-template <class... Args>
-CUTE_HOST_DEVICE
-void
-print(MMA_Atom<MMA_Traits<Args...>> const&)
-{
-  using Atom = MMA_Atom<MMA_Traits<Args...>>;
-  print("MMA_Atom\n");
-  print("  ThrID:      "); print(typename Atom::ThrID{});      print("\n");
-  print("  Shape_MNK:  "); print(typename Atom::Shape_MNK{});  print("\n");
-  print("  LayoutA_TV: "); print(typename Atom::LayoutA_TV{}); print("\n");
-  print("  LayoutB_TV: "); print(typename Atom::LayoutB_TV{}); print("\n");
-  print("  LayoutC_TV: "); print(typename Atom::LayoutC_TV{}); print("\n");
-}
-
-template <class Atom, class TiledThr, class TiledPerm>
-CUTE_HOST_DEVICE
-void
-print(TiledMMA<Atom, TiledThr, TiledPerm> const& mma)
-{
-  print("TiledMMA\n");
-  print("  ThrLayoutVMNK:  "); print(mma.get_thr_layout_vmnk());  print("\n");
-  print("  PermutationMNK: "); print(TiledPerm{}); print("\n");
-  print(static_cast<Atom const&>(mma));
-}
-
-template <class TiledMMA, class ThrVMNK>
-CUTE_HOST_DEVICE
-void
-print(ThrMMA<TiledMMA, ThrVMNK> const& thr_mma)
-{
-  print("ThrMMA\n");
-  print("  Thr VMNK: "); print(thr_mma.thr_vmnk_); print("\n");
-  print(static_cast<TiledMMA>(thr_mma));
-}
-
-// MMA Atom to LaTeX TikZ
-template <class... Args, class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-void
-print_latex(MMA_Atom<Args...> const& mma_atom,
-            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
-{
-  print_latex(make_tiled_mma(mma_atom));
-}
-
-// TiledMMA to LaTeX TikZ
-template <class... Args, class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-void
-print_latex(TiledMMA<Args...> const& mma,
-            TikzColorFn color = {})             // lambda(thr_idx,val_idx) -> tikz color string
-{
-  auto layout_and_thrid_C = mma.get_layoutC_MN();
-  auto layoutC_MN = get<0>(layout_and_thrid_C);
-  auto thrID_C    = get<1>(layout_and_thrid_C);
-
-  auto layout_and_thrid_A = mma.get_layoutA_MK();
-  auto layoutA_MK = get<0>(layout_and_thrid_A);
-  auto thrID_A    = get<1>(layout_and_thrid_A);
-
-  auto layout_and_thrid_B = mma.get_layoutB_NK();
-  auto layoutB_NK = get<0>(layout_and_thrid_B);
-  auto thrID_B    = get<1>(layout_and_thrid_B);
-
-  print_latex_mma(layoutC_MN, thrID_C,
-                  layoutA_MK, thrID_A,
-                  layoutB_NK, thrID_B);
-}
-
-// MNK MMA Layout to LaTeX TikZ
-template <class LayoutC, class ThrIDC,
-          class LayoutA, class ThrIDA,
-          class LayoutB, class ThrIDB,
-          class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-void
-print_latex_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-                LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
-                LayoutB const& B, ThrIDB const& TB,  // (n,k) -> (tid,vid)  and  tid -> thr_idx
-                TikzColorFn color = {})              // lambda(thr_idx,val_idx) -> tikz color string
-{
-  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
-  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
-  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
-
-  assert(size<0>(A) == size<0>(C));
-  assert(size<0>(B) == size<1>(C));
-  assert(size<1>(A) == size<1>(B));
-
-  // Commented prints
-  printf("%% LayoutC: "); print(C);  printf("\n");
-  printf("%% ThrIDC : "); print(TC); printf("\n");
-  printf("%% LayoutA: "); print(A);  printf("\n");
-  printf("%% ThrIDA : "); print(TA); printf("\n");
-  printf("%% LayoutB: "); print(B);  printf("\n");
-  printf("%% ThrIDB : "); print(TB); printf("\n\n");
-  // Header
-  printf("\\documentclass[convert]{standalone}\n"
-         "\\usepackage{tikz}\n\n"
-         "\\begin{document}\n"
-         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
-
-  // C starting at 0,0
-  for (int m = 0; m < size<0>(C); ++m) {
-    for (int n = 0; n < size<1>(C); ++n) {
-      int thrid   = C(m,n) % size(TC);
-      int val_idx = C(m,n) / size(TC);
-      int thr_idx = TC(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             m, n,
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
-         0, 0, int(size<0>(C)), int(size<1>(C)));
-
-  // A starting at 0,-size<1>(A)-1
-  for (int m = 0; m < size<0>(A); ++m) {
-    for (int k = 0; k < size<1>(A); ++k) {
-      int thrid   = A(m,k) % size(TA);
-      int val_idx = A(m,k) / size(TA);
-      int thr_idx = TA(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             m, k-1-size<1>(A),
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
-         0, int(-size<1>(A)-1), int(size<0>(A)), -1);
-  // A labels
-  for (int m =  0, k = -1; m < size<0>(A); ++m) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), m);
-  }
-  for (int m = -1, k =  0; k < size<1>(A); ++k) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", m, k-1-size<1>(A), k);
-  }
-
-  // B starting at -size<1>(B)-1,0
-  for (int n = 0; n < size<0>(B); ++n) {
-    for (int k = 0; k < size<1>(B); ++k) {
-      int thrid   = B(n,k) % size(TB);
-      int val_idx = B(n,k) / size(TB);
-      int thr_idx = TB(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             k-1-size<1>(B), n,
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (%d,%d) grid (%d,%d);\n\n",
-         int(-size<1>(B)-1), 0, -1, int(size<0>(B)));
-  // B labels
-  for (int n =  0, k = -1; n < size<0>(B); ++n) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, n);
-  }
-  for (int n = -1, k =  0; k < size<1>(B); ++k) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", k-1-size<1>(B), n, k);
-  }
-
-  // Footer
-  printf("\\end{tikzpicture}\n"
-         "\\end{document}\n");
-}
-
-// MNK MMA Layout to console printer
-template <class LayoutC, class ThrIDC,
-          class LayoutA, class ThrIDA,
-          class LayoutB, class ThrIDB>
-CUTE_HOST_DEVICE
-void
-print_layout_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-                 LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
-                 LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
-{
-  CUTE_STATIC_ASSERT_V(rank(C) == Int<2>{});
-  CUTE_STATIC_ASSERT_V(rank(A) == Int<2>{});
-  CUTE_STATIC_ASSERT_V(rank(B) == Int<2>{});
-
-  assert(size<0>(A) == size<0>(C));
-  assert(size<0>(B) == size<1>(C));
-  assert(size<1>(A) == size<1>(B));
-
-  int a_width = size<1>(A) * 6 + 4;
-
-  // Print out B (white-shifted) k-by-n
-  for (int k = 0; k < size<1>(B); ++k) {
-    // Header
-    printf("%*s", a_width, "");
-    for (int n = 0; n < size<0>(B); ++n) printf("+-----");
-    printf("+\n");
-    // Values
-    printf("%*s", a_width, "");
-    for (int n = 0; n < size<0>(B); ++n) printf("|T%02dV%1d", int(TB(B(n,k) % size(TB))), int(B(n,k) / size(TB)));
-    printf("|\n");
-  }
-  // Footer
-  printf("%*s", a_width, "");
-  for (int n = 0; n < size<0>(B); ++n) printf("+-----");
-  printf("+\n\n");
-
-  // Print out A m-by-k and C m-by-n
-  for (int m = 0; m < size<0>(A); ++m) {
-    // Header
-    for (int k = 0; k < size<1>(A); ++k) printf("+-----");
-    printf("+   ");
-    for (int n = 0; n < size<1>(C); ++n) printf("+-----");
-    printf("+\n");
-    // Values
-    for (int k = 0; k < size<1>(A); ++k) printf("|T%02dV%1d", int(TA(A(m,k) % size(TA))), int(A(m,k) / size(TA)));
-    printf("|   ");
-    for (int n = 0; n < size<1>(C); ++n) printf("|T%02dV%1d", int(TC(C(m,n) % size(TC))), int(C(m,n) / size(TC)));
-    printf("|\n");
-  }
-  // Footer
-  for (int k = 0; k < size<1>(A); ++k) printf("+-----");
-  printf("+   ");
-  for (int n = 0; n < size<1>(C); ++n) printf("+-----");
-  printf("+\n");
-}
-
-// MNK MMA Layout to SVG -- 8-value color coded by thread
-template <class LayoutC, class ThrIDC,
-          class LayoutA, class ThrIDA,
-          class LayoutB, class ThrIDB>
-CUTE_HOST_DEVICE
-void
-print_svg_mma(LayoutC const& C, ThrIDC const& TC,  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-              LayoutA const& A, ThrIDA const& TA,  // (m,k) -> (tid,vid)  and  tid -> thr_idx
-              LayoutB const& B, ThrIDB const& TB)  // (n,k) -> (tid,vid)  and  tid -> thr_idx
-{
-  char const *color_map[8] = {"175,175,255", "175,255,175", "255,255,175",
-                              "255,175,175", "210,210,255", "210,255,210",
-                              "255,255,210", "255,210,210"};
-
-  const int cell_width = 20;
-  const int cell_height = 20;
-
-  const int page_width = (size<1>(A) + size<0>(B) + 2) * cell_width;
-  const int page_height = (size<1>(B) + size<0>(A) + 2) * cell_height;
-
-  // header
-  printf("<svg width=\"100%%\" height=\"100%%\" viewBox=\"0 0 %d %d\" "
-         "preserveAspectRatio=\"xMidYMid meet\" "
-         "xmlns=\"http://www.w3.org/2000/svg\">\n",
-         page_width, page_height);
-
-  // C
-  int c_base_x = (size<1>(A) + 2) * cell_width;
-  int c_base_y = (size<1>(B) + 2) * cell_height;
-  for (int m = 0; m < cute::size<0>(C); ++m) {
-    for (int n = 0; n < cute::size<1>(C); ++n) {
-
-      int thrid = C(m, n) % size(TC);
-      int val_idx = C(m, n) / size(TC);
-      int thr_idx = TC(thrid);
-
-      int x = n * cell_width + c_base_x;
-      int y = m * cell_height + c_base_y;
-
-      int thr_x = x + cell_width / 2;
-      int thr_y = y + cell_height / 4;
-      int val_x = x + cell_width / 2;
-      int val_y = y + cell_height * 3 / 4;
-
-      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
-             "fill=\"rgb(%s)\" stroke=\"black\"/>\n",
-             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
-
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
-             thr_x, thr_y, thr_idx);
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
-             val_x, val_y, val_idx);
-    }
-  }
-
-  // A
-  int a_base_x = cell_width;
-  int a_base_y = (size<1>(B) + 2) * cell_height;
-  for (int m = 0; m < size<0>(A); ++m) {
-    for (int k = 0; k < size<1>(A); ++k) {
-      int thrid = A(m, k) % size(TA);
-      int val_idx = A(m, k) / size(TA);
-      int thr_idx = TA(thrid);
-
-      int x = k * cell_width + a_base_x;
-      int y = m * cell_height + a_base_y;
-
-      int thr_x = x + cell_width / 2;
-      int thr_y = y + cell_height / 4;
-      int val_x = x + cell_width / 2;
-      int val_y = y + cell_height * 3 / 4;
-
-      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
-             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
-             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
-             thr_x, thr_y, thr_idx);
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
-             val_x, val_y, val_idx);
-    }
-  }
-
-  // B
-  int b_base_x = (size<1>(A) + 2) * cell_width;
-  int b_base_y = cell_height;
-  for (int n = 0; n < size<0>(B); ++n) {
-    for (int k = 0; k < size<1>(B); ++k) {
-      int thrid = B(n, k) % size(TB);
-      int val_idx = B(n, k) / size(TB);
-      int thr_idx = TB(thrid);
-
-      int x = n * cell_width + b_base_x;
-      int y = k * cell_height + b_base_y;
-
-      int thr_x = x + cell_width / 2;
-      int thr_y = y + cell_height / 4;
-      int val_x = x + cell_width / 2;
-      int val_y = y + cell_height * 3 / 4;
-
-      printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" "
-             "fill=\"rgb(%s)\" stroke=\"black\" />\n",
-             x, y, cell_width, cell_height, color_map[thr_idx % 8]);
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">T%d</text>\n",
-             thr_x, thr_y, thr_idx);
-      printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-             "alignment-baseline=\"central\" font-size=\"8\">V%d</text>\n",
-             val_x, val_y, val_idx);
-    }
-  }
-
-  // A labels
-  for (int m = 0; m < size<0>(A); ++m) {
-    int x = cell_width / 2;
-    int y = m * cell_height + cell_height / 2 + a_base_y;
-    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
-           x, y, m);
-  }
-  for (int k = 0; k < size<1>(A); ++k) {
-    int x = cell_width + k * cell_width + cell_width / 2;
-    int y = -cell_height / 2 + a_base_y;
-    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
-           x, y, k);
-  }
-
-  // B labels
-  for (int n = 0; n < size<0>(B); ++n) {
-    int x = b_base_x + cell_width * n + cell_width / 2;
-    int y = cell_height / 2;
-    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
-           x, y, n);
-  }
-  for (int k = 0; k < size<1>(B); ++k) {
-    int x = b_base_x - cell_width / 2;
-    int y = cell_height * (k + 1) + cell_height / 2;
-    printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\" "
-           "alignment-baseline=\"central\" font-size=\"12\">%d</text>\n",
-           x, y, k);
-  }
-
-  // footer
-  printf("</svg>");
-}
-
-template <class... Args>
-CUTE_HOST_DEVICE
-void
-print_svg(MMA_Atom<Args...> const &mma_atom) {
-  print_svg(make_tiled_mma(mma_atom));
-}
-
-template <class... Args>
-CUTE_HOST_DEVICE
-void
-print_svg(TiledMMA<Args...> const &mma) {
-  auto layout_and_thrid_C = mma.get_layoutC_MN();
-  auto layoutC_MN = get<0>(layout_and_thrid_C);
-  auto thrID_C = get<1>(layout_and_thrid_C);
-
-  auto layout_and_thrid_A = mma.get_layoutA_MK();
-  auto layoutA_MK = get<0>(layout_and_thrid_A);
-  auto thrID_A = get<1>(layout_and_thrid_A);
-
-  auto layout_and_thrid_B = mma.get_layoutB_NK();
-  auto layoutB_NK = get<0>(layout_and_thrid_B);
-  auto thrID_B = get<1>(layout_and_thrid_B);
-
-  print_svg_mma(layoutC_MN, thrID_C, layoutA_MK, thrID_A, layoutB_NK, thrID_B);
-}
-
-} // namespace cute
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cute/atom/mma_traits_sm61.hpp>
-#include <cute/atom/mma_traits_sm70.hpp>
-#include <cute/atom/mma_traits_sm75.hpp>
-#include <cute/atom/mma_traits_sm80.hpp>
-#include <cute/atom/mma_traits_sm90.hpp>
-#include <cute/atom/mma_traits_sm90_gmma.hpp>
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
deleted file mode 100755
index 0994698a8..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/tensor_impl.hpp>  // cute::Tensor
-#include <cute/pointer.hpp>      // cute::is_rmem
-#include <cute/arch/mma.hpp>     // cute::UniversalFMA
-#include <cute/arch/util.hpp>    // cute::detail::explode
-
-namespace cute
-{
-
-/**
- * concept MMA_Traits
- * {
- *   using ValTypeD =  // Logical A-value type
- *   using ValTypeA =  // Logical B-value type
- *   using ValTypeB =  // Logical C-value type
- *   using ValTypeC =  // Logical D-value type    (NOTE: Not used? Assumed == ValTypeD)
- *
- *   using FrgTypeA =  // A-type consumed by MMA  (if ommitted, same as ValTypeA)
- *   using FrgTypeB =  // B_type consumed by MMA  (if ommitted, same as ValTypeB)
- *   using FrgTypeC =  // C_type consumed by MMA  (if ommitted, same as ValTypeC)
- *
- *   using Shape_MNK =    // Logical MxNxK shape of the MMA
- *
- *   using ThrID     =    // Logical thread id (tid) -> tidx
- *
- *   using ALayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MK-coord
- *   using BLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat NK-coord
- *   using CLayout =      // (Logical thread id (tid), Logical value id (vid)) -> Flat MN-coord
- * };
- */
-
-template <class MMAOperation, class... MMAOpArgs>
-struct MMA_Traits
-{
-  static_assert(sizeof(MMAOperation) == 0, "MMA_Traits not implemented for this MMA_Operation.");
-};
-
-template <class D, class A, class B, class C>
-struct MMA_Traits<UniversalFMA<D,A,B,C>>
-{
-  using ValTypeD = D;
-  using ValTypeA = A;
-  using ValTypeB = B;
-  using ValTypeC = C;
-
-  // Logical shape of the MMA
-  using Shape_MNK = Shape<_1,_1,_1>;
-
-  // Logical thread id (tid) -> tidx
-  using ThrID   = Layout<_1>;
-
-  // (Logical thread id (tid), Logical value id (vid)) -> coord
-
-  // (tid,vid) -> (m,k)
-  using ALayout = Layout<Shape<_1,_1>>;
-  // (tid,vid) -> (n,k)
-  using BLayout = Layout<Shape<_1,_1>>;
-  // (tid,vid) -> (m,n)
-  using CLayout = Layout<Shape<_1,_1>>;
-};
-
-// Extract an MMA_Op from an MMA_Traits
-template <class MMA_Traits>
-struct MMA_Op {};
-
-template <class MMA_Op_Arg, class... Args>
-struct MMA_Op<MMA_Traits<MMA_Op_Arg, Args...>> {
-  using type = MMA_Op_Arg;
-};
-
-//
-// Generic mma_unpack for any MMA_Traits
-//
-
-template <class AnyMMATraits,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE constexpr
-void
-mma_unpack(AnyMMATraits        const& traits,
-           Tensor<TD, DLayout>      & D,
-           Tensor<TA, ALayout> const& A,
-           Tensor<TB, BLayout> const& B,
-           Tensor<TC, CLayout> const& C)
-{
-  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
-
-  // Register value types from the MMA_Operation register arrays
-  using MMA_Op   = typename MMA_Op<AnyMMATraits>::type;
-  using RegTypeD = typename remove_extent<typename MMA_Op::DRegisters>::type;
-  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
-  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
-  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
-
-  Tensor rA = recast<RegTypeA>(A);
-  Tensor rB = recast<RegTypeB>(B);
-  Tensor rD = recast<RegTypeD>(D);
-  Tensor rC = recast<RegTypeC>(C);
-
-  constexpr int RegNumD = extent<typename MMA_Op::DRegisters>::value;
-  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
-  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
-  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
-
-  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
-  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
-  CUTE_STATIC_ASSERT_V(size(rD) == Int<RegNumD>{});
-  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
-
-  detail::explode(MMA_Op::fma,
-                  rD, make_int_sequence<RegNumD>{},
-                  rA, make_int_sequence<RegNumA>{},
-                  rB, make_int_sequence<RegNumB>{},
-                  rC, make_int_sequence<RegNumC>{});
-}
-
-// Accept mutable temporaries
-template <class AnyMMATraits,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE constexpr
-void
-mma_unpack(AnyMMATraits        const& traits,
-           Tensor<TD, DLayout>     && D,
-           Tensor<TA, ALayout> const& A,
-           Tensor<TB, BLayout> const& B,
-           Tensor<TC, CLayout> const& C)
-{
-  mma_unpack(traits, D, A, B, C);
-}
-
-namespace detail {
-
-template <class X, class = void>
-struct FrgTypeA_or_Default { using type = typename X::ValTypeA; };
-template <class X>
-struct FrgTypeA_or_Default<X,void_t<typename X::FrgTypeA>> { using type = typename X::FrgTypeA; };
-
-template <class X, class = void>
-struct FrgTypeB_or_Default { using type = typename X::ValTypeB; };
-template <class X>
-struct FrgTypeB_or_Default<X,void_t<typename X::FrgTypeB>> { using type = typename X::FrgTypeB; };
-
-template <class X, class = void>
-struct FrgTypeC_or_Default { using type = typename X::ValTypeC; };
-template <class X>
-struct FrgTypeC_or_Default<X,void_t<typename X::FrgTypeC>> { using type = typename X::FrgTypeC; };
-
-} // end namespace detail
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
deleted file mode 100755
index f72a63940..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/mma_sm61.hpp>
-
-#include <cute/atom/mma_traits.hpp>
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <>
-struct MMA_Traits<SM61_DP4A>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_1,_1,_4>;
-  using ThrID   = Layout<_1>;
-  using ALayout = Layout<Shape<_1,_4>>;
-  using BLayout = Layout<Shape<_1,_4>>;
-  using CLayout = Layout<Shape<_1,_1>>;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM61_DP2A>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int16_t;
-  using ValTypeB = int16_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_1,_1,_2>;
-  using ThrID   = Layout<_1>;
-  using ALayout = Layout<Shape<_1,_2>>;
-  using BLayout = Layout<Shape<_1,_2>>;
-  using CLayout = Layout<Shape<_1,_1>>;
-};
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
deleted file mode 100755
index f0702a961..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/mma_sm70.hpp>
-
-#include <cute/atom/mma_traits.hpp>
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-namespace {
-
-// Logical thread id to thread idx (quadpair)
-using SM70_QuadPair = Layout<Shape <_4, _2>,
-                             Stride<_1,_16>>;
-// (T8,V4) -> (M8,K4)
-using SM70_8x4_Row  = Layout<Shape <_8,_4>,
-                             Stride<_1,_8>>;
-// (T8,V4) -> (M8,K4)
-using SM70_8x4_Col  = Layout<Shape <Shape <_4,_2>,_4>,
-                             Stride<Stride<_8,_4>,_1>>;
-// (T8,V8) -> (M8,N8)
-using SM70_8x8_16b  = Layout<Shape <_8,_8>,
-                             Stride<_1,_8>>;
-// (T8,V8) -> (M8,N8)
-using SM70_8x8_32b  = Layout<Shape <Shape <_2, _2,_2>,Shape <_2,_2, _2>>,
-                             Stride<Stride<_1,_16,_4>,Stride<_8,_2,_32>>>;
-
-} 
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TN>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Row;
-  using BLayout = SM70_8x4_Row;
-  using CLayout = SM70_8x8_16b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NT>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Col;
-  using BLayout = SM70_8x4_Col;
-  using CLayout = SM70_8x8_16b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F16F16F16F16_NN>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Col;
-  using BLayout = SM70_8x4_Row;
-  using CLayout = SM70_8x8_16b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F16F16F16F16_TT>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Row;
-  using BLayout = SM70_8x4_Col;
-  using CLayout = SM70_8x8_16b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Row;
-  using BLayout = SM70_8x4_Row;
-  using CLayout = SM70_8x8_32b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NT>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Col;
-  using BLayout = SM70_8x4_Col;
-  using CLayout = SM70_8x8_32b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F32F16F16F32_NN>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Col;
-  using BLayout = SM70_8x4_Row;
-  using CLayout = SM70_8x8_32b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM70_8x8x4_F32F16F16F32_TT>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = SM70_QuadPair;
-  using ALayout = SM70_8x4_Row;
-  using BLayout = SM70_8x4_Col;
-  using CLayout = SM70_8x8_32b;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
deleted file mode 100755
index 1d3f51961..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/mma_sm75.hpp>
-
-#include <cute/atom/mma_traits.hpp>
-#include <cute/layout.hpp>
-
-namespace cute
-{
-
-template <>
-struct MMA_Traits<SM75_16x8x8_F32F16F16F32_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_16,_8,_8>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>,_2>,
-                         Stride<Stride<_16,_1>,_8>>;
-  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM75_8x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_8,_8,_16>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,_4>,
-                         Stride<Stride<_32,_1>,_8>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>,_4>,
-                         Stride<Stride<_32,_1>,_8>>;
-  using CLayout = Layout<Shape <Shape < _4,_8>,_2>,
-                         Stride<Stride<_16,_1>,_8>>;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
deleted file mode 100755
index 706b10d88..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp
+++ /dev/null
@@ -1,489 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/mma_sm80.hpp>
-#include <cute/atom/mma_traits.hpp>
-#include <cute/layout.hpp>
-#include <cute/numeric/numeric_types.hpp>
-
-namespace cute
-{
-
-namespace {
-
-// (T32,V1) -> (M8,N8)
-using SM80_8x4      = Layout<Shape <Shape < _4,_8>,_1>,
-                             Stride<Stride< _8,_1>,_0>>;
-// (T32,V2) -> (M8,N8)
-using SM80_8x8_Row  = Layout<Shape <Shape < _4,_8>,_2>,
-                             Stride<Stride<_16,_1>,_8>>;
-// (T32,V4) -> (M8,N16)
-using SM80_8x16_Row = Layout<Shape <Shape < _4,_8>,_4>,
-                             Stride<Stride<_32,_1>,_8>>;
-// (T32,V4) -> (M16,N8)
-using SM80_16x8_Row = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                             Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp16 = fp16 * fp16 + fp16 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_16,_8,_8>;
-  using ThrID   = Layout<_32>;
-  using ALayout = SM80_16x8_Row;
-  using BLayout = SM80_8x8_Row;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using Shape_MNK = Shape<_16,_8,_16>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2,  _2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8,_128>>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
-                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
-  using CLayout = SM80_16x8_Row;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp32 = fp16 * fp16 + fp32 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_16x8x8_F32F16F16F32_TN>
-     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_F32F16F16F32_TN>
-     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp32 = bf16 * bf16 + fp32 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_16x8x8_F32BF16BF16F32_TN>
-     : MMA_Traits<SM80_16x8x8_F16F16F16F16_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_F32BF16BF16F32_TN>
-     : MMA_Traits<SM80_16x8x16_F16F16F16F16_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp32 = tf32 * tf32 + fp32 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_16x8x4_F32TF32TF32F32_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = cutlass::tfloat32_t;
-  using ValTypeB = cutlass::tfloat32_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_16,_8,_4>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
-                         Stride<Stride<_16,_1>,_8>>;
-  using BLayout = SM80_8x4;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x8_F32TF32TF32F32_TN>
-{
-  using ValTypeD = float;
-  using ValTypeA = cutlass::tfloat32_t;
-  using ValTypeB = cutlass::tfloat32_t;
-  using ValTypeC = float;
-
-  using Shape_MNK = Shape<_16,_8,_8>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
-                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
-  using BLayout = Layout<Shape <Shape <_4,_8>, _2>,
-                         Stride<Stride<_8,_1>,_32>>;
-  using CLayout = SM80_16x8_Row;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
-{
-  using ValTypeD = double;
-  using ValTypeA = double;
-  using ValTypeB = double;
-  using ValTypeC = double;
-
-  using Shape_MNK = Shape<_8,_8,_4>;
-  using ThrID   = Layout<_32>;
-  using ALayout = SM80_8x4;
-  using BLayout = SM80_8x4;
-  using CLayout = SM80_8x8_Row;
-};
-
-// Custom complex fp64 MMA composed of 4 fp64 MMAs -- same layouts
-template <>
-struct MMA_Traits<SM80_8x8x4_C64C64C64C64_TN>
-     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
-{
-  using ValTypeD = complex<double>;
-  using ValTypeA = complex<double>;
-  using ValTypeB = complex<double>;
-  using ValTypeC = complex<double>;
-};
-
-// Custom complex fp64 MMA composed of 3 fp64 MMAs -- same layouts
-template <>
-struct MMA_Traits<SM80_8x8x4_GC64C64C64GC64_TN>
-     : MMA_Traits<SM80_8x8x4_F64F64F64F64_TN>
-{
-  using ValTypeD = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
-  using ValTypeA = complex<double>;
-  using ValTypeB = complex<double>;
-  using ValTypeC = typename SM80_8x8x4_GC64C64C64GC64_TN::GaussComplex;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-/////////////////////////// s32 = s8 * s8 + s32 ///////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_8,_8,_16>;
-  using ThrID   = Layout<_32>;
-  using ALayout = SM80_8x16_Row;
-  using BLayout = SM80_8x16_Row;
-  using CLayout = SM80_8x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32S8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_16,_8,_16>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2>>,
-                         Stride<Stride<_64,_1>,Stride<_16,_8>>>;
-  using BLayout = SM80_8x16_Row;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32S8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_16,_8,_32>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape < _4,_2,  _2>>,
-                         Stride<Stride<_64,_1>,Stride<_16,_8,_256>>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>, Shape <_4,  _2>>,
-                         Stride<Stride<_32,_1>, Stride<_8,_128>>>;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32S8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN> {};
-
-///////////////////////////////////////////////////////////////////////////////
-/////////////////////////// s32 = s8 * u8 + s32 ///////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN>
-     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32S8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_8x8x16_S32S8U8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN>
-     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32S8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x16_S32S8U8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN>
-     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32S8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x32_S32S8U8S32_TN> {};
-
-///////////////////////////////////////////////////////////////////////////////
-/////////////////////////// s32 = u8 * s8 + s32 ///////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN>
-     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32U8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_8x8x16_S32U8S8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN>
-     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32U8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x16_S32U8S8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN>
-     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32U8S8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x32_S32U8S8S32_TN> {};
-
-///////////////////////////////////////////////////////////////////////////////
-/////////////////////////// s32 = u8 * u8 + s32 ///////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN>
-     : MMA_Traits<SM80_8x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_8x8x16_S32U8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_8x8x16_S32U8U8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN>
-     : MMA_Traits<SM80_16x8x16_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x16_S32U8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x16_S32U8U8S32_TN> {};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN>
-     : MMA_Traits<SM80_16x8x32_S32S8S8S32_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x32_S32U8U8S32_TN_SATURATE>
-     : MMA_Traits<SM80_16x8x32_S32U8U8S32_TN> {};
-
-///////////////////////////////////////////////////////////////////////////////
-/////////////////////////// s32 = b1 ^ b1 + s32 ///////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = cute::uint1b_t;
-  using ValTypeB = cute::uint1b_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_16,_8,_256>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2,_2>>,
-                       Stride<Stride<_512,_1>,Stride<_16,_8,_2048>>>;
-  using BLayout = Layout<Shape<Shape <_4,_8>,Shape<_32,_2>>,
-                         Stride<Stride<_256,_1>,Stride< _8,_1024>>>;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_ANDPOPC>
-      :MMA_Traits<SM80_16x8x256_S32U1U1S32_TN_XORPOPC> {};
-
-template<>
-struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = cute::uint1b_t;
-  using ValTypeB = cute::uint1b_t;
-  using ValTypeC = int32_t;
-
-  using Shape_MNK = Shape<_8,_8,_128>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape<Shape<_4,_8>,_32>,
-                       Stride<Stride<_256,_1>,_8>>;
-  using BLayout = Layout<Shape<Shape<_4,_8>,_32>,
-                         Stride<Stride<_256,_1>,_8>>;
-  using CLayout = SM80_8x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_ANDPOPC>
-      :MMA_Traits<SM80_8x8x128_S32U1U1S32_TN_XORPOPC> {};
-
-template<>
-struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = cute::uint1b_t;
-  using ValTypeB = cute::uint1b_t;
-  using ValTypeC = int32_t;
-  
-  using Shape_MNK = Shape<_16,_8,_128>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape<Shape<_4,_8>,Shape<_32,_2>>,
-                       Stride<Stride<_512,_1>,Stride<Stride<_16,_8>>>>;
-  using BLayout = Layout<Shape <Shape<_4,_8>,_32>,
-                         Stride<Stride<_256,_1>,_8>>;
-  using CLayout = SM80_16x8_Row;
-};
-
-template <>
-struct MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_ANDPOPC>
-      :MMA_Traits<SM80_16x8x128_S32U1U1S32_TN_XORPOPC> {};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
deleted file mode 100755
index b2ced3f87..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/mma_sm90.hpp>
-#include <cute/atom/mma_traits.hpp>
-
-#include <cute/layout.hpp>
-
-namespace cute {
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////// fp64 = fp64 * fp64 + fp64 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-using SM90_16x8x4_F64F64F64F64_TN = SM90::MMA_16x8x4_F64F64F64F64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
-{
-  using ValTypeD = double;
-  using ValTypeA = double;
-  using ValTypeB = double;
-  using ValTypeC = double;
-
-  using Shape_MNK = Shape<_16,_8,_4>;
-  using ThrID =  Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,_2>,
-                         Stride<Stride<_16,_1>,_8>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>,_1>,
-                         Stride<Stride< _8,_1>,_0>>;
-  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-};
-
-using SM90_16x8x8_F64F64F64F64_TN = SM90::MMA_16x8x8_F64F64F64F64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
-{
-  using ValTypeD = double;
-  using ValTypeA = double;
-  using ValTypeB = double;
-  using ValTypeC = double;
-
-  using Shape_MNK = Shape<_16,_8,_8>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _2>>,
-                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>, _2>,
-                         Stride<Stride< _8,_1>,_32>>;
-  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-};
-
-using SM90_16x8x16_F64F64F64F64_TN = SM90::MMA_16x8x16_F64F64F64F64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
-{
-  using ValTypeD = double;
-  using ValTypeA = double;
-  using ValTypeB = double;
-  using ValTypeC = double;
-
-  using Shape_MNK = Shape<_16,_8,_16>;
-  using ThrID   = Layout<_32>;
-  using ALayout = Layout<Shape <Shape < _4,_8>,Shape <_2, _4>>,
-                         Stride<Stride<_16,_1>,Stride<_8,_64>>>;
-  using BLayout = Layout<Shape <Shape < _4,_8>, _4>,
-                         Stride<Stride< _8,_1>,_32>>;
-  using CLayout = Layout<Shape <Shape < _4,_8>,Shape < _2,_2>>,
-                         Stride<Stride<_32,_1>,Stride<_16,_8>>>;
-};
-
-///////////////////////////////////////////////////////////////////////////////////
-//////////////////////// cfp64 = cfp64 * cfp64 + cfp64 ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-using SM90_16x8x4_C64C64C64C64_TN  = SM90::MMA_16x8x4_C64C64C64C64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x4_C64C64C64C64_TN>
-     : MMA_Traits<SM90_16x8x4_F64F64F64F64_TN>
-{
-  using ValTypeD = complex<double>;
-  using ValTypeA = complex<double>;
-  using ValTypeB = complex<double>;
-  using ValTypeC = complex<double>;
-};
-
-using SM90_16x8x8_C64C64C64C64_TN  = SM90::MMA_16x8x8_C64C64C64C64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x8_C64C64C64C64_TN>
-     : MMA_Traits<SM90_16x8x8_F64F64F64F64_TN>
-{
-  using ValTypeD = complex<double>;
-  using ValTypeA = complex<double>;
-  using ValTypeB = complex<double>;
-  using ValTypeC = complex<double>;
-};
-
-using SM90_16x8x16_C64C64C64C64_TN = SM90::MMA_16x8x16_C64C64C64C64_TN;
-
-template <>
-struct MMA_Traits<SM90_16x8x16_C64C64C64C64_TN>
-     : MMA_Traits<SM90_16x8x16_F64F64F64F64_TN>
-{
-  using ValTypeD = complex<double>;
-  using ValTypeA = complex<double>;
-  using ValTypeB = complex<double>;
-  using ValTypeC = complex<double>;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
deleted file mode 100755
index b02f5b3af..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp
+++ /dev/null
@@ -1,8999 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/pointer_flagged.hpp>            // cute::smem_ptr_flag
-#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
-#include <cute/swizzle.hpp>                    // cute::Swizzle
-#include <cute/tensor_impl.hpp>                // cute::Tensor
-#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
-#include <cute/arch/mma_sm90_gmma.hpp>         // cute::SM90_64x8x16_F16F16F16_SS, etc
-#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
-#include <cute/layout_composed.hpp>            // cute::ComposedLayout
-#include <cute/numeric/integral_constant.hpp>  // cute::is_static
-
-namespace cute {
-
-// Fence between the async destination accumulators of GMMA & source for their dependent use
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE
-void
-warpgroup_fence_operand(Tensor<Engine, Layout>& frg) {
-  CUTE_STATIC_ASSERT(is_static<Layout>::value);
-  if constexpr (is_same_v<typename Engine::value_type, float>) {
-    auto f32_frg = recast<float>(frg);
-    CUTE_UNROLL
-    for (int i = 0; i < size(f32_frg); ++i) {
-      warpgroup_fence_operand(f32_frg(i));
-    }
-  }
-  else {
-    CUTE_STATIC_ASSERT(is_rmem<Engine>::value);
-    auto u32_frg = recast<uint32_t>(frg);
-    CUTE_UNROLL
-    for (int i = 0; i < size(u32_frg); ++i) {
-      warpgroup_fence_operand(u32_frg(i));
-    }
-  }
-}
-
-namespace SM90::GMMA {
-
-///////////////////////////////////////////
-// Common layouts for GMMA Shared Memory //
-///////////////////////////////////////////
-
-// M|N-major GMMA layouts in units of bits
-using Layout_MN_INTER_Atom_Bits = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape< _128,_8>,Stride<_1, _128>>>;
-using Layout_MN_SW32_Atom_Bits  = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape< _256,_8>,Stride<_1, _256>>>;
-using Layout_MN_SW64_Atom_Bits  = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape< _512,_8>,Stride<_1, _512>>>;
-using Layout_MN_SW128_Atom_Bits = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_1024,_8>,Stride<_1,_1024>>>;
-
-// K-major GMMA layouts in units of bits
-using Layout_K_INTER_Atom_Bits  = ComposedLayout<Swizzle<0,4,3>, smem_ptr_flag, Layout<Shape<_8, _128>,Stride< _128,_1>>>;
-using Layout_K_SW32_Atom_Bits   = ComposedLayout<Swizzle<1,4,3>, smem_ptr_flag, Layout<Shape<_8, _256>,Stride< _256,_1>>>;
-using Layout_K_SW64_Atom_Bits   = ComposedLayout<Swizzle<2,4,3>, smem_ptr_flag, Layout<Shape<_8, _512>,Stride< _512,_1>>>;
-using Layout_K_SW128_Atom_Bits  = ComposedLayout<Swizzle<3,4,3>, smem_ptr_flag, Layout<Shape<_8,_1024>,Stride<_1024,_1>>>;
-
-// M|N-major layouts in units of Type
-template <class Type>
-using Layout_MN_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_INTER_Atom_Bits{}));
-template <class Type>
-using Layout_MN_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW32_Atom_Bits{}));
-template <class Type>
-using Layout_MN_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW64_Atom_Bits{}));
-template <class Type>
-using Layout_MN_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_MN_SW128_Atom_Bits{}));
-
-// K-major layouts in units of Type
-template <class Type>
-using Layout_K_INTER_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_INTER_Atom_Bits{}));
-template <class Type>
-using Layout_K_SW32_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW32_Atom_Bits{}));
-template <class Type>
-using Layout_K_SW64_Atom  = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW64_Atom_Bits{}));
-template <class Type>
-using Layout_K_SW128_Atom = decltype(upcast<sizeof_bits<Type>::value>(Layout_K_SW128_Atom_Bits{}));
-
-// With GMMA::Major param
-template <class Type, Major tnsp>
-using Layout_INTER_Atom = typename conditional<tnsp == Major::MN,
-                                               Layout_MN_INTER_Atom<Type>,
-                                               Layout_K_INTER_Atom<Type>>::type;
-template <class Type, Major tnsp>
-using Layout_SW32_Atom = typename conditional<tnsp == Major::MN,
-                                              Layout_MN_SW32_Atom<Type>,
-                                              Layout_K_SW32_Atom<Type>>::type;
-template <class Type, Major tnsp>
-using Layout_SW64_Atom = typename conditional<tnsp == Major::MN,
-                                              Layout_MN_SW64_Atom<Type>,
-                                              Layout_K_SW64_Atom<Type>>::type;
-template <class Type, Major tnsp>
-using Layout_SW128_Atom = typename conditional<tnsp == Major::MN,
-                                               Layout_MN_SW128_Atom<Type>,
-                                               Layout_K_SW128_Atom<Type>>::type;
-
-//
-// Tensor (position-dependent swizzle) to LayoutType utility
-//
-
-template <class Engine, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-LayoutType
-layout_type(Tensor<Engine, Layout<Shape,Stride>> const&)
-{
-  static_assert(is_same<uint128_t, typename Engine::value_type>::value,
-                "Expected uint128_t type in LayoutType conversion.");
-
-  using Swizzle = get_swizzle_t<Engine>;
-  constexpr int B = Swizzle::num_bits;
-  constexpr int M = Swizzle::num_base;
-  constexpr int S = Swizzle::num_shft;
-
-  static_assert(M == 4,           "Unsupported layout swizzle");
-  static_assert(0 <= B && B <= 3, "Unsupported layout swizzle");
-  static_assert(S == 3,           "Unsupported layout swizzle");
-
-  switch (B) {
-    case 0: return LayoutType::INTERLEAVE;
-    case 1: return LayoutType::B32;
-    case 2: return LayoutType::B64;
-    case 3: return LayoutType::B128;
-  }
-  return LayoutType::INTERLEAVE;  // ERROR
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Construction method for GMMA Descriptors
-///////////////////////////////////////////////////////////////////////////////
-
-/**
-* ///////////////////////////////
-* // make_gmma_desc<Major::MN> //
-* ///////////////////////////////
-* Each GmmaDescriptor Major-MN describes a canonical layout of the form
-*
-* LayoutType::INTERLEAVE   : Swizzle<0,4,3> o smem_ptr o ((T,1,m),(8,k)):((1,T,SBO),(1T,LBO))
-* LayoutType::B32          : Swizzle<1,4,3> o smem_ptr o ((T,2,m),(8,k)):((1,T,LBO),(2T,SBO))
-* LayoutType::B64          : Swizzle<2,4,3> o smem_ptr o ((T,4,m),(8,k)):((1,T,LBO),(4T,SBO))
-* LayoutType::B128         : Swizzle<3,4,3> o smem_ptr o ((T,8,m),(8,k)):((1,T,LBO),(8T,SBO))
-*
-* where
-*   T  : sizeof(uint128_t) / sizeof(value_type)
-*   m  : integer in [1,16] corresponding to GMMA shape
-*   k  : integer in [1,32] corresponding to GMMA shape
-*   SBO: stride byte offset
-*   LBO: leading byte offset
-*
-* See GMMA::Layout_MN_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-MN layouts.
-* For example,
-*   auto smem_layout = tile_to_shape(Layout_MN_SW128_Atom<value_type>{}, Shape<_128,_64>{});
-* is guaranteed to be accepted by make_gmma_desc<Major::MN> for appropriate value_type.
-*
-* //////////////////////////////
-* // make_gmma_desc<Major::K> //
-* //////////////////////////////
-* Each GmmaDescriptor Major-K describes a canonical layout of the form
-*
-* LayoutType::INTERLEAVE : Swizzle<0,4,3> o smem_ptr o ((8,m),(T,2)):((1T,SBO),(1,LBO))
-* LayoutType::B32        : Swizzle<1,4,3> o smem_ptr o ((8,m),(T,2)):((2T,SBO),(1, T ))
-* LayoutType::B64        : Swizzle<2,4,3> o smem_ptr o ((8,m),(T,2)):((4T,SBO),(1, T ))
-* LayoutType::B128       : Swizzle<3,4,3> o smem_ptr o ((8,m),(T,2)):((8T,SBO),(1, T ))
-*
-* See GMMA::Layout_K_XXX_Atom<value_type> for building canonical GmmaDescriptor Major-K layouts.
-* For example,
-*   auto smem_layout = tile_to_shape(Layout_K_SW128_Atom<value_type>{}, Shape<_128,_64>{});
-* is guaranteed to be accepted by make_gmma_desc<Major::K> for appropriate value_type.
-*/
-template <Major MajorMode, class TEngine, class TLayout>
-CUTE_HOST_DEVICE constexpr
-GmmaDescriptor
-make_gmma_desc(Tensor<TEngine,TLayout> const& tensor)
-{
-  static_assert(is_smem<TEngine>::value, "GMMA Descriptors can only be constructed on smem.");
-  static_assert(TLayout::rank == 2, "GMMA Descriptors can only be constructed on rank-2 tensors.");
-  using value_type = typename TEngine::value_type;
-
-  Tensor u128_tensor = recast<uint128_t const>(tensor);
-
-  // Result
-  GmmaDescriptor desc;
-
-  // Layout type
-  constexpr LayoutType LAYOUT_TYPE = layout_type(u128_tensor);
-  desc.bitfield.layout_type_ = uint8_t(LAYOUT_TYPE);
-
-  // Start address (4LSB not included)
-  uint32_t start_address = cast_smem_ptr_to_uint(raw_pointer_cast(u128_tensor.data()));
-  desc.bitfield.start_address_ = static_cast<uint16_t>(start_address >> 4);
-
-  constexpr uint8_t base_offset = 0;
-  desc.bitfield.base_offset_ = base_offset;
-
-  // LayoutType meta
-  constexpr int W = LAYOUT_TYPE == LayoutType::INTERLEAVE ? 1 :
-                    LAYOUT_TYPE == LayoutType::B32        ? 2 :
-                    LAYOUT_TYPE == LayoutType::B64        ? 4 :
-                    LAYOUT_TYPE == LayoutType::B128       ? 8 : -1;
-
-  if constexpr (MajorMode == Major::MN)
-  {
-    /* In units of uint128_t, each GmmaDescriptor Major-MN describes a canonical layout of the form
-     *
-     * LayoutType::INTERLEAVE         : Swizzle<0,4,3> o smem_ptr o ((1,n),(8,k)):((X,SBO),(1,LBO))
-     * LayoutType::B32                : Swizzle<1,4,3> o smem_ptr o ((2,n),(8,k)):((1,LBO),(2,SBO))
-     * LayoutType::B64                : Swizzle<2,4,3> o smem_ptr o ((4,n),(8,k)):((1,LBO),(4,SBO))
-     * LayoutType::B128               : Swizzle<3,4,3> o smem_ptr o ((8,n),(8,k)):((1,LBO),(8,SBO))
-     */
-    static_assert(size<1>(u128_tensor) == Int<(256 / cute::sizeof_bits<value_type>::value)>{} || // A and B in dense MMA
-                  size<1>(u128_tensor) == Int<(128 / cute::sizeof_bits<value_type>::value)>{} || // A in sparse MMA
-                  size<1>(u128_tensor) == Int<(512 / cute::sizeof_bits<value_type>::value)>{},   // B in sparse MMA
-                         "Not a canonical GMMA_MN Layout: Expected K-size 256/sizeof_bits<T> for dense or (128|512)/sizeof_bits<T> for sparse.");
-
-    // Construct the canonical GMMA T Layout with shape ((W,n),(8,2))
-    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<Int<W>,_1>{}, Layout<Int<8>,_1>{}));
-
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_MN Layout: No flat offset mode");
-    // Check canonical mode strides
-    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
-    constexpr uint32_t expected_stride_00 = LAYOUT_TYPE == LayoutType::INTERLEAVE ? stride<0,0>(canonical_layout) : 1;
-    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_MN Layout: Expected stride failure.");
-    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
-    constexpr uint32_t expected_stride_10 = W;
-    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_MN Layout: Expected stride failure.");
-
-    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
-    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
-    constexpr uint32_t stride_11 = stride<1,1>(canonical_layout);
-
-    desc.bitfield.stride_byte_offset_  = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_01 : stride_11;
-    desc.bitfield.leading_byte_offset_ = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride_11 : stride_01;
-  }
-  else if constexpr (MajorMode == Major::K)
-  {
-    /* In units of uint128_t, each GmmaDescriptor Major-K describes a canonical layout of the form
-     *
-     * LayoutType::INTERLEAVE    : Swizzle<0,4,3> o smem_ptr o ((8,n),2):((1,SBO),LBO)
-     * LayoutType::B32           : Swizzle<1,4,3> o smem_ptr o ((8,n),2):((2,SBO),1)
-     * LayoutType::B64           : Swizzle<2,4,3> o smem_ptr o ((8,n),2):((4,SBO),1)
-     * LayoutType::B128          : Swizzle<3,4,3> o smem_ptr o ((8,n),2):((8,SBO),1)
-     */
-    CUTE_STATIC_ASSERT_V(size<0>(u128_tensor) % Int<8>{} == Int<0>{},          // N|M size
-                         "Not a canonical GMMA_K Layout: Expected MN-size multiple of 8.");
-    CUTE_STATIC_ASSERT_V(size<1>(u128_tensor) == Int<2>{} || size<1>(u128_tensor) == Int<4>{},      // K   size
-                         "Not a canonical GMMA_K Layout: Expected K-size 2 for dense or 4 for sparse (in units of uint128_t).");
-
-    // Construct the canonical GMMA N Layout with shape ((8,n),(2,1))
-    Layout canonical_layout = logical_divide(layout(u128_tensor), make_tile(Layout<_8,_1>{}, Layout<_2,_1>{}));
-
-    // Check ranks of canonical
-    CUTE_STATIC_ASSERT_V(rank<0>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
-    CUTE_STATIC_ASSERT_V(rank<1>(canonical_layout) == Int<2>{}, "Not a canonical GMMA_K Layout: No flat offset mode");
-    // Check canonical mode strides
-    constexpr uint32_t stride_00 = stride<0,0>(canonical_layout);
-    constexpr uint32_t expected_stride_00 = W;
-    static_assert(stride_00 == expected_stride_00, "Not a canonical GMMA_K Layout: Expected stride failure.");
-    constexpr uint32_t stride_10 = stride<1,0>(canonical_layout);
-    constexpr uint32_t expected_stride_10 = (LAYOUT_TYPE == LayoutType::INTERLEAVE) ? stride<1,0>(canonical_layout) : 1;
-    static_assert(stride_10 == expected_stride_10, "Not a canonical GMMA_K Layout: Expected stride failure.");
-
-    // stride dimension byte offset and leading dimension byte offset (4LSB not included == uint128_t units)
-    constexpr uint32_t stride_01 = stride<0,1>(canonical_layout);
-
-    desc.bitfield.stride_byte_offset_  = stride_01;
-    desc.bitfield.leading_byte_offset_ = stride_10;
-  } else {
-    static_assert(MajorMode != Major::MN && MajorMode != Major::K, "Unrecognized MajorMode!");
-  }
-
-#if 0
-  // DEBUG and SANITY
-  assert((start_address & 0b0000001111) == 0); // Must be 16B aligned (4LSB are 0) no negotiation
-  assert((start_address & 0b1110000000) == 0); // Assert base_offset is 0, generalize later
-  if (thread0()) {
-    print("smem_desc input     tensor: "); print(tensor.data()); print(" o "); print(tensor.layout()); print("\n");
-    print("smem_desc uint128_t tensor: "); print(u128_tensor.data()); print(" o "); print(u128_tensor.layout()); print("\n");
-    //print("     desc canonical layout: "); print(canonical_layout); print("\n");
-    print(desc);
-  }
-#endif
-
-  return desc;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Higher level GMMA Descriptor utilities
-///////////////////////////////////////////////////////////////////////////////
-
-struct DescriptorIterator
-{
-  using reference    = GmmaDescriptor;
-  using element_type = GmmaDescriptor;
-  using value_type   = GmmaDescriptor;
-
-  GmmaDescriptor desc_;
-
-  // Dereference returns the GmmaDescriptor
-  CUTE_HOST_DEVICE constexpr
-  reference operator*() const { return desc_; }
-
-  // Advance and return a new GmmaDescriptor
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](Index const& i) const { return *(*this + i); }
-
-  // Return an advanced iterator
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  DescriptorIterator operator+(Index const& offset) const
-  {
-    return { GmmaDescriptor{desc_ + uint64_t(offset)} };
-  }
-};
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-GmmaDescriptor
-raw_pointer_cast(DescriptorIterator const& ptr) {
-  return ptr.desc_;
-}
-
-// Recast a DescriptorIterator Tensor to uint64_t, it's RegType in mma_unpack
-template <class NewT>
-CUTE_HOST_DEVICE constexpr
-DescriptorIterator
-recast_ptr(DescriptorIterator const& iter) {
-  static_assert(is_same<NewT, uint64_t>::value, "Can only cast GmmaDescriptorIterator to uint64_t.");
-  return iter;  // Do nothing, it will still dereference to GmmaDescriptor and decay to uint64_t
-}
-
-CUTE_HOST_DEVICE void
-print(DescriptorIterator) {
-  printf("GMMA::DescriptorIterator");
-}
-
-// The GMMA Traits below have custom fragment type flags for their smem desc tensors.
-// These flags specialize a MakeTensor customization point to correctly make the fragment that is desired.
-template <Major>
-struct smem_desc : DescriptorIterator {};
-
-} // end namespace SM90::GMMA
-
-// Customization point for creating a GMMA::smem_desc Tensor
-template <SM90::GMMA::Major MajorMode>
-struct MakeTensor<SM90::GMMA::smem_desc<MajorMode>>
-{
-  template <class TEngine, class TLayout>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
-  {
-    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
-    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
-                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////////// MMA_TRAITS ///////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-namespace SM90::GMMA {
-
-//
-// Specialized mma_unpack implementation for SM90 GMMA instructions
-//
-
-template <class MMA_Op, class... MMA_Args,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE constexpr
-void
-mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
-           Tensor<TD, DLayout>      & D,
-           Tensor<TA, ALayout> const& A,
-           Tensor<TB, BLayout> const& B,
-           Tensor<TC, CLayout> const& C)
-{
-  static_assert(is_rmem<TD>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TA>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TB>::value, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem<TC>::value, "Expected registers in MMA_Atom::call");
-
-  // Register value types from the MMA_Operation register arrays
-  using RegTypeA = typename remove_extent<typename MMA_Op::ARegisters>::type;
-  using RegTypeB = typename remove_extent<typename MMA_Op::BRegisters>::type;
-  using RegTypeC = typename remove_extent<typename MMA_Op::CRegisters>::type;
-
-  // SM90 GMMA take three arguments rather than four, try to assert C and D are aliased
-  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
-  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
-  // assert((void*)&C == (void*)&D);
-
-  Tensor rA = recast<RegTypeA>(A);
-  Tensor rB = recast<RegTypeB>(B);
-  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
-
-  constexpr int RegNumA = extent<typename MMA_Op::ARegisters>::value;
-  constexpr int RegNumB = extent<typename MMA_Op::BRegisters>::value;
-  constexpr int RegNumC = extent<typename MMA_Op::CRegisters>::value;
-
-  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
-  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
-  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
-
-  detail::explode(MMA_Op::fma,
-                  rA, make_int_sequence<RegNumA>{},
-                  rB, make_int_sequence<RegNumB>{},
-                  rC, make_int_sequence<RegNumC>{},
-                  &(traits.accumulate_), seq<0>{});
-}
-
-// Accumulator layouts
-template<int N>
-using CLayout_64xN   = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,Int<N/8>>>,
-                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,   _512>>>;
-
-using CLayout_64x8   = CLayout_64xN<  8>;
-using CLayout_64x16  = CLayout_64xN< 16>;
-using CLayout_64x32  = CLayout_64xN< 32>;
-using CLayout_64x64  = CLayout_64xN< 64>;
-using CLayout_64x96  = CLayout_64xN< 96>;
-using CLayout_64x128 = CLayout_64xN<128>;
-using CLayout_64x192 = CLayout_64xN<192>;
-using CLayout_64x256 = CLayout_64xN<256>;
-
-// Register source layout for 32-bit value types
-using ALayout_64x8   = Layout<Shape <Shape <  _4,_8, _4>,Shape <    _2,  _2>>,
-                              Stride<Stride< _64,_1,_16>,Stride<    _8,_256>>>;
-
-// Register source layout for 16-bit (sparse 32-bit) value types
-using ALayout_64x16  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _2,_2,  _2>>,
-                              Stride<Stride<_128,_1,_16>,Stride<_64,_8,_512>>>;
-
-// Register source layout for 8-bit (sparse 16-bit) value types
-using ALayout_64x32  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _4,_2,   _2>>,
-                              Stride<Stride<_256,_1,_16>,Stride<_64,_8,_1024>>>;
-
-// Register source layout for sparse 8-bit value types
-using ALayout_64x64  = Layout<Shape <Shape <  _4,_8, _4>,Shape < _8,_2,   _2>>,
-                              Stride<Stride<_512,_1,_16>,Stride<_64,_8,_2048>>>;
-
-// Shared memory source layouts for any value type
-template <int M, int K>
-using ABLayout       = Layout<Shape <_128,Shape <Int<M>,Int<K>>>,
-                              Stride<  _0,Stride<    _1,Int<M>>>>;
-
-} // end namespace SM90::GMMA
-
-using namespace SM90;
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F16F16F16_SS = SM90::GMMA::MMA_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F16F16F16_RS = SM90::GMMA::MMA_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F16F16F16_SS = SM90::GMMA::MMA_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F16F16F16_RS = SM90::GMMA::MMA_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F16F16F16_SS = SM90::GMMA::MMA_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F16F16F16_RS = SM90::GMMA::MMA_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F16F16F16_SS = SM90::GMMA::MMA_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F16F16F16_RS = SM90::GMMA::MMA_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F16F16F16_SS = SM90::GMMA::MMA_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F16F16F16_RS = SM90::GMMA::MMA_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F16F16F16_SS = SM90::GMMA::MMA_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F16F16F16_RS = SM90::GMMA::MMA_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F16F16F16_SS = SM90::GMMA::MMA_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F16F16F16_RS = SM90::GMMA::MMA_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F16F16F16_SS = SM90::GMMA::MMA_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F16F16F16_RS = SM90::GMMA::MMA_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F32F16F16_SS = SM90::GMMA::MMA_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F32F16F16_RS = SM90::GMMA::MMA_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F32F16F16_SS = SM90::GMMA::MMA_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F32F16F16_RS = SM90::GMMA::MMA_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F32F16F16_SS = SM90::GMMA::MMA_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F32F16F16_RS = SM90::GMMA::MMA_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F32F16F16_SS = SM90::GMMA::MMA_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F32F16F16_RS = SM90::GMMA::MMA_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F32F16F16_SS = SM90::GMMA::MMA_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F32F16F16_RS = SM90::GMMA::MMA_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F32F16F16_SS = SM90::GMMA::MMA_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F32F16F16_RS = SM90::GMMA::MMA_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F32F16F16_SS = SM90::GMMA::MMA_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F32F16F16_RS = SM90::GMMA::MMA_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F32F16F16_SS = SM90::GMMA::MMA_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F32F16F16_RS = SM90::GMMA::MMA_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<  8,  8>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<  8,  8>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 16,  8>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 16,  8>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 32,  8>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 32,  8>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 64,  8>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 64,  8>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 96,  8>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 96,  8>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<128,  8>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<128,  8>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<192,  8>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<192,  8>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<256,  8>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<256,  8>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32S8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8S8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x8x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x8x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x8x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x16x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x16x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x16x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x32x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x32x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x32x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x64x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x64x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x64x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x96x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x96x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x96x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x128x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x128x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x128x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x192x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x192x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x192x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x256x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x256x32_S32U8U8_RS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x256x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x8x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x8x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x16x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x16x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x32x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x32x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x64x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x64x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x96x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x96x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x128x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x128x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x192x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x192x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x256x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x256x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
-
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-#include "mma_traits_sm90_gmma_ext.hpp"
-#endif
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
deleted file mode 100755
index 15e2412c8..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
+++ /dev/null
@@ -1,20116 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
-#pragma once
-  
-#include <cute/arch/mma_sm90.hpp>
-#include <cute/atom/mma_traits.hpp>
-
-namespace cute {
-
-namespace SM90::GMMA {
-
-using CLayout_64x24  = CLayout_64xN< 24>;
-using CLayout_64x40  = CLayout_64xN< 40>;
-using CLayout_64x48  = CLayout_64xN< 48>;
-using CLayout_64x56  = CLayout_64xN< 56>;
-using CLayout_64x72  = CLayout_64xN< 72>;
-using CLayout_64x80  = CLayout_64xN< 80>;
-using CLayout_64x88  = CLayout_64xN< 88>;
-using CLayout_64x104 = CLayout_64xN<104>;
-using CLayout_64x112 = CLayout_64xN<112>;
-using CLayout_64x120 = CLayout_64xN<120>;
-using CLayout_64x136 = CLayout_64xN<136>;
-using CLayout_64x144 = CLayout_64xN<144>;
-using CLayout_64x152 = CLayout_64xN<152>;
-using CLayout_64x160 = CLayout_64xN<160>;
-using CLayout_64x168 = CLayout_64xN<168>;
-using CLayout_64x176 = CLayout_64xN<176>;
-using CLayout_64x184 = CLayout_64xN<184>;
-using CLayout_64x200 = CLayout_64xN<200>;
-using CLayout_64x208 = CLayout_64xN<208>;
-using CLayout_64x216 = CLayout_64xN<216>;
-using CLayout_64x224 = CLayout_64xN<224>;
-using CLayout_64x232 = CLayout_64xN<232>;
-using CLayout_64x240 = CLayout_64xN<240>;
-using CLayout_64x248 = CLayout_64xN<248>;
-
-}
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F16F16F16_SS = SM90::GMMA::MMA_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F16F16F16_RS = SM90::GMMA::MMA_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F16F16F16_SS = SM90::GMMA::MMA_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F16F16F16_RS = SM90::GMMA::MMA_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F16F16F16_SS = SM90::GMMA::MMA_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F16F16F16_RS = SM90::GMMA::MMA_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F16F16F16_SS = SM90::GMMA::MMA_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F16F16F16_RS = SM90::GMMA::MMA_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F16F16F16_SS = SM90::GMMA::MMA_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F16F16F16_RS = SM90::GMMA::MMA_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F16F16F16_SS = SM90::GMMA::MMA_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F16F16F16_RS = SM90::GMMA::MMA_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F16F16F16_SS = SM90::GMMA::MMA_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F16F16F16_RS = SM90::GMMA::MMA_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F16F16F16_SS = SM90::GMMA::MMA_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F16F16F16_RS = SM90::GMMA::MMA_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F16F16F16_SS = SM90::GMMA::MMA_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F16F16F16_RS = SM90::GMMA::MMA_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F16F16F16_SS = SM90::GMMA::MMA_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F16F16F16_RS = SM90::GMMA::MMA_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F16F16F16_SS = SM90::GMMA::MMA_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F16F16F16_RS = SM90::GMMA::MMA_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F16F16F16_SS = SM90::GMMA::MMA_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F16F16F16_RS = SM90::GMMA::MMA_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F16F16F16_SS = SM90::GMMA::MMA_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F16F16F16_RS = SM90::GMMA::MMA_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F16F16F16_SS = SM90::GMMA::MMA_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F16F16F16_RS = SM90::GMMA::MMA_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F16F16F16_SS = SM90::GMMA::MMA_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F16F16F16_RS = SM90::GMMA::MMA_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F16F16F16_SS = SM90::GMMA::MMA_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F16F16F16_RS = SM90::GMMA::MMA_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F16F16F16_SS = SM90::GMMA::MMA_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F16F16F16_RS = SM90::GMMA::MMA_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F16F16F16_SS = SM90::GMMA::MMA_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F16F16F16_RS = SM90::GMMA::MMA_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F16F16F16_SS = SM90::GMMA::MMA_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F16F16F16_RS = SM90::GMMA::MMA_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F16F16F16_SS = SM90::GMMA::MMA_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F16F16F16_RS = SM90::GMMA::MMA_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F16F16F16_SS = SM90::GMMA::MMA_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F16F16F16_RS = SM90::GMMA::MMA_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F16F16F16_SS = SM90::GMMA::MMA_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F16F16F16_RS = SM90::GMMA::MMA_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F16F16F16_SS = SM90::GMMA::MMA_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F16F16F16_RS = SM90::GMMA::MMA_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F16F16F16_SS = SM90::GMMA::MMA_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F16F16F16_RS = SM90::GMMA::MMA_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F32F16F16_SS = SM90::GMMA::MMA_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F32F16F16_RS = SM90::GMMA::MMA_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F32F16F16_SS = SM90::GMMA::MMA_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F32F16F16_RS = SM90::GMMA::MMA_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F32F16F16_SS = SM90::GMMA::MMA_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F32F16F16_RS = SM90::GMMA::MMA_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F32F16F16_SS = SM90::GMMA::MMA_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F32F16F16_RS = SM90::GMMA::MMA_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F32F16F16_SS = SM90::GMMA::MMA_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F32F16F16_RS = SM90::GMMA::MMA_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F32F16F16_SS = SM90::GMMA::MMA_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F32F16F16_RS = SM90::GMMA::MMA_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F32F16F16_SS = SM90::GMMA::MMA_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F32F16F16_RS = SM90::GMMA::MMA_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F32F16F16_SS = SM90::GMMA::MMA_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F32F16F16_RS = SM90::GMMA::MMA_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F32F16F16_SS = SM90::GMMA::MMA_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F32F16F16_RS = SM90::GMMA::MMA_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F32F16F16_SS = SM90::GMMA::MMA_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F32F16F16_RS = SM90::GMMA::MMA_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F32F16F16_SS = SM90::GMMA::MMA_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F32F16F16_RS = SM90::GMMA::MMA_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F32F16F16_SS = SM90::GMMA::MMA_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F32F16F16_RS = SM90::GMMA::MMA_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F32F16F16_SS = SM90::GMMA::MMA_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F32F16F16_RS = SM90::GMMA::MMA_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F32F16F16_SS = SM90::GMMA::MMA_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F32F16F16_RS = SM90::GMMA::MMA_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F32F16F16_SS = SM90::GMMA::MMA_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F32F16F16_RS = SM90::GMMA::MMA_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F32F16F16_SS = SM90::GMMA::MMA_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F32F16F16_RS = SM90::GMMA::MMA_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F32F16F16_SS = SM90::GMMA::MMA_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F32F16F16_RS = SM90::GMMA::MMA_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F32F16F16_SS = SM90::GMMA::MMA_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F32F16F16_RS = SM90::GMMA::MMA_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F32F16F16_SS = SM90::GMMA::MMA_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F32F16F16_RS = SM90::GMMA::MMA_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F32F16F16_SS = SM90::GMMA::MMA_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F32F16F16_RS = SM90::GMMA::MMA_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F32F16F16_SS = SM90::GMMA::MMA_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F32F16F16_RS = SM90::GMMA::MMA_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F32F16F16_SS = SM90::GMMA::MMA_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F32F16F16_RS = SM90::GMMA::MMA_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F32F16F16_SS = SM90::GMMA::MMA_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F32F16F16_RS = SM90::GMMA::MMA_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F32F16F16_SS = SM90::GMMA::MMA_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F32F16F16_RS = SM90::GMMA::MMA_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = half_t;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F32BF16BF16_SS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>;
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::Major tnspA,
-  GMMA::Major tnspB,
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x16_F32BF16BF16_RS = SM90::GMMA::MMA_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>; 
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x16_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = bfloat16_t;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 24,  8>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 24,  8>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 40,  8>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 40,  8>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 48,  8>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 48,  8>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 56,  8>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 56,  8>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 72,  8>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 72,  8>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 80,  8>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 80,  8>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout< 88,  8>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout< 88,  8>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<104,  8>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<104,  8>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<112,  8>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<112,  8>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<120,  8>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<120,  8>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<136,  8>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<136,  8>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<144,  8>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<144,  8>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<152,  8>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<152,  8>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<160,  8>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<160,  8>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<168,  8>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<168,  8>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<176,  8>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<176,  8>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<184,  8>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<184,  8>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<200,  8>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<200,  8>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<208,  8>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<208,  8>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<216,  8>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<216,  8>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<224,  8>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<224,  8>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<232,  8>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<232,  8>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<240,  8>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<240,  8>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x8_F32TF32TF32_SS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x8_F32TF32TF32_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64,  8>;
-  using BLayout = GMMA::ABLayout<248,  8>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x8_F32TF32TF32_RS_TN = SM90::GMMA::MMA_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x8_F32TF32TF32_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = tfloat32_t;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_8>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x8;
-  using BLayout = GMMA::ABLayout<248,  8>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32S8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32S8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32S8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = int8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8S8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8S8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8S8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8S8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8S8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8S8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8S8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8U8_SS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8U8_SS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_SS_TN_SATURATE;
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8U8_SS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x24x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x24x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x24x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x48x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x48x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x48x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x80x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x80x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x80x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x112x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x112x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x112x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x144x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x144x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x144x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x160x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x160x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x160x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x176x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x176x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x176x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x208x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x208x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x208x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x224x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x224x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x224x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8U8_RS_TN = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-using SM90_64x240x32_S32U8U8_RS_TN_SATURATE = SM90::GMMA::MMA_64x240x32_S32U8U8_RS_TN_SATURATE; 
-
-template <>
-struct MMA_Traits<SM90_64x240x32_S32U8U8_RS_TN_SATURATE>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = uint8_t;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E4M3E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E4M3E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E4M3E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E4M3E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E4M3E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E4M3E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e4m3_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E5M2E4M3_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E5M2E4M3_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E5M2E4M3_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x24x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x24x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x40x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x40x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x48x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x48x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x56x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x56x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x72x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x72x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x80x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x80x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x88x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x88x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x104x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x104x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x112x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x112x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x120x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x120x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x136x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x136x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x144x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x144x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x152x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x152x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x160x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x160x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x168x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x168x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x176x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x176x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x184x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x184x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x200x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x200x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x208x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x208x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x216x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x216x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x224x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x224x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x232x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x232x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x240x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x240x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F16E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F16E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E5M2E5M2_SS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>;
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_SS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  GMMA::ScaleIn  scaleA = GMMA::ScaleIn::One,
-  GMMA::ScaleIn  scaleB = GMMA::ScaleIn::One
->
-using SM90_64x248x32_F32E5M2E5M2_RS_TN = SM90::GMMA::MMA_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>; 
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB>
-struct MMA_Traits<SM90_64x248x32_F32E5M2E5M2_RS_TN<scaleA, scaleB>>
-{
-  using ValTypeD = float;
-  using ValTypeA = float_e5m2_t;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
deleted file mode 100755
index 27c41ad33..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
+++ /dev/null
@@ -1,7738 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/pointer_sparse.hpp>             // cute::smem_sparse_ptr_flag
-#include <cute/swizzle.hpp>                    // cute::Swizzle
-#include <cute/tensor_impl.hpp>                // cute::Tensor
-#include <cute/arch/mma_sm90_desc.hpp>         // cute::LayoutType
-#include <cute/arch/mma_sm90_gmma_sparse.hpp>  // cute::SM90::SPARSE::GMMA_64x8x32_F16F16F16_SS, etc
-#include <cute/atom/mma_traits_sm90_gmma.hpp>  // cute::GMMA::Layout_*
-#include <cute/atom/mma_traits.hpp>            // cute::MMA_Traits
-#include <cute/layout_composed.hpp>            // cute::ComposedLayout
-#include <cute/numeric/integral_constant.hpp>  // cute::is_static
-
-namespace cute {
-
-namespace SM90::GMMA {
-
-///////////////////////////////////////////
-// Common layouts for GMMA Shared Memory //
-///////////////////////////////////////////
-
-// M|N-major layouts in units of Type and sparsity factor S
-template <class Type, int S>
-using Layout_MN_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_INTER_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_MN_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW32_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_MN_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW64_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_MN_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_MN_SW128_Atom<Type>{}.layout_b()))>;
-
-// K-major layouts in units of Type and sparsity factor S
-template <class Type, int S>
-using Layout_K_INTER_SpAtom = ComposedLayout<Swizzle<0,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_INTER_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_K_SW32_SpAtom  = ComposedLayout<Swizzle<1,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW32_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_K_SW64_SpAtom  = ComposedLayout<Swizzle<2,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW64_Atom<Type>{}.layout_b()))>;
-template <class Type, int S>
-using Layout_K_SW128_SpAtom = ComposedLayout<Swizzle<3,4,3>, smem_sparse_ptr_flag_bits<S,sizeof_bits_v<Type>>,
-                                              decltype(blocked_product(Layout<Shape<_1,Int<S>>>{}, Layout_K_SW128_Atom<Type>{}.layout_b()))>;
-
-// With GMMA::Major param
-template <class Type, int S, GMMA::Major tnsp>
-using Layout_INTER_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
-                                                 Layout_MN_INTER_SpAtom<Type,S>,
-                                                 Layout_K_INTER_SpAtom<Type,S>>::type;
-template <class Type, int S, GMMA::Major tnsp>
-using Layout_SW32_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
-                                                Layout_MN_SW32_SpAtom<Type,S>,
-                                                Layout_K_SW32_SpAtom<Type,S>>::type;
-template <class Type, int S, GMMA::Major tnsp>
-using Layout_SW64_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
-                                                Layout_MN_SW64_SpAtom<Type,S>,
-                                                Layout_K_SW64_SpAtom<Type,S>>::type;
-template <class Type, int S, GMMA::Major tnsp>
-using Layout_SW128_SpAtom = typename conditional<tnsp == GMMA::Major::MN,
-                                                 Layout_MN_SW128_SpAtom<Type,S>,
-                                                 Layout_K_SW128_SpAtom<Type,S>>::type;
-
-///////////////////////////////////////////////////////////////////////////////
-// Higher level GMMA Descriptor utilities
-///////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major>
-struct sparse_smem_desc : DescriptorIterator {};
-
-} // end namespace SM90::GMMA
-
-// Customization point for creating a cute::GMMAsparse_smem_desc Tensor
-template <SM90::GMMA::Major MajorMode>
-struct MakeTensor<SM90::GMMA::sparse_smem_desc<MajorMode>>
-{
-  // Note that this is the exact same as cute::GMMAsmem_desc above, plus additional static checks.
-  template <class TEngine, class TLayout>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Tensor<TEngine,TLayout> const& smem_tensor)
-  {
-    static_assert(is_smem<TEngine>::value, "Expected SMEM Tensor to construct a GMMA Desc Tensor");
-    static_assert(is_sparse<typename TEngine::value_type>::value, "Expected sparse value_type.");
-    static_assert(is_sparse_ptr<TEngine>::value, "Expected sparse iter.");
-    return make_tensor(SM90::GMMA::DescriptorIterator{SM90::GMMA::make_gmma_desc<MajorMode>(tensor<0>(smem_tensor))},
-                       replace<0>(recast<uint128_t const>(smem_tensor).layout(), Layout<_1,_0>{}));
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//////////////////////////// MMA_TRAITS ///////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-namespace SM90::GMMA {
-
-// Metadata layouts
-using ELayout_64x64  = Layout<Shape <Shape <_2,   _2,_8, _4>, Shape <_32>>, 
-                              Stride<Stride<_8,_2048,_1,_16>, Stride<_64>>>;
-
-using ELayout_64x32  = Layout<Shape <Shape <   _2,_2,_8, _4>, Shape <_16,_2>>, 
-                              Stride<Stride<_1024,_0,_1,_16>, Stride<_64,_8>>>;
-
-using ELayout_64x16  = Layout<Shape <Shape <  _2,_2,_8, _4>, Shape < _8,_2>>, 
-                              Stride<Stride<_512,_0,_1,_16>, Stride<_64,_8>>>;
-
-} // namespace SM90::GMMA
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace SM90::GMMA::SPARSE {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class MMAOp,
-          class TD, class DLayout,
-          class TA, class ALayout,
-          class TB, class BLayout,
-          class TC, class CLayout>
-CUTE_HOST_DEVICE constexpr void
-mma_unpack(MMA_Traits<MMAOp>   const& traits,
-           Tensor<TD, DLayout>      & D,
-           Tensor<TA, ALayout> const& A_zipped,
-           Tensor<TB, BLayout> const& B,
-           Tensor<TC, CLayout> const& C)
-{
-  static_assert(is_rmem_v<TD>, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem_v<TA>, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem_v<TB>, "Expected registers in MMA_Atom::call");
-  static_assert(is_rmem_v<TC>, "Expected registers in MMA_Atom::call");
-
-  using DRegisters = typename MMAOp::DRegisters;
-  using ARegisters = typename MMAOp::ARegisters;
-  using ERegisters = typename MMAOp::ERegisters;
-  using BRegisters = typename MMAOp::BRegisters;
-  using CRegisters = typename MMAOp::CRegisters;
-
-  // Register value types from the MMAOp register arrays
-  using RegTypeD   = typename remove_extent<DRegisters>::type;
-  using RegTypeA   = typename remove_extent<ARegisters>::type;
-  using RegTypeE   = typename remove_extent<ERegisters>::type;
-  using RegTypeB   = typename remove_extent<BRegisters>::type;
-  using RegTypeC   = typename remove_extent<CRegisters>::type;
-
-  constexpr int RegNumA = extent<ARegisters>::value;
-  constexpr int RegNumE = extent<ERegisters>::value;
-  constexpr int RegNumB = extent<BRegisters>::value;
-  constexpr int RegNumC = extent<CRegisters>::value;
-
-  auto [A, E] = unzip_tensor(A_zipped);
-  Tensor rA   = recast<RegTypeA>(A);
-  Tensor rE   = recast<RegTypeE>(E);
-  Tensor rB   = recast<RegTypeB>(B);
-
-  CUTE_STATIC_ASSERT_V(size(rA) == Int<RegNumA>{});
-  CUTE_STATIC_ASSERT_V(size(rE) == Int<RegNumE>{});
-  CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
-
-  static_assert(is_same<RegTypeD, void>::value, "GMMA DRegisters must have void type.");
-  static_assert(is_same<typename TD::value_type, typename TC::value_type>::value, "GMMA C and D value_type must match.");
-  static_assert(is_same<DLayout, CLayout>::value, "GMMA C and D layouts must match.");
-
-  Tensor rC = recast<RegTypeC>(D);  // NOTE: D and C are same, so use mutable D
-
-  CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
-
-  detail::explode(MMAOp::fma,
-                  rA, make_int_sequence<RegNumA>{},
-                  rB, make_int_sequence<RegNumB>{},
-                  rC, make_int_sequence<RegNumC>{},
-                  rE, make_int_sequence<RegNumE>{},
-                  &(traits.accumulate_), seq<0>{});
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace SM90::SPARSE
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_8,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<  8, 32>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_16,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 16, 32>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_32,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 32, 32>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_64,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 64, 32>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_96,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 96, 32>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_128,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<128, 32>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_192,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<192, 32>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_256,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<256, 32>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<  8, 16>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 16, 16>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 32, 16>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 64, 16>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 96, 16>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<128, 16>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<192, 16>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<256, 16>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x8x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_8,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<  8, 64>;
-  using CLayout = GMMA::CLayout_64x8;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x16x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_16,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 16, 64>;
-  using CLayout = GMMA::CLayout_64x16;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x32x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_32,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 32, 64>;
-  using CLayout = GMMA::CLayout_64x32;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x64x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_64,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 64, 64>;
-  using CLayout = GMMA::CLayout_64x64;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x96x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_96,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 96, 64>;
-  using CLayout = GMMA::CLayout_64x96;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x128x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_128,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<128, 64>;
-  using CLayout = GMMA::CLayout_64x128;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x192x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_192,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<192, 64>;
-  using CLayout = GMMA::CLayout_64x192;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x256x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_256,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<256, 64>;
-  using CLayout = GMMA::CLayout_64x256;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
-
-#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
-#include "mma_traits_sm90_gmma_sparse_ext.hpp"
-#endif
\ No newline at end of file
diff --git a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp b/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
deleted file mode 100755
index 3680b7e13..000000000
--- a/lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
+++ /dev/null
@@ -1,17335 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
-#pragma once
-  
-#include <cute/arch/mma_sm90.hpp>
-#include <cute/atom/mma_traits.hpp>
-
-namespace cute {
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F16F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32F16F16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, half_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = half_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_24,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 24, 32>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_40,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 40, 32>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_48,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 48, 32>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_56,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 56, 32>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_72,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 72, 32>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_80,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 80, 32>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_88,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout< 88, 32>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_104,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<104, 32>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_112,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<112, 32>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_120,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<120, 32>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_136,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<136, 32>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_144,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<144, 32>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_152,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<152, 32>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_160,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<160, 32>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_168,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<168, 32>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_176,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<176, 32>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_184,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<184, 32>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_200,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<200, 32>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_208,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<208, 32>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_216,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<216, 32>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_224,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<224, 32>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_232,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<232, 32>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_240,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<240, 32>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_SS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<tnspA>;
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 32>;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::Major tnspA, GMMA::Major tnspB, GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x32_F32BF16BF16_RS<tnspA, tnspB, scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, bfloat16_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = bfloat16_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<tnspB>;
-
-  using Shape_MNK = Shape<_64,_248,_32>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x32;
-  using ELayout = GMMA::ELayout_64x32;
-  using BLayout = GMMA::ABLayout<248, 32>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 24, 16>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 40, 16>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 48, 16>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 56, 16>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 72, 16>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 80, 16>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout< 88, 16>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<104, 16>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<112, 16>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<120, 16>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<136, 16>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<144, 16>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<152, 16>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<160, 16>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<168, 16>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<176, 16>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<184, 16>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<200, 16>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<208, 16>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<216, 16>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<224, 16>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<232, 16>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<240, 16>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 16>;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x16_F32TF32TF32_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, tfloat32_t>;
-  using ValTypeE = sparse_elem<4, uint8_t>;
-  using ValTypeB = tfloat32_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_16>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x16;
-  using ELayout = GMMA::ELayout_64x16;
-  using BLayout = GMMA::ABLayout<248, 16>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32S8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, int8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8S8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = int8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_SS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_S32U8U8_RS_TN_SATURATE<spsel>>
-{
-  using ValTypeD = int32_t;
-  using ValTypeA = sparse_elem<2, uint8_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = uint8_t;
-  using ValTypeC = int32_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E4M3E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e4m3_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E4M3_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e4m3_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x24x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_24,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 24, 64>;
-  using CLayout = GMMA::CLayout_64x24;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x40x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_40,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 40, 64>;
-  using CLayout = GMMA::CLayout_64x40;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x48x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_48,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 48, 64>;
-  using CLayout = GMMA::CLayout_64x48;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x56x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_56,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 56, 64>;
-  using CLayout = GMMA::CLayout_64x56;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x72x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_72,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 72, 64>;
-  using CLayout = GMMA::CLayout_64x72;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x80x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_80,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 80, 64>;
-  using CLayout = GMMA::CLayout_64x80;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x88x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_88,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout< 88, 64>;
-  using CLayout = GMMA::CLayout_64x88;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x104x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_104,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<104, 64>;
-  using CLayout = GMMA::CLayout_64x104;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x112x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_112,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<112, 64>;
-  using CLayout = GMMA::CLayout_64x112;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x120x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_120,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<120, 64>;
-  using CLayout = GMMA::CLayout_64x120;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x136x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_136,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<136, 64>;
-  using CLayout = GMMA::CLayout_64x136;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x144x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_144,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<144, 64>;
-  using CLayout = GMMA::CLayout_64x144;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x152x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_152,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<152, 64>;
-  using CLayout = GMMA::CLayout_64x152;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x160x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_160,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<160, 64>;
-  using CLayout = GMMA::CLayout_64x160;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x168x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_168,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<168, 64>;
-  using CLayout = GMMA::CLayout_64x168;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x176x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_176,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<176, 64>;
-  using CLayout = GMMA::CLayout_64x176;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x184x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_184,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<184, 64>;
-  using CLayout = GMMA::CLayout_64x184;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x200x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_200,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<200, 64>;
-  using CLayout = GMMA::CLayout_64x200;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x208x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_208,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<208, 64>;
-  using CLayout = GMMA::CLayout_64x208;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x216x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_216,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<216, 64>;
-  using CLayout = GMMA::CLayout_64x216;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x224x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_224,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<224, 64>;
-  using CLayout = GMMA::CLayout_64x224;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x232x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_232,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<232, 64>;
-  using CLayout = GMMA::CLayout_64x232;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x240x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_240,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<240, 64>;
-  using CLayout = GMMA::CLayout_64x240;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F16E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = half_t;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = half_t;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_SS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeA = GMMA::smem_desc<GMMA::Major::K>;
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ABLayout< 64, 64>;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <GMMA::ScaleIn scaleA, GMMA::ScaleIn scaleB, GMMA::SparseSel spsel>
-struct MMA_Traits<SM90::GMMA::SPARSE::GMMA_64x248x64_F32E5M2E5M2_RS_TN<scaleA, scaleB, spsel>>
-{
-  using ValTypeD = float;
-  using ValTypeA = sparse_elem<2, float_e5m2_t>;
-  using ValTypeE = sparse_elem<8, uint8_t>;
-  using ValTypeB = float_e5m2_t;
-  using ValTypeC = float;
-
-  using FrgTypeB = GMMA::smem_desc<GMMA::Major::K>;
-
-  using Shape_MNK = Shape<_64,_248,_64>;
-  using ThrID   = Layout<_128>;
-  using ALayout = GMMA::ALayout_64x64;
-  using ELayout = GMMA::ELayout_64x64;
-  using BLayout = GMMA::ABLayout<248, 64>;
-  using CLayout = GMMA::CLayout_64x248;
-
-  GMMA::ScaleOut accumulate_ = GMMA::ScaleOut::One;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/config.hpp b/lightllm-kernel/cutlass/include/cute/config.hpp
deleted file mode 100755
index b5cfcf47d..000000000
--- a/lightllm-kernel/cutlass/include/cute/config.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-#  define CUTE_HOST_DEVICE __forceinline__ __host__ __device__
-#  define CUTE_DEVICE      __forceinline__          __device__
-#  define CUTE_HOST        __forceinline__ __host__
-#else
-#  define CUTE_HOST_DEVICE inline
-#  define CUTE_DEVICE      inline
-#  define CUTE_HOST        inline
-#endif // CUTE_HOST_DEVICE, CUTE_DEVICE
-
-#if defined(__CUDACC_RTC__)
-#  define CUTE_HOST_RTC CUTE_HOST_DEVICE
-#else
-#  define CUTE_HOST_RTC CUTE_HOST
-#endif
-
-#if !defined(__CUDACC_RTC__) && !defined(__clang__) && \
-  (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))
-#  define CUTE_UNROLL    #pragma unroll
-#  define CUTE_NO_UNROLL #pragma unroll 1
-#elif defined(__CUDACC_RTC__) || defined(__clang__)
-#  define CUTE_UNROLL    _Pragma("unroll")
-#  define CUTE_NO_UNROLL _Pragma("unroll 1")
-#else
-#  define CUTE_UNROLL
-#  define CUTE_NO_UNROLL
-#endif // CUTE_UNROLL
-
-#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
-#  define CUTE_INLINE_CONSTANT                 static const __device__
-#else
-#  define CUTE_INLINE_CONSTANT                 static constexpr
-#endif
-
-// __grid_constant__ was introduced in CUDA 11.7.
-#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7)))
-#  define CUTE_GRID_CONSTANT_SUPPORTED
-#endif
-
-// __grid_constant__ can be enabled only on SM70+.
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
-#  define CUTE_GRID_CONSTANT_ENABLED
-#endif
-
-#if ! defined(CUTE_GRID_CONSTANT)
-#  if defined(CUTE_GRID_CONSTANT_SUPPORTED) && defined(CUTE_GRID_CONSTANT_ENABLED)
-#    define CUTE_GRID_CONSTANT __grid_constant__
-#  else
-#    define CUTE_GRID_CONSTANT
-#  endif
-#endif
-
-// Some versions of GCC < 11 have trouble deducing that a
-// function with "auto" return type and all of its returns in an "if
-// constexpr ... else" statement must actually return.  Thus, GCC
-// emits spurious "missing return statement" build warnings.
-// Developers can suppress these warnings by using the
-// CUTE_GCC_UNREACHABLE macro, which must be followed by a semicolon.
-// It's harmless to use the macro for other GCC versions or other
-// compilers, but it has no effect.
-#if ! defined(CUTE_GCC_UNREACHABLE)
-#  if defined(__GNUC__)
-#    define CUTE_GCC_UNREACHABLE __builtin_unreachable()
-#  else
-#    define CUTE_GCC_UNREACHABLE
-#  endif
-#endif
-
-#if defined(_MSC_VER)
-// Provides support for alternative operators 'and', 'or', and 'not'
-#  include <iso646.h>
-#endif // _MSC_VER
-
-#if defined(__CUDACC_RTC__)
-#  define CUTE_STL_NAMESPACE cuda::std
-#  define CUTE_STL_NAMESPACE_IS_CUDA_STD
-#else
-#  define CUTE_STL_NAMESPACE std
-#endif
-
-//
-// Assertion helpers
-//
-
-#if defined(__CUDACC_RTC__)
-#  include <cuda/std/cassert>
-#else
-#  include <cassert>
-#endif
-
-#define CUTE_STATIC_V(x)            decltype(x)::value
-
-#define CUTE_STATIC_ASSERT          static_assert
-#define CUTE_STATIC_ASSERT_V(x,...) static_assert(decltype(x)::value, ##__VA_ARGS__)
-
-// Fail and print a message. Typically used for notification of a compiler misconfiguration.
-#if defined(__CUDA_ARCH__)
-#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x); __brkpt()
-#else
-#  define CUTE_INVALID_CONTROL_PATH(x) assert(0 && x); printf(x)
-#endif
-
-//
-// IO
-//
-
-#if !defined(__CUDACC_RTC__)
-#  include <cstdio>
-#  include <iostream>
-#  include <iomanip>
-#endif
-
-//
-// Debugging utilities
-//
-
-#include <cute/util/debug.hpp>
diff --git a/lightllm-kernel/cutlass/include/cute/container/alignment.hpp b/lightllm-kernel/cutlass/include/cute/container/alignment.hpp
deleted file mode 100755
index 52e4cbadd..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/alignment.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/numeric/numeric_types.hpp>
-#include <cute/numeric/math.hpp>
-
-namespace cute
-{
-
-// Test if a pointer is aligned to N bytes
-template <int N>
-CUTE_HOST_DEVICE constexpr
-bool
-is_byte_aligned(void const* const ptr)
-{
-  static_assert(has_single_bit(N), "N must be a power of 2 in alignment check");
-  return (reinterpret_cast<uintptr_t>(ptr) & (N-1)) == 0;
-}
-
-#if defined(__CUDACC__)
-#  define CUTE_ALIGNAS(n) __align__(n)
-#else
-#  define CUTE_ALIGNAS(n) alignas(n)
-#endif
-
-template <size_t Alignment, class Child = void>
-struct aligned_struct {};
-
-template <class Child> struct CUTE_ALIGNAS(  1) aligned_struct<  1, Child> {};
-template <class Child> struct CUTE_ALIGNAS(  2) aligned_struct<  2, Child> {};
-template <class Child> struct CUTE_ALIGNAS(  4) aligned_struct<  4, Child> {};
-template <class Child> struct CUTE_ALIGNAS(  8) aligned_struct<  8, Child> {};
-template <class Child> struct CUTE_ALIGNAS( 16) aligned_struct< 16, Child> {};
-template <class Child> struct CUTE_ALIGNAS( 32) aligned_struct< 32, Child> {};
-template <class Child> struct CUTE_ALIGNAS( 64) aligned_struct< 64, Child> {};
-template <class Child> struct CUTE_ALIGNAS(128) aligned_struct<128, Child> {};
-template <class Child> struct CUTE_ALIGNAS(256) aligned_struct<256, Child> {};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/array.hpp b/lightllm-kernel/cutlass/include/cute/container/array.hpp
deleted file mode 100755
index 9cdcf5f4c..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/array.hpp
+++ /dev/null
@@ -1,492 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/util/type_traits.hpp>
-
-namespace cute
-{
-
-template <class T, size_t N>
-struct array
-{
-  using element_type = T;
-  using value_type = remove_cv_t<T>;
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using reference = element_type&;
-  using const_reference = const element_type&;
-  using pointer = element_type*;
-  using const_pointer = const element_type*;
-  using iterator = pointer;
-  using const_iterator = const_pointer;
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](size_type pos)
-  {
-    return begin()[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference operator[](size_type pos) const
-  {
-    return begin()[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference front()
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference front() const
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference back()
-  {
-    // return *rbegin();
-    return operator[](N-1);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference back() const
-  {
-    // return *rbegin();
-    return operator[](N-1);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  T* data()
-  {
-    return __elems_;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  T const* data() const
-  {
-    return __elems_;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator begin()
-  {
-    return data();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator begin() const
-  {
-    return data();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cbegin()
-  {
-    return begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cbegin() const
-  {
-    return begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator end()
-  {
-    return data() + size();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator end() const
-  {
-    return data() + size();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cend()
-  {
-    return end();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cend() const
-  {
-    return end();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  bool empty() const
-  {
-    return size() == 0;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  size_type size() const
-  {
-    return N;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  size_type max_size() const
-  {
-    return size();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  void fill(const T& value)
-  {
-    for (auto& e : *this) {
-      e = value;
-    }
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  void clear()
-  {
-    fill(T(0));
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  void swap(array& other)
-  {
-    using CUTE_STL_NAMESPACE::swap;
-    for (size_type i = 0; i < size(); ++i) {
-      swap((*this)[i], other[i]);
-    }
-  }
-
-  element_type __elems_[N];
-};
-
-
-template <class T>
-struct array<T, 0>
-{
-  using element_type = T;
-  using value_type = remove_cv_t<T>;
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using reference = element_type&;
-  using const_reference = const element_type&;
-  using pointer = element_type*;
-  using const_pointer = const element_type*;
-  using const_iterator = const_pointer;
-  using iterator = pointer;
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](size_type pos)
-  {
-    return begin()[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference operator[](size_type pos) const
-  {
-    return begin()[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference front()
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference front() const
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference back()
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference back() const
-  {
-    return *begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  T* data()
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  T const* data() const
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator begin()
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator begin() const
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cbegin()
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cbegin() const
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator end()
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator end() const
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cend()
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cend() const
-  {
-    return nullptr;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  bool empty() const
-  {
-    return true;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  size_type size() const
-  {
-    return 0;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  size_type max_size() const
-  {
-    return 0;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  void fill(const T& value)
-  {}
-
-  CUTE_HOST_DEVICE constexpr
-  void clear()
-  {}
-
-  CUTE_HOST_DEVICE constexpr
-  void swap(array& other)
-  {}
-};
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-bool operator==(array<T,N> const& lhs, array<T,N> const& rhs)
-{
-  for (size_t i = 0; i < N; ++i) {
-    if (lhs[i] != rhs[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-void clear(array<T,N>& a)
-{
-  a.fill(T(0));
-}
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-void fill(array<T,N>& a, T const& value)
-{
-  a.fill(value);
-}
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-void swap(array<T,N>& a, array<T,N>& b)
-{
-  a.swap(b);
-}
-
-/// @return A cute::array of the elements of @c t in reverse order.
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-cute::array<T,N> reverse(cute::array<T,N> const& t) 
-{
-  if constexpr (N == 0u) {
-    return t;
-  } else {
-    cute::array<T,N> t_r{};
-    for (size_t k = 0; k < N; ++k) {
-      t_r[k] = t[N - k - 1];
-    }
-    return t_r;
-  }
-}
-
-} // end cute
-
-
-//
-// Specialize tuple-related functionality for cute::array
-//
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/tuple>
-#else
-#include <tuple>
-#endif
-
-namespace cute
-{
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T& get(array<T,N>& a)
-{
-  static_assert(I < N, "Index out of range");
-  return a[I];
-}
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T const& get(array<T,N> const& a)
-{
-  static_assert(I < N, "Index out of range");
-  return a[I];
-}
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T&& get(array<T,N>&& a)
-{
-  static_assert(I < N, "Index out of range");
-  return cute::move(a[I]);
-}
-
-} // end namespace cute
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N>>
-{
-  using type = T;
-};
-
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N> const>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N> const>
-{
-  using type = T;
-};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std
-{
-
-#if defined(__CUDACC_RTC__)
-template <class... _Tp>
-struct tuple_size;
-
-template <size_t _Ip, class... _Tp>
-struct tuple_element;
-#endif
-
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N>>
-{
-  using type = T;
-};
-
-template <class T, size_t N>
-struct tuple_size<cute::array<T,N> const>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array<T,N> const>
-{
-  using type = T;
-};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp b/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
deleted file mode 100755
index a9d14a1a2..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/container/alignment.hpp>  // CUTE_ALIGNAS
-#include <cute/container/array.hpp>      // cute::array
-
-namespace cute
-{
-
-template <class T, size_t N, size_t Alignment = 16>
-struct CUTE_ALIGNAS(Alignment) array_aligned : cute::array<T,N> {};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp b/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
deleted file mode 100755
index 57db56aba..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp
+++ /dev/null
@@ -1,643 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Statically sized array of elements that accommodates subbyte trivial types
-           in a packed storage.
-*/
-
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/numeric/numeric_types.hpp>
-#include <cute/numeric/integral_constant.hpp>
-
-namespace cute
-{
-//
-// Underlying subbyte storage type
-//
-template <class T>
-using subbyte_storage_type_t = conditional_t<(cute::sizeof_bits_v<T> <=   8), uint8_t,
-                               conditional_t<(cute::sizeof_bits_v<T> <=  16), uint16_t,
-                               conditional_t<(cute::sizeof_bits_v<T> <=  32), uint32_t,
-                               conditional_t<(cute::sizeof_bits_v<T> <=  64), uint64_t,
-                               conditional_t<(cute::sizeof_bits_v<T> <= 128), uint128_t,
-                               T>>>>>;
-
-template <class T> struct subbyte_iterator;
-template <class, class> struct swizzle_ptr;
-
-//
-// subbyte_reference
-//   Proxy object for sub-byte element references
-//
-template <class T>
-struct subbyte_reference
-{
-  // Iterator Element type (const or non-const)
-  using element_type = T;
-  // Iterator Value type without type qualifier.
-  using value_type   = remove_cv_t<T>;
-  // Storage type (const or non-const)
-  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
-
-  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
-
-  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
-                "Size of Element must not be greater than Storage.");
-
-private:
-
-  // Bitmask for covering one item
-  static constexpr storage_type BitMask = storage_type(storage_type(-1) >> (sizeof_bits_v<storage_type> - sizeof_bits_v<element_type>));
-  // Flag for fast branching on straddled elements
-  static constexpr bool is_storage_unaligned = ((sizeof_bits_v<storage_type> % sizeof_bits_v<element_type>) != 0);
-
-  friend struct subbyte_iterator<T>;
-
-  // Pointer to storage element
-  storage_type* ptr_ = nullptr;
-
-  // Bit index of value_type starting position within storage_type element.
-  // RI: 0 <= idx_ < sizeof_bit<storage_type>
-  uint8_t idx_ = 0;
-
-  // Ctor
-  template <class PointerType>
-  CUTE_HOST_DEVICE constexpr
-  subbyte_reference(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) {}
-
-public:
-
-  // Copy Ctor
-  CUTE_HOST_DEVICE constexpr
-  subbyte_reference(subbyte_reference const& other) {
-    *this = element_type(other);
-  }
-
-  // Copy Assignment
-  CUTE_HOST_DEVICE constexpr
-  subbyte_reference& operator=(subbyte_reference const& other) {
-    return *this = element_type(other);
-  }
-
-  // Assignment
-  template <class T_ = element_type>
-  CUTE_HOST_DEVICE constexpr
-  enable_if_t<!is_const_v<T_>, subbyte_reference&> operator=(element_type x)
-  {
-    static_assert(is_same_v<T_, element_type>, "Do not specify template arguments!");
-    storage_type item = (reinterpret_cast<storage_type const&>(x) & BitMask);
-
-    // Update the current storage element
-    storage_type bit_mask_0 = storage_type(BitMask << idx_);
-    ptr_[0] = storage_type((ptr_[0] & ~bit_mask_0) | (item << idx_));
-
-    // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
-    if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
-      uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
-      storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
-      // Update the next storage element
-      ptr_[1] = storage_type((ptr_[1] & ~bit_mask_1) | (item >> straddle_bits));
-    }
-
-    return *this;
-  }
-
-  // Comparison of referenced values
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator==(subbyte_reference const& x, subbyte_reference const& y) { return x.get() == y.get(); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator!=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() != y.get(); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator< (subbyte_reference const& x, subbyte_reference const& y) { return x.get() <  y.get(); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator> (subbyte_reference const& x, subbyte_reference const& y) { return x.get() >  y.get(); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator<=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() <= y.get(); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator>=(subbyte_reference const& x, subbyte_reference const& y) { return x.get() >= y.get(); }
-
-  // Value
-  CUTE_HOST_DEVICE
-  element_type get() const
-  {
-    if constexpr (is_same_v<bool, value_type>) {      // Extract to bool -- potentially faster impl
-      return bool((*ptr_) & (BitMask << idx_));
-    } else {                                          // Extract to element_type
-      // Extract from the current storage element
-      auto item = storage_type((ptr_[0] >> idx_) & BitMask);
-
-      // If value_type is unaligned with storage_type (static) and this is a straddled value (dynamic)
-      if (is_storage_unaligned && idx_ + sizeof_bits_v<value_type> > sizeof_bits_v<storage_type>) {
-        uint8_t straddle_bits = uint8_t(sizeof_bits_v<storage_type> - idx_);
-        storage_type bit_mask_1 = storage_type(BitMask >> straddle_bits);
-        // Extract from the next storage element
-        item |= storage_type((ptr_[1] & bit_mask_1) << straddle_bits);
-      }
-
-      return reinterpret_cast<element_type&>(item);
-    }
-  }
-
-  // Extract to type element_type
-  CUTE_HOST_DEVICE constexpr
-  operator element_type() const {
-    return get();
-  }
-
-  // Address
-  CUTE_HOST_DEVICE
-  subbyte_iterator<T> operator&() const {
-    return {ptr_, idx_};
-  }
-};
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-print(subbyte_reference<T> ref) {
-  cute::print(ref.get());
-}
-
-template <class T>
-CUTE_HOST_DEVICE
-void
-pretty_print(subbyte_reference<T> ref) {
-  cute::pretty_print(ref.get());
-}
-
-//
-// subbyte_iterator
-//   Random-access iterator over subbyte references
-//
-template <class T>
-struct subbyte_iterator
-{
-  // Iterator Element type (const or non-const)
-  using element_type = T;
-  // Iterator Value type without type qualifier.
-  using value_type   = remove_cv_t<T>;
-  // Storage type (const or non-const)
-  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
-  // Reference proxy type
-  using reference = subbyte_reference<element_type>;
-
-  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
-
-  static_assert(sizeof_bits_v<element_type> <= sizeof_bits_v<storage_type>,
-                "Size of Element must not be greater than Storage.");
-
-private:
-
-  template <class, class> friend struct swizzle_ptr;
-  template <class U> friend CUTE_HOST_DEVICE constexpr U* raw_pointer_cast(subbyte_iterator<U> const&);
-  template <class N, class U> friend CUTE_HOST_DEVICE constexpr auto recast_ptr(subbyte_iterator<U> const&);
-  template <class U> friend CUTE_HOST_DEVICE void print(subbyte_iterator<U> const&);
-
-  // Pointer to storage element
-  storage_type* ptr_;
-
-  // Bit index of value_type starting position within storage_type element.
-  // RI: 0 <= idx_ < sizeof_bit<storage_type>
-  uint8_t idx_;
-
-public:
-
-  // Default Ctor
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator() : ptr_{nullptr}, idx_{0} {};
-
-  // Ctor
-  template <class PointerType>
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator(PointerType* ptr, uint8_t idx = 0) : ptr_(reinterpret_cast<storage_type*>(ptr)), idx_(idx) { }
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator*() const {
-    return reference(ptr_, idx_);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator& operator+=(uint64_t k) {
-    k = sizeof_bits_v<value_type> * k + idx_;
-    ptr_ += k / sizeof_bits_v<storage_type>;
-    idx_  = k % sizeof_bits_v<storage_type>;
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator operator+(uint64_t k) const {
-    return subbyte_iterator(ptr_, idx_) += k;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](uint64_t k) const {
-    return *(*this + k);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator& operator++() {
-    idx_ += sizeof_bits_v<value_type>;
-    if (idx_ >= sizeof_bits_v<storage_type>) {
-      ++ptr_;
-      idx_ -= sizeof_bits_v<storage_type>;
-    }
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator operator++(int) {
-    subbyte_iterator ret(*this);
-    ++(*this);
-    return ret;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator& operator--() {
-    if (idx_ >= sizeof_bits_v<value_type>) {
-      idx_ -= sizeof_bits_v<value_type>;
-    } else {
-      --ptr_;
-      idx_ += sizeof_bits_v<storage_type> - sizeof_bits_v<value_type>;
-    }
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  subbyte_iterator operator--(int) {
-    subbyte_iterator ret(*this);
-    --(*this);
-    return ret;
-  }
-
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator==(subbyte_iterator const& x, subbyte_iterator const& y) {
-    return x.ptr_ == y.ptr_ && x.idx_ == y.idx_;
-  }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator!=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x == y); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator< (subbyte_iterator const& x, subbyte_iterator const& y) {
-    return x.ptr_ < y.ptr_ || (x.ptr_ == y.ptr_ && x.idx_ < y.idx_);
-  }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator<=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(y <  x); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator> (subbyte_iterator const& x, subbyte_iterator const& y) { return  (y <  x); }
-  CUTE_HOST_DEVICE constexpr friend
-  bool operator>=(subbyte_iterator const& x, subbyte_iterator const& y) { return !(x <  y); }
-};
-
-// Conversion to raw pointer with loss of subbyte index
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T*
-raw_pointer_cast(subbyte_iterator<T> const& x) {
-  assert(x.idx_ == 0);
-  return reinterpret_cast<T*>(x.ptr_);
-}
-
-// Conversion to NewT_ with possible loss of subbyte index
-template <class NewT_, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(subbyte_iterator<T> const& x) {
-  using NewT = conditional_t<(is_const_v<T>), NewT_ const, NewT_>;
-  if constexpr (cute::is_subbyte_v<NewT>) {       // Making subbyte_iter, preserve the subbyte idx
-    return subbyte_iterator<NewT>(x.ptr_, x.idx_);
-  } else {                                       // Not subbyte, assume/assert subbyte idx 0
-    return reinterpret_cast<NewT*>(raw_pointer_cast(x));
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T>
-CUTE_HOST_DEVICE void
-print(subbyte_iterator<T> const& x) {
-  printf("subptr[%db](%p.%u)", int(sizeof_bits_v<T>), x.ptr_, x.idx_);
-}
-
-template <class T>
-CUTE_HOST_DEVICE void
-print(subbyte_reference<T> const& x) {
-  print(x.get());
-}
-//
-// array_subbyte
-//   Statically sized array for non-byte-aligned data types
-//
-template <class T, size_t N>
-struct array_subbyte
-{
-  using element_type    = T;
-  using value_type      = remove_cv_t<T>;
-  using pointer         = element_type*;
-  using const_pointer   = element_type const*;
-
-  using size_type       = size_t;
-  using difference_type = ptrdiff_t;
-
-  //
-  // References
-  //
-  using reference       = subbyte_reference<element_type>;
-  using const_reference = subbyte_reference<element_type const>;
-
-  //
-  // Iterators
-  //
-  using iterator        = subbyte_iterator<element_type>;
-  using const_iterator  = subbyte_iterator<element_type const>;
-
-  // Storage type (const or non-const)
-  using storage_type = conditional_t<(is_const_v<T>), subbyte_storage_type_t<T> const, subbyte_storage_type_t<T>>;
-
-  static_assert(sizeof_bits_v<storage_type> % 8 == 0, "Storage type is not supported");
-
-private:
-
-  // Number of storage elements, ceil_div
-  static constexpr size_type StorageElements = (N * sizeof_bits_v<value_type> + sizeof_bits_v<storage_type> - 1) / sizeof_bits_v<storage_type>;
-
-  // Internal storage
-  storage_type storage[StorageElements];
-
-public:
-
-  CUTE_HOST_DEVICE constexpr
-  size_type size() const {
-    return N;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  size_type max_size() const {
-    return N;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  bool empty() const {
-    return !N;
-  }
-
-  // Efficient clear method
-  CUTE_HOST_DEVICE constexpr
-  void clear() {
-    CUTE_UNROLL
-    for (size_type i = 0; i < StorageElements; ++i) {
-      storage[i] = storage_type(0);
-    }
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  void fill(T const& value) {
-    CUTE_UNROLL
-    for (size_type i = 0; i < N; ++i) {
-      at(i) = value;
-    }
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference at(size_type pos) {
-    return iterator(storage)[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference at(size_type pos) const {
-    return const_iterator(storage)[pos];
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](size_type pos) {
-    return at(pos);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference operator[](size_type pos) const {
-    return at(pos);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference front() {
-    return at(0);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference front() const {
-    return at(0);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference back() {
-    return at(N-1);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_reference back() const {
-    return at(N-1);
-  }
-
-  // In analogy to std::vector<bool>::data(), these functions are deleted to prevent bugs.
-  // Instead, prefer
-  //   auto* data = raw_pointer_cast(my_subbyte_array.begin());
-  // where the type of auto* is implementation-defined and
-  // with the knowledge that [data, data + my_subbyte_array.size()) may not be a valid range.
-  CUTE_HOST_DEVICE constexpr
-  pointer data() = delete;
-
-  CUTE_HOST_DEVICE constexpr
-  const_pointer data() const = delete;
-
-  CUTE_HOST_DEVICE constexpr
-  iterator begin() {
-    return iterator(storage);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator begin() const {
-    return const_iterator(storage);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cbegin() const {
-    return begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator end() {
-    return iterator(storage) + N;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator end() const {
-    return const_iterator(storage) + N;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  const_iterator cend() const {
-    return end();
-  }
-
-  //
-  // Comparison operators
-  //
-
-};
-
-//
-// Operators
-//
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-void clear(array_subbyte<T,N>& a)
-{
-  a.clear();
-}
-
-template <class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-void fill(array_subbyte<T,N>& a, T const& value)
-{
-  a.fill(value);
-}
-
-} // namespace cute
-
-//
-// Specialize tuple-related functionality for cute::array_subbyte
-//
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/tuple>
-#else
-#include <tuple>
-#endif
-
-namespace cute
-{
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T& get(array_subbyte<T,N>& a)
-{
-  static_assert(I < N, "Index out of range");
-  return a[I];
-}
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T const& get(array_subbyte<T,N> const& a)
-{
-  static_assert(I < N, "Index out of range");
-  return a[I];
-}
-
-template <size_t I, class T, size_t N>
-CUTE_HOST_DEVICE constexpr
-T&& get(array_subbyte<T,N>&& a)
-{
-  static_assert(I < N, "Index out of range");
-  return cute::move(a[I]);
-}
-
-} // end namespace cute
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class T>
-struct is_reference<cute::subbyte_reference<T>>
-    : CUTE_STL_NAMESPACE::true_type
-{};
-
-
-template <class T, size_t N>
-struct tuple_size<cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
-template <class T, size_t N>
-struct tuple_size<const cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, const cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std
-{
-
-#if defined(__CUDACC_RTC__)
-template <class... _Tp>
-struct tuple_size;
-
-template <size_t _Ip, class... _Tp>
-struct tuple_element;
-#endif
-
-template <class T, size_t N>
-struct tuple_size<cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
-template <class T, size_t N>
-struct tuple_size<const cute::array_subbyte<T,N>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, N>
-{};
-
-template <size_t I, class T, size_t N>
-struct tuple_element<I, const cute::array_subbyte<T,N>>
-{
-  using type = T;
-};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp b/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
deleted file mode 100755
index d7fac42a5..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/bit_field.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Portable bit field that supports byte and word straddling that can
-           be used in unions to bit-wise define parameters.
-*/
-
-#pragma once
-
-#include <cute/config.hpp>                  // CUTE_HOST_DEVICE
-#include <cute/numeric/numeric_types.hpp>   // uint_bit_t
-#include <cute/util/type_traits.hpp>        // cute::is_same
-
-namespace cute
-{
-
-class dummy_type {};
-
-template <uint32_t BitStart, uint32_t NumBits, class OtherValueType = dummy_type>
-struct bit_field
-{
-  static_assert(0 < NumBits && NumBits <= 64, "bit_fields with more than 64 bits are not supported.");
-
-  // value_type: Use the smallest value type that fits NumBits
-  static constexpr uint32_t value_type_bits = (NumBits <=  8) ?  8 :
-                                              (NumBits <= 16) ? 16 :
-                                              (NumBits <= 32) ? 32 : 64;
-  using value_type   = cute::uint_bit_t<value_type_bits>;
-  // storage_type: Use the smallest storage_type that avoids boundary crossing
-  static constexpr uint32_t storage_type_bits = (BitStart /  8 == (BitStart + NumBits - 1) /  8) ?  8 :
-                                                (BitStart / 16 == (BitStart + NumBits - 1) / 16) ? 16 :
-                                                (BitStart / 32 == (BitStart + NumBits - 1) / 32) ? 32 : 64;
-  using storage_type = cute::uint_bit_t<storage_type_bits>;
-
-  static_assert(sizeof(OtherValueType) == sizeof(value_type) || is_same<OtherValueType,dummy_type>::value,
-                "sizeof(OtherValueType) must be same as sizeof(value_type).");
-
-  // Number of storage values needed: ceil_div(BitStart + NumBits, storage_type_bits)
-  static constexpr uint32_t N      = (BitStart + NumBits + storage_type_bits - 1) / storage_type_bits;
-  // Index of storage value for BitStart
-  static constexpr uint32_t idx    = BitStart / storage_type_bits;
-  // Bit of data_[idx] for BitStart
-  static constexpr uint32_t bit_lo = BitStart % storage_type_bits;
-  // Number of bits in data_[idx] used for NumBits if straddling, else 0
-  static constexpr uint32_t bit_hi = (idx + 1 < N) ? (storage_type_bits - bit_lo) : 0;
-
-public:
-
-  // NumBits mask
-  static constexpr value_type   mask    = value_type(uint64_t(-1) >> (64u - NumBits));
-  // NumBits mask for BitStart
-  static constexpr storage_type mask_lo = storage_type(mask) << bit_lo;
-  // NumBits mask for leftover bits in data_[idx+1] if straddling, else 0
-  static constexpr storage_type mask_hi = (idx + 1 < N) ? (storage_type(mask) >> bit_hi) : 0;
-
-  storage_type data_[N];
-
-  // Get value
-  CUTE_HOST_DEVICE constexpr
-  value_type get() const {
-    storage_type result = (data_[idx] & mask_lo) >> bit_lo;
-    if constexpr (bit_hi != 0) {
-      result |= (data_[idx+1] & mask_hi) << bit_hi;
-    }
-    return static_cast<value_type>(result);
-  }
-
-  // Set value
-  CUTE_HOST_DEVICE constexpr
-  void set(value_type x) {
-    storage_type item = static_cast<storage_type>(x & mask);
-    data_[idx] = static_cast<storage_type>((data_[idx] & ~mask_lo) | (item << bit_lo));
-    if constexpr (bit_hi != 0) {
-      data_[idx+1] = static_cast<storage_type>((data_[idx+1] & ~mask_hi) | (item >> bit_hi));
-    }
-  }
-
-  // Assign value
-  CUTE_HOST_DEVICE constexpr
-  bit_field& operator=(value_type x) {
-    set(x);
-    return *this;
-  }
-
-  // Cast to value
-  CUTE_HOST_DEVICE constexpr
-  operator value_type () const {
-    return get();
-  }
-
-  // Assign OtherValueType
-  CUTE_HOST_DEVICE constexpr
-  bit_field& operator=(OtherValueType x) {
-    return *this = *reinterpret_cast<value_type*>(&x);
-  }
-
-  // Cast to OtherValueType
-  CUTE_HOST_DEVICE constexpr
-  operator OtherValueType () const {
-    value_type x = get();
-    return *reinterpret_cast<OtherValueType*>(&x);
-  }
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp b/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
deleted file mode 100755
index fbc314e54..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
-#include <cute/numeric/integral_constant.hpp>  // cute::integral_constant
-
-namespace cute
-{
-
-//
-// dim3
-//
-
-using dim3 = ::dim3;
-
-// MSVC doesn't define its C++ version macro to match
-// its C++ language version.  This means that when
-// building with MSVC, dim3 isn't constexpr-friendly.
-template <size_t I>
-CUTE_HOST_DEVICE
-#if ! defined(_MSC_VER)
-constexpr
-#endif
-uint32_t& get(dim3& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return a.x;
-  } else if constexpr (I == 1) {
-    return a.y;
-  } else if constexpr (I == 2) {
-    return a.z;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I>
-CUTE_HOST_DEVICE
-#if ! defined(_MSC_VER)
-constexpr
-#endif
-uint32_t const& get(dim3 const& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return a.x;
-  } else if constexpr (I == 1) {
-    return a.y;
-  } else if constexpr (I == 2) {
-    return a.z;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I>
-CUTE_HOST_DEVICE
-#if ! defined(_MSC_VER)
-constexpr
-#endif
-uint32_t&& get(dim3&& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return cute::move(a.x);
-  } else if constexpr (I == 1) {
-    return cute::move(a.y);
-  } else if constexpr (I == 2) {
-    return cute::move(a.z);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Specialize cute::tuple-traits for external types
-template <>
-struct tuple_size<dim3>
-    : integral_constant<size_t, 3>
-{};
-
-template <size_t I>
-struct tuple_element<I, dim3>
-{
-  using type = uint32_t;
-};
-
-//
-// uint3
-//
-
-using uint3 = ::uint3;
-
-template <size_t I>
-CUTE_HOST_DEVICE constexpr
-uint32_t& get(uint3& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return a.x;
-  } else if constexpr (I == 1) {
-    return a.y;
-  } else if constexpr (I == 2) {
-    return a.z;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I>
-CUTE_HOST_DEVICE constexpr
-uint32_t const& get(uint3 const& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return a.x;
-  } else if constexpr (I == 1) {
-    return a.y;
-  } else if constexpr (I == 2) {
-    return a.z;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I>
-CUTE_HOST_DEVICE constexpr
-uint32_t&& get(uint3&& a)
-{
-  static_assert(I < 3, "Index out of range");
-  if constexpr (I == 0) {
-    return cute::move(a.x);
-  } else if constexpr (I == 1) {
-    return cute::move(a.y);
-  } else if constexpr (I == 2) {
-    return cute::move(a.z);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Specialize cute::tuple-traits for external types
-template <>
-struct tuple_size<uint3>
-    : integral_constant<size_t, 3>
-{};
-
-template <size_t I>
-struct tuple_element<I, uint3>
-{
-  using type = uint32_t;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp b/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
deleted file mode 100755
index c20df2c23..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/container/type_list.hpp>
-
-namespace cute {
-
-namespace detail {
-
-// Empty Structure Optimization
-template <bool IsFirstEmpty, bool IsRestEmpty, class... T>
-struct ESO;
-
-template <class First, class... Rest>
-static constexpr bool is_first_empty_v = cute::is_empty<First>::value;
-template <class First, class... Rest>
-static constexpr bool is_rest_empty_v  = (cute::is_empty<Rest>::value && ...);
-
-template <class... T>
-using ESO_t = ESO<is_first_empty_v<T...>, is_rest_empty_v<T...>, T...>;
-
-// Empty First and Empty Rest...
-template <class First, class... Rest>
-struct ESO<true, true, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const&, Rest const&...) {}
-};
-
-// NonEmpty First and Empty Rest...
-template <class First, class... Rest>
-struct ESO<false, true, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : first_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const& first, Rest const&...) : first_{first} {}
-
-  First first_;
-};
-
-// Empty First and NonEmpty Rest...
-template <class First, class... Rest>
-struct ESO<true, false, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : rest_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const&, Rest const&... rest) : rest_{rest...} {}
-
-  ESO_t<Rest...> rest_;
-};
-
-// NonEmpty T and NonEmpty Rest...
-template <class First, class... Rest>
-struct ESO<false, false, First, Rest...> {
-  CUTE_HOST_DEVICE constexpr
-  ESO() : first_{}, rest_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  ESO(First const& first, Rest const&... rest) : first_{first}, rest_{rest...} {}
-
-  First first_;
-  ESO_t<Rest...> rest_;
-};
-
-// Get Nth value from ESO
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...> const& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T const&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(s.rest_); }
-  }
-}
-
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(s.rest_); }
-  }
-}
-
-template <size_t N, class T, class... Rest, bool F, bool R>
-CUTE_HOST_DEVICE constexpr decltype(auto) getv(ESO<F, R, T, Rest...>&& s) {
-  if constexpr (N == 0) {
-    if constexpr (F) { return T{}; }
-    else             { return static_cast<T&&>(s.first_); }
-  } else {
-    if constexpr (R) { return cute::tuple_element_t<N-1, cute::type_list<Rest...>>{}; }
-    else             { return getv<N-1>(static_cast<ESO_t<Rest...>&&>(s.rest_)); }
-  }
-}
-
-// findt: Implementation detail of cute::find.
-// If X is the first template argument of the tuple, findt returns C<N>.
-
-template <class X, size_t N,
-  bool IsFirstEmpty, bool IsRestEmpty, class First, class... Rest>
-CUTE_HOST_DEVICE constexpr
-auto
-findt(ESO<IsFirstEmpty, IsRestEmpty, First, Rest...> const& t) noexcept
-{
-  if constexpr (cute::is_same_v<X, First>) {
-    return C<N>{};
-  }
-  else {
-    static_assert(sizeof...(Rest) != 0,
-      "The type does not appear in the argument list of the tuple.");
-    if constexpr (IsRestEmpty) {
-      // The rest is empty, so creating an instance of it is cheap.
-      return cute::detail::findt<X, N+1>(ESO_t<Rest...>{});
-    }
-    else {
-      return cute::detail::findt<X, N+1>(t.rest_);
-    }
-  }
-}
-
-} // end namespace detail
-
-// packed_tuple<T...> is a tuple type that is a standard-layout type
-// whenever all of its template arguments are standard layout types:
-//   (cute::is_standard_layout_v<T> && ...) implies (cute::is_standard_layout_v<packed_tuple<T...>>)
-
-template <class... T>
-struct packed_tuple : detail::ESO_t<T...>
-{
-  CUTE_HOST_DEVICE constexpr
-  packed_tuple() {}
-
-  CUTE_HOST_DEVICE constexpr
-  packed_tuple(T const&... ts)
-    : detail::ESO_t<T...>(ts...)
-  {}
-};
-
-template <>
-struct packed_tuple<> {};
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...> const& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...>& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(packed_tuple<T...>&& t) {
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(static_cast<detail::ESO_t<T...>&&>(t));
-}
-
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-packed_tuple<T...>
-make_packed_tuple(T const&... t)
-{
-  return {t...};
-}
-
-// Returns the position of type X (as a static integer) in the tuple
-// type's argument list.  X must be unique in the argument list.
-template <class X, class... T>
-CUTE_HOST_DEVICE constexpr
-auto
-find(packed_tuple<T...> const& t) noexcept
-{
-  return detail::findt<X, 0>(t);
-}
-
-} // end namespace cute
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class... T>
-struct tuple_size<cute::packed_tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::packed_tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std {
-
-template <class ... T>
-struct tuple_size<cute::packed_tuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class ... T>
-struct tuple_element<I, cute::packed_tuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, cute::packed_tuple<T...>>
-{};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/container/tuple.hpp b/lightllm-kernel/cutlass/include/cute/container/tuple.hpp
deleted file mode 100755
index 3123a68d8..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/tuple.hpp
+++ /dev/null
@@ -1,744 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
-#include <cute/numeric/integer_sequence.hpp>
-
-#include <cute/container/cuda_types.hpp>
-#include <cute/container/type_list.hpp>
-#if defined(CUTLASS_USE_PACKED_TUPLE)
-#  include <cute/container/packed_tuple.hpp>
-#endif
-
-//#include <cute/container/array.hpp>            // Advanced optimizations
-
-// cute::tuple is like std::tuple, with two differences.
-//
-// 1. It works on both host and device.
-// 2. Its template arguments must be semiregular types.
-//
-// Semiregular types are default constructible and copyable.
-// They include "value types" like int or float,
-// but do _not_ include references like int& or float&.
-// (See std::tie for an example of a tuple of references.)
-//
-// If the template arguments of cute::tuple are all empty types (in
-// the sense of std::is_empty_v), then the cute::tuple is also an
-// empty type.  Furthermore, if CUTLASS_USE_PACKED_TUPLE is defined,
-// cute::tuple is always a standard-layout type if all of its template
-// arguments are standard-layout types.
-
-namespace cute
-{
-
-#if defined(CUTLASS_USE_PACKED_TUPLE)
-
-template<class... T>
-using tuple = packed_tuple<T...>;
-
-#else
-
-namespace detail
-{
-
-// This is simplified over the implementations in std::, cuda::std::, and thrust:: by ignoring much of
-// the conversion SFINAE, special overloading, and avoiding cvref template types.
-//
-// Over standard-conforming tuple implementations, this appears to accelerate compilation times by over 3x.
-
-// EBO stands for "empty base optimization."
-// We use this technique to ensure that cute::tuple
-// doesn't need to waste space storing any template arguments
-// of cute::tuple that have no data (like integral_constant).
-// Otherwise, cute::tuple would need to spend at least 1 byte
-// for each of its template arguments.
-//
-// This is one way in which cute::tuple differs from std::tuple.
-// Empty types in the template argument list are not even constructed,
-// and do not have unique element addresses.  In fact, they are not
-// even members of the tuple or stored in any way.  Calling `get`
-// constructs and returns an instance of an empty type on demand.
-//
-// EBO always "holds" a single value of type T.
-// N is like an array index that TupleBase uses
-// to access the desired tuple element.
-template <size_t N, class T, bool IsEmpty = is_empty<T>::value>
-struct EBO;
-
-template <class T, size_t N, bool B>
-CUTE_HOST_DEVICE constexpr C<N> findt(EBO<N, T, B> const&)
-{ return {}; }
-
-// Specialization for types T that have no data;
-// the "static tuple leaf."  Valid T here include
-// integral_constant<U, Value>, Int<Value>,
-// and any other semiregular type
-// for which std::is_empty_v<T> is true.
-template <size_t N, class T>
-struct EBO<N, T, true>
-{
-  CUTE_HOST_DEVICE constexpr
-  EBO() {}
-
-  CUTE_HOST_DEVICE constexpr
-  EBO(T const&) {}
-};
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T getv(EBO<N, T, true> const&)
-{ return {}; }
-
-// Specialization for types T that are not empty;
-// the "dynamic tuple leaf."  Valid T here include int,
-// any other integral or floating-point type,
-// or any semiregular type for which std::is_empty_v<T> is false.
-template <size_t N, class T>
-struct EBO<N, T, false>
-{
-  CUTE_HOST_DEVICE constexpr
-  EBO() : t_{} {}
-
-  CUTE_HOST_DEVICE constexpr
-  EBO(T const& t) : t_{t} {}
-
-  T t_;
-};
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T const& getv(EBO<N, T, false> const& x)
-{ return x.t_; }
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T& getv(EBO<N, T, false>& x)
-{ return x.t_; }
-
-template <size_t N, class T>
-CUTE_HOST_DEVICE constexpr T&& getv(EBO<N, T, false>&& x)
-{ return cute::move(x.t_); }
-
-template <class IdxSeq, class... T>
-struct TupleBase;
-
-// Base class of cute::tuple binds each element to an index
-// by inheriting from EBO<i, t> for each (i, t) in (I..., T...).
-// The storage (for nonempty t) lives in the base classes.
-template <size_t... I, class... T>
-struct TupleBase<index_sequence<I...>, T...>
-    : EBO<I,T>...
-{
-  CUTE_HOST_DEVICE constexpr
-  TupleBase() {}
-
-  CUTE_HOST_DEVICE constexpr
-  TupleBase(T const&... t) : EBO<I,T>(t)... {}
-};
-
-} // end namespace detail
-
-// Attempting to use the following commented-out alias
-// in the declaration of `struct tuple` causes MSVC 2022 build errors.
-//
-//template <class... T>
-//using TupleBase = detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>;
-
-// This is the actual cute::tuple class.
-// The storage (if any) lives in TupleBase's EBO base classes.
-//
-// Inheriting from the above alias TupleBase
-// causes MSVC 2022 build errors when assigning one tuple to another:
-// In summary: this is verbose as a work-around for MSVC build errors.
-template <class... T>
-struct tuple : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>
-{
-  CUTE_HOST_DEVICE constexpr
-  tuple() {}
-
-  CUTE_HOST_DEVICE constexpr
-  tuple(T const&... t) : detail::TupleBase<make_index_sequence<sizeof...(T)>, T...>(t...) {}
-};
-
-template <>
-struct tuple<>
-{};
-
-//
-// get for cute::tuple (just like std::get for std::tuple)
-//
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(tuple<T...> const& t) noexcept
-{
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(tuple<T...>& t) noexcept
-{
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(t);
-}
-
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(tuple<T...>&& t) noexcept
-{
-  static_assert(I < sizeof...(T), "Index out of range");
-  return detail::getv<I>(static_cast<tuple<T...>&&>(t));
-}
-
-//
-// find a type X within a cute::tuple
-//   Requires X to be unique in tuple
-//   Returns a static integer
-//
-
-template <class X, class... T>
-CUTE_HOST_DEVICE constexpr
-auto
-find(tuple<T...> const& t) noexcept
-{
-  return detail::findt<X>(t);
-}
-
-#endif // CUTLASS_USE_PACKED_TUPLE
-
-//
-// Custom is_tuple trait simply checks the existence of tuple_size
-//      and assumes std::get<I>(.), std::tuple_element<I,.>
-//
-namespace detail {
-
-template <class T>
-auto has_tuple_size( T*) -> bool_constant<(0 <= tuple_size<T>::value)>;
-auto has_tuple_size(...) -> false_type;
-
-} // end namespace detail
-
-template <class T>
-struct is_tuple : decltype(detail::has_tuple_size((T*)0)) {};
-
-template<typename T>
-constexpr bool is_tuple_v = cute::is_tuple<T>::value;
-
-//
-// make_tuple (value-based implementation)
-//
-
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-tuple<T...>
-make_tuple(T const&... t)
-{
-  return {t...};
-}
-
-//
-// tuple_cat concatenates multiple cute::tuple into a single cute::tuple,
-// just like std::tuple_cat for std::tuple.
-//
-
-#if 0
-// Original implementation
-
-namespace detail {
-
-template <class T0, class T1,
-          size_t... I0, size_t... I1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1,
-          index_sequence<I0...>, index_sequence<I1...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
-}
-
-} // end namespace detail
-
-CUTE_HOST_DEVICE constexpr
-tuple<>
-tuple_cat()
-{
-  return {};
-}
-
-template <class Tuple,
-          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
-CUTE_HOST_DEVICE constexpr
-Tuple const&
-tuple_cat(Tuple const& t)
-{
-  return t;
-}
-
-template <class T0, class T1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1)
-{
-  return detail::tuple_cat(t0, t1,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{});
-}
-
-template <class T0, class T1, class T2, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, Ts const&... ts)
-{
-  return cute::tuple_cat(cute::tuple_cat(t0,t1),t2,ts...);
-}
-#endif
-
-#if 1
-// Extended implementation
-
-namespace detail {
-
-template <class T0, class T1,
-          size_t... I0, size_t... I1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1,
-          index_sequence<I0...>, index_sequence<I1...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
-}
-
-template <class T0, class T1, class T2,
-          size_t... I0, size_t... I1, size_t... I2>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2,
-          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)...);
-}
-
-template <class T0, class T1, class T2, class T3,
-          size_t... I0, size_t... I1, size_t... I2, size_t... I3>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3,
-          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)...);
-}
-
-template <class T0, class T1, class T2, class T3, class T4,
-          size_t... I0, size_t... I1, size_t... I2, size_t... I3, size_t... I4>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4,
-          index_sequence<I0...>, index_sequence<I1...>, index_sequence<I2...>, index_sequence<I3...>, index_sequence<I4...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)..., get<I2>(t2)..., get<I3>(t3)..., get<I4>(t4)...);
-}
-
-template <class T0, class T1>
-struct tuple_cat_static;
-
-template <class... T0s, class... T1s>
-struct tuple_cat_static<tuple<T0s...>, tuple<T1s...>> {
-  using type = tuple<T0s..., T1s...>;
-};
-
-} // end namespace detail
-
-CUTE_HOST_DEVICE constexpr
-tuple<>
-tuple_cat()
-{
-  return {};
-}
-
-template <class Tuple,
-          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
-CUTE_HOST_DEVICE constexpr
-Tuple const&
-tuple_cat(Tuple const& t)
-{
-  return t;
-}
-
-template <class T0, class T1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1)
-{
-  if constexpr (is_static<T0>::value && is_static<T1>::value &&
-		is_tuple<T0>::value && is_tuple<T1>::value) {
-    return typename detail::tuple_cat_static<T0, T1>::type{};
-  } else {
-    return detail::tuple_cat(t0, t1,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T0, class T1, class T2>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2)
-{
-  return detail::tuple_cat(t0, t1, t2,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{},
-                           make_index_sequence<tuple_size<T2>::value>{});
-}
-
-template <class T0, class T1, class T2, class T3>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3)
-{
-  return detail::tuple_cat(t0, t1, t2, t3,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{},
-                           make_index_sequence<tuple_size<T2>::value>{},
-                           make_index_sequence<tuple_size<T3>::value>{});
-}
-
-template <class T0, class T1, class T2, class T3, class T4>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4)
-{
-  return detail::tuple_cat(t0, t1, t2, t3, t4,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{},
-                           make_index_sequence<tuple_size<T2>::value>{},
-                           make_index_sequence<tuple_size<T3>::value>{},
-                           make_index_sequence<tuple_size<T4>::value>{});
-}
-
-template <class T0, class T1, class T2, class T3, class T4, class T5, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1, T2 const& t2, T3 const& t3, T4 const& t4, T5 const& t5, Ts const&... ts)
-{
-  return cute::tuple_cat(cute::tuple_cat(t0,t1,t2,t3,t4), cute::tuple_cat(t5, ts...));
-}
-#endif
-
-#if 0
-// Outer-Inner indexing trick to concat all tuples at once
-
-namespace detail {
-
-template <size_t... Ns>
-struct tuple_cat_helper
-{
-  static constexpr cute::array<size_t,sizeof...(Ns)> ns = {Ns...};
-
-  static constexpr size_t total_size() {
-    size_t sum = 0;
-    for (size_t n : ns) sum += n;
-    return sum;
-  }
-  static constexpr size_t total_size_ = total_size();
-
-  static constexpr auto values() {
-    cute::array<size_t[2],total_size_> outer_inner = {};
-
-    size_t idx = 0;
-    for (size_t i = 0; i < ns.size(); ++i) {
-      for (size_t j = 0; j < ns[i]; ++j, ++idx) {
-        outer_inner[idx][0] = i;
-        outer_inner[idx][1] = j;
-      }
-    }
-    return outer_inner;
-  }
-  static constexpr auto outer_inner_ = values();
-
-  using total_sequence = make_index_sequence<total_size_>;
-};
-
-template <class Helper, class Tuple, size_t... I>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(Tuple const& t, index_sequence<I...>)
-{
-  return cute::make_tuple(get<Helper::outer_inner_[I][1]>(get<Helper::outer_inner_[I][0]>(t))...);
-}
-
-template <class T0, class T1,
-          size_t... I0, size_t... I1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1,
-          index_sequence<I0...>, index_sequence<I1...>)
-{
-  return cute::make_tuple(get<I0>(t0)..., get<I1>(t1)...);
-}
-
-} // end namespace detail
-
-CUTE_HOST_DEVICE constexpr
-tuple<>
-tuple_cat()
-{
-  return {};
-}
-
-template <class Tuple,
-          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
-CUTE_HOST_DEVICE constexpr
-Tuple const&
-tuple_cat(Tuple const& t)
-{
-  return t;
-}
-
-template <class T0, class T1>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(T0 const& t0, T1 const& t1)
-{
-  return detail::tuple_cat(t0, t1,
-                           make_index_sequence<tuple_size<T0>::value>{},
-                           make_index_sequence<tuple_size<T1>::value>{});
-}
-
-template <class... Tuples>
-CUTE_HOST_DEVICE constexpr
-auto
-tuple_cat(Tuples const&... ts)
-{
-  using Helper = detail::tuple_cat_helper<tuple_size<Tuples>::value...>;
-  return detail::tuple_cat<Helper>(cute::make_tuple(ts...), typename Helper::total_sequence{});
-}
-#endif
-
-//
-// Equality operators
-//
-
-namespace detail {
-
-template <class TupleA, class TupleB, size_t... I>
-CUTE_HOST_DEVICE constexpr
-auto
-equal_impl(TupleA const& a, TupleB const& b, index_sequence<I...>)
-{
-  return (cute::true_type{} && ... && (get<I>(a) == get<I>(b)));
-}
-
-} // end namespace detail
-
-template <class TupleT, class TupleU,
-          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(TupleT const& t, TupleU const& u)
-{
-  if constexpr (tuple_size<TupleT>::value == tuple_size<TupleU>::value) {
-    return detail::equal_impl(t, u, make_index_sequence<tuple_size<TupleT>::value>{});
-  } else {
-    return cute::false_type{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class TupleT, class TupleU,
-          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(TupleT const& t, TupleU const& u)
-{
-  return cute::false_type{};
-}
-
-template <class TupleT, class TupleU,
-          __CUTE_REQUIRES(is_tuple<TupleT>::value && is_tuple<TupleU>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator!=(TupleT const& t, TupleU const& u)
-{
-  return !(t == u);
-}
-
-template <class TupleT, class TupleU,
-          __CUTE_REQUIRES(is_tuple<TupleT>::value ^ is_tuple<TupleU>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator!=(TupleT const& t, TupleU const& u)
-{
-  return cute::true_type{};
-}
-
-//
-// Comparison operators
-//
-
-//
-// There are many ways to compare tuple of elements and because CuTe is built
-//   on parameterizing layouts of coordinates, some comparisons are appropriate
-//   only in certain cases.
-//  -- lexicographical comparison [reverse, reflected, revref]
-//  -- colexicographical comparison [reverse, reflected, revref]
-//  -- element-wise comparison [any,all]
-// This can be very confusing. To avoid errors in selecting the appropriate
-//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
-//
-// That said, see int_tuple for more explicitly named common comparison ops.
-//
-
-//
-// Display utilities
-//
-
-namespace detail {
-
-template <class Tuple, size_t... Is>
-CUTE_HOST_DEVICE void print_tuple(Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
-{
-  using cute::print;
-  if (sizeof...(Is) == 0) {
-    print(s);
-  } else {
-    ((void(print(Is == 0 ? s : ',')), void(print(get<Is>(t)))), ...);
-  }
-  print(e);
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class Tuple, std::size_t... Is>
-CUTE_HOST std::ostream& print_tuple_os(std::ostream& os, Tuple const& t, index_sequence<Is...>, char s = '(', char e = ')')
-{
-  if (sizeof...(Is) == 0) {
-    os << s;
-  } else {
-    (void(os << (Is == 0 ? s : ',') << get<Is>(t)), ...);
-  }
-  return os << e;
-}
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace detail
-
-template <class Tuple,
-          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
-CUTE_HOST_DEVICE void print(Tuple const& t)
-{
-  return detail::print_tuple(t, make_index_sequence<tuple_size<Tuple>::value>{});
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class Tuple,
-          __CUTE_REQUIRES(is_tuple<Tuple>::value)>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, Tuple const& t)
-{
-  return detail::print_tuple_os(os, t, make_index_sequence<tuple_size<Tuple>::value>{});
-}
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace cute
-
-#if ! defined(CUTLASS_USE_PACKED_TUPLE)
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class... T>
-struct tuple_size<cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-template <class... T>
-struct tuple_size<const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-//
-// std compatibility
-//
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std
-{
-
-#if defined(__CUDACC_RTC__)
-template <class... _Tp>
-struct tuple_size;
-
-template <size_t _Ip, class... _Tp>
-struct tuple_element;
-#endif
-
-template <class... T>
-struct tuple_size<cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-template <class... T>
-struct tuple_size<const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::tuple<T...>>
-    : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
-
-#endif // CUTLASS_USE_PACKED_TUPLE
diff --git a/lightllm-kernel/cutlass/include/cute/container/type_list.hpp b/lightllm-kernel/cutlass/include/cute/container/type_list.hpp
deleted file mode 100755
index a15f2c1c1..000000000
--- a/lightllm-kernel/cutlass/include/cute/container/type_list.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>            // CUTE_HOST_DEVICE, CUTE_STL_NAMESPACE
-
-namespace cute
-{
-
-template <class... T>
-struct type_list {};
-
-// get<I> for type_list<T...>
-//   requires tuple_element_t<I,type_list<T...>> to have std::is_default_constructible
-template <size_t I, class... T>
-CUTE_HOST_DEVICE constexpr
-CUTE_STL_NAMESPACE::tuple_element_t<I, type_list<T...>>
-get(type_list<T...> const& t) noexcept {
-  return {};
-}
-
-} // end namespace cute
-
-//
-// Specialize tuple-related functionality for cute::type_list
-//
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/tuple>
-#else
-#include <tuple>
-#endif
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class... T>
-struct tuple_size<cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
-template <class... T>
-struct tuple_size<const cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
-} // end namespace std
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std
-{
-
-#if defined(__CUDACC_RTC__)
-template <class... _Tp>
-struct tuple_size;
-
-template <size_t _Ip, class... _Tp>
-struct tuple_element;
-#endif
-
-template <class... T>
-struct tuple_size<cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
-template <class... T>
-struct tuple_size<const cute::type_list<T...>>
-    : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::type_list<T...>>
-{
-  using type = typename CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>::type;
-};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/int_tuple.hpp b/lightllm-kernel/cutlass/include/cute/int_tuple.hpp
deleted file mode 100755
index 95d06bbdd..000000000
--- a/lightllm-kernel/cutlass/include/cute/int_tuple.hpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
-#include <cute/container/array.hpp>             // cute::array
-#include <cute/container/tuple.hpp>             // cute::is_tuple
-#include <cute/numeric/integral_constant.hpp>   // cute::Int
-#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform
-
-/** IntTuple is an integer or a tuple of IntTuples.
- * This file holds utilities for working with IntTuples,
- * but does not hold a concrete concept or class of IntTuple.
- */
-
-namespace cute
-{
-
-// Implementation of get<0>(Integral).
-//   Even though is_tuple<Integral> is false and tuple_size<Integral> doesn't compile,
-//   CuTe defines rank(Integral) as 1, so it's useful for get<0>(Integral) to return its input
-template <size_t I, class T, __CUTE_REQUIRES(cute::is_integral<cute::remove_cvref_t<T>>::value)>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(T&& t) noexcept
-{
-  static_assert(I == 0, "Index out of range");
-  return static_cast<T&&>(t);
-}
-
-// Custom recursive get for anything that implements get<I>(.) (for a single integer I).
-template <size_t I0, size_t I1, size_t... Is, class T>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-get(T&& t) noexcept
-{
-  return get<I1, Is...>(get<I0>(static_cast<T&&>(t)));
-}
-
-//
-// rank
-//
-
-template <int... Is, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-rank(IntTuple const& t)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    if constexpr (is_tuple<IntTuple>::value) {
-      return Int<tuple_size<IntTuple>::value>{};
-    } else {
-      return Int<1>{};
-    }
-  } else {
-    return rank(get<Is...>(t));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class IntTuple>
-using rank_t = decltype(rank(declval<IntTuple>()));
-
-template <class IntTuple>
-static constexpr auto rank_v = rank_t<IntTuple>::value;
-
-//
-// shape
-//
-
-template <class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-shape(IntTuple const& s)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    return transform(s, [](auto const& a) { return shape(a); });
-  } else {
-    return s;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int I, int... Is, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-shape(IntTuple const& s)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    return shape<Is...>(get<I>(s));
-  } else {
-    return get<I,Is...>(shape(s));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// max
-//
-
-template <class T0, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-max(T0 const& t0, Ts const&... ts)
-{
-  if constexpr (is_tuple<T0>::value) {
-    return cute::max(cute::apply(t0, [](auto const&... a){ return cute::max(a...); }), ts...);
-  } else if constexpr (sizeof...(Ts) == 0) {
-    return t0;
-  } else {
-    return cute::max(t0, cute::max(ts...));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// min
-//
-
-template <class T0, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-min(T0 const& t0, Ts const&... ts)
-{
-  if constexpr (is_tuple<T0>::value) {
-    return cute::min(cute::apply(t0, [](auto const&... a){ return cute::min(a...); }), ts...);
-  } else if constexpr (sizeof...(Ts) == 0) {
-    return t0;
-  } else {
-    return cute::min(t0, cute::min(ts...));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// gcd
-//
-
-template <class T0, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-gcd(T0 const& t0, Ts const&... ts)
-{
-  if constexpr (is_tuple<T0>::value) {
-    return cute::gcd(cute::apply(t0, [](auto const&... a){ return cute::gcd(a...); }), ts...);
-  } else if constexpr (sizeof...(Ts) == 0) {
-    return t0;
-  } else {
-    return cute::gcd(t0, cute::gcd(ts...));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// depth
-//
-
-template <int... Is, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-depth(IntTuple const& t)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    if constexpr (is_tuple<IntTuple>::value) {
-      return Int<1>{} + cute::apply(t, [](auto const&... v){ return cute::max(depth(v)...); });
-    } else {
-      return Int<0>{};
-    }
-  } else {
-    return depth(get<Is...>(t));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class Tuple>
-using depth_t = decltype(depth(declval<Tuple>()));
-
-template <class Tuple>
-static constexpr auto depth_v = depth_t<Tuple>::value;
-
-//
-// product
-//
-
-// Implementation of product as a function object
-struct Product
-{
-  template <class IntTuple>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(IntTuple const& a) const
-  {
-    if constexpr (is_tuple<IntTuple>::value) {
-      if constexpr (tuple_size<IntTuple>::value == 0) {
-        return Int<1>{};
-      } else {
-        return cute::transform_apply(a, Product{}, multiplies_unary_lfold{});
-      }
-    } else if constexpr (cute::is_integral<IntTuple>::value) {
-      return a;
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-};
-// Callable product function object
-CUTE_INLINE_CONSTANT Product product;
-
-// Return a rank(t) tuple @a result such that get<i>(@a result) = product(get<i>(@a t))
-template <class Tuple>
-CUTE_HOST_DEVICE constexpr
-auto
-product_each(Tuple const& t)
-{
-  return transform(wrap(t), product);
-}
-
-// Take the product of Tuple at the leaves of TupleG
-template <class Tuple, class TupleG>
-CUTE_HOST_DEVICE constexpr
-auto
-product_like(Tuple const& tuple, TupleG const& guide)
-{
-  return transform_leaf(guide, tuple, [](auto const& g, auto const& t) { return product(t); });
-}
-
-// Return the product of elements in a mode
-template <int... Is, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-size(IntTuple const& a)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    return product(a);
-  } else {
-    return size(get<Is...>(a));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class IntTuple>
-static constexpr auto size_v = decltype(size(declval<IntTuple>()))::value;
-
-//
-// sum
-//
-
-template <class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-sum(IntTuple const& a)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    return cute::apply(a, [](auto const&... v){ return (Int<0>{} + ... + sum(v)); });
-  } else {
-    return a;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// inner_product
-//
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-inner_product(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
-    return transform_apply(a, b, [](auto const& x, auto const& y) { return inner_product(x,y); },
-                                 [](auto const&... v) { return (Int<0>{} + ... + v); });
-  } else {
-    return a * b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// ceil_div
-//
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-ceil_div(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value) {
-    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
-      static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
-      constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
-      return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return ceil_div(x,y); });
-    } else {                                     // tuple int
-      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
-        [] (auto const& init, auto const& ai) {
-          return cute::make_tuple(append(get<0>(init), ceil_div(ai, get<1>(init))), ceil_div(get<1>(init), ai));
-        });
-      return result;
-    }
-  } else
-  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
-    return ceil_div(a, product(b));
-  } else {
-    return (a + b - Int<1>{}) / b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// round_up
-//   Round @a a up to the nearest multiple of @a b.
-//   For negative numbers, rounds away from zero.
-//
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-round_up(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    static_assert(tuple_size<IntTupleA>::value >= tuple_size<IntTupleB>::value, "Mismatched ranks");
-    constexpr int R = tuple_size<IntTupleA>::value;        // Missing ranks in TupleB are implicitly 1
-    return transform(a, append<R>(b,Int<1>{}), [](auto const& x, auto const& y) { return round_up(x,y); });
-  } else {
-    return ((a + b - Int<1>{}) / b) * b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Division for Shapes
- * Case Tuple Tuple:
- *   Perform shape_div element-wise
- * Case Tuple Int:
- *   Fold the division of b across each element of a
- *   Example: shape_div((4,5,6),40) -> shape_div((1,5,6),10) -> shape_div((1,1,6),2) -> (1,1,3)
- * Case Int Tuple:
- *   Return shape_div(a, product(b))
- * Case Int Int:
- *   Enforce the divisibility condition a % b == 0 || b % a == 0 when possible
- *   Return a / b with rounding away from 0 (that is, 1 or -1 when a < b)
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-shape_div(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value) {
-    if constexpr (is_tuple<IntTupleB>::value) {  // tuple tuple
-      static_assert(tuple_size<IntTupleA>::value == tuple_size<IntTupleB>::value, "Mismatched ranks");
-      return transform(a, b, [](auto const& x, auto const& y) { return shape_div(x,y); });
-    } else {                                     // tuple int
-      auto const [result, rest] = fold(a, cute::make_tuple(cute::make_tuple(), b),
-        [] (auto const& init, auto const& ai) {
-          return cute::make_tuple(append(get<0>(init), shape_div(ai, get<1>(init))), shape_div(get<1>(init), ai));
-        });
-      return result;
-    }
-  } else
-  if constexpr (is_tuple<IntTupleB>::value) {    // int tuple
-    return shape_div(a, product(b));
-  } else
-  if constexpr (is_static<IntTupleA>::value && is_static<IntTupleB>::value) {
-    static_assert(IntTupleA::value % IntTupleB::value == 0 || IntTupleB::value % IntTupleA::value == 0, "Static shape_div failure");
-    return C<shape_div(IntTupleA::value, IntTupleB::value)>{};
-  } else {                                       // int int
-    //assert(a % b == 0 || b % a == 0);          // Waive dynamic assertion
-    return a / b != 0 ? a / b : signum(a) * signum(b);  // Division with rounding away from zero
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Minimum for Shapes
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-shape_min(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value || is_tuple<IntTupleB>::value) {
-    static_assert(dependent_false<IntTupleA>, "Not implemented.");
-  } else
-  if constexpr (is_constant<1, IntTupleA>::value || is_constant<1, IntTupleB>::value) {
-    return Int<1>{};            // _1 is less than all other shapes, preserve static
-  } else {
-    return cute::min(a, b);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Return a tuple the same profile as A scaled by corresponding elements in B
- */
-template <class A, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_scale(A const& a, B const& b)
-{
-  if constexpr (is_tuple<A>::value) {
-    return transform(a, b, [](auto const& x, auto const& y) { return elem_scale(x,y); });
-  } else {
-    return a * product(b);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Test if two IntTuple have the same profile (hierarchical rank division)
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-congruent(IntTupleA const& a, IntTupleB const& b)
-{
-  return bool_constant<is_same<decltype(repeat_like(shape(a),_0{})),
-                               decltype(repeat_like(shape(b),_0{}))>::value>{};
-}
-
-template <class A, class B>
-using is_congruent = decltype(congruent(declval<A>(), declval<B>()));
-
-/** Test if two IntTuple have the similar profiles up to Shape A (hierarchical rank division)
- * weakly_congruent is a partial order on A and B: A <= B
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-weakly_congruent(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
-      return false_type{};
-    } else {
-      return transform_apply(a, b, [](auto const& x, auto const& y) { return weakly_congruent(x,y); },
-                                   [](auto const&... z) { return (true_type{} && ... && z); });
-    }
-  } else if constexpr (is_integral<IntTupleA>::value) {
-    return true_type{};
-  } else if constexpr (is_integral<IntTupleB>::value) {
-    return false_type{};
-  } else {
-    return weakly_congruent(shape(a), shape(b));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class A, class B>
-using is_weakly_congruent = decltype(weakly_congruent(declval<A>(), declval<B>()));
-
-/** Test if Shape A is compatible with Shape B:
- *    the size of A and B are the same, and
- *    any coordinate into A can also be used as a coordinate into B
- * Equivalently, the size of Shape B is the same as Shape A at each terminal of Shape A.
- * compatible is a partial order on A and B: A <= B
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-compatible(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    if constexpr (tuple_size<IntTupleA>::value != tuple_size<IntTupleB>::value) {
-      return false_type{};
-    } else {
-      return transform_apply(a, b, [](auto const& x, auto const& y) { return compatible(x,y); },
-                                   [](auto const&... z) { return (true_type{} && ... && z); });
-    }
-  } else if constexpr (is_integral<IntTupleA>::value) {
-    return a == size(b);
-  } else if constexpr (is_integral<IntTupleB>::value) {
-    return false_type{};
-  } else {
-    return compatible(shape(a), shape(b));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class A, class B>
-using is_compatible = decltype(compatible(declval<A>(), declval<B>()));
-
-/** Test if Shape A is evenly divided by Tiler B
- * @returns Static or dynamic boolean
- * @post if result is true_type, then
- *       size(a) == logical_divide(make_layout(shape(a)),b) will always compile
- *       and result in true_type.
- */
-template <class Shape, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-evenly_divides(Shape const& a, Tiler const& b)
-{
-  if constexpr (is_tuple<Tiler>::value) {
-    if constexpr (rank_v<Tiler> > rank_v<Shape>) {
-      return false_type{};
-    } else {
-      return transform_apply(b, a, [](auto const& x, auto const& y) { return evenly_divides(y,x); },
-                                   [](auto const&... z) { return (true_type{} && ... && z); });
-    }
-  } else {
-    return size(a) == size(b) * size(ceil_div(shape(a), b));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Replace the elements of Tuple B that are paired with an Int<0> with an Int<1>
- */
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value) {
-    return transform(a, b, [](auto const& x, auto const& y) { return filter_zeros(x,y); });
-  } else if constexpr (is_constant<0, IntTupleA>::value) {
-    return repeat_like(b, Int<1>{});
-  } else {
-    return b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class Tuple>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tuple const& t)
-{
-  return filter_zeros(t, t);
-}
-
-//
-// Converters and constructors with arrays and params
-//
-
-/** Make an IntTuple of rank N from an Indexable array.
- * Access elements up to a dynamic index n, then use init (requires compatible types)
- * Consider cute::take<B,E> if all indexing is known to be valid
- * \code
- *   std::vector<int> a = {6,3,4};
- *   auto tup = make_int_tuple<5>(a, a.size(), 0)            // (6,3,4,0,0)
- * \endcode
- */
-template <int N, class Indexable, class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_int_tuple(Indexable const& t, int n, T const& init)
-{
-  static_assert(N > 0);
-  if constexpr (N == 1) {
-    return 0 < n ? t[0] : init;
-  } else {
-    return transform(make_seq<N>{}, [&](auto i) { return i < n ? t[i] : init; });
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** Fill the dynamic values of a Tuple with values from another Tuple
- * \code
- *   auto params = make_tuple(6,3,4);
- *   cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>> result;
- *   fill_int_tuple_from(result, params);                    // (_1,(6,3,_3),4,_2)
- * \endcode
- */
-template <class Tuple, class TupleV>
-CUTE_HOST_DEVICE constexpr
-auto
-fill_int_tuple_from(Tuple& result, TupleV const& vals)
-{
-  return fold(result, vals, [](auto const& init, auto&& r) {
-    if constexpr (is_static<remove_cvref_t<decltype(r)>>::value) {       // Skip static elements of result
-      return init;
-    } else if constexpr (is_tuple<remove_cvref_t<decltype(r)>>::value) { // Recurse into tuples
-      return fill_int_tuple_from(r, init);
-    } else {                                                             // Assign and consume arg
-      static_assert(tuple_size<remove_cvref_t<decltype(init)>>::value > 0, "Not enough values to fill with!");
-      r = get<0>(init);
-      return remove<0>(init);
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  });
-}
-
-/** Make a "Tuple" by filling in the dynamic values in order from the arguments
- * \code
- *   using result_t = cute::tuple<Int<1>, cute::tuple<int, int, Int<3>>, int, Int<2>>;
- *   auto result = make_int_tuple_from<result_t>(6,3,4);     // (_1,(6,3,_3),4,_2)
- * \endcode
- */
-template <class Tuple, class... Ts>
-CUTE_HOST_DEVICE constexpr
-Tuple
-make_int_tuple_from(Ts const&... ts)
-{
-  Tuple result = Tuple{};
-  fill_int_tuple_from(result, cute::make_tuple(ts...));
-  return result;
-}
-
-/** Convert a tuple to a flat homogeneous array of type T
- * \code
- *   auto tup = cute::make_tuple(Int<1>{}, cute::make_tuple(6,3,Int<3>{}),4,Int<2>{});
- *   cute::array<uint64_t,6> result = to_array<uint64_t>(tup);   // [1,6,3,3,4,2]
- * \endcode
- */
-template <class T = int64_t, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-to_array(IntTuple const& t)
-{
-  auto flat_t = flatten_to_tuple(t);
-  constexpr int N = tuple_size<decltype(flat_t)>::value;
-  cute::array<T,N> result;
-  for_each(make_seq<N>{}, [&] (auto i) { result[i] = get<i>(flat_t); });
-  return result;
-}
-
-//
-// Comparison operators
-//
-
-//
-// There are many ways to compare tuple of elements and because CuTe is built
-//   on parameterizing layouts of coordinates, some comparisons are appropriate
-//   only in certain cases.
-//  -- lexicographical comparison [reverse, reflected, revref]   : Correct for coords in RowMajor Layout
-//  -- colexicographical comparison [reverse, reflected, revref] : Correct for coords in ColMajor Layout
-//  -- element-wise comparison [any,all]                         :
-// This can be very confusing. To avoid errors in selecting the appropriate
-//   comparison, op<|op<=|op>|op>= are *not* implemented for cute::tuple.
-//
-// When actually desiring to order coordinates, the user should map them to
-//   their indices within the Layout they came from:
-//      e.g.  layoutX(coordA) < layoutX(coordB)
-// That said, we implement the three most common ways to compare tuples below.
-//   These are implemented with slighly more explicit names than op<.
-//
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_less(IntTupleA const& a, IntTupleB const& b);
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_less(IntTupleA const& a, IntTupleB const& b);
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_less(IntTupleA const& a, IntTupleB const& b);
-
-namespace detail {
-
-template <size_t I, class TupleA, class TupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_less_impl(TupleA const& a, TupleB const& b)
-{
-  if constexpr (I == tuple_size<TupleB>::value) {
-    return cute::false_type{};    // Terminal: TupleB is exhausted
-  } else if constexpr (I == tuple_size<TupleA>::value) {
-    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
-  } else {
-    return lex_less(get<I>(a), get<I>(b)) || (get<I>(a) == get<I>(b) && lex_less_impl<I+1>(a,b));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I, class TupleA, class TupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_less_impl(TupleA const& a, TupleB const& b)
-{
-  if constexpr (I == tuple_size<TupleB>::value) {
-    return cute::false_type{};    // Terminal: TupleB is exhausted
-  } else if constexpr (I == tuple_size<TupleA>::value) {
-    return cute::true_type{};     // Terminal: TupleA is exhausted, TupleB is not exhausted
-  } else {
-    constexpr size_t A = tuple_size<TupleA>::value - 1 - I;
-    constexpr size_t B = tuple_size<TupleB>::value - 1 - I;
-    return colex_less(get<A>(a), get<B>(b)) || (get<A>(a) == get<B>(b) && colex_less_impl<I+1>(a,b));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <size_t I, class TupleA, class TupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_less_impl(TupleA const& a, TupleB const& b)
-{
-  if constexpr (I == tuple_size<TupleA>::value) {
-    return cute::true_type{};     // Terminal: TupleA is exhausted
-  } else if constexpr (I == tuple_size<TupleB>::value) {
-    return cute::false_type{};    // Terminal: TupleA is not exhausted, TupleB is exhausted
-  } else {
-    return elem_less(get<I>(a), get<I>(b)) && elem_less_impl<I+1>(a,b);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-// Lexicographical comparison
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_less(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    return detail::lex_less_impl<0>(a, b);
-  } else {
-    return a < b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_leq(T const& t, U const& u) {
-  return !lex_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_gtr(T const& t, U const& u) {
-  return lex_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-lex_geq(T const& t, U const& u) {
-  return !lex_less(t, u);
-}
-
-// Colexicographical comparison
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_less(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    return detail::colex_less_impl<0>(a, b);
-  } else {
-    return a < b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_leq(T const& t, U const& u) {
-  return !colex_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_gtr(T const& t, U const& u) {
-  return colex_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-colex_geq(T const& t, U const& u) {
-  return !colex_less(t, u);
-}
-
-// Elementwise [all] comparison
-
-template <class IntTupleA, class IntTupleB>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_less(IntTupleA const& a, IntTupleB const& b)
-{
-  if constexpr (is_tuple<IntTupleA>::value && is_tuple<IntTupleB>::value) {
-    return detail::elem_less_impl<0>(a, b);
-  } else {
-    return a < b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_leq(T const& t, U const& u) {
-  return !elem_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_gtr(T const& t, U const& u) {
-  return elem_less(u, t);
-}
-
-template <class T, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-elem_geq(T const& t, U const& u) {
-  return !elem_less(t, u);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/layout.hpp b/lightllm-kernel/cutlass/include/cute/layout.hpp
deleted file mode 100755
index bc1b54efb..000000000
--- a/lightllm-kernel/cutlass/include/cute/layout.hpp
+++ /dev/null
@@ -1,2058 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/int_tuple.hpp>
-#include <cute/stride.hpp>
-#include <cute/underscore.hpp>
-#include <cute/numeric/arithmetic_tuple.hpp>
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/numeric/integral_ratio.hpp>
-#include <cute/numeric/numeric_types.hpp>  // cute::sizeof_bits
-
-namespace cute
-{
-
-// Aliases
-
-template <class... Shapes>
-using Shape = cute::tuple<Shapes...>;
-
-template <class... Strides>
-using Stride = cute::tuple<Strides...>;
-
-template <class... Strides>
-using Step = cute::tuple<Strides...>;
-
-template <class... Coords>
-using Coord = cute::tuple<Coords...>;
-
-template <class... Layouts>
-using Tile = cute::tuple<Layouts...>;
-
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-Shape<Ts...>
-make_shape(Ts const&... t) {
-  return {t...};
-}
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-Stride<Ts...>
-make_stride(Ts const&... t) {
-  return {t...};
-}
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-Step<Ts...>
-make_step(Ts const&... t) {
-  return {t...};
-}
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-Coord<Ts...>
-make_coord(Ts const&... t) {
-  return {t...};
-}
-template <class... Ts>
-CUTE_HOST_DEVICE constexpr
-Tile<Ts...>
-make_tile(Ts const&... t)
-{
-  return {t...};
-}
-
-//
-// Layout
-//
-
-template <class Shape, class Stride = LayoutLeft::Apply<Shape> >
-struct Layout
-    : private cute::tuple<Shape, Stride>   // EBO for static layouts
-{
-  // Expensive in compilation time...
-  //static_assert(is_congruent<Shape, Stride>::value, "Shape and Stride must be congruent");
-
-  // NOTE: This defaults static Shapes/Strides correctly, but not dynamic
-  CUTE_HOST_DEVICE constexpr
-  Layout(Shape  const& shape  = {}, Stride const& stride = {})
-      : cute::tuple<Shape, Stride>(shape, stride)
-  {}
-
-  //
-  // Accessors
-  //
-
-  static constexpr int rank  = rank_v<Shape>;
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout() {
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout() const {
-    return *this;
-  }
-
-  template <int... I>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  shape() {
-    return get<0,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
-  }
-
-  template <int... I>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  shape() const {
-    return get<0,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
-  }
-
-  template <int... I>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  stride() {
-    return get<1,I...>(static_cast<cute::tuple<Shape, Stride>&>(*this));
-  }
-
-  template <int... I>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  stride() const {
-    return get<1,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
-  }
-
-  //
-  // Mappings
-  //
-
-  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
-  // OR
-  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coord const& coord) const {
-    if constexpr (has_underscore<Coord>::value) {
-      return slice(coord, *this);
-    } else {
-      return crd2idx(coord, shape(), stride());
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  // Convenience function for multi-dimensional coordinates
-  template <class Coord0, class Coord1, class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
-    return operator()(make_coord(c0,c1,cs...));
-  }
-
-  //
-  // Compose
-  //
-
-  template <class OtherLayout>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(OtherLayout const& other) const {
-    return composition(*this, other);
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(Layouts const&... layouts) const {
-    return composition(*this, make_tile(layouts...));
-  }
-
-  template <class OtherShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  with_shape(OtherShape const& shape) const {
-    return composition(*this, make_layout(shape));
-  }
-
-  template <class... Shapes>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  with_shape(Shapes const&... shapes) const {
-    return composition(*this, make_layout(make_shape(shapes...)));
-  }
-
-  //
-  // Tile
-  //
-
-  template <class OtherLayout>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(OtherLayout const& other) const {
-    return tiled_divide(*this, other);
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(Layouts const&... layouts) const {
-    return tiled_divide(*this, make_tile(layouts...));
-  }
-
-  //
-  // Utility
-  //
-
-  //
-  // Index to Coordinate
-  //
-
-  // NOTE: Only valid for compact layouts
-
-  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
-  // @post crd2idx(@a result, shape(), stride()) == idx
-  // @post congruent(@a result, shape())
-  template <class IInt,
-            __CUTE_REQUIRES(is_integral<IInt>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_hier_coord(IInt const& idx) const {
-    return cute::idx2crd(idx, shape(), stride());
-  }
-
-  // Return the (flat) ND logical coordinate corresponding to the linear index
-  // @post crd2idx(@a result, shape(), stride()) == idx
-  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
-  template <class IInt,
-            __CUTE_REQUIRES(is_integral<IInt>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_flat_coord(IInt const& idx) const {
-    return cute::crd2crd(this->get_hier_coord(idx), shape(), repeat<rank>(Int<1>{}));
-  }
-
-  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
-  // @post crd2idx(@a result, shape(), stride()) == idx
-  // @post is_integral<decltype(@a result)>::value
-  template <class IInt,
-            __CUTE_REQUIRES(is_integral<IInt>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_1d_coord(IInt const& idx) const {
-    return cute::crd2idx(this->get_hier_coord(idx), shape());
-  }
-
-  //
-  // Coordinate to Coordinate
-  //
-
-#if 0
-  // Return the (hierarchical) ND logical coordinate corresponding to the linear index
-  // @post congruent(@a result, shape())
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  crd_2_hier_coord(Coord const& crd) const {
-    return cute::crd2crd(crd, shape(), shape());
-  }
-
-  // Return the (flat) ND logical coordinate corresponding to the linear index
-  // @post rank(@a result) == rank(shape()) && depth(@a result) == 1
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  crd_2_flat_coord(Coord const& crd) const {
-    return cute::crd2crd(crd, shape(), product_each(shape()));
-  }
-
-  // Return the generalized column-major 1D logical coordinate corresponding to the linear index
-  // @post is_integral<decltype(@a result)>::value
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  crd_2_1d_coord(Coord const& crd) const {
-    //return cute::crd2crd(crd, shape(), product(shape()));
-    return cute::crd2idx(crd, shape());
-  }
-#endif
-};
-
-// Equality, return a static or dynamic boolean
-template <class ShapeA, class StrideA,
-          class ShapeB, class StrideB>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(Layout<ShapeA,StrideA> const& layoutA, Layout<ShapeB,StrideB> const& layoutB)
-{
-  return layoutA.shape() == layoutB.shape() && layoutA.stride() == layoutB.stride();
-}
-
-template <class Layout>
-struct is_layout : false_type {};
-template <class Shape, class Stride>
-struct is_layout<Layout<Shape,Stride>> : true_type {};
-
-//
-// Layout construction
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Shape const& shape, Stride const& stride)
-{
-  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
-  static_assert(is_tuple<Stride>::value || is_integral<Stride>::value);
-  return Layout<Shape,Stride>(shape, stride);
-}
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Shape const& shape)
-{
-  static_assert(is_tuple<Shape >::value || is_integral<Shape >::value);
-  return make_layout(shape, compact_major<LayoutLeft>(shape));
-}
-
-//
-// Convenience tags for common layouts
-//
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Shape const& shape, LayoutLeft)
-{
-  return make_layout(shape, compact_major<LayoutLeft>(shape));
-}
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Shape const& shape, LayoutRight)
-{
-  return make_layout(shape, compact_major<LayoutRight>(shape));
-}
-
-//
-// Construct a layout from multiple layouts by concatenation
-//
-
-// One argument overload
-template <class Shape0, class Stride0>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Layout<Shape0,Stride0> const& layout0)
-{
-  return make_layout(make_shape (layout0.shape() ),
-                     make_stride(layout0.stride()));
-}
-
-// Two argument overload
-template <class Shape0, class Stride0,
-          class Shape1, class Stride1>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Layout<Shape0,Stride0> const& layout0,
-            Layout<Shape1,Stride1> const& layout1)
-{
-  return make_layout(make_shape (layout0.shape() , layout1.shape() ),
-                     make_stride(layout0.stride(), layout1.stride()));
-}
-
-// Var argument overload
-template <class Shape0, class Stride0,
-          class Shape1, class Stride1,
-          class... Shapes, class... Strides>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Layout<Shape0,Stride0> const& layout0,
-            Layout<Shape1,Stride1> const& layout1,
-            Layout<Shapes,Strides> const&... layouts)
-{
-  return make_layout(make_shape (layout0.shape() , layout1.shape() , layouts.shape()... ),
-                     make_stride(layout0.stride(), layout1.stride(), layouts.stride()...));
-}
-
-//
-// Advanced Layout constructions
-//
-
-// Make a compact layout with shape @a shape and strides following the order induced by @a order.
-// Dynamic values in @a order are ignored, considered large, and considered ordered from left to right.
-// Example:
-//   make_ordered_layout(Shape<_2,_2,_2,_2>{}, Step<_0,_2,_3,_1>{})
-//     ->  (_2,_2,_2,_2):(_1,_4,_8,_2)
-//   make_ordered_layout(make_shape(2,3,4,5), make_step(Int<2>{}, 67, 42, Int<50>{}))
-//     -> (2,3,4,5):(_1,10,30,2)
-template <class Shape, class Order>
-CUTE_HOST_DEVICE constexpr
-auto
-make_ordered_layout(Shape const& shape, Order const& order)
-{
-  return make_layout(shape, compact_order(shape, order));
-}
-
-// Make a compact layout with the same shape as @a layout
-//   and strides following the order induced by @a layout.stride().
-// Static-0 strides in the input @a layout are preserved in the output.
-// Example:
-//   make_layout_like(Layout<Shape<_2,_2,_2,_2>, Stride<_0,_2,_4,_1>>{})
-//     ->  (_2,_2,_2,_2):(_0,_2,_4,_1)
-//   make_layout_like(make_layout(make_shape(2,3,4,5), make_stride(Int<0>{},42,Int<1>{},Int<0>{})))
-//     -> (2,3,4,5):(_0,4,_1,_0)
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout_like(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(layout.shape(),
-                     compact_order(filter_zeros(layout.stride(), layout.shape()), layout.stride()));
-}
-
-// Make a compact layout with the same shape as @a layout
-//   and strides following the order induced by @a layout.stride(),
-//   except mode-0 is always stride-1 and generated column-major.
-// The 0th mode is commonly used for MMA_Atoms or Copy_Atoms so this
-//   generates the 0th mode with LayoutLeft (preserving stride-0s) regardless of the reference layout
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(Layout<Shape,Stride> const& layout)
-{
-  constexpr int R = Layout<Shape,Stride>::rank;
-  if constexpr (R > 1 && is_static<Shape>::value) {
-    return tiled_product(make_layout(get<0>(layout.shape()),
-                                     compact_major<LayoutLeft>(filter_zeros(get<0>(layout.stride()), get<0>(layout.shape())))),
-                         make_ordered_layout(take<1,R>(layout.shape()), take<1,R>(layout.stride())));
-  } else {
-    return make_layout(layout.shape());
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class Shape,
-          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(Shape const& shape)
-{
-  return make_layout(shape);
-}
-
-//
-// Make an identity layout that maps a coordinate to itself
-//
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_identity_layout(Shape const& shape)
-{
-  return make_layout(shape, make_basis_like(shape));
-}
-
-//
-// Operations to manipulate Layouts like a tuple of pairs
-//
-
-// Return the Is...th sublayout.
-// For Is... = <I0,I1,...,IN>, equivalent to get<IN>(...get<I1>(get<I0>(layout)))
-template <size_t... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-get(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(get<Is...>(layout.shape()),
-                     get<Is...>(layout.stride()));
-}
-
-// Return a new layout with only the modes in the range [B,E)
-template <int B, int E, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-take(Layout<Shape,Stride> const& layout)
-{
-  static_assert(B < E, "take: empty range error");
-  static_assert(0 <= B && E <= Layout<Shape,Stride>::rank, "take: range out of bounds");
-  return make_layout(take<B,E>(layout.shape()),
-                     take<B,E>(layout.stride()));
-}
-
-// Return a new layout with only the modes Is... = <I0,I1,...,IN>
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-select(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(select<Is...>(layout.shape()),
-                     select<Is...>(layout.stride()));
-}
-
-// Return a layout with depth at most 1
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(flatten(layout.shape()),
-                     flatten(layout.stride()));
-}
-
-// Return a layout whose profile is congruent to TargetProfile
-// @pre Input layout is flat, flatten(@a layout) == @a layout
-// @pre Input layout can be folded to profile, rank(@a layout) == rank(flatten(@a target_profile))
-// @post congruent(@a result, @a target_profile)
-template <class Shape, class Stride, class TargetProfile>
-CUTE_HOST_DEVICE constexpr
-auto
-unflatten(Layout<Shape,Stride> const& layout, TargetProfile const& target_profile)
-{
-  return make_layout(unflatten(layout.shape(),  target_profile),
-                     unflatten(layout.stride(), target_profile));
-}
-
-//
-// Utilities
-//
-
-// Return the sublayout of mode I...
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-layout(Layout<Shape,Stride> const& layout)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    return layout;
-  } else {
-    return get<Is...>(layout);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Return the shape of a mode
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-shape(Layout<Shape,Stride>& layout)
-{
-  return layout.template shape<Is...>();
-}
-
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-shape(Layout<Shape,Stride> const& layout)
-{
-  return layout.template shape<Is...>();
-}
-
-// Return the stride of a mode
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-stride(Layout<Shape,Stride>& layout)
-{
-  return layout.template stride<Is...>();
-}
-
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-stride(Layout<Shape,Stride> const& layout)
-{
-  return layout.template stride<Is...>();
-}
-
-// Return the number of elements in a mode
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-size(Layout<Shape,Stride> const& layout)
-{
-  return size(shape<Is...>(layout));
-}
-
-// Return the number of modes
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-rank(Layout<Shape,Stride> const& layout)
-{
-  return rank(shape<Is...>(layout));
-}
-
-// Return the depth of the layout
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-depth(Layout<Shape,Stride> const& layout)
-{
-  return depth(shape<Is...>(layout));
-}
-
-// Return the codomain shape of a mode
-// @post size(coshape(@a a)) == cosize(@a a)
-// @return C Coordinate with smallest elements such that
-//           @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
-//           where sub_layout = get<Is...>(layout).
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-coshape(Layout<Shape,Stride> const& layout)
-{
-  // Protect against negative strides
-  auto abs_sub_layout = make_layout(shape<Is...>(layout),
-                                    transform_leaf(stride<Is...>(layout), abs_fn{}));
-  auto co_coord = as_arithmetic_tuple(abs_sub_layout(size(abs_sub_layout) - Int<1>{}));
-  return co_coord + repeat_like(co_coord, Int<1>{});
-}
-
-// Return the codomain size of a mode
-// @return M smallest integer such that
-//           @a sub_layout(c) < M for all c < size(@a sub_layout)
-//           where sub_layout = get<Is...>(layout).
-template <int... Is, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-cosize(Layout<Shape,Stride> const& layout)
-{
-  return size(coshape<Is...>(layout));
-}
-
-template <class Layout>
-using cosize_t = decltype(cosize(declval<Layout>()));
-
-template <class Layout>
-static constexpr auto cosize_v = cosize_t<Layout>::value;
-
-// With crd2idx(coord, shape), makes sense to have crd2idx(coord, Layout) as well
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx(Coord const& c, Layout<Shape,Stride> const& layout)
-{
-  return crd2idx(c, layout.shape(), layout.stride());
-}
-
-//
-// Slice and Dice a layout
-//
-
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-slice(Coord const& c, Layout<Shape,Stride> const& layout)
-{
-  return make_layout(slice(c, layout.shape()),
-                     slice(c, layout.stride()));
-}
-
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-slice_and_offset(Coord const& c, Layout<Shape,Stride> const& layout)
-{
-  return cute::make_tuple(slice(c, layout), crd2idx(c, layout));
-}
-
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-dice(Coord const& c, Layout<Shape,Stride> const& layout)
-{
-  return make_layout(dice(c, layout.shape()),
-                     dice(c, layout.stride()));
-}
-
-// Compute a pointer offset and (potentially modified) layout from a coordinate
-// This exists so it can be overloaded for ComposedLayout
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-domain_offset(Coord const& coord, Layout<Shape,Stride> const& layout)
-{
-  return cute::make_tuple(layout, layout(coord));
-}
-
-//
-// Transform the modes of a layout
-//
-
-namespace detail {
-
-template <class Tuple, class F, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_layout(Tuple const& t, F&& f, seq<I...>)
-{
-  return make_layout(f(get<I>(t))...);
-}
-
-template <class Tuple0, class Tuple1, class F, int... I, int... I0, int... I1>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f, seq<I...>, seq<I0...>, seq<I1...>)
-{
-  return make_layout(f(get<I>(t0),get<I>(t1))..., get<I0>(t0)..., get<I1>(t1)...);
-}
-
-} // end namespace detail
-
-template <class Tuple, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_layout(Tuple const& t, F&& f)
-{
-  return detail::transform_layout(t, f, make_seq<decltype(rank(t))::value>{});
-}
-
-template <class Tuple0, class Tuple1, class F>
-CUTE_HOST_DEVICE constexpr
-auto
-transform_layout(Tuple0 const& t0, Tuple1 const& t1, F&& f)
-{
-  constexpr int R0 = decltype(rank(t0))::value;
-  constexpr int R1 = decltype(rank(t1))::value;
-  constexpr int R  = (R0 < R1) ? R0 : R1;
-  return detail::transform_layout(t0, t1, f, make_seq<R>{}, make_range<R,R0>{}, make_range<R,R1>{});
-}
-
-//
-// Coalesce and Filter
-//
-
-namespace detail {
-
-// Look at each element and the front of the stack (in order of priority)
-// front(NewLayout)  get<I>(Layout)
-//      s0:d0           _1:d1     =>  continue
-//      _1:d0           s1:d1     =>  replace_front    s1:d1
-//      s0:s1*d1        s1:d1     =>  replace_front s0*s1:d1
-//      s0:d0           s1:d1     =>  prepend          s1:d1
-//
-// @pre OldShape and OldStride are flat
-template <int I, class OldShape, class OldStride, class NewShape, class NewStride>
-CUTE_HOST_DEVICE constexpr
-auto
-bw_coalesce(OldShape const& old_shape, OldStride const& old_stride,
-            NewShape const& new_shape, NewStride const& new_stride)
-{
-  if constexpr (I == -1) {
-    // Base case, we're done
-    if constexpr (is_constant<1, NewShape>::value) {
-      return Layout<_1,_0>{};
-    } else {
-      return Layout<NewShape,NewStride>{new_shape,new_stride};
-    }
-  } else if constexpr (is_constant<1, decltype(get<I>(old_shape))>::value) {
-    // shape<I>(layout) == _1, skip it and continue
-    return bw_coalesce<I-1>(old_shape, old_stride, new_shape, new_stride);
-  } else if constexpr (is_constant<1, NewShape>::value) {
-    // Replace our shape-1 with anything (Can only happen on input new_shape/new_stride)
-    return bw_coalesce<I-1>(old_shape, old_stride, get<I>(old_shape), get<I>(old_stride));
-  } else if constexpr (is_static<decltype(get<0>(new_shape))>::value &&
-                       is_constant<true, decltype(get<I>(old_shape) * get<I>(old_stride) == get<0>(new_stride))>::value) {
-    // Merge modes because the shapes and strides match
-    return bw_coalesce<I-1>(old_shape, old_stride,
-                            replace_front(new_shape,  get<I>(old_shape) * get<0>(new_shape)),
-                            replace_front(new_stride, get<I>(old_stride)));
-  } else {
-    // Can't replace or merge, so prepend a new mode
-    return bw_coalesce<I-1>(old_shape, old_stride,
-                            prepend(new_shape,  get<I>(old_shape)),
-                            prepend(new_stride, get<I>(old_stride)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// cute::coalesce promises to not change the Layout as a function from integers to codomain.
-// It accomplishes this inside of the Layout's domain, but not always outside of the domain.
-//   Example: (_4,_1):(_1,_0) coalesces to _4:_1.
-// detail::coalesce_x preserves the Layout function inside its domain and outside.
-//
-// @post depth(@a result) <= 1
-// @post for all i, 0 <= i, @a layout(i) == @a result(i)
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce_x(Layout<Shape,Stride> const& layout)
-{
-  auto flat_shape  = flatten(layout.shape());
-  auto flat_stride = flatten(layout.stride());
-
-  constexpr int R = decltype(rank(flat_shape))::value;
-  if constexpr (is_constant<1, decltype(get<R-1>(flat_shape))>::value) {
-    return detail::bw_coalesce<R-2>(flat_shape, flat_stride,             Int<2>{}, get<R-1>(flat_stride));
-  } else {
-    return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
-  }
-}
-
-// Apply coalesce_x at the terminals of trg_profile
-template <class Shape, class Stride, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce_x(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
-    return cute::transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce_x(l,t); });
-  } else {
-    return coalesce_x(layout);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-// "Simplify" the layout by combining modes that are possible to combine
-// Does not respect the shape of the layout, but does preserve total size
-// @post size(@a result) == size(@a layout)
-// @post depth(@a result) <= 1
-// @post for all i, 0 <= i < size(@a layout), @a layout(i) == @a result(i)
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Layout<Shape,Stride> const& layout)
-{
-  auto flat_shape  = flatten(layout.shape());
-  auto flat_stride = flatten(layout.stride());
-
-  constexpr int R = decltype(rank(flat_shape))::value;
-  return detail::bw_coalesce<R-2>(flat_shape, flat_stride, get<R-1>(flat_shape), get<R-1>(flat_stride));
-}
-
-// Apply coalesce at the terminals of trg_profile
-template <class Shape, class Stride, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
-    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return coalesce(l,t); });
-  } else {
-    return coalesce(layout);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Combine static and dynamic modes of a shape.
-// @post size(@a result) == size(@a shape)
-// @post depth(@a result) <= 1
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Shape const& shape)
-{
-  static_assert(is_integral<Shape>::value || is_tuple<Shape>::value);
-
-  return cute::fold_first(flatten(shape), [](auto const& init, auto const& a) {
-    if constexpr (is_static<decltype(back(init))>::value == is_static<decltype(a)>::value) {
-      return replace_back(init, back(init) * a);  // Both static or both dynamic, coalesce and replace
-    } else {
-      return append(init, a);                     // Can't coalesce, so append
-    }
-  });
-}
-
-// Replace the modes in layout that have a 0-stride with a 1-size
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(filter_zeros(layout.stride(), layout.shape()), layout.stride());
-}
-
-// Replace the modes in layout that correspond to a 0 at the terminals of trg_profile with a 1-size
-template <class Shape, class Stride, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
-{
-  return make_layout(filter_zeros(trg_profile, layout.shape()), layout.stride());
-}
-
-// Remove all of the 0-strides and 1-sizes
-// Return 1-shape if empty
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(Layout<Shape,Stride> const& layout)
-{
-  return coalesce(filter_zeros(layout));
-}
-
-// Apply filter at the terminals of trg_profile
-template <class Shape, class Stride, class IntTuple>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(Layout<Shape,Stride> const& layout, IntTuple const& trg_profile)
-{
-  if constexpr (is_tuple<IntTuple>::value) {
-    static_assert(tuple_size<IntTuple>::value <= Layout<Shape,Stride>::rank);
-    return transform_layout(layout, trg_profile, [](auto const& l, auto const& t) { return filter(l,t); });
-  } else {
-    return filter(layout);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Append, Prepend, Replace
-//
-
-template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-append(Layout<ShapeA,StrideA> const& layout,
-       Layout<ShapeX,StrideX> const& x = {})
-{
-  return make_layout(append<N>(layout.shape(),  x.shape()),
-                     append<N>(layout.stride(), x.stride()));
-}
-
-template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-append(Layout<ShapeA,StrideA> const& layout,
-       Layout<ShapeX,StrideX> const& x = {})
-{
-  return make_layout(append(layout.shape(),  x.shape()),
-                     append(layout.stride(), x.stride()));
-}
-
-template <int N, class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-prepend(Layout<ShapeA,StrideA> const& layout,
-        Layout<ShapeX,StrideX> const& x = {})
-{
-  return make_layout(prepend<N>(layout.shape(),  x.shape()),
-                     prepend<N>(layout.stride(), x.stride()));
-}
-
-template <class ShapeA, class StrideA, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-prepend(Layout<ShapeA,StrideA> const& layout,
-        Layout<ShapeX,StrideX> const& x = {})
-{
-  return make_layout(prepend(layout.shape(),  x.shape()),
-                     prepend(layout.stride(), x.stride()));
-}
-
-template <int N, class ShapeA, class StrideA, class ShapeX, class StrideX>
-CUTE_HOST_DEVICE constexpr
-auto
-replace(Layout<ShapeA,StrideA> const& layout,
-        Layout<ShapeX,StrideX> const& x)
-{
-  return make_layout(replace<N>(layout.shape(),  x.shape()),
-                     replace<N>(layout.stride(), x.stride()));
-}
-
-template <int B, int E, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-group(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(group<B,E>(layout.shape()),
-                     group<B,E>(layout.stride()));
-}
-
-//
-// Composition of two layouts: lhs o rhs
-// @post compatible(rhs, result)
-// @post result(c) = lhs(rhs(c))
-//         for all c in the domain of rhs
-//
-
-namespace detail {
-
-template <class LShape, class LStride,
-          class RShape, class RStride>
-CUTE_HOST_DEVICE constexpr
-auto
-composition_impl(LShape const& lhs_shape, LStride const& lhs_stride,
-                 RShape const& rhs_shape, RStride const& rhs_stride)
-{
-  if constexpr (is_tuple<RShape>::value) {
-    // Apply the right-distributivity of Layout composition
-    return transform_layout(rhs_shape, rhs_stride, [&](auto const& s, auto const& d) {
-      return composition_impl(lhs_shape, lhs_stride, s, d);
-    });
-  } else
-  if constexpr (is_scaled_basis<RStride>::value) {
-    // Special case for a ScaledBasis stride
-    return composition_impl(basis_get(rhs_stride, lhs_shape), basis_get(rhs_stride, lhs_stride),
-                            rhs_shape, basis_value(rhs_stride));
-  } else
-  if constexpr (is_constant<0, RStride>::value) {
-    // Special case shortcut for any static stride-0
-    return Layout<RShape, RStride>{rhs_shape, rhs_stride};
-  } else
-  if constexpr (is_integral<decltype(lhs_shape)>::value) {
-    // Special case shortcut for any integral LShape
-    return Layout{rhs_shape, rhs_stride * lhs_stride};
-  } else
-  if constexpr (is_constant<1, RStride>::value) {
-    // Special case shortcut for any static stride-1
-    constexpr int R  = rank_v<LShape>;
-    auto result_shape_0  = take<0,R-1>(lhs_shape);
-
-    // Mod out the rhs_shape from the lhs_shape
-    auto const [result_shape_1, rest_shape]  = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_shape),
-      [] (auto const& init, auto const& si) {
-        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
-      });
-
-    // Jump into coalesce and append (rest_shape, get<R-1>(lhs_stride))
-    return detail::bw_coalesce<R-2>(result_shape_1, lhs_stride, rest_shape, get<R-1>(lhs_stride));
-  } else {
-    // General case: integral RShape and RStride, tuple LShape and LStride
-    constexpr int R  = rank_v<LShape>;
-    auto result_shape_0  = take<0,R-1>(lhs_shape);
-    auto result_stride_0 = take<0,R-1>(lhs_stride);
-
-    // Divide out the rhs_stride from the lhs_shape
-    auto const [result_shape_1, rest_stride] = fold(result_shape_0, cute::make_tuple(cute::make_tuple(), rhs_stride),
-      [] (auto const& init, auto const& di) {
-        return cute::make_tuple(append(get<0>(init), shape_div(di, get<1>(init))), shape_div(get<1>(init), di));
-      });
-
-    // Apply any lhs_shape changes to the stride
-    auto result_stride_1 = elem_scale(result_stride_0, shape_div(result_shape_0, result_shape_1));
-
-    // Mod out the rhs_shape from the lhs_shape
-    auto const [result_shape_2, rest_shape] = fold(result_shape_1, cute::make_tuple(cute::make_tuple(), rhs_shape),
-      [] (auto const& init, auto const& si) {
-        return cute::make_tuple(append(get<0>(init), shape_min(abs(si), get<1>(init))), shape_div(get<1>(init), abs(si)));
-      });
-
-    // Jump into coalesce and append (rest_shape, rest_stride * get<R-1>(lhs_stride))
-    return detail::bw_coalesce<R-2>(result_shape_2, result_stride_1, rest_shape, rest_stride * get<R-1>(lhs_stride));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class LShape, class LStride,
-          class RShape, class RStride>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Layout<LShape,LStride> const& lhs,
-            Layout<RShape,RStride> const& rhs)
-{
-  auto coprofile = repeat_like(decltype(coshape(rhs)){}, Int<0>{});
-  auto flat_lhs = detail::coalesce_x(lhs, coprofile);
-  return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs.shape(), rhs.stride());
-}
-
-template <class LShape, class LStride, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Layout<LShape,LStride> const& lhs,
-            Tiler                  const& rhs)
-{
-  if constexpr (is_tuple<Tiler>::value) {
-    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank);
-    // Drop any modes of lhs that aren't hit by rhs
-    return detail::transform_layout(lhs, rhs, [](auto const& l, auto const& r) { return composition(l,r); }, make_seq<tuple_size<Tiler>::value>{}, seq<>{}, seq<>{});
-  } else if constexpr (is_underscore<Tiler>::value) {
-    return lhs;
-  } else if constexpr (is_integral<Tiler>::value) {
-    auto flat_lhs = detail::coalesce_x(lhs);
-    return detail::composition_impl(flat_lhs.shape(), flat_lhs.stride(), rhs, Int<1>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Complement
-//
-// Build the complement of a layout.
-// @post size(@a result) >= @a cosize_hi / size(filter(@a layout)));
-// @post For all i in [1,size(@a result)),
-//           @a result(i) < @a result(i-1)
-//           For all j in [0, size(@a layout)),
-//               @a result(i) != @a layout(j)
-//
-
-namespace detail {
-
-// @pre @a layout has been filtered (flattened and no stride-0 or size-1 modes).
-template <class Shape, class Stride, class CoTarget>
-CUTE_HOST_DEVICE constexpr
-auto
-complement(Shape const& shape, Stride const& stride, CoTarget const& cotarget)
-{
-  if constexpr (is_constant<0, Stride>::value) {
-    // Special case for irreducible rank-1 stride-0 layout
-    return make_layout(coalesce(cotarget));
-  } else {
-    // General case
-    constexpr int R = rank_v<Shape>;
-    static_assert(R == 1 || is_static<Stride>::value,
-                  "Dynamic-stride complement only for rank-1 layouts");
-
-    // Should just be a sort and a fold...
-    // Then we could even handle dynamic strides (but they would destroy all static strides)
-    auto [shape_, stride_, result_shape_, result_stride] =
-      fold(make_seq<R-1>{},
-           cute::make_tuple(shape, stride, cute::make_tuple(), cute::make_tuple(Int<1>{})),
-           [](auto const& init, auto i)
-           {
-              auto [shape, stride, result_shape, result_stride] = init;
-              auto min_stride = cute::min(stride);
-              auto min_idx    = cute::find(stride, min_stride);
-              auto new_shape  = min_stride / get<i>(result_stride);
-              auto new_stride = min_stride * get<min_idx>(shape);
-              static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
-
-              return cute::make_tuple(remove<min_idx>(shape),              // Remove the min_idx from shape
-                                      remove<min_idx>(stride),             // Remove the min_idx from stride
-                                      append(result_shape , new_shape ),   // new shape  = min_stride / last_stride
-                                      append(result_stride, new_stride));  // new stride = min_stride * curr_shape
-            });
-
-    // Append the last shape mode
-    auto new_shape    = get<0>(stride_) / get<R-1>(result_stride);         // new shape  = min_stride / last_stride
-    static_assert(not is_constant<0, decltype(new_shape)>::value, "Non-injective Layout detected in complement.");
-    auto result_shape = append(result_shape_, new_shape);
-
-    // Compute the rest_shape and rest_stride
-    auto new_stride  = get<0>(stride_) * get<0>(shape_);                   // new stride = min_stride * curr_shape
-    auto rest_shape  = coalesce(ceil_div(cotarget, new_stride));
-    auto rest_stride = compact_major<LayoutLeft>(rest_shape, new_stride);
-
-    // Coalesce and append (rest_shape, rest_stride)
-    return coalesce(make_layout(make_shape (result_shape , rest_shape ),
-                                make_stride(result_stride, rest_stride)));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class Shape, class Stride, class CoTarget>
-CUTE_HOST_DEVICE constexpr
-auto
-complement(Layout<Shape,Stride> const& layout, CoTarget const& cotarget)
-{
-  auto filter_layout = filter(layout);
-  return detail::complement(filter_layout.shape(), filter_layout.stride(), shape(cotarget));
-}
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-complement(Layout<Shape,Stride> const& layout)
-{
-  auto filter_layout = filter(layout);
-  return detail::complement(filter_layout.shape(), filter_layout.stride(), cosize(filter_layout));
-}
-
-//
-// Right-Inverse and Left-Inverse
-//
-
-namespace detail {
-
-template <int NextStride, class Shape, class Stride, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-inverse_seq(Shape const& shape, Stride const& stride, seq<Is...>)
-{
-  auto next_I = cute::find_if(stride, [](auto a) { return is_constant<NextStride, decltype(a)>{}; });
-
-  if constexpr (next_I == decltype(rank(stride))::value) {
-    // If not found, return current seq
-    return seq<Is...>{};
-  } else {
-    // auto next_stride = get<next_I>(shape) * get<next_I>(stride);
-    // NOTE: Needed for g++-7
-    using next_stride = decltype(get<next_I>(shape) * get<next_I>(stride));
-
-    if constexpr (is_static<next_stride>::value && !is_constant<NextStride, next_stride>::value) {
-      // If next_stride is static and unique, then continue
-      return inverse_seq<next_stride::value>(shape, stride, seq<Is..., next_I>{});
-    } else {
-      // Else return current seq + next_I
-      return seq<Is..., next_I>{};
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-//
-// Build the right-inverse of a layout
-// @pre is_static<Layout>
-// @result A layout @a result such that
-//    @a layout(@a result(i)) == i for all i < size(@a result)
-// @result A layout @a result such that
-//    composition(@a layout, @a result) is identical to make_layout(shape(result))
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-right_inverse(Layout<Shape,Stride> const& layout)
-{
-  auto flat_layout = coalesce(layout);
-  auto astride = transform_leaf(flat_layout.stride(), abs_fn{});
-
-  // Find Int<1>{}, the starting stride, and follow the strides to gen inverse_seq
-  [[maybe_unused]] auto iseq = detail::inverse_seq<1>(flat_layout.shape(), astride, seq<>{});
-
-  if constexpr (iseq.size() == 0) {
-    return Layout<_1,_0>{};     // Empty case, nothing found
-  } else {
-    // Generate the corresponding new strides and construct
-    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
-    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
-                       unwrap(transform(iseq, [&](auto i) { return signum(stride<i>(flat_layout)) * get<i>(rstride); })));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-CUTE_HOST_DEVICE constexpr
-auto
-right_inverse(Underscore const& _)
-{
-  return _;
-}
-
-//
-// Build the left-inverse of a layout
-// @pre is_static<Layout>
-// @pre @a layout is an injective function
-// @result A layout @a result such that
-//    @a result(@a layout(i)) == i for all i < size(@a layout)
-// @result A layout @a result such that
-//    composition(@a result, @a layout) is identical to make_layout(shape(layout))
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-left_inverse(Layout<Shape,Stride> const& layout)
-{
-  return right_inverse(make_layout(layout, complement(layout)));
-}
-
-CUTE_HOST_DEVICE constexpr
-auto
-left_inverse(Underscore const& _)
-{
-  return _;
-}
-
-//
-// Max Common Layout
-//
-
-/* Return a layout that points to the maximum number of contiguous elements
- * that logically correspond in the layouts of @a a and @a b.
- *
- * @returns Layout R
- * @post For all 0 <= i < size(R), a(R(i)) == i and b(R(i)) == i
- */
-template <class ShapeA, class StrideA,
-          class ShapeB, class StrideB>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_layout(Layout<ShapeA,StrideA> const& a,
-                  Layout<ShapeB,StrideB> const& b)
-{
-  Layout inv_b  = right_inverse(b);
-  Layout common = coalesce(composition(a, inv_b));
-
-  // Keep only the static identity component of the common layout
-  if constexpr (is_static<decltype(shape<0>(common))>::value &&
-                is_constant<1, decltype(stride<0>(common))>::value) {
-    // Truncate to the size of the contiguous vector (static stride-1 mode)
-    return composition(inv_b, layout<0>(common));
-  } else {
-    return Layout<_1,_0>{};
-  }
-}
-
-/* Return Int<N> such that N is the maximum number of contiguous elements
- * that logically correspond in the layouts of @a a and @a b.
- *
- * @returns Int<N> with N >= 1
- * @post For all 0 <= n < N, a(b.get_1d_coord(n)) == n
- *       (NOTE: Problems with negative strides/coords in this post-condition)
- */
-template <class ShapeA, class StrideA,
-          class ShapeB, class StrideB>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_vector(Layout<ShapeA,StrideA> const& a,
-                  Layout<ShapeB,StrideB> const& b)
-{
-  Layout common = coalesce(composition(a, right_inverse(b)));
-
-  // Keep only the static identity component of the common layout
-  if constexpr (is_static<decltype(shape<0>(common))>::value &&
-                is_constant<1, decltype(stride<0>(common))>::value) {
-    // Truncate to the size of the contiguous vector (static stride-1 mode)
-    return shape<0>(common);
-  } else {
-    return Int<1>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/* Return a layout that distributes ShapeB over ShapeA.
- *
- * @returns Layout result
- * @post evenly_divides(@a b, size(@a result))
- * @post evenly_divides(@a a, @a result)
- * @post For all i,j in [0,size(@a result)) with i < j, @a result(i) < @a result(j). Surjective and Ordered.
- * @post composition(make_layout(shape(@a a)), @a result) is admissible
- * \code
- *   // Note that 6 does not divide this shape
- *   Layout layoutA = Layout<Shape<Int<15>,Int<14>>>{};
- *
- *   // Want to tile any 6 elements and don't care where they come from
- *   Layout dist = domain_distribute(layoutA, Int<6>{});   // (_3,_2):(_1,_15)
- *
- *   // Not guaranteed to find all 6 though...
- *   CUTE_STATIC_ASSERT_V(Int<6>{} == size(dist));
- *
- *   Layout result = zipped_divide(layoutA, dist);         // (_6,Rest)
- * \endcode
- */
-template <class ShapeA, class ShapeB>
-CUTE_HOST_DEVICE constexpr
-auto
-domain_distribute(ShapeA const& a, ShapeB const& b)
-{
-  static_assert(is_integral<ShapeB>::value);
-  static_assert(is_static<ShapeB>::value);
-
-  auto flat_shape_a = flatten(shape(a));
-
-  static_assert(is_static<decltype(flat_shape_a)>::value);
-
-  // Compute the shape of the result
-  auto [result_shape, b_rest] = cute::fold(flat_shape_a, cute::make_tuple(cute::tuple<>{}, size(b)), [](auto init, auto a_) {
-    auto [result, b_] = init;
-    auto gcd_ = gcd(a_, b_);
-    return cute::make_tuple(append(result, gcd_), b_ / gcd_);
-  });
-
-  // Compute the stride of the result
-  auto result_stride = compact_major<LayoutLeft>(flat_shape_a);
-
-  return coalesce(make_layout(result_shape, result_stride));
-}
-
-//
-// Kernel (Nullspace) of a Layout
-//
-
-namespace detail {
-
-template <int NextI, class Stride, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-nullspace_seq(Stride const& stride, seq<Is...>)
-{
-  if constexpr (NextI == rank_v<Stride>) {
-    return seq<Is...>{};
-  } else
-  if constexpr (is_constant<0, decltype(get<NextI>(stride))>::value) {
-    return detail::nullspace_seq<NextI+1>(stride, seq<Is..., NextI>{});
-  } else {
-    return detail::nullspace_seq<NextI+1>(stride, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-//
-// Build the nullspace of a layout
-// @result A layout @a result such that
-//    size(@a result) == size(@a layout) / size(filter(@a layout))
-//    @a layout(@a result(i)) == 0 for all i < size(@a result)
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-nullspace(Layout<Shape,Stride> const& layout)
-{
-  auto flat_layout = flatten(layout);
-
-  auto iseq = detail::nullspace_seq<0>(flat_layout.stride(), seq<>{});
-
-  if constexpr (iseq.size() == 0) {
-    return Layout<_1,_0>{};     // Empty case, nothing found
-  } else {
-    // Generate the corresponding new strides and construct
-    auto rstride = compact_major<LayoutLeft>(flat_layout.shape());
-    return make_layout(unwrap(transform(iseq, [&](auto i) { return shape<i>(flat_layout); })),
-                       unwrap(transform(iseq, [&](auto i) { return get<i>(rstride); })));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Zip
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(Layout<Shape,Stride> const& layout)
-{
-  return make_layout(zip(layout.shape()),
-                     zip(layout.stride()));
-}
-
-template <class TShape, class TStride,
-          class UShape, class UStride>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(Layout<TShape,TStride> const& layoutA,
-    Layout<UShape,UStride> const& layoutB)
-{
-  return make_layout(zip(layoutA.shape(),  layoutB.shape()),
-                     zip(layoutA.stride(), layoutB.stride()));
-}
-
-//
-// Tile unzip
-//   Logical product and logical divide (on layouts) produce rank-2 results by design.
-//   Follow the profile of @a tile and zip the rank-2 modes located at the terminals into
-//   their own mode.
-//
-
-template <class LShape, class LStride, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_unzip(Layout<LShape,LStride> const& layout,
-           Tiler                  const& tiler)
-{
-  return make_layout(zip2_by(layout.shape(),  tiler),
-                     zip2_by(layout.stride(), tiler));
-}
-
-//
-// Logical divide
-//
-
-template <class LShape, class LStride,
-          class TShape, class TStride>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_divide(Layout<LShape,LStride> const& layout,
-               Layout<TShape,TStride> const& tiler)
-{
-  return composition(layout, make_layout(tiler, complement(tiler, shape(layout))));
-}
-
-template <class LShape, class LStride, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_divide(Layout<LShape,LStride> const& layout,
-               Tiler                  const& tiler)
-{
-  if constexpr (is_tuple<Tiler>::value) {
-    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_divide: Too many modes in tiler.");
-    return transform_layout(layout, tiler, [](auto const& l, auto const& t) { return logical_divide(l,t); });
-  } else if constexpr (is_underscore<Tiler>::value) {
-    return layout;
-  } else if constexpr (is_integral<Tiler>::value) {
-    return logical_divide(layout, make_layout(tiler));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Generalization of ceil_div for Layout lhs
-//   is effectively the "rest mode" of logical_divide.
-// Occurs in the calculation of gridDim, for example, for generalized tilers
-// Example:
-//   dim3 gridDim(size(ceil_div(problem_shape_M, cta_tiler_M)),
-//                size(ceil_div(problem_shape_N, cta_tiler_N)));
-// This does not consider compositional acceptance, so it may be the case that
-//   ceil_div produces a result while logical_divide (and friends) do not.
-template <class Target, class TShape, class TStride>
-CUTE_HOST_DEVICE constexpr
-auto
-ceil_div(Target                 const& target,
-         Layout<TShape,TStride> const& tiler)
-{
-  return shape(complement(tiler, shape(target)));
-}
-
-//
-// Convenience operator
-//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
-//   by gathering the tile modes and residuals into a rank-2 result.
-//
-
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_divide(Layout<LShape,LStride> const& layout,
-              Tiler                  const& tiler)
-{
-  return tile_unzip(logical_divide(layout, tiler), tiler);
-}
-
-// Same as zipped_divide, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tiled_divide(Layout<LShape,LStride> const& layout,
-             Tiler                  const& tiler)
-{
-  auto result = zipped_divide(layout, tiler);
-
-  auto R1 = rank<1>(result);
-  return result(_, repeat<R1>(_));
-}
-
-// Same as zipped_divide, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-flat_divide(Layout<LShape,LStride> const& layout,
-            Tiler                  const& tiler)
-{
-  auto result = zipped_divide(layout, tiler);
-
-  auto R0 = rank<0>(result);
-  auto R1 = rank<1>(result);
-  return result(repeat<R0>(_), repeat<R1>(_));
-}
-
-//
-// Logical product
-//
-
-template <class LShape, class LStride,
-          class TShape, class TStride>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_product(Layout<LShape,LStride> const& block,
-                Layout<TShape,TStride> const& tiler)
-{
-  return make_layout(block, composition(complement(block, size(block)*cosize(tiler)), tiler));
-}
-
-template <class LShape, class LStride, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_product(Layout<LShape,LStride> const& block,
-                Tiler                  const& tiler)
-{
-  if constexpr (is_tuple<Tiler>::value) {
-    static_assert(tuple_size<Tiler>::value <= Layout<LShape,LStride>::rank, "logical_product: Too many modes in tiler.");
-    return transform_layout(block, tiler, [](auto const& l, auto const& t) { return logical_product(l,t); });
-  } else if constexpr (is_underscore<Tiler>::value) {
-    return block;
-  } else if constexpr (is_integral<Tiler>::value) {
-    return logical_product(block, make_layout(tiler));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Convenience operator
-//   that produces layouts like ((BLK_A,BLK_B,...),(a,b,...,x,y))
-//   by gathering the block modes and products into a rank-2 result.
-//
-
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_product(Layout<LShape,LStride> const& block,
-               Tiler                  const& tiler)
-{
-  return tile_unzip(logical_product(block, tiler), tiler);
-}
-
-// Same as zipped_product, but unpacks the second mode: ((BLK_A,BLK_B,...),a,b,...,x,y)
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tiled_product(Layout<LShape,LStride> const& block,
-              Tiler                  const& tiler)
-{
-  auto result = zipped_product(block, tiler);
-
-  auto R1 = rank<1>(result);
-  return result(_, repeat<R1>(_));
-}
-
-// Same as zipped_product, but unpacks both modes: (BLK_A,BLK_B,...,a,b,...,x,y)
-template <class LShape, class LStride,
-          class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-flat_product(Layout<LShape,LStride> const& block,
-             Tiler                  const& tiler)
-{
-  auto result = zipped_product(block, tiler);
-
-  auto R0 = rank<0>(result);
-  auto R1 = rank<1>(result);
-  return result(repeat<R0>(_), repeat<R1>(_));
-}
-
-//
-// Rank-sensitive products
-//
-
-// blocked_product -- Reproduce a block over a tiler.
-// Think of every element of "tiler" as a "block"
-//   and return the layout of the resulting structure.
-// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
-template <class TShape, class TStride,
-          class UShape, class UStride>
-CUTE_HOST_DEVICE constexpr
-auto
-blocked_product(Layout<TShape,TStride> const& block,
-                Layout<UShape,UStride> const& tiler)
-{
-  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
-
-  auto result = logical_product(append<R>(block), append<R>(tiler));
-
-  return coalesce(zip(get<0>(result), get<1>(result)), tuple_repeat<R>(Int<1>{}));
-}
-
-// raked_product -- Reproduce a block over a tiler with block-interleaving.
-// Think of every element of "tiler" as a "block", interleave those blocks,
-//   and return the layout of the resulting structure.
-// @post rank(@a result) == cute::max(rank(@a block), rank(@a tiler))
-template <class TShape, class TStride,
-          class UShape, class UStride>
-CUTE_HOST_DEVICE constexpr
-auto
-raked_product(Layout<TShape,TStride> const& block,
-              Layout<UShape,UStride> const& tiler)
-{
-  constexpr int R = cute::max(rank_v<TShape>, rank_v<UShape>);
-
-  auto result = logical_product(append<R>(block), append<R>(tiler));
-
-  return coalesce(zip(get<1>(result), get<0>(result)), tuple_repeat<R>(Int<1>{}));
-}
-
-// tile_to_shape -- Perform a product of a layout so that the result matches a target shape.
-// This is similar to blocked_product, but specifies the result shape instead of the
-//   product shape, which is more convenient in certain circumstances.
-// @param block The layout to repeat
-// @param trg_shape The target shape of the result
-// @param ord_shape The order of the modes of @a trg_shape to tile @a layout with.
-//                  Defaults to GenColMajor, so @a layout will repeat
-//                    across the first mode first, the second mode second, etc
-//                  E.g. Step<_2,_1,_3> will cause @a layout to repeat
-//                    across the second mode first, the first mode second, and the third mode last.
-// @pre rank(@a block) <= rank(@a trg_shape)
-// @post compatible(@a trg_shape, shape(@a result))
-template <class Shape, class Stride,
-          class TrgShape, class ModeOrder = LayoutLeft>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_to_shape(Layout<Shape,Stride> const& block,
-              TrgShape             const& trg_shape,
-              ModeOrder            const& ord_shape = {})
-{
-  CUTE_STATIC_ASSERT_V(rank(block) <= rank(trg_shape), "Rank of layout must be <= rank of target shape.");
-  constexpr int R = rank_v<TrgShape>;
-
-  auto padded_block = append<R>(block);
-
-  auto block_shape  = product_each(shape(padded_block));
-  auto target_shape = product_each(shape(trg_shape));
-
-  // Assert proper division
-  if constexpr (is_static<decltype(target_shape)>::value) {
-    CUTE_STATIC_ASSERT_V(evenly_divides(target_shape, block_shape),
-                         "tile_to_shape: block shape does not divide the target shape.");
-  }
-
-  auto product_shape = ceil_div(target_shape, block_shape);
-
-  return coalesce(blocked_product(padded_block, make_ordered_layout(product_shape, ord_shape)), product_shape);
-}
-
-//
-// Upcast
-//   For stride-1 mode, divide size by N. Divide all other strides by N.
-//
-
-template <int N, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Shape const& shape, Stride const& stride)
-{
-  if constexpr (is_tuple<Shape>::value) {                  // tuple stride
-    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N>(s,d); });
-  } else if constexpr (is_constant<0, Stride>::value) {    // static-0 stride
-    return Layout<Shape,Stride>{shape,stride};
-  } else if constexpr (is_static<Stride>::value) {         // static stride
-    return make_layout(shape_div(shape,  shape_div(Int<N>{}, abs(stride))),
-                       shape_div(stride, Int<N>{}));
-  } else {                                                 // dynamic stride
-    // assume dynamic strides are larger than N and divisible
-    // assert(stride % N == 0);
-    return make_layout(shape, safe_div(stride, Int<N>{}));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Layout<Shape,Stride> const& layout)
-{
-  return upcast<N>(layout.shape(), layout.stride());
-}
-
-//
-// Downcast
-//   For stride-1 mode, multiply size by N. Multiply all other strides by N.
-//
-
-template <int N, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(Shape const& shape, Stride const& stride)
-{
-  if constexpr (is_tuple<Shape>::value) {
-    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return downcast<N>(s,d); });
-  } else if constexpr (is_constant<1, Stride>::value || is_constant<-1, Stride>::value) {
-    return make_layout(shape * Int<N>{}, stride);
-  } else {
-    return make_layout(shape, stride * Int<N>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(Layout<Shape,Stride> const& layout)
-{
-  CUTE_STATIC_ASSERT(has_int1<Stride>::value, "Downcast requires adjacent elements");
-  return downcast<N>(layout.shape(), layout.stride());
-}
-
-//
-// Recast
-//
-
-template <class OldType, class NewType,
-          class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_layout(Layout<Shape,Stride> const& layout)
-{
-  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
-  if constexpr (scale::num == 1 && scale::den == 1) {
-    return layout;
-  }
-  else if constexpr (scale::num == 1) {
-    return downcast<scale::den>(layout);
-  }
-  else if constexpr (scale::den == 1) {
-    return upcast<scale::num>(layout);
-  }
-  else {
-    static_assert(dependent_false<scale>, "Recast not supported.");
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Determine the maximum alignment of a Layout.
-// The maximum alignment is the largest N for which upcast<N>(layout) will compile.
-//   upcast<N>(layout) compiles when the static shapes and strides pass divisibility checks.
-//   Therefore, upcast<M>(layout) will also compile for all divisors M of N.
-// Note that this only considers the static shapes and strides of the Layout
-//   in symmetry with upcast<N> only checking against static shapes and strides and assuming all
-//   dynamic shapes and strides are large and multiples of N.
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-max_alignment(Layout<Shape,Stride> const& layout)
-{
-  auto flat_layout   = coalesce(layout);
-  auto static_shape  = transform( shape(flat_layout), [](auto s){ return conditional_return<is_static<decltype(s)>::value>(s, Int<1>{}); });
-  auto static_stride = transform(stride(flat_layout), [](auto d){ return conditional_return<is_static<decltype(d)>::value>(d, Int<0>{}); });
-  auto filter_layout = make_layout(static_shape, static_stride);
-  auto permuted = logical_divide(filter_layout, right_inverse(filter_layout));
-  return gcd(size<0>(permuted), stride<1>(permuted));
-}
-
-//
-// Display utilities
-//
-
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE void print(Layout<Shape,Stride> const& layout)
-{
-  print(layout.shape()); print(":"); print(layout.stride());
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class Shape, class Stride>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, Layout<Shape,Stride> const& layout)
-{
-  return os << shape(layout) << ":" << stride(layout);
-}
-#endif
-
-// Generic 2D Layout to console table
-template <class Layout>
-CUTE_HOST_DEVICE
-void
-print_layout(Layout const& layout)  // (m,n) -> idx
-{
-  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
-
-  int idx_width = num_digits(cosize(layout)) + 2;
-  const char* delim = "+-----------------------";
-
-  print(layout); print("\n");
-
-  // Column indices
-  print("    ");
-  for (int n = 0; n < size<1>(layout); ++n) { printf("  %*d ", idx_width-2, n); }
-  printf("\n");
-
-  // Print out A m-by-n
-  for (int m = 0; m < size<0>(layout); ++m) {
-    // Header
-    print("    ");
-    for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
-    printf("+\n");
-    // Values
-    printf("%2d  ", m);  // Row indices
-    for (int n = 0; n < size<1>(layout); ++n) { printf("| %*d ", idx_width-2, int(layout(m,n))); }
-    printf("|\n");
-  }
-  // Footer
-  print("    ");
-  for (int n = 0; n < size<1>(layout); ++n) { printf("%.*s", idx_width+1, delim); }
-  printf("+\n");
-}
-
-// Generic ThrVal 2D Layout to console table
-template <class Layout, class ThrID>
-CUTE_HOST_DEVICE
-void
-print_layout(Layout const& layout, ThrID const& thrid)  // (m,n) -> (tid,vid)  and  tid -> thr_idx
-{
-  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
-
-  print(layout); print("\n");
-  print(thrid);  print("\n");
-
-  // Print out m-by-n
-  for (int m = 0; m < size<0>(layout); ++m) {
-    // Header
-    for (int n = 0; n < size<1>(layout); ++n) printf("+------");
-    printf("+\n");
-    // Values
-    for (int n = 0; n < size<1>(layout); ++n) printf("|%03d-%02d", int(thrid(layout(m,n) % size(thrid))), int(layout(m,n) / size(thrid)));
-    printf("|\n");
-  }
-  // Footer
-  for (int n = 0; n < size<1>(layout); ++n) printf("+------");
-  printf("+\n");
-}
-
-struct TikzColor_White {
-  CUTE_HOST_DEVICE char const*
-  operator()(int idx) const {
-    return "white";
-  }
-};
-
-struct TikzColor_BWx8 {
-  CUTE_HOST_DEVICE char const*
-  operator()(int idx) const {
-    static char const* color_map[8] = {"black!00", "black!40", "black!20", "black!60",
-                                       "black!10", "black!50", "black!30", "black!70"};
-    return color_map[idx % 8];
-  }
-};
-
-struct TikzColor_TV {
-  CUTE_HOST_DEVICE char const*
-  operator()(int tid, int vid) const {
-    static char const* color_map[8] = {"{rgb,255:red,175;green,175;blue,255}",
-                                       "{rgb,255:red,175;green,255;blue,175}",
-                                       "{rgb,255:red,255;green,255;blue,175}",
-                                       "{rgb,255:red,255;green,175;blue,175}",
-                                       "{rgb,255:red,210;green,210;blue,255}",
-                                       "{rgb,255:red,210;green,255;blue,210}",
-                                       "{rgb,255:red,255;green,255;blue,210}",
-                                       "{rgb,255:red,255;green,210;blue,210}"};
-    return color_map[tid % 8];
-  }
-};
-
-// Generic 2D Layout to LaTeX printer
-template <class LayoutA, class TikzColorFn = TikzColor_BWx8>
-CUTE_HOST_DEVICE
-void
-print_latex(LayoutA const& layout_a,   // (m,n) -> idx
-            TikzColorFn color = {})    // lambda(idx) -> tikz color string
-{
-  CUTE_STATIC_ASSERT_V(rank(layout_a) <= Int<2>{});
-  auto layout = append<2>(layout_a, Layout<_1,_0>{});
-
-  // Commented print(layout)
-  printf("%% Layout: "); print(layout); printf("\n");
-  // Header
-  printf("\\documentclass[convert]{standalone}\n"
-         "\\usepackage{tikz}\n\n"
-         "\\begin{document}\n"
-         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
-
-  // Layout
-  for (int i = 0; i < size<0>(layout); ++i) {
-    for (int j = 0; j < size<1>(layout); ++j) {
-      int idx = layout(i,j);
-      printf("\\node[fill=%s] at (%d,%d) {%d};\n",
-             color(idx), i, j, idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
-         int(size<0>(layout)), int(size<1>(layout)));
-  // Labels
-  for (int i =  0, j = -1; i < size<0>(layout); ++i) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
-  }
-  for (int i = -1, j =  0; j < size<1>(layout); ++j) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
-  }
-
-  // Footer
-  printf("\\end{tikzpicture}\n"
-         "\\end{document}\n");
-}
-
-// Generic ThrVal 2D Layout to LaTeX TikZ
-template <class Layout, class ThrID, class TikzColorFn = TikzColor_TV>
-CUTE_HOST_DEVICE
-void
-print_latex(Layout const& layout,    // (m,n) -> (tid,vid)
-            ThrID  const& thr,       // tid -> thr_idx
-            TikzColorFn color = {})  // lambda(thr_idx,val_idx) -> tikz color string
-{
-  CUTE_STATIC_ASSERT_V(rank(layout) == Int<2>{});
-
-  // Commented prints
-  printf("%% Layout: "); print(layout); printf("\n");
-  printf("%% ThrID : "); print(thr);  printf("\n");
-  // Header
-  printf("\\documentclass[convert]{standalone}\n"
-         "\\usepackage{tikz}\n\n"
-         "\\begin{document}\n"
-         "\\begin{tikzpicture}[x={(0cm,-1cm)},y={(1cm,0cm)},every node/.style={minimum size=1cm, outer sep=0pt}]\n\n");
-
-  // Layout
-  for (int i = 0; i < size<0>(layout); ++i) {
-    for (int j = 0; j < size<1>(layout); ++j) {
-      int thrid   = layout(i,j) % size(thr);
-      int val_idx = layout(i,j) / size(thr);
-      int thr_idx = thr(thrid);
-
-      printf("\\node[fill=%s] at (%d,%d) {\\shortstack{T%d \\\\ V%d}};\n",
-             color(thr_idx, val_idx),
-             i, j,
-             thr_idx, val_idx);
-    }
-  }
-  // Grid
-  printf("\\draw[color=black,thick,shift={(-0.5,-0.5)}] (0,0) grid (%d,%d);\n\n",
-         int(size<0>(layout)), int(size<1>(layout)));
-  // Labels
-  for (int i = 0, j = -1; i < size<0>(layout); ++i) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, i);
-  }
-  for (int j = 0, i = -1; j < size<1>(layout); ++j) {
-    printf("\\node at (%d,%d) {\\Large{\\texttt{%d}}};\n", i, j, j);
-  }
-
-  // Footer
-  printf("\\end{tikzpicture}\n"
-         "\\end{document}\n");
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/layout_composed.hpp b/lightllm-kernel/cutlass/include/cute/layout_composed.hpp
deleted file mode 100755
index 3e5f83627..000000000
--- a/lightllm-kernel/cutlass/include/cute/layout_composed.hpp
+++ /dev/null
@@ -1,652 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE, CUTE_GCC_UNREACHABLE
-#include <cute/layout.hpp>                     // cute::tuple
-#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type, cute::Int
-
-/* This implements a ComposedLayout of the form
- *   LayoutA o Offset o LayoutB
- * and is useful in cases where composition() does not or cannot apply to LayoutA and LayoutB.
- * For example, when the "divisibility condition" in shape_div is violated in composition(LayoutA, LayoutB).
- *
- * This ComposedLayout provides similar functionality to Layout including tiling, partitioning,
- * coordinate-to-index mapping and layout manipulations, but is not considered a "normal" layout.
- * For example, this layout provides shape() and size() functions, but does not provide stride() functions.
- * Mostly, the similar functionality is accomplished by applying each operation to LayoutB only
- * as LayoutB defines the domain.
- */
-
-namespace cute
-{
-
-// A Layout of non-trivially composable functions: F o I o L
-template <class LayoutA, class Offset, class LayoutB>
-struct ComposedLayout : private cute::tuple<LayoutA, Offset, LayoutB>  // EBO for static layouts
-{
-  CUTE_HOST_DEVICE constexpr
-  ComposedLayout(LayoutA const& layoutA = {},
-                 Offset  const& offset  = {},
-                 LayoutB const& layoutB = {})
-      : cute::tuple<LayoutA, Offset, LayoutB>(layoutA, offset, layoutB)
-  {}
-
-  //
-  // Accessors
-  //
-
-  static constexpr int rank  = LayoutB::rank;
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout_a() const {
-    return get<0>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  offset() const {
-    return get<1>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout_b() const {
-    return get<2>(static_cast<cute::tuple<LayoutA, Offset, LayoutB> const&>(*this));
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout() const {
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  shape() const {
-    return layout_b().shape();
-  }
-
-  // Doesn't really make sense to ask for the strides of this "layout"
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  stride() const = delete;
-
-  //
-  // Mappings
-  //
-
-  // Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
-  // OR
-  // Slice the layout and return the sublayout (Coord has an Underscore slice op)
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coord const& coord) const {
-    if constexpr (has_underscore<Coord>::value) {
-      return slice(coord, *this);
-    } else {
-      return layout_a()(offset() + layout_b()(coord));    // (A o O o B)(c)
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  // Convenience function for multi-dimensional coordinates
-  template <class Coord0, class Coord1, class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
-    return operator()(make_coord(c0,c1,cs...));
-  }
-
-  //
-  // Compose
-  //
-
-  template <class OtherLayout>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(OtherLayout const& other) const {
-    return composition(*this, other);
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(Layouts const&... layouts) const {
-    return composition(*this, make_tile(layouts...));
-  }
-
-  template <class OtherShape>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  with_shape(OtherShape const& shape) const {
-    return composition(*this, make_layout(shape));
-  }
-
-  template <class... Shapes>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  with_shape(Shapes const&... shapes) const {
-    return composition(*this, make_layout(make_shape(shapes...)));
-  }
-
-  //
-  // Tile
-  //
-
-  template <class OtherLayout>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(OtherLayout const& other) const {
-    return tiled_divide(*this, other);
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(Layouts const&... layouts) const {
-    return tiled_divide(*this, make_tile(layouts...));
-  }
-
-  // Equality, return a static or dynamic boolean
-  template <class... Args>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator==(ComposedLayout<Args...> const& other) const {
-    return this->layout_a() == other.layout_a() &&
-           this->layout_b() == other.layout_b() &&
-           this->offset()   == other.offset();
-  }
-};
-
-template <class A, class O, class B>
-struct is_layout<ComposedLayout<A,O,B>> : true_type {};
-
-template <class T>
-struct is_composed_layout : false_type {};
-template <class A, class O, class B>
-struct is_composed_layout<ComposedLayout<A,O,B>> : true_type {};
-
-//
-// Constructors
-//
-
-template <class LayoutA, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-make_composed_layout(LayoutA const& layoutA,
-                     Offset  const& offset,
-                     LayoutB const& layoutB)
-{
-  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
-}
-
-//
-// Utilities
-//
-
-// Return the layout of a mode
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-layout(ComposedLayout<A,O,B> const& clayout)
-{
-  return composition(clayout.layout_a(), clayout.offset(), layout<Is...>(clayout.layout_b()));
-}
-
-// Return the shape of a mode
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-shape(ComposedLayout<A,O,B> const& layout)
-{
-  return shape<Is...>(layout.layout_b());
-}
-
-// Doesn't make sense to directly ask for the strides of this "layout"
-template <int... Is, class Fn, class O, class Layout>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-stride(ComposedLayout<Fn,O,Layout> const& layout) = delete;
-
-// Return the number of elements in a mode
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-size(ComposedLayout<A,O,B> const& layout)
-{
-  return size<Is...>(layout.layout_b());
-}
-
-// Return the number of modes
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-rank(ComposedLayout<A,O,B> const& layout)
-{
-  return rank<Is...>(layout.layout_b());
-}
-
-// Return the depth of the layout
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-depth(ComposedLayout<A,O,B> const& layout)
-{
-  return depth<Is...>(layout.layout_b());
-}
-
-// Return the codomain size of a mode
-template <int... Is, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-cosize(ComposedLayout<A,O,B> const& layout)
-{
-  return cosize<Is...>(layout.layout_b());
-}
-
-//
-// Operations to manipulate Layouts like a tuple of pairs
-//
-
-template <size_t I, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-get(ComposedLayout<A,O,B> const& a)
-{
-  return composition(a.layout_a(), a.offset(), get<I>(a.layout_b()));
-}
-
-template <int Begin, int End, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-take(ComposedLayout<A,O,B> const& a)
-{
-  return composition(a.layout_a(), a.offset(), take<Begin,End>(a.layout_b()));
-}
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(ComposedLayout<A,O,B> const& a)
-{
-  return composition(a.layout_a(), a.offset(), flatten(a.layout_b()));
-}
-
-template <int N, class A, class O, class B, class X>
-CUTE_HOST_DEVICE constexpr
-auto
-append(ComposedLayout<A,O,B> const& a, X const& x)
-{
-  return composition(a.layout_a(), a.offset(), append<N>(a.layout_b(), x));
-}
-
-template <int Begin, int End, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-group(ComposedLayout<A,O,B> const& a)
-{
-  return composition(a.layout_a(), a.offset(), group<Begin,End>(a.layout_b()));
-}
-
-//
-// Slice a ComposedLayout
-//
-
-template <class Coord, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-slice_and_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
-{
-  auto [slice, offset] = slice_and_offset(coord, layout.layout_b());
-  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + offset, slice}, Int<0>{});
-}
-
-template <class Coord, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-slice(Coord const& coord, ComposedLayout<A,O,B> const& layout)
-{
-  return get<0>(slice_and_offset(coord, layout));
-}
-
-// Compute a pointer offset and (potentially modified) layout from a coordinate
-// For composed layout tensors the offset is accumulated in the layout itself while pointer is not updated
-template <class Coord, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-domain_offset(Coord const& coord, ComposedLayout<A,O,B> const& layout)
-{
-  return cute::make_tuple(ComposedLayout{layout.layout_a(), layout.offset() + layout.layout_b()(coord), layout.layout_b()}, Int<0>{});
-}
-
-//
-// composition
-//
-
-template <class LayoutA,
-          class Offset,
-          class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(LayoutA const& layoutA,
-            Offset  const& offset,
-            LayoutB const& layoutB)
-{
-  return ComposedLayout<LayoutA, Offset, LayoutB>{layoutA, offset, layoutB};
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(ComposedLayout<A,O,B> const& a,
-            Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), composition(a.layout_b(), b));
-}
-
-template <class ShapeA, class StrideA,
-          class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Layout<ShapeA,StrideA> const& a,
-            ComposedLayout<A,O,B>  const& b)
-{
-  CUTE_STATIC_ASSERT_V(b.offset() == Int<0>{}, "Require offset == 0.");
-
-  return composition(composition(a, b.layout_a()), b.layout_b());
-}
-
-//
-// complement
-//
-
-template <class A, class O, class B, class CoTarget>
-CUTE_HOST_DEVICE constexpr
-auto
-complement(ComposedLayout<A,O,B> const& layout, CoTarget const& cotarget)
-{
-  return complement(layout.layout_b(), cotarget);
-}
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-complement(ComposedLayout<A,O,B> const& layout)
-{
-  return complement(layout, cosize(layout));
-}
-
-//
-// inverse
-//
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-right_inverse(ComposedLayout<A,O,B> const& layout)
-{
-  return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
-}
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-left_inverse(ComposedLayout<A,O,B> const& layout)
-{
-  return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
-}
-
-//
-// Other operations
-//
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-zip(ComposedLayout<A,O,B> const& a)
-{
-  return composition(a.layout_a(), a.offset(), zip(a.layout_b()));
-}
-
-// Partitions
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_divide(ComposedLayout<A,O,B> const& a,
-               Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), logical_divide(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_unzip(ComposedLayout<A,O,B> const& a,
-           Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), tile_unzip(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tiled_divide(ComposedLayout<A,O,B> const& a,
-             Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), tiled_divide(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_divide(ComposedLayout<A,O,B> const& a,
-              Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), zipped_divide(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-flat_divide(ComposedLayout<A,O,B> const& a,
-            Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), flat_divide(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_product(ComposedLayout<A,O,B> const& a,
-                Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), logical_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_product(ComposedLayout<A,O,B> const& a,
-               Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), zipped_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-tiled_product(ComposedLayout<A,O,B> const& a,
-              Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), tiled_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-flat_product(ComposedLayout<A,O,B> const& a,
-             Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), flat_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-blocked_product(ComposedLayout<A,O,B> const& a,
-                Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), blocked_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-raked_product(ComposedLayout<A,O,B> const& a,
-              Tiler                 const& b)
-{
-  return composition(a.layout_a(), a.offset(), raked_product(a.layout_b(), b));
-}
-
-template <class A, class O, class B,
-          class Shape, class ModeOrder = GenColMajor>
-CUTE_HOST_DEVICE constexpr
-auto
-tile_to_shape(ComposedLayout<A,O,B> const& layout,
-              Shape                 const& trg_shape,
-              ModeOrder             const& ord_shape = {})
-{
-  return composition(layout.layout_a(), layout.offset(), tile_to_shape(layout.layout_b(), trg_shape, ord_shape));
-}
-
-template <class A, class O, class B,
-          class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
-{
-  return composition(layout.layout_a(), layout.offset(), filter(layout.layout_b(), trg_profile));
-}
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(ComposedLayout<A,O,B> const& layout)
-{
-  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b()));
-}
-
-template <class A, class O, class B, class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(ComposedLayout<A,O,B> const& layout, Shape const& trg_profile)
-{
-  return composition(layout.layout_a(), layout.offset(), coalesce(layout.layout_b(), trg_profile));
-}
-
-
-//
-// Upcast and Downcast
-//
-
-template <int N, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<A,O,B> const& layout)
-{
-  return composition(upcast<N>(layout.layout_a()), upcast<N>(layout.offset()), upcast<N>(layout.layout_b()));
-}
-
-template <int N, class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(ComposedLayout<A,O,B> const& layout)
-{
-  return composition(downcast<N>(layout.layout_a()), downcast<N>(layout.offset()), downcast<N>(layout.layout_b()));
-}
-
-
-template <class OldType, class NewType,
-          class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_layout(ComposedLayout<A,O,B> const& layout)
-{
-  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
-  if constexpr (scale::num == 1 && scale::den == 1) {
-    return layout;
-  }
-  else if constexpr (scale::num == 1) {
-    return downcast<scale::den>(layout);
-  }
-  else if constexpr (scale::den == 1) {
-    return upcast<scale::num>(layout);
-  }
-  else {
-    static_assert(dependent_false<scale>, "Recast not supported.");
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-max_alignment(ComposedLayout<A,O,B> const& layout)
-{
-  // Do not attempt for general ComposedLayouts
-  //return gcd(max_alignment(layout.layout_a()), max_alignment(layout.offset()), max_alignment(layout.layout_b()));
-  return Int<1>{};
-}
-
-//
-// Display utilities
-//
-
-template <class A, class O, class B>
-CUTE_HOST_DEVICE void print(ComposedLayout<A,O,B> const& layout)
-{
-  print(layout.layout_a()); print(" o "); print(layout.offset()); print(" o "); print(layout.layout_b());
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class A, class O, class B>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, ComposedLayout<A,O,B> const& layout)
-{
-  return os << layout.layout_a() << " o " << layout.offset() << " o " << layout.layout_b();
-}
-#endif
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp b/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
deleted file mode 100755
index 2e4690571..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp
+++ /dev/null
@@ -1,556 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-#include <cute/container/tuple.hpp>
-#include <cute/numeric/integral_constant.hpp>
-#include <cute/algorithm/functional.hpp>
-#include <cute/algorithm/tuple_algorithms.hpp>
-#include <cute/util/type_traits.hpp>
-
-namespace cute
-{
-
-template <class... T>
-struct ArithmeticTuple : tuple<T...>
-{
-  template <class... U>
-  CUTE_HOST_DEVICE constexpr
-  ArithmeticTuple(ArithmeticTuple<U...> const& u)
-    : tuple<T...>(static_cast<tuple<U...> const&>(u)) {}
-
-  template <class... U>
-  CUTE_HOST_DEVICE constexpr
-  ArithmeticTuple(tuple<U...> const& u)
-    : tuple<T...>(u) {}
-
-  template <class... U>
-  CUTE_HOST_DEVICE constexpr
-  ArithmeticTuple(U const&... u)
-    : tuple<T...>(u...) {}
-};
-
-template <class... T>
-struct is_tuple<ArithmeticTuple<T...>> : true_type {};
-
-template <class... Ts>
-struct is_flat<ArithmeticTuple<Ts...>> : is_flat<tuple<Ts...>> {};
-
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_arithmetic_tuple(T const&... t) {
-  return ArithmeticTuple<T...>(t...);
-}
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-as_arithmetic_tuple(T const& t) {
-  if constexpr (is_tuple<T>::value) {
-    return detail::tapply(t, [](auto const& x){ return as_arithmetic_tuple(x); },
-                          [](auto const&... a){ return make_arithmetic_tuple(a...); },
-                          tuple_seq<T>{});
-  } else {
-    return t;
-  }
-}
-
-//
-// Numeric operators
-//
-
-// Addition
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
-  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
-  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), plus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
-}
-
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
-  return t + ArithmeticTuple<U...>(u);
-}
-
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
-  return ArithmeticTuple<T...>(t) + u;
-}
-
-// Subtraction
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator-(ArithmeticTuple<T...> const& t, ArithmeticTuple<U...> const& u) {
-  constexpr int R = cute::max(int(sizeof...(T)), int(sizeof...(U)));
-  return transform_apply(append<R>(t,Int<0>{}), append<R>(u,Int<0>{}), minus{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
-}
-
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator-(ArithmeticTuple<T...> const& t, tuple<U...> const& u) {
-  return t - ArithmeticTuple<U...>(u);
-}
-
-template <class... T, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator-(tuple<T...> const& t, ArithmeticTuple<U...> const& u) {
-  return ArithmeticTuple<T...>(t) - u;
-}
-
-// Negation
-template <class... T>
-CUTE_HOST_DEVICE constexpr
-auto
-operator-(ArithmeticTuple<T...> const& t) {
-  return transform_apply(t, negate{}, [](auto const&... a){ return make_arithmetic_tuple(a...); });
-}
-
-//
-// Special cases
-//
-
-template <auto t, class... U>
-CUTE_HOST_DEVICE constexpr
-ArithmeticTuple<U...> const&
-operator+(C<t>, ArithmeticTuple<U...> const& u) {
-  static_assert(t == 0, "Arithmetic tuple op+ error!");
-  return u;
-}
-
-template <class... T, auto u>
-CUTE_HOST_DEVICE constexpr
-ArithmeticTuple<T...> const&
-operator+(ArithmeticTuple<T...> const& t, C<u>) {
-  static_assert(u == 0, "Arithmetic tuple op+ error!");
-  return t;
-}
-
-template <auto t, class... U>
-CUTE_HOST_DEVICE constexpr
-ArithmeticTuple<U...> const&
-operator-(C<t>, ArithmeticTuple<U...> const& u) {
-  static_assert(t == 0, "Arithmetic tuple op- error!");
-  return -u;
-}
-
-template <class... T, auto u>
-CUTE_HOST_DEVICE constexpr
-ArithmeticTuple<T...> const&
-operator-(ArithmeticTuple<T...> const& t, C<u>) {
-  static_assert(u == 0, "Arithmetic tuple op- error!");
-  return t;
-}
-
-//
-// ArithmeticTupleIterator
-//
-
-template <class ArithTuple>
-struct ArithmeticTupleIterator
-{
-  using value_type   = ArithTuple;
-  using element_type = ArithTuple;
-  using reference    = ArithTuple;
-
-  ArithTuple coord_;
-
-  CUTE_HOST_DEVICE constexpr
-  ArithmeticTupleIterator(ArithTuple const& coord = {}) : coord_(coord) {}
-
-  CUTE_HOST_DEVICE constexpr
-  ArithTuple operator*() const { return coord_; }
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto operator[](Coord const& c) const { return *(*this + c); }
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto operator+(Coord const& c) const {
-    return ArithmeticTupleIterator<remove_cvref_t<decltype(coord_ + c)>>(coord_ + c);
-  }
-};
-
-template <class Tuple>
-CUTE_HOST_DEVICE constexpr
-auto
-make_inttuple_iter(Tuple const& t) {
-  return ArithmeticTupleIterator(as_arithmetic_tuple(t));
-}
-
-template <class T0, class T1, class... Ts>
-CUTE_HOST_DEVICE constexpr
-auto
-make_inttuple_iter(T0 const& t0, T1 const& t1, Ts const&... ts) {
-  return make_inttuple_iter(cute::make_tuple(t0, t1, ts...));
-}
-
-//
-// ArithmeticTuple "basis" elements
-//   A ScaledBasis<T,N> is a (at least) rank-N+1 ArithmeticTuple:
-//      (_0,_0,...,T,_0,...)
-//   with value T in the Nth mode
-
-template <class T, int N>
-struct ScaledBasis : private tuple<T>
-{
-  CUTE_HOST_DEVICE constexpr
-  ScaledBasis(T const& t = {}) : tuple<T>(t) {}
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) value()       { return get<0>(static_cast<tuple<T>      &>(*this)); }
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto) value() const { return get<0>(static_cast<tuple<T> const&>(*this)); }
-
-  CUTE_HOST_DEVICE static constexpr
-  auto mode() { return Int<N>{}; }
-};
-
-template <class T>
-struct is_scaled_basis : false_type {};
-template <class T, int N>
-struct is_scaled_basis<ScaledBasis<T,N>> : true_type {};
-
-template <class T, int N>
-struct is_integral<ScaledBasis<T,N>> : true_type {};
-
-// Get the scalar T out of a ScaledBasis
-template <class SB>
-CUTE_HOST_DEVICE constexpr auto
-basis_value(SB const& e)
-{
-  if constexpr (is_scaled_basis<SB>::value) {
-    return basis_value(e.value());
-  } else {
-    return e;
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Apply the N... pack to another Tuple
-template <class SB, class Tuple>
-CUTE_HOST_DEVICE decltype(auto)
-basis_get(SB const& e, Tuple&& t)
-{
-  if constexpr (is_scaled_basis<SB>::value) {
-    return basis_get(e.value(), get<SB::mode()>(static_cast<Tuple&&>(t)));
-  } else {
-    return static_cast<Tuple&&>(t);
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-namespace detail {
-
-template <class T, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-to_atuple_i(T const& t, seq<I...>) {
-  return make_arithmetic_tuple((void(I),Int<0>{})..., t);
-}
-
-} // end namespace detail
-
-// Turn a ScaledBases<T,N> into a rank-N+1 ArithmeticTuple
-//    with N prefix 0s:  (_0,_0,...N...,_0,T)
-template <class T, int N>
-CUTE_HOST_DEVICE constexpr
-auto
-as_arithmetic_tuple(ScaledBasis<T,N> const& t) {
-  return detail::to_atuple_i(as_arithmetic_tuple(t.value()), make_seq<N>{});
-}
-
-namespace detail {
-
-template <int... Ns>
-struct Basis;
-
-template <>
-struct Basis<> {
-  using type = Int<1>;
-};
-
-template <int N, int... Ns>
-struct Basis<N,Ns...> {
-  using type = ScaledBasis<typename Basis<Ns...>::type, N>;
-};
-
-} // end namespace detail
-
-// Shortcut for writing ScaledBasis<ScaledBasis<ScaledBasis<Int<1>, N0>, N1>, ...>
-// E<>    := _1
-// E<0>   := (_1,_0,_0,...)
-// E<1>   := (_0,_1,_0,...)
-// E<0,0> := ((_1,_0,_0,...),_0,_0,...)
-// E<0,1> := ((_0,_1,_0,...),_0,_0,...)
-// E<1,0> := (_0,(_1,_0,_0,...),_0,...)
-// E<1,1> := (_0,(_0,_1,_0,...),_0,...)
-template <int... N>
-using E = typename detail::Basis<N...>::type;
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_basis_like(Shape const& shape)
-{
-  if constexpr (is_integral<Shape>::value) {
-    return Int<1>{};
-  } else {
-    // Generate bases for each rank of shape
-    return transform(tuple_seq<Shape>{}, shape, [](auto I, auto si) {
-      // Generate bases for each rank of si and add an i on front
-      using I_type = decltype(I);
-      return transform_leaf(make_basis_like(si), [](auto e) {
-        // MSVC has trouble capturing variables as constexpr,
-        // so that they can be used as template arguments.
-        // This is exactly what the code needs to do with i, unfortunately.
-        // The work-around is to define i inside the inner lambda,
-        // by using just the type from the enclosing scope.
-        constexpr int i = I_type::value;
-        return ScaledBasis<decltype(e), i>{};
-      });
-    });
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Arithmetic
-//
-
-template <class T, int M, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-safe_div(ScaledBasis<T,M> const& b, U const& u)
-{
-  auto t = safe_div(b.value(), u);
-  return ScaledBasis<decltype(t),M>{t};
-}
-
-template <class T, int M, class U>
-CUTE_HOST_DEVICE constexpr
-auto
-shape_div(ScaledBasis<T,M> const& b, U const& u)
-{
-  auto t = shape_div(b.value(), u);
-  return ScaledBasis<decltype(t),M>{t};
-}
-
-// Equality
-template <class T, int N, class U, int M>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
-  return bool_constant<M == N>{} && t.value() == u.value();
-}
-
-// Not equal to anything else
-template <class T, int N, class U>
-CUTE_HOST_DEVICE constexpr
-false_type
-operator==(ScaledBasis<T,N> const&, U const&) {
-  return {};
-}
-
-template <class T, class U, int M>
-CUTE_HOST_DEVICE constexpr
-false_type
-operator==(T const&, ScaledBasis<U,M> const&) {
-  return {};
-}
-
-// Abs
-template <class T, int N>
-CUTE_HOST_DEVICE constexpr
-auto
-abs(ScaledBasis<T,N> const& e) {
-  return ScaledBasis<decltype(abs(e.value())),N>{abs(e.value())};
-}
-
-// Multiplication
-template <class A, class T, int N>
-CUTE_HOST_DEVICE constexpr
-auto
-operator*(A const& a, ScaledBasis<T,N> const& e) {
-  auto r = a * e.value();
-  return ScaledBasis<decltype(r),N>{r};
-}
-
-template <class T, int N, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-operator*(ScaledBasis<T,N> const& e, B const& b) {
-  auto r = e.value() * b;
-  return ScaledBasis<decltype(r),N>{r};
-}
-
-// Addition
-template <class T, int N, class U, int M>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ScaledBasis<T,N> const& t, ScaledBasis<U,M> const& u) {
-  return as_arithmetic_tuple(t) + as_arithmetic_tuple(u);
-}
-
-template <class T, int N, class... U>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ScaledBasis<T,N> const& t, ArithmeticTuple<U...> const& u) {
-  return as_arithmetic_tuple(t) + u;
-}
-
-template <class... T, class U, int M>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ArithmeticTuple<T...> const& t, ScaledBasis<U,M> const& u) {
-  return t + as_arithmetic_tuple(u);
-}
-
-template <auto t, class U, int M>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(C<t>, ScaledBasis<U,M> const& u) {
-  static_assert(t == 0, "ScaledBasis op+ error!");
-  return u;
-}
-
-template <class T, int N, auto u>
-CUTE_HOST_DEVICE constexpr
-auto
-operator+(ScaledBasis<T,N> const& t, C<u>) {
-  static_assert(u == 0, "ScaledBasis op+ error!");
-  return t;
-}
-
-//
-// Display utilities
-//
-
-template <class ArithTuple>
-CUTE_HOST_DEVICE void print(ArithmeticTupleIterator<ArithTuple> const& iter)
-{
-  printf("ArithTuple"); print(iter.coord_);
-}
-
-template <class T, int N>
-CUTE_HOST_DEVICE void print(ScaledBasis<T,N> const& e)
-{
-  print(e.value()); printf("@%d", N);
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class ArithTuple>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, ArithmeticTupleIterator<ArithTuple> const& iter)
-{
-  return os << "ArithTuple" << iter.coord_;
-}
-
-template <class T, int N>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, ScaledBasis<T,N> const& e)
-{
-  return os << e.value() << "@" << N;
-}
-#endif
-
-} // end namespace cute
-
-
-namespace CUTE_STL_NAMESPACE
-{
-
-template <class... T>
-struct tuple_size<cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-template <class... T>
-struct tuple_size<const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace CUTE_STL_NAMESPACE
-
-#ifdef CUTE_STL_NAMESPACE_IS_CUDA_STD
-namespace std
-{
-
-#if defined(__CUDACC_RTC__)
-template <class... _Tp>
-struct tuple_size;
-
-template <size_t _Ip, class... _Tp>
-struct tuple_element;
-#endif
-
-template <class... T>
-struct tuple_size<cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-template <class... T>
-struct tuple_size<const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::integral_constant<size_t, sizeof...(T)>
-{};
-
-template <size_t I, class... T>
-struct tuple_element<I, const cute::ArithmeticTuple<T...>>
-  : CUTE_STL_NAMESPACE::tuple_element<I, const CUTE_STL_NAMESPACE::tuple<T...>>
-{};
-
-} // end namespace std
-#endif // CUTE_STL_NAMESPACE_IS_CUDA_STD
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp b/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
deleted file mode 100755
index 7dd9ea5bf..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/complex.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>    // CUTE_HOST_DEVICE
-
-#include <cutlass/complex.h>  // cutlass::complexm, cutlass::real, cutlass::imag, cutlass::is_complex
-
-namespace cute
-{
-
-using cutlass::complex;
-using cutlass::is_complex;
-using cutlass::RealType;
-using cutlass::real;
-using cutlass::imag;
-using cutlass::conj;
-
-template <class T>
-static constexpr auto is_complex_v = is_complex<T>::value;
-
-/// Fused multiply-add for complex numbers
-template <class D, class A, class B, class C>
-CUTE_HOST_DEVICE constexpr
-void
-fma(complex<D>      & d,
-    complex<A> const& a,
-    complex<B> const& b,
-    complex<C> const& c)
-{
-  fma(d.real(),  a.real(), b.real(), c.real());
-  fma(d.imag(),  a.real(), b.imag(), c.imag());
-  fma(d.real(), -a.imag(), b.imag(), d.real());
-  fma(d.imag(),  a.imag(), b.real(), d.imag());
-}
-
-/// Fused multiply-add for triplets
-template <class A, class B, class C>
-CUTE_HOST_DEVICE constexpr
-void
-fma(complex<A> const& a,
-    complex<B> const& b,
-    complex<C>      & c)
-{
-  return fma(c, a, b, c);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/int.hpp b/lightllm-kernel/cutlass/include/cute/numeric/int.hpp
deleted file mode 100755
index 571b3e3ed..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/int.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#endif
-
-#include <cute/config.hpp>          // CUTE_STL_NAMESPACE
-
-#include <cutlass/numeric_types.h>  // cutlass::int2b_t, cutlass::int4b_t
-
-namespace cute
-{
-
-//
-// Signed integers
-//
-
-using int2_t  = cutlass::int2b_t;
-using int4_t  = cutlass::int4b_t;
-using CUTE_STL_NAMESPACE::int8_t;
-using CUTE_STL_NAMESPACE::int16_t;
-using CUTE_STL_NAMESPACE::int32_t;
-using CUTE_STL_NAMESPACE::int64_t;
-
-template <int N> struct int_bit;
-template <> struct int_bit<  2>  { using type = int2_t; };
-template <> struct int_bit<  4>  { using type = int4_t; };
-template <> struct int_bit<  8>  { using type = int8_t;  };
-template <> struct int_bit< 16>  { using type = int16_t; };
-template <> struct int_bit< 32>  { using type = int32_t; };
-template <> struct int_bit< 64>  { using type = int64_t; };
-
-template <int N>
-using int_bit_t = typename int_bit<N>::type;
-
-template <int N>
-using int_byte = int_bit<8*N>;
-
-template <int N>
-using int_byte_t = typename int_byte<N>::type;
-
-//
-// Unsigned integers
-//
-
-using uint1_t   = cutlass::uint1b_t;
-using uint2_t   = cutlass::uint2b_t;
-using uint4_t   = cutlass::uint4b_t;
-using CUTE_STL_NAMESPACE::uint8_t;
-using CUTE_STL_NAMESPACE::uint16_t;
-using CUTE_STL_NAMESPACE::uint32_t;
-using CUTE_STL_NAMESPACE::uint64_t;
-using cutlass::uint128_t;
-
-template <int N> struct uint_bit;
-template <> struct uint_bit<  1> { using type = uint1_t; };
-template <> struct uint_bit<  2> { using type = uint2_t; };
-template <> struct uint_bit<  4> { using type = uint4_t; };
-template <> struct uint_bit<  8> { using type = uint8_t;  };
-template <> struct uint_bit< 16> { using type = uint16_t; };
-template <> struct uint_bit< 32> { using type = uint32_t; };
-template <> struct uint_bit< 64> { using type = uint64_t; };
-template <> struct uint_bit<128> { using type = cutlass::uint128_t; };
-
-template <int N>
-using uint_bit_t = typename uint_bit<N>::type;
-
-template <int N>
-using uint_byte = uint_bit<8*N>;
-
-template <int N>
-using uint_byte_t = typename uint_byte<N>::type;
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
deleted file mode 100755
index 608017958..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-#include <cute/util/type_traits.hpp>
-#include <cute/numeric/integral_constant.hpp>
-
-namespace cute
-{
-
-using CUTE_STL_NAMESPACE::integer_sequence;
-using CUTE_STL_NAMESPACE::make_integer_sequence;
-
-namespace detail {
-
-template <class T, class S, T Begin>
-struct range_impl;
-
-template <class T, T... N, T Begin>
-struct range_impl<T, integer_sequence<T, N...>, Begin> {
-  using type = integer_sequence<T, N+Begin...>;
-};
-
-template <class S>
-struct reverse_impl;
-
-template <class T, T... N>
-struct reverse_impl<integer_sequence<T, N...>> {
-  using type = integer_sequence<T, sizeof...(N)-1-N...>;
-};
-
-} // end namespace detail
-
-template <class T, T Begin, T End>
-using make_integer_range = typename detail::range_impl<
-    T,
-    make_integer_sequence<T, (End-Begin > 0) ? (End-Begin) : 0>,
-    Begin>::type;
-
-template <class T, T N>
-using make_integer_sequence_reverse = typename detail::reverse_impl<
-    make_integer_sequence<T, N>>::type;
-
-//
-// Common aliases
-//
-
-// int_sequence
-
-template <int... Ints>
-using int_sequence = integer_sequence<int, Ints...>;
-
-template <int N>
-using make_int_sequence = make_integer_sequence<int, N>;
-
-template <int N>
-using make_int_rsequence = make_integer_sequence_reverse<int, N>;
-
-template <int Begin, int End>
-using make_int_range = make_integer_range<int, Begin, End>;
-
-// index_sequence
-
-template <size_t... Ints>
-using index_sequence = integer_sequence<size_t, Ints...>;
-
-template <size_t N>
-using make_index_sequence = make_integer_sequence<size_t, N>;
-
-template <size_t N>
-using make_index_rsequence = make_integer_sequence_reverse<size_t, N>;
-
-template <size_t Begin, size_t End>
-using make_index_range = make_integer_range<size_t, Begin, End>;
-
-//
-// Shortcuts
-//
-
-template <int... Ints>
-using seq = int_sequence<Ints...>;
-
-template <int N>
-using make_seq = make_int_sequence<N>;
-
-template <int N>
-using make_rseq = make_int_rsequence<N>;
-
-template <int Min, int Max>
-using make_range = make_int_range<Min, Max>;
-
-template <class Tuple>
-using tuple_seq = make_seq<tuple_size<remove_cvref_t<Tuple>>::value>;
-
-template <class Tuple>
-using tuple_rseq = make_rseq<tuple_size<remove_cvref_t<Tuple>>::value>;
-
-//
-// Specialize cute::tuple-traits for std::integer_sequence
-//
-
-template <class T, T... Ints>
-struct tuple_size<integer_sequence<T, Ints...>>
-    : cute::integral_constant<size_t, sizeof...(Ints)>
-{};
-
-template <size_t I, class T, T... Is>
-struct tuple_element<I, integer_sequence<T, Is...>>
-{
-  constexpr static T idx[sizeof...(Is)] = {Is...};
-  using type = cute::integral_constant<T, idx[I]>;
-};
-
-template <size_t I, class T, T... Ints>
-CUTE_HOST_DEVICE constexpr
-tuple_element_t<I, integer_sequence<T, Ints...>>
-get(integer_sequence<T, Ints...>) {
-  static_assert(I < sizeof...(Ints), "Index out of range");
-  return {};
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
deleted file mode 100755
index 3a8d036ee..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/numeric/math.hpp>      // cute::max, etc
-#include <cute/util/print.hpp>        // cute::print
-#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES, cute::is_std_integral
-
-namespace cute
-{
-
-// A constant value: short name and type-deduction for fast compilation
-template <auto v>
-struct C {
-  using type = C<v>;
-  static constexpr auto value = v;
-  using value_type = decltype(v);
-  CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
-  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
-};
-
-// Deprecate
-template <class T, T v>
-using constant = C<v>;
-
-template <bool b>
-using bool_constant = C<b>;
-
-using true_type  = bool_constant<true>;
-using false_type = bool_constant<false>;
-
-// A more std:: conforming integral_constant that enforces type but interops with C<v>
-template <class T, T v>
-struct integral_constant : C<v> {
-  using type = integral_constant<T,v>;
-  static constexpr T value = v;
-  using value_type = T;
-  // Disambiguate C<v>::operator value_type()
-  //CUTE_HOST_DEVICE constexpr operator   value_type() const noexcept { return value; }
-  CUTE_HOST_DEVICE constexpr value_type operator()() const noexcept { return value; }
-};
-
-//
-// Traits
-//
-
-// Use cute::is_std_integral<T> to match built-in integral types (int, int64_t, unsigned, etc)
-// Use cute::is_integral<T> to match both built-in integral types AND static integral types.
-
-template <class T>
-struct is_integral : bool_constant<is_std_integral<T>::value> {};
-template <auto v>
-struct is_integral<C<v>                  > : true_type {};
-template <class T, T v>
-struct is_integral<integral_constant<T,v>> : true_type {};
-
-// Register FastDivmod as the integral type
-template<>
-struct is_integral<cutlass::FastDivmod> : true_type {};
-
-// is_static detects if an (abstract) value is defined completely by its type (no members)
-template <class T>
-struct is_static : bool_constant<is_empty<remove_cvref_t<T>>::value> {};
-
-template <class T>
-constexpr bool is_static_v = is_static<T>::value;
-
-// is_constant detects if a type is a static integral type and if v is equal to a value
-
-template <auto n, class T>
-struct is_constant : false_type {};
-template <auto n, class T>
-struct is_constant<n, T const > : is_constant<n,T> {};
-template <auto n, class T>
-struct is_constant<n, T const&> : is_constant<n,T> {};
-template <auto n, class T>
-struct is_constant<n, T      &> : is_constant<n,T> {};
-template <auto n, class T>
-struct is_constant<n, T     &&> : is_constant<n,T> {};
-template <auto n, auto v>
-struct is_constant<n, C<v>                  > : bool_constant<v == n> {};
-template <auto n, class T, T v>
-struct is_constant<n, integral_constant<T,v>> : bool_constant<v == n> {};
-
-//
-// Specializations
-//
-
-template <int v>
-using Int = C<v>;
-
-using _m32    = Int<-32>;
-using _m24    = Int<-24>;
-using _m16    = Int<-16>;
-using _m12    = Int<-12>;
-using _m10    = Int<-10>;
-using _m9     = Int<-9>;
-using _m8     = Int<-8>;
-using _m7     = Int<-7>;
-using _m6     = Int<-6>;
-using _m5     = Int<-5>;
-using _m4     = Int<-4>;
-using _m3     = Int<-3>;
-using _m2     = Int<-2>;
-using _m1     = Int<-1>;
-using _0      = Int<0>;
-using _1      = Int<1>;
-using _2      = Int<2>;
-using _3      = Int<3>;
-using _4      = Int<4>;
-using _5      = Int<5>;
-using _6      = Int<6>;
-using _7      = Int<7>;
-using _8      = Int<8>;
-using _9      = Int<9>;
-using _10     = Int<10>;
-using _12     = Int<12>;
-using _16     = Int<16>;
-using _24     = Int<24>;
-using _32     = Int<32>;
-using _40     = Int<40>;
-using _48     = Int<48>;
-using _56     = Int<56>;
-using _64     = Int<64>;
-using _72     = Int<72>;
-using _80     = Int<80>;
-using _88     = Int<88>;
-using _96     = Int<96>;
-using _104    = Int<104>;
-using _112    = Int<112>;
-using _120    = Int<120>;
-using _128    = Int<128>;
-using _136    = Int<136>;
-using _144    = Int<144>;
-using _152    = Int<152>;
-using _160    = Int<160>;
-using _168    = Int<168>;
-using _176    = Int<176>;
-using _184    = Int<184>;
-using _192    = Int<192>;
-using _200    = Int<200>;
-using _208    = Int<208>;
-using _216    = Int<216>;
-using _224    = Int<224>;
-using _232    = Int<232>;
-using _240    = Int<240>;
-using _248    = Int<248>;
-using _256    = Int<256>;
-using _384    = Int<384>;
-using _512    = Int<512>;
-using _768    = Int<768>;
-using _1024   = Int<1024>;
-using _2048   = Int<2048>;
-using _4096   = Int<4096>;
-using _8192   = Int<8192>;
-using _16384  = Int<16384>;
-using _32768  = Int<32768>;
-using _65536  = Int<65536>;
-using _131072 = Int<131072>;
-using _262144 = Int<262144>;
-using _524288 = Int<524288>;
-
-/***************/
-/** Operators **/
-/***************/
-
-#define CUTE_LEFT_UNARY_OP(OP)                                       \
-  template <auto t>                                                  \
-  CUTE_HOST_DEVICE constexpr                                         \
-  C<(OP t)> operator OP (C<t>) {                                     \
-    return {};                                                       \
-  }
-#define CUTE_RIGHT_UNARY_OP(OP)                                      \
-  template <auto t>                                                  \
-  CUTE_HOST_DEVICE constexpr                                         \
-  C<(t OP)> operator OP (C<t>) {                                     \
-    return {};                                                       \
-  }
-#define CUTE_BINARY_OP(OP)                                           \
-  template <auto t, auto u>                                          \
-  CUTE_HOST_DEVICE constexpr                                         \
-  C<(t OP u)> operator OP (C<t>, C<u>) {                             \
-    return {};                                                       \
-  }
-
-CUTE_LEFT_UNARY_OP(+);
-CUTE_LEFT_UNARY_OP(-);
-CUTE_LEFT_UNARY_OP(~);
-CUTE_LEFT_UNARY_OP(!);
-CUTE_LEFT_UNARY_OP(*);
-
-CUTE_BINARY_OP( +);
-CUTE_BINARY_OP( -);
-CUTE_BINARY_OP( *);
-CUTE_BINARY_OP( /);
-CUTE_BINARY_OP( %);
-CUTE_BINARY_OP( &);
-CUTE_BINARY_OP( |);
-CUTE_BINARY_OP( ^);
-CUTE_BINARY_OP(<<);
-CUTE_BINARY_OP(>>);
-
-CUTE_BINARY_OP(&&);
-CUTE_BINARY_OP(||);
-
-CUTE_BINARY_OP(==);
-CUTE_BINARY_OP(!=);
-CUTE_BINARY_OP( >);
-CUTE_BINARY_OP( <);
-CUTE_BINARY_OP(>=);
-CUTE_BINARY_OP(<=);
-
-#undef CUTE_BINARY_OP
-#undef CUTE_LEFT_UNARY_OP
-#undef CUTE_RIGHT_UNARY_OP
-
-//
-// Mixed static-dynamic special cases
-//
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator*(C<t>, U) {
-  return {};
-}
-
-template <class U, auto t,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator*(U, C<t>) {
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator/(C<t>, U) {
-  return {};
-}
-
-template <class U, auto t,
-          __CUTE_REQUIRES(is_std_integral<U>::value && (t == 1 || t == -1))>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator%(U, C<t>) {
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator%(C<t>, U) {
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator&(C<t>, U) {
-  return {};
-}
-
-template <class U, auto t,
-          __CUTE_REQUIRES(is_std_integral<U>::value && t == 0)>
-CUTE_HOST_DEVICE constexpr
-C<0>
-operator&(U, C<t>) {
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
-CUTE_HOST_DEVICE constexpr
-C<false>
-operator&&(C<t>, U) {
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value && !bool(t))>
-CUTE_HOST_DEVICE constexpr
-C<false>
-operator&&(U, C<t>) {
-  return {};
-}
-
-template <class U, auto t,
-          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
-CUTE_HOST_DEVICE constexpr
-C<true>
-operator||(C<t>, U) {
-  return {};
-}
-
-template <class U, auto t,
-          __CUTE_REQUIRES(is_std_integral<U>::value && bool(t))>
-CUTE_HOST_DEVICE constexpr
-C<true>
-operator||(U, C<t>) {
-  return {};
-}
-
-//
-// Named functions from math.hpp
-//
-
-#define CUTE_NAMED_UNARY_FN(OP)                                      \
-  template <auto t>                                                  \
-  CUTE_HOST_DEVICE constexpr                                         \
-  C<OP(t)> OP (C<t>) {                                               \
-    return {};                                                       \
-  }
-#define CUTE_NAMED_BINARY_FN(OP)                                     \
-  template <auto t, auto u>                                          \
-  CUTE_HOST_DEVICE constexpr                                         \
-  C<OP(t,u)> OP (C<t>, C<u>) {                                       \
-    return {};                                                       \
-  }                                                                  \
-  template <auto t, class U,                                         \
-            __CUTE_REQUIRES(is_std_integral<U>::value)>              \
-  CUTE_HOST_DEVICE constexpr                                         \
-  auto OP (C<t>, U u) {                                              \
-    return OP(t,u);                                                  \
-  }                                                                  \
-  template <class T, auto u,                                         \
-            __CUTE_REQUIRES(is_std_integral<T>::value)>              \
-  CUTE_HOST_DEVICE constexpr                                         \
-  auto OP (T t, C<u>) {                                              \
-    return OP(t,u);                                                  \
-  }
-
-CUTE_NAMED_UNARY_FN(abs);
-CUTE_NAMED_UNARY_FN(signum);
-CUTE_NAMED_UNARY_FN(has_single_bit);
-
-CUTE_NAMED_BINARY_FN(max);
-CUTE_NAMED_BINARY_FN(min);
-CUTE_NAMED_BINARY_FN(shiftl);
-CUTE_NAMED_BINARY_FN(shiftr);
-CUTE_NAMED_BINARY_FN(gcd);
-CUTE_NAMED_BINARY_FN(lcm);
-
-#undef CUTE_NAMED_UNARY_FN
-#undef CUTE_NAMED_BINARY_FN
-
-//
-// Other functions
-//
-
-template <auto t, auto u>
-CUTE_HOST_DEVICE constexpr
-C<t / u>
-safe_div(C<t>, C<u>) {
-  static_assert(t % u == 0, "Static safe_div requires t % u == 0");
-  return {};
-}
-
-template <auto t, class U,
-          __CUTE_REQUIRES(is_std_integral<U>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-safe_div(C<t>, U u) {
-  return t / u;
-}
-
-template <class T, auto u,
-          __CUTE_REQUIRES(is_std_integral<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-safe_div(T t, C<u>) {
-  return t / u;
-}
-
-template <class TrueType, class FalseType>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-conditional_return(true_type, TrueType&& t, FalseType&&) {
-  return static_cast<TrueType&&>(t);
-}
-
-template <class TrueType, class FalseType>
-CUTE_HOST_DEVICE constexpr
-decltype(auto)
-conditional_return(false_type, TrueType&&, FalseType&& f) {
-  return static_cast<FalseType&&>(f);
-}
-
-template <auto v>
-CUTE_HOST_DEVICE constexpr
-auto
-conditional_return(bool b, C<v> const&, C<v> const&) {
-  return C<v>{};
-}
-
-template <auto v, auto u>
-CUTE_HOST_DEVICE constexpr
-auto
-conditional_return(bool b, C<v> const&, C<u> const&) {
-  return b ? v : u;
-}
-
-// TrueType and FalseType must have a common type
-template <class TrueType, class FalseType>
-CUTE_HOST_DEVICE constexpr
-auto
-conditional_return(bool b, TrueType const& t, FalseType const& f) {
-  return b ? t : f;
-}
-
-// TrueType and FalseType don't require a common type
-template <bool b, class TrueType, class FalseType>
-CUTE_HOST_DEVICE constexpr
-auto
-conditional_return(TrueType const& t, FalseType const& f) {
-  if constexpr (b) {
-    return t;
-  } else {
-    return f;
-  }
-}
-
-template <class Trait>
-CUTE_HOST_DEVICE constexpr
-auto
-static_value()
-{
-  if constexpr (is_std_integral<decltype(Trait::value)>::value) {
-    return Int<Trait::value>{};
-  } else {
-    return Trait::value;
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Display utilities
-//
-
-template <auto Value>
-CUTE_HOST_DEVICE void print(C<Value>) {
-  printf("_");
-  ::cute::print(Value);
-}
-
-#if !defined(__CUDACC_RTC__)
-template <auto t>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, C<t> const&) {
-  return os << "_" << t;
-}
-#endif
-
-
-namespace detail {
-
-// parse_int_digits takes a variadic number of digits and converts them into an int
-template <class... Ts>
-constexpr uint64_t parse_int_digits(uint64_t result, int digit, Ts... digits)
-{
-  if constexpr (sizeof...(Ts) == 0) {
-    return 10 * result + digit;
-  } else {
-    return parse_int_digits(10 * result + digit, digits...);
-  }
-}
-
-} // end namespace detail
-
-
-// This user-defined literal operator allows cute::constant written as literals. For example,
-//
-//    auto var = 32_c;
-//
-//  var has type cute::constant<int,32>.
-//
-template <char... digits>
-constexpr cute::constant<int,detail::parse_int_digits(0, (digits - '0')...)> operator "" _c()
-{
-  static_assert((('0' <= digits && digits <= '9') && ...),
-                "Expected 0 <= digit <= 9 for each digit of the integer.");
-  return {};
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp b/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
deleted file mode 100755
index 1b1432533..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
-#include <cute/numeric/math.hpp>               // cute::signum
-#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
-
-namespace cute
-{
-
-/** Compile-time rational arithmetic type.
- * Like cute::C for std::integral_constant, cute::R for std::ratio has a short name
- *   for error messages and compile times.
- * The static data members @a num and @a den represent the reduced numerator and denominator
- *   of the rational value. Thus, two cute::R types with different @a n or @a d are distinct types
- *   even if they represent the same rational value.
- * A cute::R exposes the reduced canonical type via its ::type member.
- *   That is, cute::R<3,6>::type is cute::R<1,2> and cute::R<6,3>::type is cute::C<2>.
- * A cute::R<n,d>::value can be used much like any other trait::value. It can be involved in
- *   arithmetic expressions (according to the operator-overloads for cute::C and cute::R,
- *   though these may be incomplete) but with a potential rational value rather than an integral value.
- */
-template <auto n, auto d>
-class R {
-  static_assert(d != 0);
-  static constexpr auto an  = abs(n);
-  static constexpr auto ad  = abs(d);
-  static constexpr auto g   = gcd(an, ad);
-
- public:
-  static constexpr auto num = signum(n) * signum(d) * an / g;
-  static constexpr auto den =                         ad / g;
-  // RI: den >= 1 && gcd(abs(num),den) == 1
-  using type = typename conditional<num == 0 || den == 1, C<num>, R<num,den>>::type;
-};
-
-template <class T>
-struct is_ratio : false_type {};
-template <auto n, auto d>
-struct is_ratio<R<n,d>> : true_type {};
-
-template <auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-typename R<a,b>::type
-ratio(C<a>, C<b>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-typename R<a*c,b>::type
-ratio(C<a>, R<b,c>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-typename R<b,a*c>::type
-ratio(R<b,c>, C<a>) {
-  return {};
-}
-
-template <auto a, auto b, auto c, auto d>
-CUTE_HOST_DEVICE constexpr
-typename R<a*d,b*c>::type
-ratio(R<a,b>, R<c,d>) {
-  return {};
-}
-
-//
-// Non-reduced ratio implementations
-//
-
-template <auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-R<a,b>
-nratio(C<a>, C<b>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-R<a*c,b>
-nratio(C<a>, R<b,c>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-R<b,a*c>
-nratio(R<b,c>, C<a>) {
-  return {};
-}
-
-template <auto a, auto b, auto c, auto d>
-CUTE_HOST_DEVICE constexpr
-R<a*d,b*c>
-nratio(R<a,b>, R<c,d>) {
-  return {};
-}
-
-//
-// Operators
-//
-
-template <auto a, auto b, auto x, auto y>
-CUTE_HOST_DEVICE constexpr
-typename R<a*x,b*y>::type
-operator*(R<a,b>, R<x,y>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-typename R<a*c,b>::type
-operator*(R<a,b>, C<c>) {
-  return {};
-}
-
-template <auto c, auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-typename R<a*c,b>::type
-operator*(C<c>, R<a,b>) {
-  return {};
-}
-
-template <auto c, auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-typename R<c*b,a>::type
-operator/(C<c>, R<a,b>) {
-  return {};
-}
-
-// Product with dynamic type needs to produce an integer...
-template <class C, auto a, auto b,
-          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator*(C const& c, R<a,b>) {
-  return c * R<a,b>::num / R<a,b>::den;
-}
-
-// Product with dynamic type needs to produce an integer...
-template <auto a, auto b, class C,
-          __CUTE_REQUIRES(cute::is_std_integral<C>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-operator*(R<a,b>, C const& c) {
-  return c * R<a,b>::num / R<a,b>::den;
-}
-
-template <auto a, auto b, auto x, auto y>
-CUTE_HOST_DEVICE constexpr
-typename R<a*y+b*x, b*y>::type
-operator+(R<a,b>, R<x,y>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-typename R<a+c*b,b>::type
-operator+(R<a,b>, C<c>) {
-  return {};
-}
-
-template <auto c, auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-typename R<a+c*b,b>::type
-operator+(C<c>, R<a,b>) {
-  return {};
-}
-
-template <auto a, auto b, auto x, auto y>
-CUTE_HOST_DEVICE constexpr
-bool_constant<R<a,b>::num == R<x,y>::num && R<a,b>::den == R<x,y>::den>
-operator==(R<a,b>, R<x,y>) {
-  return {};
-}
-
-template <auto a, auto b, auto c>
-CUTE_HOST_DEVICE constexpr
-bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
-operator==(R<a,b>, C<c>) {
-  return {};
-}
-
-template <auto c, auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-bool_constant<R<a,b>::num == c && R<a,b>::den == 1>
-operator==(C<c>, R<a,b>) {
-  return {};
-}
-
-template <auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-typename R<abs(a),abs(b)>::type
-abs(R<a,b>) {
-  return {};
-}
-
-template <auto a, auto b>
-CUTE_HOST_DEVICE constexpr
-int32_t
-log_2(R<a,b>) {
-  static_assert(R<a,b>::num > 0);
-  static_assert(R<a,b>::den > 0);
-  return log_2(static_cast<uint32_t>(R<a,b>::num)) - log_2(static_cast<uint32_t>(R<a,b>::den));
-}
-
-// @return A non-reduced ratio cute::R of the Trait0::value / Trait1::value
-template <class Trait0, class Trait1>
-CUTE_HOST_DEVICE constexpr
-auto
-trait_ratio(Trait0, Trait1) {
-  return nratio(static_value<Trait0>(), static_value<Trait1>());
-}
-
-//
-// Display utilities
-//
-
-template <auto a, auto b>
-CUTE_HOST_DEVICE void print(R<a,b>) {
-  print(C<a>{}); print("/"); print(C<b>{});
-}
-
-#if !defined(__CUDACC_RTC__)
-template <auto a, auto b>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, R<a,b>) {
-  return os << "_" << C<a>{} << "/" << C<b>{};
-}
-#endif
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/math.hpp b/lightllm-kernel/cutlass/include/cute/numeric/math.hpp
deleted file mode 100755
index e493a3a95..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/math.hpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>            // CUTE_HOST_DEVICE
-#include <cute/util/type_traits.hpp>  // __CUTE_REQUIRES
-
-#include <cutlass/fast_math.h>
-
-namespace cute
-{
-
-//
-// Common Operations
-//
-
-template <class T, class U,
-          __CUTE_REQUIRES(is_arithmetic<T>::value &&
-                          is_arithmetic<U>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-max(T const& t, U const& u) {
-  return t < u ? u : t;
-}
-
-template <class T, class U,
-          __CUTE_REQUIRES(is_arithmetic<T>::value &&
-                          is_arithmetic<U>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-min(T const& t, U const& u) {
-  return t < u ? t : u;
-}
-
-template <class T,
-          __CUTE_REQUIRES(is_arithmetic<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-abs(T const& t) {
-  if constexpr (is_signed<T>::value) {
-    return t < T(0) ? -t : t;
-  } else {
-    return t;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Returns 1 if x > 0, -1 if x < 0, and 0 if x is zero.
-template <class T,
-          __CUTE_REQUIRES(is_arithmetic<T>::value)>
-CUTE_HOST_DEVICE constexpr
-int
-signum(T const& x) {
-  if constexpr (is_signed<T>::value) {
-    return (T(0) < x) - (x < T(0));
-  } else {
-    return T(0) < x;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// C++17 <numeric> operations
-//
-
-// Greatest common divisor of two positive integers
-template <class T, class U,
-          __CUTE_REQUIRES(is_std_integral<T>::value &&
-                          is_std_integral<U>::value)>
-CUTE_HOST_DEVICE constexpr
-cute::common_type_t<T, U>
-gcd(T t, U u) {
-  while (true) {
-    if (t == 0) { return u; }
-    u %= t;
-    if (u == 0) { return t; }
-    t %= u;
-  }
-}
-
-// Least common multiple of two positive integers
-template <class T, class U,
-          __CUTE_REQUIRES(is_std_integral<T>::value &&
-                          is_std_integral<U>::value)>
-CUTE_HOST_DEVICE constexpr
-cute::common_type_t<T, U>
-lcm(T const& t, U const& u) {
-  return (t / gcd(t,u)) * u;
-}
-
-//
-// C++20 <bit> operations
-//
-
-// Checks if a number is an integral power of two
-template <class T>
-CUTE_HOST_DEVICE constexpr
-bool
-has_single_bit(T x) {
-  return x != 0 && (x & (x - 1)) == 0;
-}
-
-// Smallest number of bits needed to represent the given value
-//   For x == 0, this is 0
-//   For x != 0, this is 1 + floor(log2(x))
-// bit_width( 0b0000 ) = 0
-// bit_width( 0b0001 ) = 1
-// bit_width( 0b0010 ) = 2
-// bit_width( 0b0011 ) = 2
-// bit_width( 0b0100 ) = 3
-// bit_width( 0b0101 ) = 3
-// bit_width( 0b0110 ) = 3
-// bit_width( 0b0111 ) = 3
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-bit_width(T x) {
-  static_assert(is_unsigned<T>::value, "Only to be used for unsigned types.");
-  constexpr int N = (numeric_limits<T>::digits == 64 ? 6 :
-                    (numeric_limits<T>::digits == 32 ? 5 :
-                    (numeric_limits<T>::digits == 16 ? 4 :
-                    (numeric_limits<T>::digits ==  8 ? 3 : (assert(false),0)))));
-  T r = 0;
-  for (int i = N - 1; i >= 0; --i) {
-    T shift = (x > ((T(1) << (T(1) << i))-1)) << i;
-    x >>= shift;
-    r  |= shift;
-  }
-  return r + (x != 0);
-}
-
-// Smallest integral power of two not less than the given value
-// bit_ceil( 0b00000000 ) = 0b00000001
-// bit_ceil( 0b00000001 ) = 0b00000001
-// bit_ceil( 0b00000010 ) = 0b00000010
-// bit_ceil( 0b00000011 ) = 0b00000100
-// bit_ceil( 0b00000100 ) = 0b00000100
-// bit_ceil( 0b00000101 ) = 0b00001000
-// bit_ceil( 0b00000110 ) = 0b00001000
-// bit_ceil( 0b00000111 ) = 0b00001000
-// bit_ceil( 0b00001000 ) = 0b00001000
-// bit_ceil( 0b00001001 ) = 0b00010000
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T
-bit_ceil(T x) {
-  return x == 0 ? T(1) : (T(1) << bit_width(x - 1));
-}
-
-// Largest integral power of two not greater than the given value
-// bit_floor( 0b00000000 ) = 0b00000000
-// bit_floor( 0b00000001 ) = 0b00000001
-// bit_floor( 0b00000010 ) = 0b00000010
-// bit_floor( 0b00000011 ) = 0b00000010
-// bit_floor( 0b00000100 ) = 0b00000100
-// bit_floor( 0b00000101 ) = 0b00000100
-// bit_floor( 0b00000110 ) = 0b00000100
-// bit_floor( 0b00000111 ) = 0b00000100
-// bit_floor( 0b00001000 ) = 0b00001000
-// bit_floor( 0b00001001 ) = 0b00001000
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T
-bit_floor(T x) {
-  return x == 0 ? 0 : (T(1) << (bit_width(x) - 1));
-}
-
-template <class T>
-CUTE_HOST_DEVICE constexpr T rotl(T x, int s);
-template <class T>
-CUTE_HOST_DEVICE constexpr T rotr(T x, int s);
-
-// Computes the result of circular bitwise left-rotation
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T
-rotl(T x, int s) {
-  constexpr int N = numeric_limits<T>::digits;
-  return static_cast<T>(s == 0 ? x : s > 0 ? (x << s) | (x >> (N - s)) : rotr(x, -s));
-}
-
-// Computes the result of circular bitwise right-rotation
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T
-rotr(T x, int s) {
-  constexpr int N = numeric_limits<T>::digits;
-  return static_cast<T>(s == 0 ? x : s > 0 ? (x >> s) | (x << (N - s)) : rotl(x, -s));
-}
-
-// Counts the number of consecutive 0 bits, starting from the most significant bit
-// countl_zero( 0b00000000 ) = 8
-// countl_zero( 0b11111111 ) = 0
-// countl_zero( 0b00011100 ) = 3
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-countl_zero(T x) {
-  return numeric_limits<T>::digits - bit_width(x);
-}
-
-// Counts the number of consecutive 1 bits, starting from the most significant bit
-// countl_one( 0b00000000 ) = 0
-// countl_one( 0b11111111 ) = 8
-// countl_one( 0b11100011 ) = 3
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-countl_one(T x) {
-  return countl_zero(~x);
-}
-
-// Counts the number of consecutive 0 bits, starting from the least significant bit
-// countr_zero( 0b00000000 ) = 8
-// countr_zero( 0b11111111 ) = 0
-// countr_zero( 0b00011100 ) = 2
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-countr_zero(T x) {
-  return x == 0 ? numeric_limits<T>::digits : bit_width(T(x & T(-x))) - 1;  // bit_width of the LSB
-}
-
-// Counts the number of consecutive 1 bits, starting from the least significant bit
-// countr_one( 0b00000000 ) = 0
-// countr_one( 0b11111111 ) = 8
-// countr_one( 0b11100011 ) = 2
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-countr_one(T x) {
-  return countr_zero(~x);
-}
-
-// Counts the number of 1 bits in an unsigned integer
-// popcount( 0b00000000 ) = 0
-// popcount( 0b11111111 ) = 8
-// popcount( 0b00011101 ) = 4
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int
-popcount(T x) {
-  int c = 0;
-  while (x) {
-    ++c;
-    x &= x - 1; // clear the least significant bit set
-  }
-  return c;
-}
-
-//
-// Custom operations
-//
-
-// Computes the result of bitwise left-shift
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-shiftl(T x, int s) {
-  return s >= 0 ? (x << s) : (x >> -s);
-}
-
-// Computes the result of bitwise right-shift
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-shiftr(T x, int s) {
-  return s >= 0 ? (x >> s) : (x << -s);
-}
-
-// Safe divide
-// @pre t % u == 0
-// @result t / u
-template <class T, class U,
-          __CUTE_REQUIRES(is_std_integral<T>::value &&
-                          is_std_integral<U>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-safe_div(T const& t, U const& u) {
-  //assert(t % u == 0);
-  return t / u;
-}
-
-/**
- * log2 computation
- */
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-int32_t
-log_2(T x) {
-  assert(x > 0);
-  static_assert(is_unsigned<T>::value, "Only to be used for unsigned integral types.");
-  return static_cast<int32_t>(bit_width(x)) - 1;
-}
-
-template <class IntDiv, class IntMod>
-struct DivModReturnType {
-  IntDiv div_;
-  IntMod mod_;
-  CUTE_HOST_DEVICE constexpr
-  DivModReturnType(IntDiv const& div, IntMod const& mod) : div_(div), mod_(mod) {}
-};
-
-// General divmod
-template <class CInt0, class CInt1>
-CUTE_HOST_DEVICE constexpr
-auto
-divmod(CInt0 const& a, CInt1 const& b) {
-  return DivModReturnType{a / b, a % b};
-}
-
-// Specialized function with fastDivmod input
-template <class CInt>
-CUTE_HOST_DEVICE constexpr
-auto
-divmod(CInt const& a, cutlass::FastDivmod const& b) {
-  using val_div_type = typename cutlass::FastDivmod::value_div_type;
-  using val_mod_type = typename cutlass::FastDivmod::value_mod_type;
-  val_div_type div = 0;
-  val_mod_type mod = 0;
-  b(div, mod, a);
-  return DivModReturnType{div, mod};
-}
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp b/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
deleted file mode 100755
index 07444331f..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>          // CUTE_HOST_DEVICE
-#include <cute/numeric/int.hpp>     // cute::int2_t, cute::int4_t, etc
-
-#include <cutlass/numeric_size.h>   // cutlass::sizeof_bits
-#include <cutlass/numeric_types.h>  // cutlass::float_e4m3_t, cutlass::float_e5m2_t, etc
-
-namespace cute {
-
-template <typename T>
-struct sizeof_bits : public cutlass::sizeof_bits<T> {};
-
-// DO NOT change auto to int, sizeof_bits<sparse_elem> use integral_ratio instead of int 
-template <class T>
-static constexpr auto sizeof_bits_v = sizeof_bits<T>::value;
-
-using cutlass::bits_to_bytes;
-
-using cutlass::is_subbyte;
-
-template <class T>
-static constexpr auto is_subbyte_v = is_subbyte<T>::value;
-
-using cutlass::half_t;
-using cutlass::bfloat16_t;
-
-using cutlass::tfloat32_t;
-
-// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
-// This umbrella datatype can be enabled when a user provides a specific
-// datatype in runtime argument list.
-using cutlass::type_erased_dynamic_float8_t;
-using cutlass::float_e4m3_t;
-using cutlass::float_e5m2_t;
-
-using cutlass::uint1b_t;
-using cutlass::int2b_t;
-using cutlass::uint2b_t;
-using cutlass::int4b_t;
-using cutlass::uint4b_t;
-using cutlass::bin1_t;
-
-
-//
-// Print utility
-//
-
-CUTE_HOST_DEVICE
-void
-print(half_t a) {
-  printf("%f", static_cast<float>(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(bfloat16_t a) {
-  printf("%f", static_cast<float>(a));
-}
-
-
-CUTE_HOST_DEVICE
-void
-print(tfloat32_t a) {
-  printf("%f", static_cast<float>(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(float_e4m3_t a) {
-  printf("%f", static_cast<float>(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(float_e5m2_t a) {
-  printf("%f", static_cast<float>(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(bfloat16_t v) {
-  printf("%*.2f", 8, float(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(half_t v) {
-  printf("%*.2f", 8, float(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(tfloat32_t v) {
-  printf("%*.2e", 10, static_cast<float>(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(float_e4m3_t t) {
-  printf("%*.2f", 8, static_cast<float>(t));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(float_e5m2_t t) {
-  printf("%*.2f", 8, static_cast<float>(t));
-}
-
-} // namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/numeric/real.hpp b/lightllm-kernel/cutlass/include/cute/numeric/real.hpp
deleted file mode 100755
index 4ce58dfa1..000000000
--- a/lightllm-kernel/cutlass/include/cute/numeric/real.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>
-
-namespace cute
-{
-
-/// Generic add
-template <class C, class A, class B>
-CUTE_HOST_DEVICE constexpr
-void
-add(C& c, A const& a, B const& b)
-{
-  c = a + b;
-}
-
-/// Generic multiply
-template <class C, class A, class B>
-CUTE_HOST_DEVICE constexpr
-void
-mul(C& c, A const& a, B const& b)
-{
-  c = a * b;
-}
-
-/// Generic fused multiply-add
-template <class D, class A, class B, class C>
-CUTE_HOST_DEVICE constexpr
-void
-fma(D& d, A const& a, B const& b, C const& c)
-{
-  d = a * b + c;
-}
-
-/// Fused multiply-add for triplets
-template <class A, class B, class C>
-CUTE_HOST_DEVICE constexpr
-void
-fma(A const& a, B const& b, C& c)
-{
-  return fma(c, a, b, c);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer.hpp b/lightllm-kernel/cutlass/include/cute/pointer.hpp
deleted file mode 100755
index 4cfa129cc..000000000
--- a/lightllm-kernel/cutlass/include/cute/pointer.hpp
+++ /dev/null
@@ -1,322 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/pointer_base.hpp>               // cute::iter_adaptor
-#include <cute/pointer_sparse.hpp>
-#include <cute/container/array_subbyte.hpp>    // cute::subbyte_iterator
-#include <cute/numeric/integral_constant.hpp>  // cute::true_type, cute::false_type
-#include <cute/numeric/numeric_types.hpp>      // sizeof_bits
-
-namespace cute
-{
-
-//
-// recast_ptr<T> -- Create an iterator over values of type T.
-// For most types this will simply be T*, but certain types require more care.
-// Subbyte Types: uint2_t, uint4_t, etc
-//   Requires construction of a subbyte_iterator<T> in order to properly
-//   resolve each element in byte-addressed memory.
-// Sparse Types: sparse_elem<int S, class T>
-//   A type that holds one physical element meant to represent S number of logical elements.
-//   Requires construction of a sparse_ptr that emulates access to the S logical elements.
-//
-
-template <class NewT>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(void* ptr)
-{
-  if constexpr (is_sparse<NewT>::value) {
-    constexpr int sparsity = NewT::sparsity;
-    NewT* p = reinterpret_cast<NewT*>(ptr);
-    return make_sparse_ptr<sparsity>(p);
-  } else
-  if constexpr (cute::is_subbyte_v<NewT>) {
-    return subbyte_iterator<NewT>(ptr);
-  } else {
-    return reinterpret_cast<NewT*>(ptr);
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class NewT>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(void const* ptr)
-{
-  if constexpr (is_sparse<NewT>::value) {
-    constexpr int sparsity = NewT::sparsity;
-    NewT const* p = reinterpret_cast<NewT const*>(ptr);
-    return make_sparse_ptr<sparsity>(p);
-  } else
-  if constexpr (cute::is_subbyte_v<NewT>) {
-    return subbyte_iterator<NewT const>(ptr);
-  } else {
-    return reinterpret_cast<NewT const*>(ptr);
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Disambiguate nullptr
-template <class NewT>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(decltype(nullptr)) {   // nullptr_t
-  return recast_ptr<NewT>(static_cast<NewT*>(nullptr));
-}
-
-//
-// gmem_ptr
-//
-
-template <class P>
-struct gmem_ptr : iter_adaptor<P, gmem_ptr<P>> {
-  using iter_adaptor<P, gmem_ptr<P>>::iter_adaptor;
-};
-
-template <class T, class = void>
-struct is_gmem : false_type {};
-template <class P>                     // Found the gmem
-struct is_gmem<gmem_ptr<P>> : true_type {};
-template <class P>                     // Recurse on ::iterator, if possible
-struct is_gmem<P, void_t<typename P::iterator>> : is_gmem<typename P::iterator> {};
-template <class P>
-constexpr bool is_gmem_v = is_gmem<P>::value;
-
-// Idempotent gmem tag on an iterator
-template <class Iterator>
-CUTE_HOST_DEVICE constexpr
-auto
-make_gmem_ptr(Iterator iter) {
-  if constexpr (is_gmem<Iterator>::value) {
-    return iter;
-  } else {
-    return gmem_ptr<Iterator>{iter};
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_gmem_ptr(void* ptr) {
-  return make_gmem_ptr(recast_ptr<T>(ptr));
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_gmem_ptr(void const* ptr) {
-  return make_gmem_ptr(recast_ptr<T const>(ptr));
-}
-
-// nullptr_t overload for make_gmem_ptr<float>(nullptr) disambiguation
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_gmem_ptr(decltype(nullptr)) { // nullptr_t
-  return make_gmem_ptr(recast_ptr<T>(nullptr));
-}
-
-// The gmem tag is invariant over type-recast
-template <class NewT, class P>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(gmem_ptr<P> const& ptr) {
-  return make_gmem_ptr(recast_ptr<NewT>(ptr.get()));
-}
-
-//
-// smem_ptr
-//
-
-template <class P>
-struct smem_ptr : iter_adaptor<P, smem_ptr<P>> {
-  using iter_adaptor<P, smem_ptr<P>>::iter_adaptor;
-};
-
-template <class T, class = void>
-struct is_smem : false_type {};
-template <class P>                     // Found the smem
-struct is_smem<smem_ptr<P>> : true_type {};
-template <class P>                     // Recurse on ::iterator, if possible
-struct is_smem<P, void_t<typename P::iterator>> : is_smem<typename P::iterator> {};
-template <class P>
-constexpr bool is_smem_v = is_smem<P>::value;
-
-// Idempotent smem tag on an iterator
-template <class Iterator>
-CUTE_HOST_DEVICE constexpr
-auto
-make_smem_ptr(Iterator iter) {
-  if constexpr (is_smem<Iterator>::value) {
-    return iter;
-  } else {
-    return smem_ptr<Iterator>{iter};
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Make a smem swizzle pointer, common operation
-template <class Iterator, class Swizzle>
-CUTE_HOST_DEVICE constexpr
-auto
-make_smem_ptr(Iterator ptr, Swizzle sw)
-{
-  return make_swizzle_ptr(make_smem_ptr(ptr), sw);
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_smem_ptr(void* ptr) {
-  return make_smem_ptr(recast_ptr<T>(ptr));
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_smem_ptr(void const* ptr) {
-  return make_smem_ptr(recast_ptr<T const>(ptr));
-}
-
-// The smem tag is invariant over type-recast
-template <class NewT, class P>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(smem_ptr<P> const& ptr) {
-  return make_smem_ptr(recast_ptr<NewT>(ptr.get()));
-}
-
-//
-// rmem_ptr
-//
-
-template <class P>
-struct rmem_ptr : iter_adaptor<P, rmem_ptr<P>> {
-  using iter_adaptor<P, rmem_ptr<P>>::iter_adaptor;
-};
-
-// Anything that is not gmem or smem is rmem
-template <class T, class = void>
-struct is_rmem : bool_constant<not (is_gmem<T>::value || is_smem<T>::value)> {};
-template <class P>
-struct is_rmem<rmem_ptr<P>> : true_type {};
-template <class P>
-constexpr bool is_rmem_v = is_rmem<P>::value;
-
-// Idempotent rmem tag on an iterator
-template <class Iterator>
-CUTE_HOST_DEVICE constexpr
-auto
-make_rmem_ptr(Iterator iter) {
-  if constexpr (is_rmem<Iterator>::value) {
-    return iter;
-  } else {
-    return rmem_ptr<Iterator>{iter};
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_rmem_ptr(void* ptr) {
-  return make_rmem_ptr(recast_ptr<T>(ptr));
-}
-
-// Explicitly typed construction from a raw pointer
-template <class T>
-CUTE_HOST_DEVICE constexpr
-auto
-make_rmem_ptr(void const* ptr) {
-  return make_rmem_ptr(recast_ptr<T const>(ptr));
-}
-
-// The rmem tag is invariant over type-recast
-template <class NewT, class P>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(rmem_ptr<P> const& ptr) {
-  return make_rmem_ptr(recast_ptr<NewT>(ptr.get()));
-}
-
-//
-// Display utilities
-//
-
-template <class T>
-CUTE_HOST_DEVICE void print(gmem_ptr<T> ptr)
-{
-  printf("gmem_"); print(ptr.get());
-}
-
-template <class T>
-CUTE_HOST_DEVICE void print(smem_ptr<T> ptr)
-{
-  printf("smem_"); print(ptr.get());
-}
-
-template <class T>
-CUTE_HOST_DEVICE void print(rmem_ptr<T> ptr)
-{
-  printf("rmem_"); print(ptr.get());
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class T>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, gmem_ptr<T> ptr)
-{
-  return os << "gmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
-}
-
-template <class T>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, smem_ptr<T> ptr)
-{
-  return os << "smem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
-}
-
-template <class T>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, rmem_ptr<T> ptr)
-{
-  return os << "rmem_[" << int(sizeof_bits<iter_value_t<T>>::value) << "b]";
-}
-
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_base.hpp b/lightllm-kernel/cutlass/include/cute/pointer_base.hpp
deleted file mode 100755
index 90ca0ceb6..000000000
--- a/lightllm-kernel/cutlass/include/cute/pointer_base.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                 // CUTE_HOST_DEVICE
-#include <cute/numeric/numeric_types.hpp>  // cute::sizeof_bits
-#include <cute/util/type_traits.hpp>       // cute::declval, cute::void_t, etc
-
-namespace cute
-{
-
-//
-// C++20 <iterator> iterator_traits
-//
-
-namespace detail {
-// Default reference type of an iterator
-template <class T, class = void>
-struct iter_ref { using type = decltype(*declval<T&>()); };
-// Prefer to propagate ::reference
-template <class T>
-struct iter_ref<T,void_t<typename T::reference>> { using type = typename T::reference; };
-} // end namespace detail
-
-template <class T>
-using iter_reference = detail::iter_ref<T>;
-template <class T>
-using iter_reference_t = typename iter_reference<T>::type;
-
-namespace detail {
-// Default element_type of an iterator
-template <class T, class = void>
-struct iter_e { using type = remove_reference_t<typename iter_ref<T>::type>; };
-// Prefer to propagate ::element_type
-template <class T>
-struct iter_e<T,void_t<typename T::element_type>> { using type = typename T::element_type; };
-} // end namespace detail
-
-template <class T>
-using iter_element = detail::iter_e<T>;
-template <class T>
-using iter_element_t = typename iter_element<T>::type;
-
-namespace detail {
-// Default value_type of an iterator
-template <class T, class = void>
-struct iter_v { using type = remove_cv_t<typename iter_e<T>::type>; };
-// Prefer to propagate ::value_type
-template <class T>
-struct iter_v<T,void_t<typename T::value_type>> { using type = typename T::value_type; };
-} // end namespace detail
-
-template <class T>
-using iter_value = detail::iter_v<T>;
-template <class T>
-using iter_value_t = typename iter_value<T>::type;
-
-template <class Iterator>
-struct iterator_traits {
-  using reference    = iter_reference_t<Iterator>;
-  using element_type = iter_element_t<Iterator>;
-  using value_type   = iter_value_t<Iterator>;
-};
-
-//
-// has_dereference to determine if a type is an iterator concept
-//
-
-namespace detail {
-template <class T, class = void>
-struct has_dereference : CUTE_STL_NAMESPACE::false_type {};
-template <class T>
-struct has_dereference<T, void_t<decltype(*declval<T&>())>> : CUTE_STL_NAMESPACE::true_type {};
-} // end namespace detail
-
-template <class T>
-using has_dereference = detail::has_dereference<T>;
-
-//
-// raw_pointer_cast
-//
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T*
-raw_pointer_cast(T* ptr) {
-  return ptr;
-}
-
-//
-// A very simplified iterator adaptor.
-// Derived classed may override methods, but be careful to reproduce interfaces exactly.
-// Clients should never have an instance of this class. Do not write methods that take this as a param.
-//
-
-template <class Iterator, class DerivedType>
-struct iter_adaptor
-{
-  using iterator     = Iterator;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  using value_type   = typename iterator_traits<iterator>::value_type;
-
-  iterator ptr_;
-
-  CUTE_HOST_DEVICE constexpr
-  iter_adaptor(iterator ptr = {}) : ptr_(ptr) {}
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator*() const { return *ptr_; }
-
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](Index const& i) const { return ptr_[i]; }
-
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  DerivedType operator+(Index const& i) const { return {ptr_ + i}; }
-
-  CUTE_HOST_DEVICE constexpr
-  iterator get() const { return ptr_; }
-
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator==(DerivedType const& x, DerivedType const& y) { return x.ptr_ == y.ptr_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator!=(DerivedType const& x, DerivedType const& y) { return x.ptr_ != y.ptr_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator< (DerivedType const& x, DerivedType const& y) { return x.ptr_ <  y.ptr_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator<=(DerivedType const& x, DerivedType const& y) { return x.ptr_ <= y.ptr_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator> (DerivedType const& x, DerivedType const& y) { return x.ptr_ >  y.ptr_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator>=(DerivedType const& x, DerivedType const& y) { return x.ptr_ >= y.ptr_; }
-};
-
-template <class I, class D>
-CUTE_HOST_DEVICE constexpr
-auto
-raw_pointer_cast(iter_adaptor<I,D> const& x) {
-  return raw_pointer_cast(x.ptr_);
-}
-
-//
-// counting iterator -- quick and dirty
-//
-
-template <class T = int>
-struct counting_iterator
-{
-  using index_type = T;
-  using value_type = T;
-  using reference  = T;
-
-  index_type n_;
-
-  CUTE_HOST_DEVICE constexpr
-  counting_iterator(index_type n = 0) : n_(n) {}
-
-  CUTE_HOST_DEVICE constexpr
-  index_type operator*() const { return n_; }
-
-  CUTE_HOST_DEVICE constexpr
-  index_type operator[](index_type i) const { return n_ + i; }
-
-  CUTE_HOST_DEVICE constexpr
-  counting_iterator operator+(index_type i) const { return {n_ + i}; }
-  CUTE_HOST_DEVICE constexpr
-  counting_iterator& operator++() { ++n_; return *this; }
-  CUTE_HOST_DEVICE constexpr
-  counting_iterator operator++(int) { counting_iterator ret = *this; ++n_; return ret; }
-
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator==(counting_iterator const& x, counting_iterator const& y) { return x.n_ == y.n_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator!=(counting_iterator const& x, counting_iterator const& y) { return x.n_ != y.n_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator< (counting_iterator const& x, counting_iterator const& y) { return x.n_ <  y.n_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator<=(counting_iterator const& x, counting_iterator const& y) { return x.n_ <= y.n_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator> (counting_iterator const& x, counting_iterator const& y) { return x.n_ >  y.n_; }
-  CUTE_HOST_DEVICE constexpr
-  friend bool operator>=(counting_iterator const& x, counting_iterator const& y) { return x.n_ >= y.n_; }
-};
-
-template <class T>
-CUTE_HOST_DEVICE constexpr
-T
-raw_pointer_cast(counting_iterator<T> const& x) {
-  return x.n_;
-}
-
-//
-// Display utilities
-//
-
-template <class T>
-CUTE_HOST_DEVICE void print(T const* const ptr)
-{
-  printf("ptr["); print(sizeof_bits<T>::value); printf("b](%p)", ptr);
-}
-
-template <class T>
-CUTE_HOST_DEVICE void print(counting_iterator<T> ptr)
-{
-  printf("counting_iter("); print(ptr.n_); printf(")");
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class T>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, counting_iterator<T> ptr)
-{
-  return os << "counting_iter(" << ptr.n_ << ")";
-}
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp b/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
deleted file mode 100755
index eb8d7e452..000000000
--- a/lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/layout_composed.hpp>            // cute::ComposedLayout
-#include <cute/pointer.hpp>                    // cute::make_smem_ptr
-#include <cute/pointer_sparse.hpp>             // cute::is_sparse
-#include <cute/pointer_swizzle.hpp>            // cute::make_swizzle_ptr
-#include <cute/arch/util.hpp>                  // cute::cast_smem_ptr_to_uint
-#include <cute/numeric/integral_constant.hpp>  // cute::Int
-
-namespace cute
-{
-
-//
-// Stand-in Swizzle Layout
-//   A model of a nullptr smem_ptr<T> with B == sizeof_bits<T>::value
-//   That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
-//
-
-template <int Bits>
-struct smem_ptr_flag_bits : Int<0> {};
-
-using smem_ptr_flag = smem_ptr_flag_bits<1>;
-
-// A flagged construction method to transform ComposedLayout
-// Make a swizzle pointer tensor and check that the intended type size matches
-template <class Iterator, class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor(Iterator const& ptr,
-            ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  static_assert(is_smem<Iterator>::value, "Expected smem.");
-  static_assert(B == sizeof_bits<iter_value_t<Iterator>>::value, "Expected a B-bit pointer type.");
-  return make_tensor(make_smem_ptr(ptr.get(), layout.layout_a()),
-                     layout.layout_b());
-}
-
-// NOTE: To preserve smem_ptr_flag_bits under recast ops
-template <int N, class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  return composition(layout.layout_a(), smem_ptr_flag_bits<B*N>{}, upcast<N>(layout.layout_b()));
-}
-
-template <int N, class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  return composition(layout.layout_a(), smem_ptr_flag_bits<B/N>{}, downcast<N>(layout.layout_b()));
-}
-
-//
-// Conversion with swizzle_layout
-//
-
-template <class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE
-auto
-as_position_independent_swizzle_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  return composition(recast_layout<uint8_t,uint_bit_t<B>>(layout.layout_a()), Int<0>{}, layout.layout_b());
-}
-
-template <class Tensor>
-CUTE_HOST_DEVICE
-auto
-as_position_independent_swizzle_tensor(Tensor&& tensor)
-{
-  static_assert(is_smem<remove_cvref_t<Tensor>>::value, "Expected smem tensor.");
-  using SwizzleFn = get_swizzle_t<remove_cvref_t<Tensor>>;
-  if constexpr (SwizzleFn::num_bits == 0) {
-    return tensor;
-  } else {
-#if !defined(NDEBUG)
-    {
-    uint32_t address = cast_smem_ptr_to_uint(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
-    uint32_t mask    = ((uint32_t(1) << SwizzleFn::num_base) - 1) | SwizzleFn::swizzle_code;
-    assert((address & mask) == 0);  // Alignment to the Base, Z, and Y of Swizzle
-    }
-#endif
-    using T = typename remove_cvref_t<Tensor>::value_type;
-    // Recast swizzle from acting on byte-addressed pointers to elements of type-T
-    auto new_swizzle = recast_layout<uint8_t, T>(SwizzleFn{});
-    // Strip off everything and create a new smem_ptr for type-T
-    auto new_ptr = make_smem_ptr<T>(raw_pointer_cast(static_cast<Tensor&&>(tensor).data()));
-    return make_tensor(new_ptr, composition(new_swizzle, Int<0>{}, tensor.layout()));
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-// A model of a nullptr sparse_ptr<S, smem_ptr<T>> with B == sizeof_bits<T>::value
-// That represents an unset pointer. This is a placeholder type that is waiting for an smem_ptr
-template <int Sparsity, int Bits>
-struct smem_sparse_ptr_flag_bits : Int<0> {};
-
-template <int Sparsity>
-using smem_sparse_ptr_flag = smem_sparse_ptr_flag_bits<Sparsity, 1>;
-
-// A flagged construction method to transform ComposedLayout
-// Make a swizzle pointer tensor and check that the intended type size matches
-template <class Iterator, class SwizzleFn, int S, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor(Iterator const& ptr,
-            ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
-{
-  static_assert(is_smem<Iterator>::value, "Expected smem.");
-  static_assert(is_sparse_ptr<Iterator>::value, "Expected sparse iter");
-  static_assert(is_sparse<iter_value_t<Iterator>>::value, "Expected sparse elem");
-  static_assert(S == iter_value_t<Iterator>::sparsity, "Expected sparsity S");
-  static_assert(B == sizeof_bits<typename iter_value_t<Iterator>::raw_type>::value, "Expected B-bit pointer type");
-  return make_tensor(make_swizzle_ptr(ptr, layout.layout_a()), layout.layout_b());
-}
-
-// NOTE: To preserve smem_ptr_flag_bits under recast ops
-template <int N, class SwizzleFn, int S, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
-{
-  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
-}
-
-template <int N, class SwizzleFn, int S, int B, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(ComposedLayout<SwizzleFn,smem_sparse_ptr_flag_bits<S,B>,Layout> const& layout)
-{
-  static_assert(dependent_false<SwizzleFn>, "Not implemented for safety");
-}
-
-//
-// Display utilities
-//
-
-// Capture and cast smem_ptr_flag Layouts to offset-0 layouts
-template <class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE
-void
-print_layout(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  print_layout(as_position_independent_swizzle_layout(layout));
-}
-
-template <class SwizzleFn, int B, class Layout>
-CUTE_HOST_DEVICE
-void
-print_latex(ComposedLayout<SwizzleFn,smem_ptr_flag_bits<B>,Layout> const& layout)
-{
-  print_latex(as_position_independent_swizzle_layout(layout));
-}
-
-template <int B>
-CUTE_HOST_DEVICE void print(smem_ptr_flag_bits<B> ptr)
-{
-  printf("smem_ptr[%db](unset)", B);
-}
-
-template <int S, int B>
-CUTE_HOST_DEVICE void print(smem_sparse_ptr_flag_bits<S,B>)
-{
-  printf("smem_sparse<%d>_ptr[%db](unset)", S, B);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp b/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
deleted file mode 100755
index ccae45865..000000000
--- a/lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/pointer_base.hpp>               // cute::iter_adaptor
-#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
-#include <cute/numeric/integral_ratio.hpp>     // cute::ratio
-
-namespace cute
-{
-
-// A data type that holds one physical element meant to represent Sparsity number of logical elements
-// This class is purposely not compatible with anything -- know what you're doing if you attempt to use it
-template <int Sparsity, class T>
-struct sparse_elem
-{
-  static constexpr int sparsity = Sparsity;
-  using raw_type = T;
-  T elem_;
-
-  CUTE_HOST_DEVICE constexpr
-  explicit sparse_elem(T const& elem = {}) : elem_(elem) {}
-
-  CUTE_HOST_DEVICE constexpr friend bool operator==(sparse_elem const& a, sparse_elem const& b) { return a.elem_ == b.elem_; }
-  CUTE_HOST_DEVICE constexpr friend bool operator!=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ != b.elem_; }
-  CUTE_HOST_DEVICE constexpr friend bool operator< (sparse_elem const& a, sparse_elem const& b) { return a.elem_ <  b.elem_; }
-  CUTE_HOST_DEVICE constexpr friend bool operator<=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ <= b.elem_; }
-  CUTE_HOST_DEVICE constexpr friend bool operator> (sparse_elem const& a, sparse_elem const& b) { return a.elem_ >  b.elem_; }
-  CUTE_HOST_DEVICE constexpr friend bool operator>=(sparse_elem const& a, sparse_elem const& b) { return a.elem_ >= b.elem_; }
-};
-
-template <class T>
-struct is_sparse : false_type {};
-template <class T>
-struct is_sparse<T const> : is_sparse<T> {};
-template <int S, class T>
-struct is_sparse<sparse_elem<S,T>> : true_type {};
-template<class T>
-static constexpr auto is_sparse_v = is_sparse<T>::value;
-
-// Overload sizeof_bits for sparse_elem.
-//   Much like subbyte element types, this is the effective number of bits in a sparse_elem
-//   rather than actual physical bits that may be used in storing one. Also like subbyte element
-//   types, modified iterators are required to properly index and access sparse_elems.
-//
-//   Defining sizeof_bits like this makes reasonable expressions like N * sizeof_bits_v<E> meaningful
-//   even when E is subbyte or sparse. However, this also means that sparse_elem can rather easily be
-//   confused with subbyte elements and special care should be taken with each.
-template <int S, class T>
-struct sizeof_bits<sparse_elem<S,T>> {
-  // Simple implementation that conforms to sizeof_bits
-  //static constexpr auto value = sizeof_bits<T>::value / S;
-  //static_assert(value != 0, "sizeof_bits=0 detected. Sparsity is larger than width.");
-  //static_assert((sizeof_bits<T>::value % S) == 0, "Width needs to be a multiple of sparsity.")
-
-  // Interesting experiment that allows any sparsity level to be used by potentially presenting
-  // an integral_ratio rather than size_t. This is valid in most integer expressions as well.
-  static constexpr auto value = cute::ratio(cute::Int<cute::sizeof_bits_v<T>>{}, cute::Int<S>{});
-};
-
-//
-// sparse_ptr
-//
-
-template <class T, class = void>
-struct is_sparse_ptr : false_type {};
-template <class T>
-struct is_sparse_ptr<T, void_t<typename T::iterator>> : is_sparse_ptr<typename T::iterator> {};
-
-template <int Sparsity, class Iterator>
-struct sparse_ptr : iter_adaptor<Iterator, sparse_ptr<Sparsity, Iterator>>
-{
-  using reference    = typename iterator_traits<Iterator>::reference;
-  using element_type = typename iterator_traits<Iterator>::element_type;
-  using value_type   = typename iterator_traits<Iterator>::value_type;
-
-  // Sanity, for now
-  static_assert(is_sparse<value_type>::value, "Enforce sparse value-type");
-  static_assert(Sparsity == iter_value_t<Iterator>::sparsity, "Enforce sparsity S");
-  static_assert(not is_sparse_ptr<Iterator>::value, "Enforce sparse singleton");
-
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  sparse_ptr operator+(Index const& i) const {
-    // Only allow offset by multiples of the sparsity factor,
-    // else the misalignments become a bug. E.g. (sparse_ptr<8,I>{} + 7) + 7
-    // Motivation for subsparse_iterator or generalization of subbyte_iterator?
-    assert(i % Sparsity == 0);
-    return {this->get() + i / Sparsity};
-  }
-
-  template <class Index>
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](Index const& i) const {
-    // Allow offset by any value and dereference.
-    // Not implemented in terms of sparse_ptr::op+()
-    return *(this->get() + i / Sparsity);
-  }
-};
-
-template <int S, class I>
-struct is_sparse_ptr<sparse_ptr<S,I>> : true_type {};
-
-template <int Sparsity, class Iter>
-CUTE_HOST_DEVICE constexpr
-auto
-make_sparse_ptr(Iter const& iter) {
-  if constexpr (Sparsity == 1) {
-    return iter;
-  } else {
-    return sparse_ptr<Sparsity, Iter>{iter};
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class NewT, int S, class Iter>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(sparse_ptr<S,Iter> const& ptr) {
-  static_assert(not is_sparse<NewT>::value);
-  return recast_ptr<NewT>(ptr.get());
-}
-
-//
-// Display utilities
-//
-
-template <int S, class Iter>
-CUTE_HOST_DEVICE void print(sparse_ptr<S,Iter> ptr)
-{
-  printf("sparse<%d>_", S); print(ptr.get());
-}
-
-#if !defined(__CUDACC_RTC__)
-template <int S, class Iter>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, sparse_ptr<S,Iter> ptr)
-{
-  return os << "sparse<" << S << ">_" << ptr.get();
-}
-#endif
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp b/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
deleted file mode 100755
index 720b9b124..000000000
--- a/lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                   // CUTE_HOST_DEVICE
-#include <cute/pointer_base.hpp>             // cute::iter_adaptor
-#include <cute/swizzle.hpp>                  // cute::Swizzle, cute::get_swizzle primary template
-#include <cute/util/type_traits.hpp>         // cute::iterator_traits
-#include <cute/container/array_subbyte.hpp>  // cute::subbyte_iterator
-
-/* This implements a swizzle pointer of the form
- *   InvolutionFn o PtrAdd
- * where the InvolutionFn need not be linear.
- *
- * This differs subtly from swizzle_layout because the smem pointer is used
- * as the offset. That means that swizzle_layout will implement position-independent
- * swizzle layouts, while swizzle_ptr implements position-dependent swizzle tensors.
- * Arch chose to design hardware with position-dependent swizzles.
- *
- * For clarity:
- *   NormalLayout  : DeRef <- PtrAdd <- [Layout]
- *   ComposedLayout: DeRef <- PtrAdd <- [Swizzle <- OffsetAdd <- Layout]
- *   SwizzlePtr    : [DeRef <- Swizzle <- PtrAdd] <- Layout
- *
- * Furthermore, for known swizzles, this pointer attempts to decay itself
- *    to a normal-pointer with a new layout containing dynamic or static strides.
- * This is possible by determining the subdomain of the InvolutionFn
- *    that is identity and testing if the Layout's codomain is contained
- *    within it.
- */
-
-namespace cute
-{
-
-// concept SwizzleFn {
-//   CUTE_HOST_DEVICE constexpr static uint apply(uint);
-// }
-// See Swizzle<B,M,S> in swizzle.hpp for common swizzle-functions.
-
-template <class SwizzleFn, class Iterator>
-struct swizzle_ptr : iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>
-{
-  using iterator     = Iterator;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  using value_type   = typename iterator_traits<iterator>::value_type;
-
-  using iter_adaptor<Iterator,swizzle_ptr<SwizzleFn,Iterator>>::iter_adaptor;
-
-  template <class Iter>
-  CUTE_HOST_DEVICE constexpr static
-  Iter apply_swizzle(Iter ptr) {
-    return {apply_swizzle(ptr.get())};
-  }
-
-  template <class T>
-  CUTE_HOST_DEVICE constexpr static
-  T* apply_swizzle(T* ptr) {
-    return reinterpret_cast<T*>(SwizzleFn::apply(reinterpret_cast<uintptr_t>(ptr)));
-  }
-
-  template <class T>
-  CUTE_HOST_DEVICE constexpr static
-  subbyte_iterator<T> apply_swizzle(subbyte_iterator<T> ptr) {
-    return {apply_swizzle(ptr.ptr_), ptr.idx_};
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator*() const {
-    return *apply_swizzle(this->get());
-  }
-
-  template <class Int>
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](Int const& i) const {
-    return *apply_swizzle(this->get() + i);
-  }
-};
-
-//
-// Helper Function
-//
-template <class SwizzleFn, class P>                   // Found the SwizzleFn
-struct get_swizzle<swizzle_ptr<SwizzleFn,P>> { using type = SwizzleFn; };
-template <class T>                                    // Recurse into anything with a ::iterator
-struct get_swizzle<T, void_t<typename T::iterator>> : get_swizzle<typename T::iterator> {};
-
-template <class Iterator, class SwizzleFn>
-CUTE_HOST_DEVICE constexpr
-swizzle_ptr<SwizzleFn,Iterator>
-make_swizzle_ptr(Iterator ptr, SwizzleFn) {
-  return {ptr};
-}
-
-// Swizzle-0 specialization for immediate decay
-template <class Iterator, int M, int S>
-CUTE_HOST_DEVICE constexpr
-Iterator
-make_swizzle_ptr(Iterator ptr, Swizzle<0,M,S>) {
-  return ptr;
-}
-
-//
-// Recast
-//
-
-template <class SwizzleFn, class P>
-CUTE_HOST_DEVICE constexpr
-auto
-raw_pointer_cast(swizzle_ptr<SwizzleFn,P> const& ptr) {
-  return raw_pointer_cast(ptr.get());
-}
-
-// SwizzleFn operates on the pointer address, so it doesn't care about the type
-template <class NewT, class SwizzleFn, class P>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_ptr(swizzle_ptr<SwizzleFn,P> const& ptr) {
-  return make_swizzle_ptr(recast_ptr<NewT>(ptr.get()), SwizzleFn{});
-}
-
-//
-// Display utilities
-//
-
-template <class SwizzleFn, class P>
-CUTE_HOST_DEVICE void print(swizzle_ptr<SwizzleFn,P> ptr)
-{
-  print(SwizzleFn{}); printf("_"); print(ptr.get());
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class SwizzleFn, class P>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, swizzle_ptr<SwizzleFn,P> ptr)
-{
-  return os << SwizzleFn{} << "_" << ptr.get();
-}
-#endif
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/stride.hpp b/lightllm-kernel/cutlass/include/cute/stride.hpp
deleted file mode 100755
index f2d31f4e3..000000000
--- a/lightllm-kernel/cutlass/include/cute/stride.hpp
+++ /dev/null
@@ -1,598 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/util/type_traits.hpp>           // cute::__CUTE_REQUIRES
-#include <cute/container/tuple.hpp>            // cute::is_tuple
-#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
-#include <cute/numeric/integer_sequence.hpp>   // cute::seq
-#include <cute/numeric/math.hpp>               // cute::divmod
-#include <cute/numeric/arithmetic_tuple.hpp>   // cute::basis_get
-#include <cute/algorithm/functional.hpp>       // cute::identity
-#include <cute/algorithm/tuple_algorithms.hpp> // cute::fold
-#include <cute/int_tuple.hpp>                  // cute::is_congruent
-
-namespace cute
-{
-
-/** crd2idx(c,s,d) maps a coordinate within <Shape,Stride> to an index
- *
- * This is computed as follows:
- *  [coord, shape, and stride are all integers => step forward by stride]
- * op(c, s, d)             => c * d
- *  [coord is integer, shape and stride are tuple => divmod coord for each mode]
- * op(c, (s,S), (d,D))     => op(c % prod(s), s, d) + op(c / prod(s), (S), (D))
- *  [coord, shape, and stride are all tuples => consider each mode independently]
- * op((c,C), (s,S), (d,D)) => op(c, s, d) + op((C), (S), (D))
- */
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx(Coord  const& coord,
-        Shape  const& shape,
-        Stride const& stride);
-
-namespace detail {
-
-template <class Coord, class Shape, class Stride, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx_ttt(Coord  const& coord,
-            Shape  const& shape,
-            Stride const& stride, seq<Is...>)
-{
-  return (... + crd2idx(get<Is>(coord), get<Is>(shape), get<Is>(stride)));
-}
-
-template <class CInt, class STuple, class DTuple, int I0, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx_itt(CInt   const& coord,
-            STuple const& shape,
-            DTuple const& stride, seq<I0,Is...>)
-{
-  if constexpr (sizeof...(Is) == 0) {  // Avoid recursion and mod on single/last iter
-    return crd2idx(coord, get<I0>(shape), get<I0>(stride));
-  } else if constexpr (is_constant<0, CInt>::value) {
-    return crd2idx(_0{}, get<I0>(shape), get<I0>(stride))
-         + (_0{} + ... + crd2idx(_0{}, get<Is>(shape), get<Is>(stride)));
-  } else {                             // General case
-    auto [div, mod] = divmod(coord, product(get<I0>(shape)));
-    return crd2idx(mod, get<I0>(shape), get<I0>(stride))
-         + crd2idx_itt(div, shape, stride, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class Coord, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx(Coord  const& coord,
-        Shape  const& shape,
-        Stride const& stride)
-{
-  if constexpr (is_tuple<Coord>::value) {
-    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
-      static_assert(tuple_size<Coord>::value == tuple_size< Shape>::value, "Mismatched Ranks");
-      static_assert(tuple_size<Coord>::value == tuple_size<Stride>::value, "Mismatched Ranks");
-      return detail::crd2idx_ttt(coord, shape, stride, tuple_seq<Coord>{});
-    } else {                                     // tuple "int" "int"
-      static_assert(sizeof(Coord) == 0, "Invalid parameters");
-    }
-  } else {
-    if constexpr (is_tuple<Shape>::value) {      // "int" tuple tuple
-      static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
-      return detail::crd2idx_itt(coord, shape, stride, tuple_seq<Shape>{});
-    } else {                                     // "int" "int" "int"
-      return coord * stride;
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-namespace detail {
-
-template <class CTuple, class STuple, int I0, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx_horner(CTuple const& coord,
-               STuple const& shape, seq<I0,Is...>)
-{
-  if constexpr (sizeof...(Is) == 0) {  // No recursion on single/last iter
-    return get<I0>(coord);
-  } else {                             // General case
-    return get<I0>(coord) + get<I0>(shape) * crd2idx_horner(coord, shape, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-/** crd2idx(c,s) maps a coordinate within Shape to an index
- * via a colexicographical enumeration of coordinates in Shape.
- * i = c0 + s0 * (c1 + s1 * (c2 + s2 * ...))
- */
-template <class Coord, class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2idx(Coord const& coord,
-        Shape const& shape)
-{
-  if constexpr (is_integral<Coord>::value) {  // Coord is already an index
-    return coord;
-  } else if constexpr (is_integral<Shape>::value) {
-    static_assert(dependent_false<Shape>, "Invalid parameters");
-  } else {                                    // Make congruent, flatten, and apply Horner's method
-    static_assert(tuple_size<Coord>::value == tuple_size<Shape>::value, "Mismatched Ranks");
-    auto flat_coord = flatten(coord);
-    auto flat_shape = flatten(product_like(shape, coord));
-    return detail::crd2idx_horner(flat_coord, flat_shape, tuple_seq<decltype(flat_shape)>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** idx2crd(i,s,d) splits an index into a coordinate within <Shape,Stride>.
- *
- * This is computed as follows:
- *  [index, shape, and stride are all integers => determine 1D coord]
- * op(i, s, d)             => (i / d) % s
- *  [index is integer, shape and stride are tuple => determine component for each mode]
- * op(i, (s,S), (d,D))     => (op(i, s, d), op(i, S, D)...)
- *  [index, shape, and stride are all tuples => consider each mode independently]
- * op((i,I), (s,S), (d,D)) => (op(i, s, d), op((I), (S), (D)))
- *
- * NOTE: This only works for compact shape+stride layouts. A more general version would
- *       apply to all surjective layouts
- */
-template <class Index, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-idx2crd(Index  const& idx,
-        Shape  const& shape,
-        Stride const& stride)
-{
-  if constexpr (is_tuple<Index>::value) {
-    if constexpr (is_tuple<Shape>::value) {      // tuple tuple tuple
-      static_assert(tuple_size<Index>::value == tuple_size< Shape>::value, "Mismatched Ranks");
-      static_assert(tuple_size<Index>::value == tuple_size<Stride>::value, "Mismatched Ranks");
-      return transform(idx, shape, stride, [](auto const& i, auto const& s, auto const& d){ return idx2crd(i,s,d); });
-    } else {                                     // tuple "int" "int"
-      static_assert(sizeof(Index) == 0, "Invalid parameters");
-    }
-  } else {
-    if constexpr (is_tuple<Shape>::value) {
-      if constexpr (is_tuple<Stride>::value) {   // "int" tuple tuple
-        static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched Ranks");
-        return transform(shape, stride, [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
-      } else {                                   // "int" tuple "int"
-        return transform(shape, compact_col_major(shape, stride), [&](auto const& s, auto const& d){ return idx2crd(idx,s,d); });
-      }
-    } else {                                     // "int" "int" "int"
-      if constexpr (is_constant<1, Shape>::value) {
-        // Skip potential stride-0 division
-        return Int<0>{};
-      } else {
-        return (idx / stride) % shape;
-      }
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/** idx2crd(i,s) splits an index into a coordinate within Shape
- * via a colexicographical enumeration of coordinates in Shape.
- * c0 = (idx / 1) % s0
- * c1 = (idx / s0) % s1
- * c2 = (idx / (s0 * s1)) % s2
- * ...
- */
-template <class Index, class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-idx2crd(Index const& idx,
-        Shape const& shape)
-{
-  if constexpr (is_tuple<Index>::value) {
-    if constexpr (is_tuple<Shape>::value) {      // tuple tuple
-      static_assert(tuple_size<Index>::value == tuple_size<Shape>::value, "Mismatched Ranks");
-      return transform(idx, shape, [](auto const& i, auto const& s) { return idx2crd(i,s); });
-    } else {                                     // tuple "int"
-      static_assert(sizeof(Index) == 0, "Invalid parameters");
-    }
-  } else {
-    if constexpr (is_tuple<Shape>::value) {      // "int" tuple
-      return transform_leaf(as_arithmetic_tuple(crd2idx(idx, shape, make_basis_like(shape))), identity{});
-    } else {                                     // "int" "int"
-      return idx;
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// crd2crd
-//
-
-template <class Coord, class SShape, class DShape>
-CUTE_HOST_DEVICE constexpr
-auto
-crd2crd(Coord  const& coord,
-        SShape const& src_shape,
-        DShape const& dst_shape)
-{
-  if constexpr (is_tuple<Coord>::value && is_tuple<SShape>::value && is_tuple<DShape>::value) {
-    static_assert(tuple_size<Coord>::value == tuple_size<SShape>::value, "Mismatched Ranks");
-    static_assert(tuple_size<Coord>::value == tuple_size<DShape>::value, "Mismatched Ranks");
-    return transform(coord, src_shape, dst_shape, [](auto const& c, auto const& s, auto const& d) { return crd2crd(c,s,d); });
-  } else {
-    // assert(size(src_shape) == size(dst_shape))
-    return idx2crd(crd2idx(coord, src_shape), dst_shape);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Compact Major
-//
-
-// Tags for common layouts and dispatching
-struct LayoutLeft;               // Col-major layout mapping; leftmost extent has stride 1
-using GenColMajor = LayoutLeft;  // Alias
-
-struct LayoutRight;              // Row-major layout mapping; rightmost extent has stride 1
-using GenRowMajor = LayoutRight; // Alias
-
-namespace detail {
-
-// For GCC8.5 -- Use of lambdas in unevaluated contexts. Instead use function objects.
-template <class Major>
-struct CompactLambda;
-
-// @pre is_integral<Current>
-// Return (result, current * product(shape)) to enable recurrence
-template <class Major, class Shape, class Current>
-CUTE_HOST_DEVICE constexpr
-auto
-compact(Shape   const& shape,
-        Current const& current)
-{
-  if constexpr (is_tuple<Shape>::value) { // Shape::tuple Current::int
-    using Lambda = CompactLambda<Major>;                  // Append or Prepend
-    using Seq    = typename Lambda::template seq<Shape>;  // Seq or RSeq
-    return cute::detail::fold(shape, cute::make_tuple(cute::make_tuple(), current), Lambda{}, Seq{});
-  } else {                                // Shape::int Current::int
-    if constexpr (is_constant<1, Shape>::value) {
-      return cute::make_tuple(Int<0>{}, current); // If current is dynamic, this could save a reg
-    } else {
-      return cute::make_tuple(current, current * shape);
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// For GCC8.5 -- Specialization LayoutLeft
-template <>
-struct CompactLambda<LayoutLeft>
-{
-  template <class Init, class Shape>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Init const& init, Shape const& si) {
-    auto result = detail::compact<LayoutLeft>(si, get<1>(init));
-    return cute::make_tuple(append(get<0>(init), get<0>(result)), get<1>(result));  // Append
-  }
-
-  template <class Shape>
-  using seq = tuple_seq<Shape>;                                                     // Seq
-};
-
-// For GCC8.5 -- Specialization LayoutRight
-template <>
-struct CompactLambda<LayoutRight>
-{
-  template <class Init, class Shape>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Init const& init, Shape const& si) {
-    auto result = detail::compact<LayoutRight>(si, get<1>(init));
-    return cute::make_tuple(prepend(get<0>(init), get<0>(result)), get<1>(result));  // Prepend
-  }
-
-  template <class Shape>
-  using seq = tuple_rseq<Shape>;                                                     // RSeq
-};
-
-} // end namespace detail
-
-template <class Major, class Shape, class Current = Int<1>,
-          __CUTE_REQUIRES(is_tuple<Shape>::value || is_integral<Shape>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_major(Shape   const& shape,
-              Current const& current = {})
-{
-  if constexpr (is_tuple<Current>::value) {    // Shape::tuple Current::tuple
-    static_assert(is_tuple<Shape>::value, "Invalid parameters");
-    static_assert(tuple_size<Shape>::value == tuple_size<Current>::value, "Mismatched Ranks");
-    // Recurse to apply to the terminals of current
-    return transform(shape, current, [&](auto const& s, auto const& c){ return compact_major<Major>(s,c); });
-  } else {
-    return get<0>(detail::compact<Major>(shape, current));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Compact Col Major
-//
-
-struct LayoutLeft {
-  template <class Shape>
-  using Apply = decltype(compact_major<LayoutLeft>(declval<Shape>()));
-};
-
-template <class Shape, class Current = Int<1>>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_col_major(Shape   const& shape,
-                  Current const& current = {})
-{
-  return compact_major<LayoutLeft>(shape, current);
-}
-
-//
-// Compact Row Major
-//
-
-struct LayoutRight {
-  template <class Shape>
-  using Apply = decltype(compact_major<LayoutRight>(declval<Shape>()));
-};
-
-template <class Shape, class Current = Int<1>>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_row_major(Shape   const& shape,
-                  Current const& current = {})
-{
-  return compact_major<LayoutRight>(shape, current);
-}
-
-//
-// Compact Order -- compute a compact stride based on an ordering of the modes
-//
-
-namespace detail {
-
-// @pre weakly_congruent(order, shape)
-// @pre is_congruent<RefShape, RefOrder>
-// @pre is_static<Order>
-// @pre is_static<RefOrder>
-template <class Shape, class Order, class RefShape, class RefOrder>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_order(Shape const& shape, Order const& order,
-              RefShape const& ref_shape, RefOrder const& ref_order)
-{
-  if constexpr (is_tuple<Order>::value) {
-    static_assert(tuple_size<Shape>::value == tuple_size<Order>::value, "Need equal rank of shape and order");
-    return transform(shape, order, [&](auto const& s, auto const& o) { return compact_order(s, o, ref_shape, ref_order); });
-  } else {
-    // Compute the starting stride for this shape by accumulating all shapes corresponding to lesser orders
-    auto stride_start = product(transform(ref_shape, ref_order,
-                                          [&](auto const& s, auto const& o) {
-                                            return conditional_return(o < order, s, Int<1>{});
-                                          }));
-    return compact_col_major(shape, stride_start);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-template <class Shape, class Order>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_order(Shape const& shape, Order const& order)
-{
-  auto ref_shape = flatten_to_tuple(product_like(shape, order));
-
-  auto flat_order = flatten_to_tuple(order);
-  // Find the largest static element of order
-  auto max_order = cute::fold(flat_order, Int<0>{}, [](auto v, auto order) {
-    if constexpr (is_constant<true, decltype(v < order)>::value) {
-      return order;
-    } else {
-      return v;
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  });
-  // Replace any dynamic elements within order with large-static elements
-  auto max_seq = make_range<max_order+1, max_order+1+rank(flat_order)>{};
-  auto ref_order = cute::transform(max_seq, flat_order, [](auto seq_v, auto order) {
-    if constexpr (is_static<decltype(order)>::value) {
-      return order;
-    } else {
-      return seq_v;
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  });
-
-  auto new_order = unflatten(ref_order, order);
-
-  return detail::compact_order(shape, new_order, ref_shape, ref_order);
-}
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_order(Shape const& shape, GenColMajor const& major)
-{
-  return compact_major<LayoutLeft>(shape);
-}
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-compact_order(Shape const& shape, GenRowMajor const& major)
-{
-  return compact_major<LayoutRight>(shape);
-}
-
-//
-// Coordinate iterator
-//
-
-namespace detail {
-
-template <class Coord, class Shape, class Order>
-CUTE_HOST_DEVICE constexpr
-void
-increment(Coord& coord, Shape const& shape, Order const& order)
-{
-  ++basis_get(get<0>(order), coord);
-  cute::for_each(make_range<1, tuple_size<Order>::value>{}, [&](auto i){
-    if (basis_get(get<i-1>(order), coord) == basis_get(get<i-1>(order), shape)) {
-      basis_get(get<i-1>(order), coord) = 0;
-      ++basis_get(get<i>(order), coord);
-    }
-  });
-}
-
-/** Increment a (dynamic) coord colexicographically within a shape
- * @pre is_congruent<Coord,Shape>::value
- * \code
- *   auto shape = make_shape(1,2,make_shape(2,3),3);
- *   auto coord = repeat_like(shape, 0);
- *
- *   for (int i = 0; i < size(shape); ++i) {
- *     std::cout << i << ": " << coord << std::endl;
- *     increment(coord, shape);
- *   }
- * \endcode
- */
-template <class Coord, class Shape>
-CUTE_HOST_DEVICE constexpr
-void
-increment(Coord& coord, Shape const& shape)
-{
-  increment(coord, shape, flatten_to_tuple(make_basis_like(shape)));
-}
-
-} // end namespace detail
-
-struct ForwardCoordIteratorSentinel
-{};
-
-// A forward iterator for a starting coordinate in a shape's domain, and a shape.
-// The starting coordinate may be zero but need not necessarily be.
-template <class Coord, class Shape, class Order>
-struct ForwardCoordIterator
-{
-  static_assert(is_congruent<Coord, Shape>::value);
-
-  CUTE_HOST_DEVICE constexpr
-  Coord const& operator*() const { return coord; }
-  CUTE_HOST_DEVICE constexpr
-  ForwardCoordIterator& operator++() { detail::increment(coord, shape, Order{}); return *this; }
-  // Sentinel for the end of the implied range
-  CUTE_HOST_DEVICE constexpr
-  bool operator==(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) == basis_get(back(Order{}), shape); }
-  CUTE_HOST_DEVICE constexpr
-  bool operator!=(ForwardCoordIteratorSentinel const&) const { return basis_get(back(Order{}), coord) != basis_get(back(Order{}), shape); }
-  // NOTE: These are expensive, avoid use
-  CUTE_HOST_DEVICE constexpr
-  bool operator==(ForwardCoordIterator const& other) const { return coord == other.coord; }
-  CUTE_HOST_DEVICE constexpr
-  bool operator!=(ForwardCoordIterator const& other) const { return coord != other.coord; }
-
-  Coord coord;
-  Shape const& shape;
-};
-
-// A forward iterator for a coordinate that starts from a provided coordinate and increments in a prescribed order
-template <class Order, class Shape, class Coord>
-CUTE_HOST_DEVICE constexpr
-auto
-make_coord_iterator(Coord const& coord, Shape const& shape)
-{
-  static_assert(is_congruent<Coord, Shape>::value);
-  static_assert(is_congruent<Order, Coord>::value);
-  static_assert(is_congruent<Order, Shape>::value);
-  auto flat_order  = flatten_to_tuple(Order{});
-  auto inv_order   = transform(make_seq<rank(flat_order)>{}, [&](auto i){ return find(flat_order, i); });
-  auto basis_order = transform_leaf(inv_order, [&](auto i) { return get<i>(flatten_to_tuple(make_basis_like(shape))); });
-  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
-}
-
-// A forward iterator for a coordinate that starts from a provided coordinate and increments colex
-template <class Shape, class Coord>
-CUTE_HOST_DEVICE constexpr
-auto
-make_coord_iterator(Coord const& coord, Shape const& shape)
-{
-  static_assert(is_congruent<Coord, Shape>::value);
-  auto basis_order = flatten_to_tuple(make_basis_like(shape));
-  return ForwardCoordIterator<Coord,Shape,decltype(basis_order)>{coord,shape};
-}
-
-// A forward iterator for a coordinate that starts from zero and increments in a prescribed order
-template <class Order, class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_coord_iterator(Shape const& shape)
-{
-  return make_coord_iterator<Order>(repeat_like(shape, int(0)), shape);
-}
-
-// A forward iterator for a coordinate that starts from zero and increments colex
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_coord_iterator(Shape const& shape)
-{
-  return make_coord_iterator(repeat_like(shape, int(0)), shape);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/swizzle.hpp b/lightllm-kernel/cutlass/include/cute/swizzle.hpp
deleted file mode 100755
index 52abf856d..000000000
--- a/lightllm-kernel/cutlass/include/cute/swizzle.hpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                      // CUTE_HOST_DEVICE
-#include <cute/container/tuple.hpp>             // cute::is_tuple
-#include <cute/numeric/integral_constant.hpp>   // cute::constant
-#include <cute/numeric/math.hpp>                // cute::max, cute::min
-#include <cute/algorithm/tuple_algorithms.hpp>  // cute::transform_apply
-
-namespace cute
-{
-
-// A generic Swizzle functor
-/* 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
- *                               ^--^ MBase is the number of least-sig bits to keep constant
- *                  ^-^       ^-^     BBits is the number of bits in the mask
- *                    ^---------^     SShift is the distance to shift the YYY mask
- *                                       (pos shifts YYY to the right, neg shifts YYY to the left)
- *
- * e.g. Given
- * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
- * the result is
- * 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
- */
-template <int BBits, int MBase, int SShift = BBits>
-struct Swizzle
-{
-  static constexpr int num_bits = BBits;
-  static constexpr int num_base = MBase;
-  static constexpr int num_shft = SShift;
-
-  static_assert(num_base >= 0,             "MBase must be positive.");
-  static_assert(num_bits >= 0,             "BBits must be positive.");
-  static_assert(abs(num_shft) >= num_bits, "abs(SShift) must be more than BBits.");
-
-  // using 'int' type here to avoid unintentially casting to unsigned... unsure.
-  using bit_msk = cute::constant<int, (1 << num_bits) - 1>;
-  using yyy_msk = cute::constant<int, bit_msk{} << (num_base + max(0,num_shft))>;
-  using zzz_msk = cute::constant<int, bit_msk{} << (num_base - min(0,num_shft))>;
-  using msk_sft = cute::constant<int, num_shft>;
-
-  static constexpr uint32_t swizzle_code = uint32_t(yyy_msk{} | zzz_msk{});
-
-  template <class Offset>
-  CUTE_HOST_DEVICE constexpr static
-  auto
-  apply(Offset const& offset)
-  {
-    return offset ^ shiftr(offset & yyy_msk{}, msk_sft{});   // ZZZ ^= YYY
-  }
-
-  template <class Offset>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Offset const& offset) const
-  {
-    return apply(offset);
-  }
-
-  template <int B, int M, int S>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator==(Swizzle<B,M,S> const&) const
-  {
-    return B == BBits && M == MBase && S == SShift;
-  }
-};
-
-//
-// make_swizzle<0b1000, 0b0100>()         ->  Swizzle<1,2,1>
-// make_swizzle<0b11000000, 0b00000110>() ->  Swizzle<2,1,5>
-//
-
-template <uint32_t Y, uint32_t Z>
-CUTE_HOST_DEVICE constexpr
-auto
-make_swizzle()
-{
-  constexpr uint32_t BZ = popcount(Y);                    // Number of swizzle bits
-  constexpr uint32_t BY = popcount(Z);                    // Number of swizzle bits
-  static_assert(BZ == BY, "Number of bits in Y and Z don't match");
-  constexpr uint32_t TZ_Y = countr_zero(Y);               // Number of trailing zeros in Y
-  constexpr uint32_t TZ_Z = countr_zero(Z);               // Number of trailing zeros in Z
-  constexpr uint32_t M = cute::min(TZ_Y, TZ_Z) % 32;
-  constexpr  int32_t S = int32_t(TZ_Y) - int32_t(TZ_Z);   // Difference in trailing zeros
-  static_assert((Y | Z) == Swizzle<BZ,M,S>::swizzle_code, "Something went wrong.");
-  return Swizzle<BZ,M,S>{};
-}
-
-template <int B0, int M0, int S0,
-          int B1, int M1, int S1>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>)
-{
-  static_assert(S0 == S1, "Can only merge swizzles of the same shift.");
-  constexpr uint32_t Y = Swizzle<B0,M0,S0>::yyy_msk::value ^ Swizzle<B1,M1,S1>::yyy_msk::value;
-  constexpr uint32_t Z = Swizzle<B0,M0,S0>::zzz_msk::value ^ Swizzle<B1,M1,S1>::zzz_msk::value;
-  return make_swizzle<Y,Z>();
-
-  //return ComposedFn<Swizzle<B0,M0,S0>, Swizzle<B1,M1,S1>>{};
-}
-
-//
-// Utility for slicing and swizzle "offsets"
-//
-
-// For swizzle functions, it is often needed to keep track of which bits are
-//   consumed and which bits are free. Furthermore, it is useful to know whether
-// each of these bits is known statically or dynamically.
-
-// MixedBits is an 32-bit unsigned integer class where some bits are known statically
-//   and some bits are known dynamically. These sets of bits are disjoint and it is
-//   known statically which bits are known dynamically.
-
-// MixedBits can only be manipulated through bitwise operations
-
-// Abstract value:  StaticInt | (dynamic_int_ & StaticFlags)
-template <uint32_t StaticInt,
-          uint32_t StaticFlags>    // 0: static, 1: dynamic
-struct MixedBits
-{
-  // Representation invariants
-  static_assert(StaticFlags != 0, "Should be at least one dynamic bit in MixedBits.");
-  static_assert((StaticInt & StaticFlags) == 0, "No static/dynamic overlap allowed in MixedBits.");
-
-  uint32_t dynamic_int_;
-  // assert((dynamic_int_ & ~StaticFlags) == 0);
-
-  CUTE_HOST_DEVICE constexpr operator uint32_t() const noexcept { return StaticInt | dynamic_int_; }
-};
-
-// Return a value representing (C<s>{} | (d & C<f>)) potentially using MixedBits to track s and f.
-// This maker does allow ((s & f) != 0) and enforces the MixedBits invariant before creation.
-template <auto s, class DynamicType, auto f>
-CUTE_HOST_DEVICE constexpr
-auto
-make_mixed_bits(C<s>, DynamicType const& d, C<f>)
-{
-  static_assert(is_integral<DynamicType>::value);
-  constexpr uint32_t new_f = uint32_t(f) & ~uint32_t(s);        // StaticBits take precedence, M<0,f>{d} | C<s>{}
-  if constexpr (new_f == 0 || is_static<DynamicType>::value) {
-    return C<s>{} | (d & C<new_f>{});                           // Just return a static int
-  } else {
-    return MixedBits<s, new_f>{uint32_t(d) & new_f};            // MixedBits
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Operators
-//
-
-// Equality
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return (S0 == (uint32_t(S1) & ~F0)) && (m.dynamic_int_ == (uint32_t(S1) & F0));
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator==(C<S1> s, MixedBits<S0,F0> const& m)
-{
-  return m == s;
-}
-
-// Bitwise AND
-template <uint32_t S0, uint32_t F0,
-          uint32_t S1, uint32_t F1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator&(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
-{
-  // Truth table for (S0,D0,F0) & (S1,D1,F1) -> (S,D,F)
-  //   S0D0F0  | 0X0 | 001 | 011 | 1X0 |
-  // S1D1F1
-  //  0X0      | 0X0 | 0X0 | 0X0 | 0X0 |
-  //  001      | 0X0 | 001 | 001 | 001 |
-  //  011      | 0X0 | 001 | 011 | 011 |
-  //  1X0      | 0X0 | 001 | 011 | 1X0 |
-
-  return make_mixed_bits(C<S0 & S1>{},
-                         //(S0 | m0.dynamic_int_) & (S1 | m1.dynamic_int_),
-                         ((S1 & F0) & m0.dynamic_int_) | ((S0 & F1) & m1.dynamic_int_) | (m0.dynamic_int_ & m1.dynamic_int_),
-                         C<(S1 & F0) | (S0 & F1) | (F0 & F1)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator&(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return make_mixed_bits(C<S0 & uint32_t(S1)>{},
-                         m.dynamic_int_,
-                         C<F0 & uint32_t(S1)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator&(C<S1> s, MixedBits<S0,F0> const& m)
-{
-  return m & s;
-}
-
-// Bitwise OR
-template <uint32_t S0, uint32_t F0,
-          uint32_t S1, uint32_t F1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator|(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
-{
-  // Truth table for (S0,D0,F0) | (S1,D1,F1) -> (S,D,F)
-  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
-  // S1D1F1
-  //  0X0     | 0X0 | 001 | 011 | 1X0 |
-  //  001     | 001 | 001 | 011 | 1X0 |
-  //  011     | 011 | 011 | 011 | 1X0 |
-  //  1X0     | 1X0 | 1X0 | 1X0 | 1X0 |
-
-  return make_mixed_bits(C<S0 | S1>{},
-                         ((~S1 & F0) & m0.dynamic_int_) | ((~S0 & F1) & m1.dynamic_int_),
-                         C<(~S0 & F1) | (~S1 & F0)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator|(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return make_mixed_bits(C<S0 |  uint32_t(S1)>{},
-                         m.dynamic_int_,
-                         C<F0 & ~uint32_t(S1)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator|(C<S1> s, MixedBits<S0,F0> const& m)
-{
-  return m | s;
-}
-
-// Bitwise XOR
-template <uint32_t S0, uint32_t F0,
-          uint32_t S1, uint32_t F1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator^(MixedBits<S0,F0> const& m0, MixedBits<S1,F1> const& m1)
-{
-  // Truth table for (S0,D0,F0) ^ (S1,D1,F1) -> (S,D,F)
-  //   S0D0F0 | 0X0 | 001 | 011 | 1X0 |
-  // S1D1F1
-  //  0X0     | 0X0 | 001 | 011 | 1X0 |
-  //  001     | 001 | 001 | 011 | 011 |
-  //  011     | 011 | 011 | 001 | 001 |
-  //  1X0     | 1X0 | 011 | 001 | 0X0 |
-
-  return make_mixed_bits(C<(~S0 & S1 & ~F0) | (S0 & ~S1 & ~F1)>{},
-                         (S0 | m0.dynamic_int_) ^ (S1 | m1.dynamic_int_),
-                         C<F0 | F1>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator^(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return make_mixed_bits(C<(~S0 & uint32_t(S1) & ~F0) | (S0 & ~uint32_t(S1))>{},
-                         (S0 | m.dynamic_int_) ^ uint32_t(S1),
-                         C<F0>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator^(C<S1> s, MixedBits<S0,F0> const& m)
-{
-  return m ^ s;
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator<<(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return make_mixed_bits(C<(S0 << S1)>{},
-                         m.dynamic_int_ << S1,
-                         C<(F0 << S1)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-operator>>(MixedBits<S0,F0> const& m, C<S1>)
-{
-  return make_mixed_bits(C<(S0 >> S1)>{},
-                         m.dynamic_int_ >> S1,
-                         C<(F0 >> S1)>{});
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-shiftl(MixedBits<S0,F0> const& m, C<S1> s)
-{
-  if constexpr (S1 >= 0) {
-    return m << s;
-  } else {
-    return m >> -s;
-  }
-}
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-shiftr(MixedBits<S0,F0> const& m, C<S1> s)
-{
-  if constexpr (S1 >= 0) {
-    return m >> s;
-  } else {
-    return m << -s;
-  }
-}
-
-//
-// Upcast and Downcast
-//
-
-template <uint32_t S0, uint32_t F0, auto S1>
-CUTE_HOST_DEVICE constexpr
-auto
-safe_div(MixedBits<S0,F0> const& m, C<S1> s)
-{
-  static_assert(has_single_bit(uint32_t(S1)), "Only divide MixedBits by powers of two.");
-  return make_mixed_bits(safe_div(C<S0>{}, s),
-                         safe_div(m.dynamic_int_, s),
-                         safe_div(C<F0>{}, s));
-}
-
-template <uint32_t N, uint32_t S0, uint32_t F0>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(MixedBits<S0,F0> const& m)
-{
-  static_assert(has_single_bit(N), "Only divide MixedBits by powers of two.");
-  return safe_div(m, C<N>{});
-}
-
-template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(T const& m)
-{
-  return safe_div(m, C<N>{});
-}
-
-template <uint32_t N, uint32_t S0, uint32_t F0>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(MixedBits<S0,F0> const& m)
-{
-  static_assert(has_single_bit(N), "Only scale MixedBits by powers of two.");
-  return make_mixed_bits(C<S0 * N>{},
-                         m.dynamic_int_ * N,
-                         C<F0 * N>{});
-}
-
-template <uint32_t N, class T, __CUTE_REQUIRES(cute::is_integral<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(T const& m)
-{
-  return m * C<N>{};
-}
-
-template <uint32_t S0, uint32_t F0>
-CUTE_HOST_DEVICE constexpr
-auto
-max_alignment(MixedBits<S0,F0> const&)
-{
-  return C<uint32_t(1) << countr_zero(S0 | F0)>{};
-}
-
-template <auto v>
-CUTE_HOST_DEVICE constexpr
-C<v>
-max_alignment(C<v> const& c)
-{
-  return c;
-}
-
-//
-// Convert a Pow2Layout+Coord to a MixedBits
-//
-
-template <class Shape, class Stride, class Coord>
-CUTE_HOST_DEVICE constexpr
-auto
-to_mixed_bits(Shape const& shape, Stride const& stride, Coord const& coord)
-{
-  if constexpr (is_tuple<Shape>::value && is_tuple<Stride>::value && is_tuple<Coord>::value) {
-    static_assert(tuple_size<Shape>::value == tuple_size<Stride>::value, "Mismatched ranks");
-    static_assert(tuple_size<Shape>::value == tuple_size<Coord >::value, "Mismatched ranks");
-    return transform_apply(shape, stride, coord, [](auto const& s, auto const& d, auto const& c) { return to_mixed_bits(s,d,c); },
-                                                 [](auto const&... a) { return (a ^ ...); });
-  } else if constexpr (is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value) {
-    static_assert(decltype(shape*stride)::value == 0 || has_single_bit(decltype(shape*stride)::value), "Requires pow2 shape*stride.");
-    return make_mixed_bits(Int<0>{}, coord * stride, (shape - Int<1>{}) * stride);
-  } else {
-    static_assert(is_integral<Shape>::value && is_integral<Stride>::value && is_integral<Coord>::value, "Either Shape, Stride, and Coord must be all tuples, or they must be all integral (in the sense of cute::is_integral).");
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <class Layout, class Coord>
-CUTE_HOST_DEVICE constexpr
-auto
-to_mixed_bits(Layout const& layout, Coord const& coord)
-{
-  return to_mixed_bits(layout.shape(), layout.stride(), idx2crd(coord, layout.shape()));
-}
-
-//
-// Display utilities
-//
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE void print(Swizzle<B,M,S> const&)
-{
-  printf("Sw<%d,%d,%d>", B, M, S);
-}
-
-template <uint32_t S, uint32_t F>
-CUTE_HOST_DEVICE void print(MixedBits<S,F> const& m)
-{
-  printf("M_%u|(%u&%u)=%u", S, m.dynamic_int_, F, uint32_t(m));
-}
-
-#if !defined(__CUDACC_RTC__)
-template <int B, int M, int S>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, Swizzle<B,M,S> const&)
-{
-  return os << "Sw<" << B << "," << M << "," << S << ">";
-}
-
-template <uint32_t S, class D, uint32_t F>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, MixedBits<S,F> const& m)
-{
-  return os << "M_" << S << "|(" << m.dynamic_int_ << "&" << F << ")=" << uint32_t(m);
-}
-#endif // !defined(__CUDACC_RTC__)
-
-//
-// Helper Function
-//
-template <class T, class = void>                      // Default No-Swizzle
-struct get_swizzle { using type = Swizzle<0,4,3>; };
-
-template <class T>
-using get_swizzle_t = typename get_swizzle<T>::type;
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp b/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
deleted file mode 100755
index 1324360eb..000000000
--- a/lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp
+++ /dev/null
@@ -1,584 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>           // CUTE_HOST_DEVICE
-#include <cute/layout.hpp>           // cute::Layout
-#include <cute/layout_composed.hpp>  // cute::ComposedLayout
-#include <cute/swizzle.hpp>          // cute::Swizzle, cute::get_swizzle primary template
-
-/* Specialized functionality for a ComposedLayout of the form
- *   InvolutionFn o Offset o LayoutB
- * where the InvolutionFn is a Swizzle<B,M,S> and is not linear (hence the need for the Offset).
- *
- * Because these are specializations for core functions of ComposedLayout, these Swizzle Layouts
- * provide similar functionality to Layout including tiling, partitioning,
- * coordinate-to-index mapping and layout manipulations, but are not considered "normal" layouts.
- * For example, these provide shape() and size() functions, but do not provide stride() functions.
- *
- * Furthermore, each of these specializations uses Swizzle<>-specific knowledge in its implementation and
- * attempts to decay itself to a normal-layout with dynamic or static strides when certain slicing conditions
- * are met. This is possible by determining the subdomain of the Swizzle<> function that is identity and
- * testing if LayoutB's codomain is contained within it. In general, MizedBits is used as the Offset to track
- * statically-vs-dynamically known bits in the Offset to improve the decay to static or dynamic normal layouts.
- */
-
-namespace cute
-{
-
-//
-// Helper Function
-//
-template <int B, int M, int S, class Offset, class LayoutB>
-struct get_swizzle<ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>> { using type = Swizzle<B,M,S>; };
-
-//
-// Constructors
-//
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-make_layout(Swizzle<B,M,S> const& sxor)
-{
-  return composition(sxor, Layout<Int<M+B+abs(S)>,Int<1>>{});
-}
-
-namespace detail {
-
-template <int B, int M, int S, class OldShape, class OldStride, class NewShape, class NewStride>
-CUTE_HOST_DEVICE constexpr
-auto
-transfer_swizzle(Layout<OldShape,OldStride> const& old_layout,
-                 Layout<NewShape,NewStride> const& new_layout)
-{
-  // Our goal is to determine a new swizzle for the strides in new_layout for consistent vectorizations
-
-  // This is accomplished by identifying
-  //  S o L  :=:  S? o L*
-  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
-  // Then that active identifier is transformed through the layouts:
-  //  L*(L[(P o L)(c*)])
-  // which is a new swizzle identifier for S?, the new swizzle
-
-  // Projections of the swizzle layout for composition, P
-  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
-                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
-
-  // Compose with the tile to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
-  auto layout_only_zy       = composition(swizzle_only_zy, old_layout);
-  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
-  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
-
-  // Get the Z bit and the Y bits -- keep only those that are active in Z *and* Y
-  auto zzz_msk = typename Swizzle<B,M,S>::zzz_msk{};
-  auto yyy_msk = typename Swizzle<B,M,S>::yyy_msk{};
-  auto msk_sft = typename Swizzle<B,M,S>::msk_sft{};
-  auto active_Z = swizzle_active_bits & shiftr(swizzle_active_bits,  msk_sft) & zzz_msk;
-  auto active_Y = swizzle_active_bits & shiftr(swizzle_active_bits, -msk_sft) & yyy_msk;
-
-  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
-  auto new_active_Z = new_layout(old_layout.get_1d_coord(active_Z));
-  auto new_active_Y = new_layout(old_layout.get_1d_coord(active_Y));
-
-  // Use this new swizzle identifier to construct the new swizzle for new_layout
-  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
-  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
-}
-
-} // end namespace detail
-
-template <int B, int M, int S, class Offset, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
-{
-  return make_fragment_like(layout.layout_b());
-}
-
-//
-// Utilities
-//
-
-namespace detail {
-
-// Get just the Swizzle part of a composed layout.
-template <int B, int M, int S, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-get_swizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB>)
-{
-  return Swizzle<B,M,S>{};
-}
-
-// A non-swizzled layout's "Swizzle part" is the identity swizzle.
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-get_swizzle_portion(Layout<Shape,Stride>)
-{
-  return Swizzle<0,4,3>{};
-}
-
-// Get the "non-swizzle" part of a composed layout,
-// which is the underlying (non-composed) Layout.
-template <int B, int M, int S, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-get_nonswizzle_portion(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& slayout)
-{
-  return slayout.layout_b();
-}
-
-// The non-swizzle part of a non-swizzled layout is just the Layout.
-template <class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-get_nonswizzle_portion(Layout<Shape,Stride> const& slayout)
-{
-  return slayout;
-}
-
-} // namespace detail
-
-//
-// Slice a Swizzled ComposedLayout
-//
-
-namespace detail {
-
-template <class IntZ, class IntY, class Offset, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-make_swizzle_strides(true_type,
-                     IntZ   const& Z,
-                     IntY   const& Y,
-                     Offset const& offset,
-                     int_sequence<I...>)
-{
-  // Below is an optimized/compressed version of:
-  //return cute::make_tuple((swizzle(offset + Z*Int<(1 << I)>{}) - swizzle(offset))...);
-  // with knowledge of Swizzle, I... ranges for each B bits,
-  //    and the layout won't slice along z-bits that are already set
-
-  // y\z  0   1
-  //   0  Z  DC
-  //   1 -Z  DC
-
-  return cute::make_tuple(conditional_return((offset & (Y << Int<I>{})) == Int<0>{}, Z * Int<(1 << I)>{}, -Z * Int<(1 << I)>{})...);
-}
-
-template <class IntZ, class IntY, class Offset, int... I>
-CUTE_HOST_DEVICE constexpr
-auto
-make_swizzle_strides(false_type,
-                     IntZ   const& Z,
-                     IntY   const& Y,
-                     Offset const& offset,
-                     int_sequence<I...>)
-{
-  // Below is an optimized/compressed version of:
-  //return cute::make_tuple((swizzle(offset + Y*Int<(1 << I)>{}) - swizzle(offset))...);
-  // with knowledge of Swizzle, I... ranges for each B bits,
-  //    and the layout won't slice along y-bits that are already set
-
-  // y\z  0   1
-  //   0 Y+Z Y-Z
-  //   1 DC  DC
-
-  return cute::make_tuple(conditional_return((offset & (Z << Int<I>{})) == Int<0>{}, (Y+Z) * Int<(1 << I)>{}, (Y-Z) * Int<(1 << I)>{})...);
-}
-
-} // end namespace detail
-
-template <class Coord, int B, int M, int S, class Offset, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-slice_and_offset(Coord const& coord, ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
-{
-  if constexpr (all_underscore<Coord>::value) {
-    // Skip the expensive/complicated attempt to decay to a normal layout and just reshape
-    return cute::make_tuple(composition(layout.layout_a(), layout.offset(), slice(coord, layout.layout_b())), Int<0>{});
-  } else {
-
-    // Projections of the swizzle layout for composition
-    auto sw = make_layout(make_shape(Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 << B)>{}, Int<1>{}));
-
-    auto swizzle_anti_zy = make_layout(shape(sw),
-                                       make_stride(stride<0>(sw),      Int<0>{}, stride<2>(sw),      Int<0>{}, size(sw)));
-    auto swizzle_only_zy = make_layout(shape(sw),
-                                       make_stride(     Int<0>{}, stride<1>(sw),      Int<0>{}, stride<3>(sw), Int<0>{}));
-
-    // The portion of the layout that is not yet consumed
-    auto sliced_layout = slice(coord, layout.layout_b());
-
-    // The portion of the layout that we are consuming now
-    auto diced_layout = dice(coord, layout.layout_b());
-    auto diced_coord  = dice(coord, coord);
-
-    auto diced_layout_anti_zy = composition(swizzle_anti_zy, diced_layout);
-    auto diced_layout_only_zy = composition(swizzle_only_zy, diced_layout);
-
-    // New swizzle and offset
-    auto swizzle = layout.layout_a();
-    // offset_only_zy interacts with swizzle and gets accumulated with layout.offset()
-    //   being careful about the static/dynamic contributions from diced_layout and diced_coord
-    auto offset_only_zy = layout.offset() ^ to_mixed_bits(diced_layout_only_zy, diced_coord);
-    // offset_anti_zy always gets passed through, no interaction with swizzle
-    auto offset_anti_zy = diced_layout_anti_zy(diced_coord);
-
-    // If Layout's codomain hits on         Y AND Z, then it's not reducible
-    // If Layout's codomain hits on         Y XOR Z, then it's dynamic-normal
-    // If Layout's codomain hits on neither Y NOR Z, then it's static-normal
-
-    // If the sliced_layout hits two bits that are swizzled together, then don't attempt to decay
-
-    // Compose with the layout to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
-    //   (this also tests that shape/stride of layout compose with swizzle)
-    auto sliced_layout_only_zy = composition(swizzle_only_zy, sliced_layout);
-    // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
-    [[maybe_unused]] auto swizzle_active_bits = sliced_layout_only_zy(size(sliced_layout_only_zy)-Int<1>{});
-
-    // Determine if any active bits collide under the swizzle for potential decay
-    if constexpr (is_constant<0, decltype(not (swizzle_active_bits & ~swizzle(swizzle_active_bits)))>::value)
-    { // Hits on Y AND Z, so it's not reducible
-      return cute::make_tuple(composition(swizzle, offset_only_zy, sliced_layout), offset_anti_zy);
-    } else
-    { // Misses on Y or Z, so it's static-normal or dynamic-normal
-
-      // Lowest bit of the Z and Y masks
-      auto Z = typename Swizzle<B,M,S>::zzz_msk{} & -typename Swizzle<B,M,S>::zzz_msk{};
-      auto Y = typename Swizzle<B,M,S>::yyy_msk{} & -typename Swizzle<B,M,S>::yyy_msk{};
-      auto stride_lo = detail::make_swizzle_strides(Z < Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
-      auto stride_hi = detail::make_swizzle_strides(Z > Y, Z, Y, offset_only_zy, make_int_sequence<B>{});
-
-      // Construct a (dynamic) layout that we can perform the composition with
-      auto swizzle_layout = make_layout(make_shape (Int<(1 << M)>{}, repeat<B>(Int<2>{}), Int<(1 << (abs(S)-B))>{}, repeat<B>(Int<2>{}), Int<                  1>{}),
-                                        make_stride(Int<       1>{},           stride_lo, Int<(1 <<      (M+B))>{},          stride_hi , Int<(1 << (M+B+abs(S)))>{}));
-
-      // Decay to a normal layout with offset
-      return cute::make_tuple(composition(swizzle_layout, sliced_layout),
-                              swizzle(offset_only_zy) + offset_anti_zy);
-    }
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// composition
-//
-
-// Ignore identity case
-template <int M, int S,
-          class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Swizzle<0,M,S> const&,
-            Int<0> const&,
-            Layout<Shape,Stride> const& layout)
-{
-  return layout;
-}
-
-template <int B, int M, int S,
-          class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Swizzle<B,M,S> const& sxor,
-            Layout<Shape,Stride> const& layout)
-{
-  return composition(sxor, Int<0>{}, layout);
-}
-
-template <class ShapeA, class StrideA,
-          int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Layout<ShapeA,StrideA> const& a,
-            Swizzle<B,M,S>         const& b)
-{
-  // Get the Z bits and the Y bits
-  auto active_Y = a(typename Swizzle<B,M,S>::yyy_msk{});
-  auto active_Z = a(typename Swizzle<B,M,S>::zzz_msk{});
-
-  // Works in simple cases... but could be greatly generalized
-
-  return composition(make_swizzle<active_Y,active_Z>(), a);
-}
-
-//
-// inverse
-//
-
-// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
-template <int B, int M, int S, class Offset, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-right_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
-{
-  if constexpr (is_constant<0, Offset>::value) {
-    return composition(right_inverse(layout.layout_b()), layout.layout_a());
-  } else {
-    return composition(right_inverse(layout.layout_b()), right_inverse(layout.offset()), right_inverse(layout.layout_a()));
-  }
-}
-
-// Specialization to attempt to pass-through the Swizzle back to the left -- Needed?
-template <int B, int M, int S, class Offset, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-left_inverse(ComposedLayout<Swizzle<B,M,S>,Offset,Layout> const& layout)
-{
-  if constexpr (is_constant<0, Offset>::value) {
-    return composition(left_inverse(layout.layout_b()), layout.layout_a());
-  } else {
-    return composition(left_inverse(layout.layout_b()), left_inverse(layout.offset()), left_inverse(layout.layout_a()));
-  }
-}
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-Swizzle<B,M,S>
-right_inverse(Swizzle<B,M,S> const& sw)
-{
-  return sw;
-}
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-Swizzle<B,M,S>
-left_inverse(Swizzle<B,M,S> const& sw)
-{
-  return sw;
-}
-
-// Kludge -- Probably want an OffsetFn<T> here instead
-template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-right_inverse(T const& t)
-{
-  return -t;
-}
-
-// Kludge -- Probably want an OffsetFn<T> here instead
-template <class T, __CUTE_REQUIRES(is_integral<T>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-left_inverse(T const& t)
-{
-  return -t;
-}
-
-//
-// Upcast and Downcast
-//
-
-template <int N, int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Swizzle<B,M,S> const& swizzle)
-{
-  static_assert(has_single_bit(N), "N must be a power of two");
-  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
-  constexpr int NewM   = M - log2_n;
-  if constexpr (NewM >= 0) {
-    return Swizzle<B,NewM,S>{};
-  } else {
-    return Swizzle<cute::max(B+NewM,0), 0, S>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-downcast(Swizzle<B,M,S> const& swizzle)
-{
-  static_assert(has_single_bit(N), "N must be a power of two");
-  constexpr int log2_n = bit_width(uint32_t(N)) - 1;
-  return Swizzle<B,(M + log2_n),S>{};
-}
-
-template <class OldType, class NewType,
-          int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-recast_layout(Swizzle<B,M,S> const& swizzle)
-{
-  using scale = decltype(trait_ratio(sizeof_bits<NewType>{}, sizeof_bits<OldType>{}));
-  if constexpr (scale::num == 1 && scale::den == 1) {
-    return swizzle;
-  }
-  else if constexpr (scale::num == 1) {
-    return downcast<scale::den>(swizzle);
-  }
-  else if constexpr (scale::den == 1) {
-    return upcast<scale::num>(swizzle);
-  }
-  else {
-    static_assert(dependent_false<scale>, "Recast not supported.");
-  }
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int B, int M, int S>
-CUTE_HOST_DEVICE constexpr
-auto
-max_alignment(Swizzle<B,M,S> const&)
-{
-  return Int<1 << M>{};
-}
-
-template <int B, int M, int S, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-max_alignment(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& layout)
-{
-  return gcd(max_alignment(layout.layout_a()),
-             max_alignment(layout.offset()),
-             max_alignment(layout.layout_b()));
-}
-
-//
-// Other operations
-//
-
-template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_layout(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
-                  Layout<Shape,Stride>                          const& b)
-{
-  auto common = max_common_layout(a.layout_b(), b);
-  auto base = Int<(1 << M)>{};
-  if constexpr (base < size(common)) {
-    return common.compose(base);       // Truncate common to size base
-  } else {
-    return common;
-  }
-}
-
-template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_layout(Layout<Shape,Stride>                          const& a,
-                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
-{
-  return max_common_layout(b, a);
-}
-
-template <int B, int M, int S, class Offset, class LayoutB, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_vector(ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& a,
-                  Layout<Shape,Stride>                          const& b)
-{
-  // This assumes that Offset is in the YZ domain of the Swizzle...
-  return cute::min(max_common_vector(a.layout_b(), b), Int<(1 << M)>{});
-}
-
-template <class Shape, class Stride, int B, int M, int S, class Offset, class LayoutB>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_vector(Layout<Shape,Stride>                          const& a,
-                  ComposedLayout<Swizzle<B,M,S>,Offset,LayoutB> const& b)
-{
-  return max_common_vector(b, a);
-}
-
-template <int B0, int M0, int S0, class Offset0, class LayoutB0,
-          int B1, int M1, int S1, class Offset1, class LayoutB1>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_vector(ComposedLayout<Swizzle<B0,M0,S0>,Offset0,LayoutB0> const& a,
-                  ComposedLayout<Swizzle<B1,M1,S1>,Offset1,LayoutB1> const& b)
-{
-  // Typical impl is composition(a, right_inverse(b))
-  // so this is  Sw0 o B0 o rinv(Sw1 o B1) = Sw0 o B0 o rinv(B1) o Sw1
-  auto vec = max_common_vector(a.layout_b(), b.layout_b());
-
-  // This assumes that Offset is in the YZ domain of the Swizzle...
-  if constexpr (Swizzle<B0,M0,S0>{} == Swizzle<B1,M1,S1>{}) {
-    return vec;
-  } else {
-    return cute::min(vec, Int<(1 << M0)>{}, Int<(1 << M1)>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// ComposedLayout as second argument is often more difficult...
-
-template <class Shape, class Stride,
-          int B, int M, int S, class Offset, class LayoutT>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_product(Layout<Shape,Stride>                          const& layout,
-                ComposedLayout<Swizzle<B,M,S>,Offset,LayoutT> const& tiler)
-{
-  CUTE_STATIC_ASSERT_V(tiler.offset() == Int<0>{}, "Require Swizzle offset == 0.");
-  // The new layout -- if swizzle wasn't an issue, this is the result
-  //   our goal is to determine a new swizzle for these strides
-  auto new_layout = logical_product(layout, tiler.layout_b());
-
-  // This is accomplished by identifying
-  //  S o L  :=:  S? o L*
-  // We identify the "active" portion of S by computing (P o L)(c*) where P is a projection generated by S
-  // Then that active identifier is transformed through the layouts:
-  //  L*(L[(P o L)(c*)])
-  // which is a new swizzle identifier for S?, the new swizzle
-
-  // Projections of the swizzle layout for composition, P
-  auto swizzle_only_zy = make_layout(make_shape (Int<(1 << M)>{}, Int<(1 << B)>{}, Int<(1 << (abs(S)-B))>{}, Int<(1 <<  B        )>{}, Int<1>{}),
-                                     make_stride(       Int<0>{}, Int<(1 << M)>{},                 Int<0>{}, Int<(1 << (M+abs(S)))>{}, Int<0>{}));
-
-  // Compose with the tiler to get the swizzle projection, P o L  [The Z and Y contributing portions of L]
-  auto layout_only_zy       = composition(swizzle_only_zy, tiler.layout_b());
-  // Transform the end coordinate to get the active bits of the swizzle, (P o L)(c*)
-  auto swizzle_active_bits  = layout_only_zy(size(layout_only_zy)-Int<1>{});
-  // Get the Z bit and the Y bits
-  auto active_Z = swizzle_active_bits & typename Swizzle<B,M,S>::zzz_msk{};
-  auto active_Y = swizzle_active_bits & typename Swizzle<B,M,S>::yyy_msk{};
-
-  // Pass the identifiers through the old layout and new layout to make a new swizzle identifier, L*(L[(P o L)(c*)])
-  auto new_active_Z = new_layout(Int<0>{}, tiler.layout_b()[active_Z]);
-  auto new_active_Y = new_layout(Int<0>{}, tiler.layout_b()[active_Y]);
-
-  // Use this new swizzle identifier to construxt the new swizzle for new_layout
-  //   (this also makes sure it's a "valid" swizzle that Swizzle can represent)
-  return composition(make_swizzle<new_active_Y,new_active_Z>(), new_layout);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/tensor.hpp b/lightllm-kernel/cutlass/include/cute/tensor.hpp
deleted file mode 100755
index 3f3335b63..000000000
--- a/lightllm-kernel/cutlass/include/cute/tensor.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/tensor_impl.hpp>
-
-//
-// Extended Engines
-//
-
-#include <cute/pointer_swizzle.hpp>
-#include <cute/pointer_sparse.hpp>
-#include <cute/pointer_flagged.hpp>
-#include <cute/tensor_zip.hpp>
-
-//
-// Tensor Algorithms
-//
-
-#include <cute/algorithm/tensor_algorithms.hpp>
-#include <cute/algorithm/fill.hpp>
-#include <cute/algorithm/clear.hpp>
-#include <cute/algorithm/copy.hpp>
-#include <cute/algorithm/prefetch.hpp>
-#include <cute/algorithm/axpby.hpp>
-#include <cute/algorithm/gemm.hpp>
-
-#include <cute/algorithm/cooperative_copy.hpp>
-#include <cute/algorithm/cooperative_gemm.hpp>
-
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp b/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
deleted file mode 100755
index 61eefc506..000000000
--- a/lightllm-kernel/cutlass/include/cute/tensor_impl.hpp
+++ /dev/null
@@ -1,1193 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This file contains the definition of Tensor as well as classes/functions most closely associated with it.
-
-    For backwards-compatibility, "tensor.hpp" is the "entrypoint" header for a collection of classes and utilities
-    that are adjacent to Tensor, e.g. fill(). Whereas this file contains the actual definition of Tensor and
-    a small set of functions central to its usage.
-
-    Within the CUTLASS codebase, favor not including "tensor.hpp" wherever possible; instead include "tensor_impl.hpp"
-    along with other specific headers that you need. This helps to avoid circular includes and to reduce build time.
-*/
-
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_HOST_DEVICE
-#include <cute/layout.hpp>                     // cute::Shape
-#include <cute/layout_composed.hpp>            // cute::is_composed_layout
-#include <cute/pointer.hpp>                    // cute::recast_ptr
-#include <cute/pointer_base.hpp>               // cute::iterator_traits
-#include <cute/container/array_aligned.hpp>    // cute::array_aligned
-#include <cute/container/array_subbyte.hpp>    // cute::array_subbyte
-#include <cute/container/tuple.hpp>            // cute::tuple
-#include <cute/numeric/integral_constant.hpp>  // cute::is_integral
-#include <cute/util/type_traits.hpp>           // __CUTE_REQUIRES
-
-namespace cute
-{
-
-//
-// Engine -- owning or non-owning data store
-//
-
-// concept Engine {
-//   using iterator     = ;
-//   using value_type   = ;
-//   using element_type = ;
-//   using reference    = ;
-//   iterator begin();
-// };
-
-template <class T, size_t N>
-struct ArrayEngine
-{
-  using Storage = typename conditional<(sizeof_bits<T>::value % 8 == 0),
-                                       array_aligned<T,N>,
-                                       array_subbyte<T,N>>::type;
-  using iterator     = typename Storage::iterator;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  using value_type   = typename iterator_traits<iterator>::value_type;
-  Storage storage_;
-
-  CUTE_HOST_DEVICE constexpr auto begin() const { return storage_.begin(); }
-  CUTE_HOST_DEVICE constexpr auto begin()       { return storage_.begin(); }
-};
-
-// Specialization for sparse_elem<S,T> tensor allocation/iteration
-template <int S, class T, size_t N>
-struct ArrayEngine<sparse_elem<S,T>, N>
-{
-  static_assert(N % S == 0, "Expected a multiple of the sparsity.");
-  using value_type   = sparse_elem<S,T>;
-  using Storage      = typename conditional<(sizeof_bits<T>::value % 8 == 0),
-                                            array_aligned<T,N/S>,
-                                            array_subbyte<T,N/S>>::type;
-  using iterator     = sparse_ptr<S,sparse_elem<S,T>*>;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  Storage storage_;
-
-  CUTE_HOST_DEVICE constexpr auto begin() const { return recast_ptr<value_type>(storage_.begin()); }
-  CUTE_HOST_DEVICE constexpr auto begin()       { return recast_ptr<value_type>(storage_.begin()); }
-};
-
-template <class Iterator>
-struct ViewEngine
-{
-  using iterator     = Iterator;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  using value_type   = typename iterator_traits<iterator>::value_type;
-  iterator storage_;
-
-  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
-  CUTE_HOST_DEVICE constexpr iterator      & begin()       { return storage_; }
-};
-
-template <class Iterator>
-struct ConstViewEngine
-{
-  using iterator     = Iterator;
-  using reference    = typename iterator_traits<iterator>::reference;
-  using element_type = typename iterator_traits<iterator>::element_type;
-  using value_type   = typename iterator_traits<iterator>::value_type;
-  iterator storage_;
-
-  CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; }
-};
-
-//
-// Tensor
-//
-
-template <class Engine, class Layout>
-struct Tensor
-{
-  using iterator     = typename Engine::iterator;
-  using value_type   = typename Engine::value_type;
-  using element_type = typename Engine::element_type;
-  using reference    = typename Engine::reference;
-
-  using engine_type  = Engine;
-  using layout_type  = Layout;
-
-  CUTE_HOST_DEVICE constexpr
-  Tensor() {}
-
-  CUTE_HOST_DEVICE constexpr
-  Tensor(Engine const& engine, Layout const& layout)
-      : rep_(layout, engine) {
-  }
-
-  //
-  // Accessors
-  //
-
-  static constexpr int rank  = Layout::rank;
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  tensor() const {
-    return *this;
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  engine() const {
-    return get<1>(rep_);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  engine() {
-    return get<1>(rep_);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  data() const {
-    return engine().begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  data() {
-    return engine().begin();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  layout() const {
-    return get<0>(rep_);
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  shape() const {
-    return layout().shape();
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  auto
-  size() const {
-    return cute::size(shape());
-  }
-
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  stride() const {
-    return layout().stride();
-  }
-
-  //
-  // Indexing op() and op[]
-  //
-
-  // Index into this tensor like an array by computing the offset via layout()
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator[](Coord const& coord) {
-    return data()[layout()(coord)];
-  }
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator[](Coord const& coord) const {
-    return data()[layout()(coord)];
-  }
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(Coord const& coord) {
-    if constexpr (has_underscore<Coord>::value) {
-      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
-      return make_tensor(data() + offset, sliced_layout);
-    } else {
-      return data()[layout()(coord)];
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(Coord const& coord) const {
-    if constexpr (has_underscore<Coord>::value) {
-      auto const& [sliced_layout,offset] = slice_and_offset(coord, layout());
-      return make_tensor(data() + offset, sliced_layout);
-    } else {
-      return data()[layout()(coord)];
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  // op() convenience function for multi-dimensional coordinates
-  template <class Coord0, class Coord1, class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) {
-    return operator()(make_coord(c0,c1,cs...));
-  }
-
-  template <class Coord0, class Coord1, class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
-    return operator()(make_coord(c0,c1,cs...));
-  }
-
-  //
-  // Compose
-  //
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(Layouts const&... layouts) {
-    return make_tensor(data(), layout().compose(layouts...));
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  compose(Layouts const&... layouts) const {
-    return make_tensor(data(), layout().compose(layouts...));
-  }
-
-  //
-  // Tile
-  //
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(Layouts const&... layouts) {
-    return make_tensor(data(), layout().tile(layouts...));
-  }
-
-  template <class... Layouts>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  tile(Layouts const&... layouts) const {
-    return make_tensor(data(), layout().tile(layouts...));
-  }
-
-  //
-  // Utility
-  //
-
-  template <class Int,
-            __CUTE_REQUIRES(is_integral<Int>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_1d_coord(Int const& linear_idx) const {
-    return layout().get_1d_coord(linear_idx);
-  }
-
-  template <class Int,
-            __CUTE_REQUIRES(is_integral<Int>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_hier_coord(Int const& linear_idx) const {
-    return layout().get_hier_coord(linear_idx);
-  }
-
-  template <class Int,
-            __CUTE_REQUIRES(is_integral<Int>::value)>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  get_flat_coord(Int const& linear_idx) const {
-    return layout().get_flat_coord(linear_idx);
-  }
-
-  cute::tuple<layout_type, engine_type> rep_;
-};
-
-template <class T>
-struct is_tensor : false_type {};
-template <class Engine, class Layout>
-struct is_tensor<Tensor<Engine,Layout>> : true_type {};
-template <class T>
-constexpr bool is_tensor_v = is_tensor<T>::value;
-
-// Customization point for creation of owning and non-owning Tensors
-template <class T>
-struct MakeTensor
-{
-  template <class Arg0, class... Args>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Arg0 const& arg0, Args const&... args) const
-  {
-    if constexpr (has_dereference<Arg0>::value) {
-      // Construct a non-owning Tensor
-      using Engine = ViewEngine<Arg0>;
-      if constexpr (sizeof...(Args) == 1 && (is_layout<Args>::value && ...)) {
-        // Forward a Layout
-        return Tensor{Engine{arg0}, args...};
-      } else {
-        // Construct a Layout from Args
-        return Tensor{Engine{arg0}, make_layout(args...)};
-      }
-    } else {
-      // Construct an owning Tensor
-      static_assert((is_static<Arg0>::value && ... && is_static<Args>::value),
-                    "Dynamic owning tensors not supported");
-      if constexpr (sizeof...(Args) == 0 && is_layout<Arg0>::value) {
-        // Forward a Layout
-        using Layout = Arg0;
-        using Engine = ArrayEngine<T, cosize_v<Layout>>;
-        return Tensor<Engine,Layout>();
-      } else {
-        // Construct a Layout from Args
-        using Layout = decltype(make_layout(arg0, args...));
-        using Engine = ArrayEngine<T, cosize_v<Layout>>;
-        return Tensor<Engine,Layout>();
-      }
-    }
-  }
-};
-
-//
-// make_tensor
-//
-
-// Make an owning Tensor that will allocate a static array
-// e.g. make_tensor<float>(Int<12>{})
-template <class T, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor(Args const&... args)
-{
-  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor<T>(args...)");
-  return MakeTensor<T>{}(args...);
-}
-
-// Make a non-owning Tensor that will use a pointer (view)
-// e.g. make_tensor(vec.data(), 12)
-template <class Iterator, class... Args>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor(Iterator const& iter, Args const&... args)
-{
-  static_assert(has_dereference<Iterator>::value, "Expected iterator iter in make_tensor(iter, args...)");
-  static_assert((not has_dereference<Args>::value && ...), "Expected layout args... in make_tensor(iter, args...)");
-  return MakeTensor<Iterator>{}(iter, args...);
-}
-
-//
-// make_tensor_like
-//   Make a register tensor the same type and shape and (if possible) order as another tensor
-//
-
-template <class NewT, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor_like(Layout const& layout)
-{
-  return make_tensor<NewT>(make_layout_like(layout));
-}
-
-template <class NewT, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor_like(Tensor<Engine,Layout> const& tensor)
-{
-  return make_tensor_like<NewT>(tensor.layout());
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_tensor_like(Tensor<Engine,Layout> const& tensor)
-{
-  return make_tensor_like<typename Engine::value_type>(tensor.layout());
-}
-
-//
-// make_fragment_like
-//   Make a tensor the same shape and (if possible) order as another tensor, with special
-//   consideration of the 0th mode. The 0th mode is commonly used for MMA_Atoms or Copy_Atoms
-//   so this allocates the 0th mode with LayoutLeft regardless of the reference layout.
-//
-
-template <class NewT, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(Layout const& layout)
-{
-  return make_tensor<NewT>(make_fragment_like(layout));
-}
-
-template <class NewT, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(Tensor<Engine,Layout> const& tensor)
-{
-  return make_fragment_like<NewT>(tensor.layout());
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-make_fragment_like(Tensor<Engine,Layout> const& tensor)
-{
-  return make_fragment_like<typename Engine::value_type>(tensor.layout());
-}
-
-//
-// make_counting_tensor
-//   Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain.
-//
-
-template <class Layout, __CUTE_REQUIRES(is_layout<Layout>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-make_counting_tensor(Layout const& layout)
-{
-  return make_tensor(make_inttuple_iter(repeat_like(coshape(layout), Int<0>{})), layout);
-}
-
-//
-// make_identity_tensor
-//   Make a tensor that maps coordinates within a shape to themselves.
-//
-
-template <class Shape>
-CUTE_HOST_DEVICE constexpr
-auto
-make_identity_tensor(Shape const& shape)
-{
-  return make_counting_tensor(make_identity_layout(shape));
-}
-
-//
-// Utilities
-//
-
-// Return the subtensor of a mode
-template <int... Is, class Tensor>
-CUTE_HOST_DEVICE constexpr
-auto
-tensor(Tensor&& tensor)
-{
-  if constexpr (sizeof...(Is) == 0) {
-    return tensor;
-  } else {
-    return make_tensor(tensor.data(), get<Is...>(tensor.layout()));
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Return the layout of a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-layout(Tensor<Engine,Layout> const& tensor)
-{
-  return layout<Is...>(tensor.layout());
-}
-
-// Return the shape of a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-shape(Tensor<Engine,Layout> const& tensor)
-{
-  return shape<Is...>(tensor.layout());
-}
-
-// Return the stride of a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-stride(Tensor<Engine,Layout> const& tensor)
-{
-  return stride<Is...>(tensor.layout());
-}
-
-// Return the number of elements in a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-size(Tensor<Engine,Layout> const& tensor)
-{
-  return size<Is...>(tensor.layout());
-}
-
-// Return the rank of a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-rank(Tensor<Engine,Layout> const& tensor)
-{
-  return rank<Is...>(tensor.layout());
-}
-
-// Return the depth of a mode
-template <int... Is, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-depth(Tensor<Engine, Layout> const& tensor)
-{
-  return depth<Is...>(tensor.layout());
-}
-
-//
-// Operations to manipulate Tensors like a Layout or IntTuple
-//   These are implemented with explicit modifier overloads because these
-//   methods likely also have a general IntTuple overload that can shadow.
-//
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(Tensor<Engine,Layout> const& tensor) {
-  return make_tensor(tensor.data(), flatten(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(Tensor<Engine,Layout>& tensor) {
-  return make_tensor(tensor.data(), flatten(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-flatten(Tensor<Engine,Layout>&& tensor) {
-  return make_tensor(tensor.data(), flatten(tensor.layout()));
-}
-
-template <class Engine, class Layout, class Profile = Int<1>>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Tensor<Engine,Layout> const& tensor, Profile const& profile = {}) {
-  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
-}
-
-template <class Engine, class Layout, class Profile = Int<1>>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Tensor<Engine,Layout>& tensor, Profile const& profile = {}) {
-  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
-}
-
-template <class Engine, class Layout, class Profile = Int<1>>
-CUTE_HOST_DEVICE constexpr
-auto
-coalesce(Tensor<Engine,Layout>&& tensor, Profile const& profile = {}) {
-  return make_tensor(tensor.data(), coalesce(tensor.layout(), profile));
-}
-
-// Replace the modes in layout that have a 0-stride with a 1-size
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout> const& tensor) {
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout>& tensor) {
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout>&& tensor) {
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout()));
-}
-
-template <class Engine, class Layout, class Profile>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout> const& tensor, Profile const& profile)
-{
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
-}
-
-template <class Engine, class Layout, class Profile>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout>& tensor, Profile const& profile)
-{
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
-}
-
-template <class Engine, class Layout, class Profile>
-CUTE_HOST_DEVICE constexpr
-auto
-filter_zeros(Tensor<Engine,Layout>&& tensor, Profile const& profile)
-{
-  return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile));
-}
-
-// Remove all of the 0-strides and 1-sizes
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(Tensor<Engine,Layout> const& tensor) {
-  return make_tensor(tensor.data(), filter(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(Tensor<Engine,Layout>& tensor) {
-  return make_tensor(tensor.data(), filter(tensor.layout()));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-filter(Tensor<Engine,Layout>&& tensor) {
-  return make_tensor(tensor.data(), filter(tensor.layout()));
-}
-
-// Group the modes [B,E) into a single mode
-// e.g. group<2,4>(make_tensor<int>(Layout<Shape<_1,_2,_3,_4,_5,_6>>{}))
-//      => make_tensor<int>(Layout<Shape<_1,_2,Shape<_3,_4>,_5,_6>>{})
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-group_modes(Tensor<Engine,Layout> const& tensor) {
-  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
-}
-
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-group_modes(Tensor<Engine,Layout>& tensor) {
-  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
-}
-
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-group_modes(Tensor<Engine,Layout>&& tensor) {
-  return make_tensor(tensor.data(), group<B,E>(tensor.layout()));
-}
-
-// Return the subtensor of a range of modes
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-take(Tensor<Engine,Layout> const& tensor) {
-  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
-}
-
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-take(Tensor<Engine,Layout>& tensor) {
-  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
-}
-
-template <int B, int E, class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-take(Tensor<Engine,Layout>&& tensor) {
-  return make_tensor(tensor.data(), take<B,E>(tensor.layout()));
-}
-
-// Return a tensor with the same shape as input but offset by a given coordinate
-template <class Coord, class Tensor,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-domain_offset(Coord const& coord, Tensor&& tensor)
-{
-  auto [layout, ptr_offset] = domain_offset(coord, tensor.layout());
-  return make_tensor(static_cast<Tensor&&>(tensor).data() + ptr_offset, layout);
-}
-
-//
-// Recast
-//
-
-// NOTE: This is very dangerous to do
-//   -- doesn't check dynamic integer divisibility
-//   -- doesn't check alignment
-
-template <class NewType, class Tensor>
-CUTE_HOST_DEVICE constexpr
-auto
-recast(Tensor&& tensor)
-{
-  using OldType = typename remove_cvref_t<Tensor>::value_type;
-  auto old_layout = tensor.layout();
-  auto new_layout = recast_layout<OldType,NewType>(old_layout);
-
-  // If this is an upcast of a normal Layout with static negative strides, then offset as well
-  if constexpr (sizeof(OldType) < sizeof(NewType) && not is_composed_layout<decltype(old_layout)>::value) {
-    auto shape_diff = transform(flatten(old_layout.shape()), flatten(new_layout.shape()), minus{});
-    auto extent_diff = transform(shape_diff, flatten(old_layout.stride()), multiplies{});
-    auto offset = fold(extent_diff, Int<0>{}, [](auto const& i, auto const& a) { return i + cute::min(a,Int<0>{}); });
-
-    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data() + offset), new_layout);
-  } else {
-    return make_tensor(recast_ptr<NewType>(static_cast<Tensor&&>(tensor).data()         ), new_layout);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// max_common_vector
-//
-
-/* Return Int<N> such that N is the maximum number of contiguous elements
- * that logically correspond in the tensors of @a a and @a b. This is,
- * the number of elements that could reasonably be vectorized into a single load/store.
- *
- * @returns Int<N> with N >= 0
- *
- * A return value of Int<0> indicates that no such conclusion can be made and no
- * vectorization should be attempted.
- *
- * Note that the return value does NOT include alignment concerns such as the pointer value and
- * the divisbility of dynamic strides.
- */
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_vector(Tensor<SrcEngine,SrcLayout> const& a,
-                  Tensor<DstEngine,DstLayout> const& b)
-{
-  using SrcType = typename SrcEngine::value_type;
-  using SrcRef  = typename SrcEngine::reference;
-  using DstType = typename DstEngine::value_type;
-  using DstRef  = typename DstEngine::reference;
-
-  // Determine if vectorization candidates at all
-  if constexpr (// Should be the same value_types, else the copy is also performing a cast
-                cute::is_same<SrcType, DstType>::value &&
-                // The types should be trivially copyable so that vectorization is valid
-                is_trivially_copyable<SrcType>::value &&
-                is_trivially_copyable<DstType>::value &&
-                // Should be load/storing real data, rather than implicit iterators or such
-                is_reference<SrcRef>::value &&
-                is_reference<DstRef>::value)
-  {
-    return max_common_vector(a.layout(), b.layout());
-  } else {
-    return Int<0>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-/* Return a layout that points to the maximum number of contiguous elements
- * that logically correspond in the tensors of @a a and @a b. This is,
- * the elements that could reasonably be "vectorized" into a single load/store.
- *
- * @returns Layout R such that composition(a.layout(), R) and composition(b.layout(), R)
- *          are both identity Layouts.
- *
- * Note that the returned layout does NOT include alignment concerns such as the pointer value and
- * the divisbility of dynamic strides.
- */
-template <class SrcEngine, class SrcLayout,
-          class DstEngine, class DstLayout>
-CUTE_HOST_DEVICE constexpr
-auto
-max_common_layout(Tensor<SrcEngine,SrcLayout> const& a,
-                  Tensor<DstEngine,DstLayout> const& b)
-{
-  using SrcType = typename SrcEngine::value_type;
-  using SrcRef  = typename SrcEngine::reference;
-  using DstType = typename DstEngine::value_type;
-  using DstRef  = typename DstEngine::reference;
-
-  // Determine if vectorization candidates at all
-  if constexpr (// Should be the same value_types, else the copy is also performing a cast
-                cute::is_same<SrcType, DstType>::value &&
-                // The types should be trivially copyable so that vectorization is valid
-                is_trivially_copyable<SrcType>::value &&
-                is_trivially_copyable<DstType>::value &&
-                // Should be load/storing real data, rather than implicit iterators or such
-                is_reference<SrcRef>::value &&
-                is_reference<DstRef>::value)
-  {
-    return max_common_layout(a.layout(), b.layout());
-  } else {
-    return Layout<_1,_0>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Key algebraic operations -- Composition, Divide, and Product
-//
-
-// Apply a Tiler to the Tensor via composition.
-template <class Tensor, class Tiler,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-composition(Tensor    && tensor,
-            Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
-{
-  return make_tensor(static_cast<Tensor&&>(tensor).data(),
-                     composition(tensor.layout(), tiler));
-}
-
-// Apply a Tiler to the Tensor.
-//
-// Consider a Tensor with shape (A,B,x,y)
-// And a Tiler that is:
-//
-// * A Layout with shape (BLK_A,BLK_B)
-// ** Result Tensor shape ((BLK_A,BLK_B),Rest).
-// ** That is, the Tensor and Tile are treated as 1D for the tiling.
-// ** See logical_divide(Layout,Layout)
-//
-// * A Tile<Layout...> with shape <BLK_A,BLK_B>
-// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
-// ** Each mode of the Tile<Layout...> is applied to the corresponding mode of the Tensor.
-// ** See logical_divide(Layout,Tuple)
-//
-// * A Shape (BLK_A,BLK_B)
-// ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y).
-// ** Equivalent to applying Tile<BLK_A:_1,BLK_B:_1>.
-// ** See logical_divide(Layout,Tuple) and logical_divide(Layout,Int)
-//
-// Note that the Tile<Layout...>/Shape Tilers must be weakly_congruent to the Tensor
-template <class Tensor, class Tiler,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_divide(Tensor    && tensor,
-               Tiler const& tiler)   // Layout or Tile<Layout...> or Shape
-{
-  return make_tensor(static_cast<Tensor&&>(tensor).data(),
-                     logical_divide(tensor.layout(), tiler));
-}
-
-// zipped_divide is logical_divide with Tiler modes and Rest modes gathered together: (Tiler,Rest)
-// When Tiler is Layout, this has no effect as logical_divide results in the same.
-// When Tiler is Tile<Layout...> or Shape, this zips modes into standard form ((BLK_A,BLK_B),(a,b,x,y))
-template <class Tensor, class Tiler,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_divide(Tensor    && tensor,
-              Tiler const& tiler)    // Layout or Tile<Layout...> or Shape
-{
-  return make_tensor(static_cast<Tensor&&>(tensor).data(),
-                     zipped_divide(tensor.layout(), tiler));
-}
-
-// tiled_divide is zipped_divide with the second output mode flattened ((BLK_A,BLK_B),a,b,x,y)
-template <class Tensor, class Tiler,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-tiled_divide(Tensor    && tensor,
-             Tiler const& tiler)     // Layout or Tile<Layout...> or Shape
-{
-  return make_tensor(static_cast<Tensor&&>(tensor).data(),
-                     tiled_divide(tensor.layout(), tiler));
-}
-
-// flat_divide is zipped_divide with the both modes flattened (BLK_A,BLK_B,a,b,x,y)
-template <class Tensor, class Tiler,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-flat_divide(Tensor    && tensor,
-            Tiler const& tiler)      // Layout or Tile<Layout...> or Shape
-{
-  return make_tensor(static_cast<Tensor&&>(tensor).data(),
-                     flat_divide(tensor.layout(), tiler));
-}
-
-// logical_product on a Tensor doesn't make sense since it often increases cosize
-//   though this might make sense for creating Tensors with broadcasted (stride-0) modes
-
-//
-// Tensor partitioning utilities
-//
-
-// Apply a Tiler to the Tensor, then slice out one of those tiles by slicing into the "Rest" modes.
-// With an inner_partition, you get everything that's inside the Tiler. Everything that the Tiler is pointing to.
-// Split the modes of tensor according to the Tiler
-//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
-// Then slice into the second mode (the "Rest" mode) with Coord
-template <class Tensor, class Tiler, class Coord,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-inner_partition(Tensor    && tensor,
-                Tiler const& tiler,
-                Coord const& coord)
-{
-  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
-  constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
-
-  // The coord slices into the second mode (the "rest" mode), flatten the first
-  if constexpr (is_tuple<Coord>::value) {
-    // Append trailing modes if coord is tuple
-    constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
-    return tensor_tiled(repeat<R0>(_), append<R1>(coord,_));
-  } else {
-    // Flat indexing if coord is not tuple
-    return tensor_tiled(repeat<R0>(_), coord);
-  }
-}
-
-// Apply a Tiler to the Tensor, then slice out the remainder by slicing into the "Tile" modes.
-// With an outer_partition, you get everything that's outside the Tiler. The layout of the Tile in the Tensor.
-// Split the modes of tensor according to the Tiler
-//   zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y))
-// Then slice into the first mode (the "Tile" mode) with Coord
-template <class Tensor, class Tiler, class Coord,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-outer_partition(Tensor    && tensor,
-                Tiler const& tiler,
-                Coord const& coord)
-{
-  auto tensor_tiled = zipped_divide(static_cast<Tensor&&>(tensor), tiler);
-  constexpr int R1 = decltype(rank<1>(tensor_tiled))::value;
-
-  // The coord slices into the first mode (the "tile" mode), flatten the second
-  if constexpr (is_tuple<Coord>::value) {
-    // Append trailing modes if coord is tuple
-    constexpr int R0 = decltype(rank<0>(tensor_tiled))::value;
-    return tensor_tiled(append<R0>(coord,_), repeat<R1>(_));
-  } else {
-    // Flat indexing if coord is not tuple
-    return tensor_tiled(coord, repeat<R1>(_));
-  }
-}
-
-// Tile a tensor according to @a tiler and use @a coord to index into the remainder, keeping the tile.
-// This is typical at the CTA level where tiles of data are extracted:
-//   Tensor data = ...                                                                         // (  M,  N)
-//   Tensor cta_data = local_tile(data, Shape<_32,_64>{}, make_coord(blockIdx.x,blockIdx.y));  // (_32,_64)
-template <class Tensor, class Tiler, class Coord,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE constexpr
-auto
-local_tile(Tensor    && tensor,
-           Tiler const& tiler,   // tiler to apply
-           Coord const& coord)   // coord to slice into "remainder"
-{
-  return inner_partition(static_cast<Tensor&&>(tensor),
-                         tiler,
-                         coord);
-}
-
-// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
-//   when using projections of the same tiler.
-// This is typical at the CTA level where tiles of data are extracted as projections:
-//   Tensor dataA = ...                                                        // (M,K)
-//   Tensor dataB = ...                                                        // (N,K)
-//   Tensor dataC = ...                                                        // (M,N)
-//   auto cta_tiler = Shape<_32, _64, _4>{};
-//   auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _);
-//   Tensor ctaA = local_tile(dataA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (_32,_4,k)
-//   Tensor ctaB = local_tile(dataA, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (_64,_4,k)
-//   Tensor ctaC = local_tile(dataA, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (_32,_64)
-template <class Tensor, class Tiler, class Coord, class Proj,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE
-auto
-local_tile(Tensor    && tensor,
-           Tiler const& tiler,   // tiler to apply
-           Coord const& coord,   // coord to slice into "remainder"
-           Proj  const& proj)    // projection to apply to tiler and coord
-{
-  return local_tile(static_cast<Tensor&&>(tensor),
-                    dice(proj, tiler),
-                    dice(proj, coord));
-}
-
-// Tile a tensor according to the flat shape of a layout that provides the coordinate of the target index.
-// This is typical at the Thread level where data is partitioned across repeated patterns of threads:
-//   Tensor data = ...                                                            // (_16,_64)
-//   Tensor thr_data = local_partition(data, Layout<Shape<_2,_16>>{}, thr_idx);   // ( _8, _4)
-template <class Tensor, class LShape, class LStride, class Index,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE
-auto
-local_partition(Tensor                     && tensor,
-                Layout<LShape,LStride> const& tile,    // coord -> index
-                Index                  const& index)   // index to slice for
-{
-  static_assert(is_integral<Index>::value);
-  return outer_partition(static_cast<Tensor&&>(tensor),
-                         product_each(shape(tile)),
-                         tile.get_flat_coord(index));
-}
-
-// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
-//   when using projections of the same tiler.
-// This is typical at the Thread level where data is partitioned across projected layouts of threads:
-//   Tensor dataA = ...                                                            // (M,K)
-//   Tensor dataB = ...                                                            // (N,K)
-//   Tensor dataC = ...                                                            // (M,N)
-//   auto thr_layout = Layout<Shape<_2,_16,_1>, Stride<_16,_1,_0>>{};
-//   Tensor thrA = local_partition(dataA, thr_layout, thr_idx, Step<_1, X,_1>{});  // (M/2,K/1)
-//   Tensor thrB = local_partition(dataB, thr_layout, thr_idx, Step< X,_1,_1>{});  // (N/16,K/1)
-//   Tensor thrC = local_partition(dataC, thr_layout, thr_idx, Step<_1,_1, X>{});  // (M/2,N/16)
-template <class Tensor, class LShape, class LStride, class Index, class Projection,
-          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
-CUTE_HOST_DEVICE
-auto
-local_partition(Tensor                     && tensor,
-                Layout<LShape,LStride> const& tile,   // coord -> index
-                Index                  const& index,  // index to slice for
-                Projection             const& proj)
-{
-  return local_partition(static_cast<Tensor&&>(tensor),
-                         dice(proj, tile),
-                         index);
-}
-
-//
-// Display utilities
-//
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE void print(Tensor<Engine,Layout> const& tensor)
-{
-  print(tensor.data()); print(" o "); print(tensor.layout());
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE void print_tensor(Tensor<Engine,Layout> const& tensor, bool print_type = true)
-{
-  if (print_type) {
-    print(tensor); print(":\n");
-  }
-
-  if constexpr (Layout::rank == 1)
-  {
-    for (int m = 0; m < size(tensor); ++m) {
-      pretty_print(tensor(m));
-      printf("\n");
-    }
-  } else
-  if constexpr (Layout::rank == 2)
-  {
-    for (int m = 0; m < size<0>(tensor); ++m) {
-      for (int n = 0; n < size<1>(tensor); ++n) {
-        pretty_print(tensor(m,n));
-      }
-      printf("\n");
-    }
-  } else
-  if constexpr (Layout::rank == 3)
-  {
-    print_tensor(tensor(_,_,0), false);
-    for (int k = 1; k < size<2>(tensor); ++k) {
-      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("-"); } print("\n");
-      print_tensor(tensor(_,_,k), false);
-    }
-  } else
-  if constexpr (Layout::rank == 4)
-  {
-    print_tensor(tensor(_,_,_,0), false);
-    for (int p = 1; p < size<3>(tensor); ++p) {
-      for (int i = 0; i < 5*size<1>(tensor); ++i) { print("="); } print("\n");
-      print_tensor(tensor(_,_,_,p), false);
-    }
-  }
-}
-
-#if !defined(__CUDACC_RTC__)
-template <class Engine, class Layout>
-CUTE_HOST std::ostream& print_tensor_os(std::ostream& os, Tensor<Engine,Layout> const& tensor)
-{
-  int digits = 9;
-
-  if constexpr (Layout::rank == 1)
-  {
-    for (int m = 0; m < size(tensor); ++m) {
-      os << std::setw(digits) << tensor(m) << std::endl;
-    }
-  } else
-  if constexpr (Layout::rank == 2)
-  {
-    for (int m = 0; m < size<0>(tensor); ++m) {
-      for (int n = 0; n < size<1>(tensor); ++n) {
-        os << std::setw(digits) << tensor(m,n);
-      }
-      os << std::endl;
-    }
-  } else
-  if constexpr (Layout::rank == 3)
-  {
-    print_tensor_os(os, tensor(_,_,0));
-    for (int k = 1; k < size<2>(tensor); ++k) {
-      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "-"; } os << std::endl;
-      print_tensor_os(os, tensor(_,_,k));
-    }
-  } else
-  if constexpr (Layout::rank == 4)
-  {
-    print_tensor_os(os, tensor(_,_,_,0));
-    for (int p = 1; p < size<3>(tensor); ++p) {
-      for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "="; } os << std::endl;
-      print_tensor_os(os, tensor(_,_,_,p));
-    }
-  }
-
-  return os;
-}
-
-template <class Engine, class Layout>
-CUTE_HOST std::ostream& operator<<(std::ostream& os, Tensor<Engine,Layout> const& tensor)
-{
-  os << tensor.layout() << std::endl;
-  return print_tensor_os(os, tensor);
-}
-#endif // !defined(__CUDACC_RTC__)
-
-} // end namespace cute
-
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp b/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
deleted file mode 100755
index 9c8a2ba61..000000000
--- a/lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                    // CUTE_HOST_DEVICE
-#include <cute/numeric/integral_constant.hpp> // cute::true_type
-
-namespace cute
-{
-
-template <class T>
-struct ConstantTensor
-{
-  template <class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  T const&
-  operator()(Coords const&...) const {
-    return val_;
-  }
-
-  T val_;
-};
-
-struct TrivialPredTensor
-{
-  template <class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  true_type
-  operator()(Coords const&...) const {
-    return {};
-  }
-};
-
-template <class Fn>
-struct FunctionPredTensor
-{
-  CUTE_HOST_DEVICE constexpr
-  FunctionPredTensor(Fn const& fn) : fn_(fn) {}
-
-  template <class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coords const&... coords) const {
-    return fn_(coords...);
-  }
-
-  Fn const& fn_;
-};
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp b/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
deleted file mode 100755
index 6d70ffc84..000000000
--- a/lightllm-kernel/cutlass/include/cute/tensor_zip.hpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cute/config.hpp>           // CUTE_HOST_DEVICE
-#include <cute/tensor_impl.hpp>      // cute::Tensor
-#include <cute/container/tuple.hpp>  // cute::tuple
-
-namespace cute
-{
-
-// A tuple of Iterators that can be offset asymmetrically
-// Note that this only accepts op+(tuple<Index...>) and op[tuple<Index...>]
-//   where each iterator will be offset by its respective index only.
-// READ-ONLY for now until cute::tuple can be constructed with references.
-template <class... Iters>
-struct ZipIterator
-{
-  using value_type   = cute::tuple<iter_value_t<Iters>...>;
-  using element_type = cute::tuple<iter_element_t<Iters>...>;
-  // NOTE: cute::tuple does not support constructions with references at the moment.
-  //       Consider fixes and/or an implementation of std::forward_as_tuple.
-  //       For now, use a cute::tuple of value_types instead, which makes this Iterator READ-ONLY.
-  //using reference    = cute::tuple<iter_reference_t<Iters>...>;
-  using reference  = value_type;
-
-  ZipIterator() = delete;
-
-  CUTE_HOST_DEVICE constexpr
-  ZipIterator(Iters... iters)
-    : iters_(iters...)
-  {}
-
-  CUTE_HOST_DEVICE constexpr
-  ZipIterator(cute::tuple<Iters...> const& iters)
-    : iters_(iters)
-  {}
-
-  CUTE_HOST_DEVICE constexpr
-  reference operator*() const {
-    return cute::apply(iters_, [](auto&&... args) { return reference(*args...); });
-  }
-
-  template <class... Index>
-  CUTE_HOST_DEVICE constexpr
-  ZipIterator operator+(cute::tuple<Index...> const& idxs) const {
-    static_assert(sizeof...(Index) == sizeof...(Iters), "Expect same number of offsets as iterators.");
-    return cute::transform(iters_, idxs, [](auto&& iter, auto&& idx) { return iter + idx; });
-  }
-
-  template <class... Index>
-  CUTE_HOST_DEVICE constexpr
-  reference operator[](cute::tuple<Index...> const& idxs) const {
-    return *(*this + idxs);
-  }
-
-  cute::tuple<Iters...> iters_;
-};
-
-//------------------------------------------------------------------------------
-// type traits
-
-template <class... Iters>
-struct is_rmem<ZipIterator<Iters...>> : conjunction<is_rmem<Iters>...> {};
-template <class... Iters>
-struct is_smem<ZipIterator<Iters...>> : conjunction<is_smem<Iters>...> {};
-template <class... Iters>
-struct is_gmem<ZipIterator<Iters...>> : conjunction<is_gmem<Iters>...> {};
-// A tuple of Layouts that operates on each Layout symmetrically
-// The Layouts need to have compatible shapes and ranks.
-// The ZipLayout presents the intersection of the domain of its component Layouts.
-//   E.g. all Layouts accept 1D coords and ZipLayout does as well.
-// The ZipLayout returns the union of the codomain of its component Layouts.
-//   E.g. all Layouts return an integer so ZipLayout returns a tuple of integers.
-template <class... Layouts>
-struct ZipLayout
-{
-  static constexpr int rank = (int(0) | ... | Layouts::rank);
-
-  static_assert((is_layout<Layouts>::value && ...), "All template parameters must be layouts");
-  static_assert(((Layouts::rank == rank) && ...),   "All layouts must have the same rank");
-
-  CUTE_HOST_DEVICE constexpr
-  ZipLayout(Layouts const&... layouts)
-    : layouts_(layouts...)
-  {}
-
-  CUTE_HOST_DEVICE constexpr
-  ZipLayout(cute::tuple<Layouts...> const& layouts)
-    : layouts_(layouts)
-  {}
-
-  template <class Coord>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(Coord const& coord) const {
-    if constexpr (has_underscore<Coord>::value) {
-      return ZipLayout(cute::transform(layouts_, [&] (auto layout) { return layout(coord); }));
-    } else {
-      return cute::transform(layouts_, [&] (auto layout) { return layout(coord); });
-    }
-
-    CUTE_GCC_UNREACHABLE;
-  }
-
-  // op() convenience function for multi-dimensional coordinates
-  template <class Coord0, class Coord1, class... Coords>
-  CUTE_HOST_DEVICE constexpr
-  decltype(auto)
-  operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const {
-    return operator()(make_coord(c0,c1,cs...));
-  }
-
-  cute::tuple<Layouts...> layouts_;
-};
-
-template <class... Layouts>
-struct is_layout<ZipLayout<Layouts...>> : true_type {};
-
-//
-// make_zip_tensor and unzip_tensor
-//
-
-template <class... Engines, class... Layouts>
-CUTE_HOST_DEVICE constexpr
-auto
-make_zip_tensor(Tensor<Engines,Layouts> const&... tensors)
-{
-  return make_tensor(ZipIterator(tensors.data()...),
-                     ZipLayout(tensors.layout()...));
-}
-
-template <class Engine, class Layout>
-CUTE_HOST_DEVICE constexpr
-auto
-unzip_tensor(Tensor<Engine,Layout> const& tensor)
-{
-  return cute::transform(tensor.data().iters_, tensor.layout().layouts_,
-                         [](auto iter, auto layout) { return make_tensor(iter, layout); });
-}
-
-//
-// Utilities
-//
-
-template <int... Is, class... Layouts>
-CUTE_HOST_DEVICE constexpr
-auto
-rank(ZipLayout<Layouts...> const& layouts)
-{
-  return rank<Is...>(get<0>(layouts.layouts_));
-}
-
-template <int... Is, class... Layouts>
-CUTE_HOST_DEVICE constexpr
-auto
-size(ZipLayout<Layouts...> const& layouts)
-{
-  return size<Is...>(get<0>(layouts.layouts_));
-}
-
-//
-// Manipulation
-//
-
-// Extend each component layout to rank-N by appending Layout @a x.
-template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-append(ZipLayout<Layouts...>  const& layouts,
-       Layout<ShapeX,StrideX> const& x = {})
-{
-  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return append<N>(t, x); }));
-}
-
-// Extend each component layout to rank-N by prepending Layout @a x.
-template <int N, class... Layouts, class ShapeX = _1, class StrideX = _0>
-CUTE_HOST_DEVICE constexpr
-auto
-prepend(ZipLayout<Layouts...>  const& layouts,
-        Layout<ShapeX,StrideX> const& x = {})
-{
-  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return prepend<N>(t, x); }));
-}
-
-template <class... Layouts, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-logical_divide(ZipLayout<Layouts...> const& layouts,
-               Tiler                 const& tiler)
-{
-  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return logical_divide(t, tiler); }));
-}
-
-template <class... Layouts, class Tiler>
-CUTE_HOST_DEVICE constexpr
-auto
-zipped_divide(ZipLayout<Layouts...> const& layouts,
-              Tiler                 const& tiler)
-{
-  return ZipLayout(cute::transform(layouts.layouts_, [&](auto t){ return zipped_divide(t, tiler); }));
-}
-
-// Return <SlicedZipLayout, ZipOffsets> by calling slice_and_offset and all component layouts.
-template <class Coord, class... Layouts>
-CUTE_HOST_DEVICE constexpr
-auto
-slice_and_offset(Coord const& c, ZipLayout<Layouts...> const& layouts)
-{
-  auto result = cute::zip(cute::transform(layouts.layouts_, [&c](auto const& layout) { return slice_and_offset(c, layout); }));
-  return cute::make_tuple(ZipLayout(get<0>(result)), get<1>(result));
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/underscore.hpp b/lightllm-kernel/cutlass/include/cute/underscore.hpp
deleted file mode 100755
index e9d80fe5b..000000000
--- a/lightllm-kernel/cutlass/include/cute/underscore.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>                     // CUTE_INLINE_CONSTANT, CUTE_HOST_DEVICE
-#include <cute/container/tuple.hpp>            // cute::is_tuple
-#include <cute/numeric/integral_constant.hpp>  // cute::false_type, cute::true_type
-
-namespace cute
-{
-
-// For slicing
-struct Underscore : Int<0> {};
-
-CUTE_INLINE_CONSTANT Underscore _;
-
-// Convenient alias
-using X = Underscore;
-
-// Treat Underscore as an integral like integral_constant
-template <>
-struct is_integral<Underscore> : true_type {};
-
-template <class T>
-struct is_underscore : false_type {};
-template <>
-struct is_underscore<Underscore> : true_type {};
-
-// Tuple trait for detecting static member element
-template <class Tuple, class Elem, class Enable = void>
-struct has_elem : false_type {};
-template <class Elem>
-struct has_elem<Elem, Elem> : true_type {};
-template <class Tuple, class Elem>
-struct has_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
-    : has_elem<Tuple, Elem, tuple_seq<Tuple> > {};
-template <class Tuple, class Elem, int... Is>
-struct has_elem<Tuple, Elem, seq<Is...>>
-    : disjunction<has_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
-
-// Tuple trait for detecting static member element
-template <class Tuple, class Elem, class Enable = void>
-struct all_elem : false_type {};
-template <class Elem>
-struct all_elem<Elem, Elem> : true_type {};
-template <class Tuple, class Elem>
-struct all_elem<Tuple, Elem, enable_if_t<is_tuple<Tuple>::value> >
-    : all_elem<Tuple, Elem, tuple_seq<Tuple> > {};
-template <class Tuple, class Elem, int... Is>
-struct all_elem<Tuple, Elem, seq<Is...>>
-    : conjunction<all_elem<tuple_element_t<Is, Tuple>, Elem>...> {};
-
-// Tuple trait for detecting Underscore member
-template <class Tuple>
-using has_underscore = has_elem<Tuple, Underscore>;
-
-template <class Tuple>
-using all_underscore = all_elem<Tuple, Underscore>;
-
-template <class Tuple>
-using has_int1 = has_elem<Tuple, Int<1>>;
-
-template <class Tuple>
-using has_int0 = has_elem<Tuple, Int<0>>;
-
-//
-// Slice keeps only the elements of Tuple B that are paired with an Underscore
-//
-
-namespace detail {
-
-template <class A, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-lift_slice(A const& a, B const& b)
-{
-  if constexpr (is_tuple<A>::value) {
-    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
-    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_slice(x,y); });
-  } else if constexpr (is_underscore<A>::value) {
-    return cute::tuple<B>{b};
-  } else {
-    return cute::tuple<>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-// Entry point overrides the lifting so that slice(_,b) == b
-template <class A, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-slice(A const& a, B const& b)
-{
-  if constexpr (is_tuple<A>::value) {
-    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
-    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_slice(x,y); });
-  } else if constexpr (is_underscore<A>::value) {
-    return b;
-  } else {
-    return cute::tuple<>{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Dice keeps only the elements of Tuple B that are paired with an Int
-//
-
-namespace detail {
-
-template <class A, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-lift_dice(A const& a, B const& b)
-{
-  if constexpr (is_tuple<A>::value) {
-    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
-    return filter_tuple(a, b, [](auto const& x, auto const& y) { return lift_dice(x,y); });
-  } else if constexpr (is_underscore<A>::value) {
-    return cute::tuple<>{};
-  } else {
-    return cute::tuple<B>{b};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
-// Entry point overrides the lifting so that dice(1,b) == b
-template <class A, class B>
-CUTE_HOST_DEVICE constexpr
-auto
-dice(A const& a, B const& b)
-{
-  if constexpr (is_tuple<A>::value) {
-    static_assert(tuple_size<A>::value == tuple_size<B>::value, "Mismatched Ranks");
-    return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_dice(x,y); });
-  } else if constexpr (is_underscore<A>::value) {
-    return cute::tuple<>{};
-  } else {
-    return b;
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-//
-// Display utilities
-//
-
-CUTE_HOST_DEVICE void print(Underscore const&) {
-  printf("_");
-}
-
-#if !defined(__CUDACC_RTC__)
-CUTE_HOST std::ostream& operator<<(std::ostream& os, Underscore const&) {
-  return os << "_";
-}
-#endif
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/debug.hpp b/lightllm-kernel/cutlass/include/cute/util/debug.hpp
deleted file mode 100755
index 86da7cae9..000000000
--- a/lightllm-kernel/cutlass/include/cute/util/debug.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/**
- * \file
- * \brief Debugging and logging functionality
- */
-
-#include <cuda_runtime_api.h>
-
-#include <cute/config.hpp>
-
-namespace cute
-{
-
-/******************************************************************************
- * Debug and logging macros
- ******************************************************************************/
-
-/**
- * Formats and prints the given message to stdout
- */
-#if !defined(CUTE_LOG)
-#  if !defined(__CUDA_ARCH__)
-#    define CUTE_LOG(format, ...) printf(format, __VA_ARGS__)
-#  else
-#    define CUTE_LOG(format, ...)                                \
-        printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
-               blockIdx.x,  blockIdx.y,  blockIdx.z,             \
-               threadIdx.x, threadIdx.y, threadIdx.z,            \
-               __VA_ARGS__);
-#  endif
-#endif
-
-/**
- * Formats and prints the given message to stdout only if DEBUG is defined
- */
-#if !defined(CUTE_LOG_DEBUG)
-#  ifdef DEBUG
-#    define CUTE_LOG_DEBUG(format, ...) CUTE_LOG(format, __VA_ARGS__)
-#  else
-#    define CUTE_LOG_DEBUG(format, ...)
-#  endif
-#endif
-
-/**
- * \brief Perror macro with exit
- */
-#if !defined(CUTE_ERROR_EXIT)
-#  define CUTE_ERROR_EXIT(e)                                         \
-      do {                                                           \
-        cudaError_t code = (e);                                      \
-        if (code != cudaSuccess) {                                   \
-          fprintf(stderr, "<%s:%d> %s:\n    %s: %s\n",               \
-                  __FILE__, __LINE__, #e,                            \
-                  cudaGetErrorName(code), cudaGetErrorString(code)); \
-          fflush(stderr);                                            \
-          exit(1);                                                   \
-        }                                                            \
-      } while (0)
-#endif
-
-#if !defined(CUTE_CHECK_LAST)
-#  define CUTE_CHECK_LAST() CUTE_ERROR_EXIT(cudaPeekAtLastError()); CUTE_ERROR_EXIT(cudaDeviceSynchronize())
-#endif
-
-#if !defined(CUTE_CHECK_ERROR)
-#  define CUTE_CHECK_ERROR(e) CUTE_ERROR_EXIT(e)
-#endif
-
-// A dummy function that uses compilation failure to print a type
-template <class... T>
-CUTE_HOST_DEVICE void
-print_type() {
-  static_assert(sizeof...(T) < 0, "Printing type T.");
-}
-
-template <class... T>
-CUTE_HOST_DEVICE void
-print_type(T&&...) {
-  static_assert(sizeof...(T) < 0, "Printing type T.");
-}
-
-//
-// Device-specific helpers
-//
-// e.g.
-// if (thread0()) print(...);
-// if (block0()) print(...);
-// if (thread(42)) print(...);
-
-CUTE_HOST_DEVICE
-bool
-block([[maybe_unused]] int bid)
-{
-#if defined(__CUDA_ARCH__)
-  return blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y == bid;
-#else
-  return true;
-#endif
-}
-
-CUTE_HOST_DEVICE
-bool
-thread([[maybe_unused]] int tid, [[maybe_unused]] int bid)
-{
-#if defined(__CUDA_ARCH__)
-  return (threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y == tid) && block(bid);
-#else
-  return true;
-#endif
-}
-
-CUTE_HOST_DEVICE
-bool
-thread(int tid)
-{
-  return thread(tid,0);
-}
-
-CUTE_HOST_DEVICE
-bool
-thread0()
-{
-  return thread(0,0);
-}
-
-CUTE_HOST_DEVICE
-bool
-block0()
-{
-  return block(0);
-}
-
-}  // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/print.hpp b/lightllm-kernel/cutlass/include/cute/util/print.hpp
deleted file mode 100755
index dbd658169..000000000
--- a/lightllm-kernel/cutlass/include/cute/util/print.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/config.hpp>           // CUTE_HOST_DEVICE
-#include <cute/util/type_traits.hpp> // cute::is_valid
-#include <cute/numeric/numeric_types.hpp> 
-
-//
-// CUDA compatible print and printf
-//
-
-namespace cute
-{
-
-CUTE_HOST_DEVICE
-int
-num_digits(int x)
-{
-  return (x < 10 ? 1 :
-          (x < 100 ? 2 :
-           (x < 1000 ? 3 :
-            (x < 10000 ? 4 :
-             (x < 100000 ? 5 :
-              (x < 1000000 ? 6 :
-               (x < 10000000 ? 7 :
-                (x < 100000000 ? 8 :
-                 (x < 1000000000 ? 9 :
-                  10)))))))));
-}
-
-//
-// print dispatcher
-//
-
-CUTE_HOST_DEVICE
-void
-print(char c) {
-  printf("%c", c);
-}
-
-CUTE_HOST_DEVICE
-void
-print(signed char a) {
-  printf("%d", static_cast<int>(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(unsigned char a) {
-  printf("%u", static_cast<unsigned int>(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(short a) {
-  printf("%hd", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(unsigned short a) {
-  printf("%hu", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(int a) {
-  printf("%d", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(uint1b_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(int2b_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(uint2b_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(int4b_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(uint4b_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(bin1_t a) {
-  printf("%d", int(a));
-}
-
-CUTE_HOST_DEVICE
-void
-print(unsigned int a) {
-  printf("%u", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(long a) {
-  printf("%ld", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(unsigned long a) {
-  printf("%lu", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(long long a) {
-  printf("%lld", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(unsigned long long a) {
-  printf("%llu", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(float a) {
-  printf("%f", a);
-}
-
-CUTE_HOST_DEVICE
-void
-print(double a) {
-  printf("%f", a);
-}
-
-template <class... T>
-CUTE_HOST_DEVICE
-void
-print(char const* format, T const&... t) {
-  printf(format, t...);
-}
-
-CUTE_HOST_DEVICE
-void
-print(char const* format) {
-  printf("%s", format);
-}
-
-//
-// pretty printing
-//
-
-CUTE_HOST_DEVICE void
-pretty_print(uint1b_t a) {
-  printf("%*d", 3, int(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(int2b_t a) {
-  printf("%*d", 5, int(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(uint2b_t a) {
-  printf("%*d", 5, int(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(int4b_t a) {
-  printf("%*d", 5, int(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(uint4b_t a) {
-  printf("%*d", 5, int(a));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(bool v) {
-  printf("%*d", 3, int(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(int32_t v) {
-  printf("%*d", 5, v);
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(uint32_t v) {
-  printf("%*d", 5, v);
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(int64_t v) {
-  printf("%*lld", 5, static_cast<long long>(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(uint64_t v) {
-  printf("%*llu", 5, static_cast<unsigned long long>(v));
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(float v) {
-  printf("%*.2e", 10, v);
-}
-
-CUTE_HOST_DEVICE void
-pretty_print(double v) {
-  printf("%*.3e", 11, v);
-}
-
-template <class T>
-CUTE_HOST_DEVICE void
-pretty_print(T t) {
-  printf("  "); print(t);
-}
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp b/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
deleted file mode 100755
index e663b569c..000000000
--- a/lightllm-kernel/cutlass/include/cute/util/type_traits.hpp
+++ /dev/null
@@ -1,292 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
-#include <cuda/std/cstddef>
-#include <cuda/std/cstdint>
-#include <cuda/std/limits>
-#else
-#include <type_traits>
-#include <utility>      // tuple_size, tuple_element
-#include <cstddef>      // ptrdiff_t
-#include <cstdint>      // uintptr_t
-#include <limits>       // numeric_limits
-#endif
-
-#include <cute/config.hpp> // CUTE_STL_NAMESPACE
-
-namespace cute
-{
-  using CUTE_STL_NAMESPACE::enable_if;
-  using CUTE_STL_NAMESPACE::enable_if_t;
-}
-
-#define __CUTE_REQUIRES(...)   typename cute::enable_if<(__VA_ARGS__)>::type* = nullptr
-#define __CUTE_REQUIRES_V(...) typename cute::enable_if<decltype((__VA_ARGS__))::value>::type* = nullptr
-
-namespace cute
-{
-
-// <type_traits>
-using CUTE_STL_NAMESPACE::conjunction;
-using CUTE_STL_NAMESPACE::conjunction_v;
-
-using CUTE_STL_NAMESPACE::disjunction;
-using CUTE_STL_NAMESPACE::disjunction_v;
-
-using CUTE_STL_NAMESPACE::negation;
-using CUTE_STL_NAMESPACE::negation_v;
-
-using CUTE_STL_NAMESPACE::void_t;
-using CUTE_STL_NAMESPACE::is_void_v;
-
-using CUTE_STL_NAMESPACE::is_base_of;
-using CUTE_STL_NAMESPACE::is_base_of_v;
-
-using CUTE_STL_NAMESPACE::is_const;
-using CUTE_STL_NAMESPACE::is_const_v;
-using CUTE_STL_NAMESPACE::is_volatile;
-using CUTE_STL_NAMESPACE::is_volatile_v;
-
-// Defined in cute/numeric/integral_constant.hpp
-// using CUTE_STL_NAMESPACE::true_type;
-// using CUTE_STL_NAMESPACE::false_type;
-
-using CUTE_STL_NAMESPACE::conditional;
-using CUTE_STL_NAMESPACE::conditional_t;
-
-using CUTE_STL_NAMESPACE::add_const_t;
-
-using CUTE_STL_NAMESPACE::remove_const_t;
-using CUTE_STL_NAMESPACE::remove_cv_t;
-using CUTE_STL_NAMESPACE::remove_reference_t;
-
-using CUTE_STL_NAMESPACE::extent;
-using CUTE_STL_NAMESPACE::remove_extent;
-
-using CUTE_STL_NAMESPACE::decay;
-using CUTE_STL_NAMESPACE::decay_t;
-
-using CUTE_STL_NAMESPACE::is_lvalue_reference;
-using CUTE_STL_NAMESPACE::is_lvalue_reference_v;
-
-using CUTE_STL_NAMESPACE::is_reference;
-using CUTE_STL_NAMESPACE::is_trivially_copyable;
-
-using CUTE_STL_NAMESPACE::is_convertible;
-using CUTE_STL_NAMESPACE::is_convertible_v;
-
-using CUTE_STL_NAMESPACE::is_same;
-using CUTE_STL_NAMESPACE::is_same_v;
-
-using CUTE_STL_NAMESPACE::is_constructible;
-using CUTE_STL_NAMESPACE::is_constructible_v;
-using CUTE_STL_NAMESPACE::is_default_constructible;
-using CUTE_STL_NAMESPACE::is_default_constructible_v;
-using CUTE_STL_NAMESPACE::is_standard_layout;
-using CUTE_STL_NAMESPACE::is_standard_layout_v;
-
-using CUTE_STL_NAMESPACE::is_arithmetic;
-using CUTE_STL_NAMESPACE::is_unsigned;
-using CUTE_STL_NAMESPACE::is_unsigned_v;
-using CUTE_STL_NAMESPACE::is_signed;
-using CUTE_STL_NAMESPACE::is_signed_v;
-
-using CUTE_STL_NAMESPACE::make_signed;
-using CUTE_STL_NAMESPACE::make_signed_t;
-
-// using CUTE_STL_NAMESPACE::is_integral;
-template <class T>
-using is_std_integral = CUTE_STL_NAMESPACE::is_integral<T>;
-
-using CUTE_STL_NAMESPACE::is_empty;
-using CUTE_STL_NAMESPACE::is_empty_v;
-
-using CUTE_STL_NAMESPACE::invoke_result_t;
-
-using CUTE_STL_NAMESPACE::common_type;
-using CUTE_STL_NAMESPACE::common_type_t;
-
-using CUTE_STL_NAMESPACE::remove_pointer;
-using CUTE_STL_NAMESPACE::remove_pointer_t;
-
-using CUTE_STL_NAMESPACE::alignment_of;
-using CUTE_STL_NAMESPACE::alignment_of_v;
-
-// <utility>
-using CUTE_STL_NAMESPACE::declval;
-
-template <class T>
-constexpr T&& forward(remove_reference_t<T>& t) noexcept
-{
-  return static_cast<T&&>(t);
-}
-
-template <class T>
-constexpr T&& forward(remove_reference_t<T>&& t) noexcept
-{
-  static_assert(! is_lvalue_reference_v<T>, "T cannot be an lvalue reference (e.g., U&).");
-  return static_cast<T&&>(t);
-}
-
-template <class T>
-constexpr remove_reference_t<T>&& move(T&& t) noexcept
-{
-  return static_cast<remove_reference_t<T>&&>(t);
-}
-
-// <limits>
-using CUTE_STL_NAMESPACE::numeric_limits;
-
-// <cstddef>
-using CUTE_STL_NAMESPACE::ptrdiff_t;
-
-// <cstdint>
-using CUTE_STL_NAMESPACE::uintptr_t;
-
-// C++20
-// using std::remove_cvref;
-template <class T>
-struct remove_cvref {
-  using type = remove_cv_t<remove_reference_t<T>>;
-};
-
-// C++20
-// using std::remove_cvref_t;
-template <class T>
-using remove_cvref_t = typename remove_cvref<T>::type;
-
-//
-// dependent_false
-//
-// @brief An always-false value that depends on one or more template parameters.
-// See
-// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
-// https://github.com/cplusplus/papers/issues/572
-// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
-template <class... Args>
-inline constexpr bool dependent_false = false;
-
-//
-// tuple_size, tuple_element
-//
-// @brief CuTe-local tuple-traits to prevent conflicts with other libraries.
-// For cute:: types, we specialize std::tuple-traits, which is explicitly allowed.
-//   cute::tuple, cute::array, cute::array_subbyte, etc
-// But CuTe wants to treat some external types as tuples as well. For those,
-// we specialize cute::tuple-traits to avoid polluting external traits.
-//   dim3, uint3, etc
-
-template <class T, class = void>
-struct tuple_size;
-
-template <class T>
-struct tuple_size<T,void_t<typename CUTE_STL_NAMESPACE::tuple_size<T>::type>> : CUTE_STL_NAMESPACE::integral_constant<size_t, CUTE_STL_NAMESPACE::tuple_size<T>::value> {};
-
-// S =  : std::integral_constant<std::size_t, std::tuple_size<T>::value> {};
-
-template <class T>
-constexpr size_t tuple_size_v = tuple_size<T>::value;
-
-template <size_t I, class T, class = void>
-struct tuple_element;
-
-template <size_t I, class T>
-struct tuple_element<I,T,void_t<typename CUTE_STL_NAMESPACE::tuple_element<I,T>::type>> : CUTE_STL_NAMESPACE::tuple_element<I,T> {};
-
-template <size_t I, class T>
-using tuple_element_t = typename tuple_element<I,T>::type;
-
-//
-// is_valid
-//
-
-namespace detail {
-
-template <class F, class... Args, class = decltype(declval<F&&>()(declval<Args&&>()...))>
-CUTE_HOST_DEVICE constexpr auto
-is_valid_impl(int) { return CUTE_STL_NAMESPACE::true_type{}; }
-
-template <class F, class... Args>
-CUTE_HOST_DEVICE constexpr auto
-is_valid_impl(...) { return CUTE_STL_NAMESPACE::false_type{}; }
-
-template <class F>
-struct is_valid_fn {
-  template <class... Args>
-  CUTE_HOST_DEVICE constexpr auto
-  operator()(Args&&...) const { return is_valid_impl<F, Args&&...>(int{}); }
-};
-
-} // end namespace detail
-
-template <class F>
-CUTE_HOST_DEVICE constexpr auto
-is_valid(F&&) {
-  return detail::is_valid_fn<F&&>{};
-}
-
-template <class F, class... Args>
-CUTE_HOST_DEVICE constexpr auto
-is_valid(F&&, Args&&...) {
-  return detail::is_valid_impl<F&&, Args&&...>(int{});
-}
-
-template <bool B, template<class...> class True, template<class...> class False>
-struct conditional_template {
-  template <class... U>
-  using type = True<U...>;
-};
-
-template <template<class...> class True, template<class...> class False>
-struct conditional_template<false, True, False> {
-  template <class... U>
-  using type = False<U...>;
-};
-
-//
-// is_any_of
-//
-
-// Member `value` is true if and only if T is same as (is_same_v) at least one of the types in Us
-template <class T, class... Us>
-struct is_any_of {
-  constexpr static bool value = (... || CUTE_STL_NAMESPACE::is_same_v<T, Us>);
-};
-
-// Is true if and only if T is same as (is_same_v) at least one of the types in Us
-template <class T, class... Us>
-inline constexpr bool is_any_of_v = is_any_of<T, Us...>::value;
-
-} // end namespace cute
diff --git a/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h b/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
deleted file mode 100755
index 0d2bb2904..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief AlignedBuffer is a container for trivially copyable elements suitable for use in
-      unions and shared memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Modifies semantics of cutlass::Array<> to provide guaranteed alignment. 
-template <
-  typename T,
-  int N,
-  int Align = 16
->
-struct AlignedBuffer {
-  
-  /// Internal storage type
-  using Storage = uint8_t;
-
-  /// Number of logical elements held in buffer
-  static int const kCount = N;
-
-  /// Alignment requirement in bytes
-  static int const kAlign = Align;
-
-  /// Number of storage elements
-  static int const kBytes = 
-    (sizeof_bits<T>::value * N + 7) / 8;
-
-private:
-
-  /// Internal storage
-  alignas(Align) Storage storage[kBytes];
-
-public:
-
-  //
-  // C++ standard members
-  //
-
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef value_type *pointer;
-  typedef value_type const * const_pointer;
-
-  using Array = Array<T, N>;
-  using reference = typename Array::reference;
-  using const_reference = typename Array::const_reference;
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  pointer data() {
-    return reinterpret_cast<pointer>(storage); 
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_pointer data() const {
-    return reinterpret_cast<pointer>(storage); 
-  }
-  
-  CUTLASS_HOST_DEVICE
-  Storage * raw_data() {
-    return storage;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Storage const * raw_data() const {
-    return storage;
-  }
-
-
-  CUTLASS_HOST_DEVICE
-  constexpr bool empty() const {
-    return !kCount;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type size() const {
-    return kCount;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type max_size() const {
-    return kCount;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/arch.h b/lightllm-kernel/cutlass/include/cutlass/arch/arch.h
deleted file mode 100755
index 36d4676bd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/arch.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines tags for architecture-specific configurations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-#if defined(__NVCC__) || defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
-
-/// Computes laneId within a warp
-CUTLASS_DEVICE
-int LaneId() {
-  int ret;
-  asm ("mov.u32 %0, %%laneid;" : "=r"(ret) : );
-  return ret;
-}
-
-/// Computes SM number the thread is running on
-CUTLASS_DEVICE
-int SmId() {
-  int ret;
-  asm ("mov.u32 %0, %%smid;" : "=r"(ret) : );
-  return ret;
-}
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct Sm50 {
-  static int const kMinComputeCapability = 50;
-}; 
-struct Sm60 {
-  static int const kMinComputeCapability = 60;
-}; 
-struct Sm61 {
-  static int const kMinComputeCapability = 61;
-};
-struct Sm70 {
-  static int const kMinComputeCapability = 70;
-};
-struct Sm72 {
-  static int const kMinComputeCapability = 72;
-};
-struct Sm75 {
-  static int const kMinComputeCapability = 75;
-};
-struct Sm80 {
-  static int const kMinComputeCapability = 80; 
-};
-struct Sm86 {
-  static int const kMinComputeCapability = 86;
-};
-struct Sm89 {
-  static int const kMinComputeCapability = 89;
-};
-struct Sm90 {
-  static int const kMinComputeCapability = 90; 
-};
-
-/// Triggers a breakpoint on the device
-CUTLASS_DEVICE
-void device_breakpoint() {
-#if defined(__CUDA_ARCH__)
-  asm volatile ("  brkpt;\n");
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h b/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
deleted file mode 100755
index c96897324..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/barrier.h
+++ /dev/null
@@ -1,630 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Barrier Operations on SM90+
-*/
-
-#pragma once
-
-#include <cutlass/arch/memory_sm75.h>
-#include <cute/arch/cluster_sm90.hpp>
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
-#define CUDA_BARRIER_ENABLED 1
-#else
-#define CUDA_BARRIER_ENABLED 0
-#endif
-
-namespace cutlass {
-/// @brief
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Enumerates the reserved named barriers to avoid potential conflicts
-// This enum class specifies the NamedBarriers reserved by CUTLASS.
-enum class ReservedNamedBarriers { 
-  EpilogueBarrier = 1,
-  TransposeBarrier = 2,
-  TransformBarrier = 3,
-  StreamkBarrier0 = 4,
-  StreamkBarrier1 = 5
-  , FirstUserBarrier = StreamkBarrier1 + 1
-};
-
-
-class NamedBarrier {
-
-  // Data Members:
-
-  // Range = [1 , NUM_THREADS_PER_CTA]
-  // Range % warp-size (i.e 32) == 0
-  uint32_t const num_threads_;
-
-  // Range : [0, 15]
-  // Note that should be set to the final barrier ID, including ReserveNamedBarrierCount should be considered
-  uint32_t const id_;
-
- public:
-
-  // Constructor for CUTLASS developers:
-  // effective barrier ID starts from 0
-  CUTLASS_DEVICE
-  NamedBarrier(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers)
-      : num_threads_(num_threads), id_(static_cast<uint32_t>(reserved_named_barriers)) {}
-
-  // Constructor for CUTLASS users:
-  // effective barrier ID starts from ReservedNamedBarrierCount
-  CUTLASS_DEVICE
-  NamedBarrier(uint32_t num_threads, uint32_t id = 0)
-      : num_threads_(num_threads), id_(id + ReservedNamedBarrierCount) {
-    CUTLASS_ASSERT(id + ReservedNamedBarrierCount <= HardwareMaxNumNamedBarriers && "Effective barrier_id should not exceed 16.");
-  }
-
-  CUTLASS_DEVICE
-  void arrive_and_wait() const {
-    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
-    NamedBarrier::arrive_and_wait_internal(num_threads_, id_);
-  }
-
-  CUTLASS_DEVICE
-  void arrive_and_wait_unaligned() const {
-    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
-    NamedBarrier::arrive_and_wait_internal_unaligned(num_threads_, id_);
-  }
-
-  CUTLASS_DEVICE
-  void arrive() const {
-    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
-    NamedBarrier::arrive_internal(num_threads_, id_);
-  }
-
-  CUTLASS_DEVICE
-  void arrive_unaligned() const {
-    // Note: The value of id_ is already the final barrier id (set correctly in the constructor).
-    NamedBarrier::arrive_internal_unaligned(num_threads_, id_);
-  }
-
-  CUTLASS_DEVICE
-  void sync() const {
-    NamedBarrier::arrive_and_wait();
-  }
-
-  //  Static variants
-
-  // Calling interface for CUTLASS users: 
-  // effective barrier ID starts from ReservedNamedBarrierCount
-  CUTLASS_DEVICE
-  static void arrive_and_wait(uint32_t num_threads, uint32_t barrier_id) {
-    arrive_and_wait_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
-  }
-
-  // Calling interface for CUTLASS developers: 
-  // effective barrier ID starts from 0
-  CUTLASS_DEVICE
-  static void arrive_and_wait(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
-    arrive_and_wait_internal(num_threads, static_cast<int>(reserved_named_barriers));
-  }
-
-  // Calling interface for CUTLASS users: 
-  // effective barrier ID starts from ReservedNamedBarrierCount
-  CUTLASS_DEVICE
-  static void arrive(uint32_t num_threads, uint32_t barrier_id) {
-    arrive_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
-  }
-
-  // Calling interface for CUTLASS developers: 
-  // effective barrier ID starts from 0
-  CUTLASS_DEVICE
-  static void arrive(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
-    arrive_internal(num_threads, static_cast<int>(reserved_named_barriers));
-  }
-
-  // Calling interface for CUTLASS users: 
-  // effective barrier ID starts from ReservedNamedBarrierCount
-  CUTLASS_DEVICE
-  static void sync(uint32_t num_threads, uint32_t barrier_id) {
-    sync_internal(num_threads, barrier_id + ReservedNamedBarrierCount);
-  }
-
-  // Calling interface for CUTLASS developers: 
-  // effective barrier ID starts from 0
-  CUTLASS_DEVICE
-  static void sync(uint32_t num_threads, ReservedNamedBarriers reserved_named_barriers) {
-    sync_internal(num_threads, static_cast<int>(reserved_named_barriers));
-  }
-
-
- private:
-  CUTLASS_DEVICE
-  static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
-    asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static void arrive_and_wait_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
-    asm volatile("barrier.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-    cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(__LINE__, num_threads, barrier_id);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
-    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
-    asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static void arrive_internal_unaligned(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
-    cutlass::arch::synclog_emit_named_barrier_arrive(__LINE__, num_threads, barrier_id);
-    asm volatile("barrier.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static void sync_internal(uint32_t num_threads, uint32_t barrier_id) {
-    NamedBarrier::arrive_and_wait_internal(num_threads, barrier_id);
-  }
-
- public:
-  // Currently we reserve 8 NamedBarriers for CUTLASS' own use cases, 
-  // while leaving the renaming for general users.
-  static const uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(ReservedNamedBarriers::FirstUserBarrier);
-  static const uint32_t HardwareMaxNumNamedBarriers = 16;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Hopper introduces a new cluster-wide barrier which handle with Cluster-wide arrive-wait behaviour.
-// This is an extension to the Ampere arrive-wait barriers
-// Note : Ampere arrive-wait Barriers have a larger max-arrive count (2^30) than Hopper arrive-wait Barriers (2^20).
-struct ClusterBarrier {
-
-  using ValueType = uint64_t;
-
-protected:
-  // Can never be initialized - can only be aliased to smem
-  ValueType barrier_;
-
-public:
-
-  CUTLASS_DEVICE
-  ClusterBarrier() = delete;
-
-  CUTLASS_DEVICE
-  void init(uint32_t arrive_count) const {
-    ClusterBarrier::init(&this->barrier_, arrive_count);
-  }
-
-  CUTLASS_DEVICE
-  bool test_wait(uint32_t phase, uint32_t pred=true) const {
-    return ClusterBarrier::test_wait(&this->barrier_, phase, pred);
-  }
-
-  CUTLASS_DEVICE
-  bool try_wait(uint32_t phase) const {
-    return ClusterBarrier::try_wait(&this->barrier_, phase);
-  }
-
-  CUTLASS_DEVICE
-  void wait(uint32_t phase) const {
-    ClusterBarrier::wait(&this->barrier_, phase);
-  }
-
-  // Barrier arrive on local smem
-  CUTLASS_DEVICE
-  void arrive() const {
-    ClusterBarrier::arrive(&this->barrier_);
-  }
-
-  // Remote SMEM arrive with a perdicate (usually done to pick the thread doing the arrive)
-  CUTLASS_DEVICE
-  void arrive(uint32_t cta_id, uint32_t pred = true ) const {
-    ClusterBarrier::arrive(&this->barrier_, cta_id, pred);
-  }
-
-  //
-  //  Static Versions
-  //
-  CUTLASS_DEVICE
-  static void init(ValueType const* smem_ptr, uint32_t arrive_count) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        "mbarrier.init.shared::cta.b64 [%1], %0; \n"
-        "}"
-        :
-        : "r"(arrive_count), "r"(smem_addr));
-    cutlass::arch::synclog_emit_cluster_barrier_init(__LINE__, smem_addr, arrive_count);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  // Static version of wait - in case we don't want to burn a register
-  CUTLASS_DEVICE
-  static void wait(ValueType const* smem_ptr, uint32_t phase) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_cluster_barrier_wait(__LINE__, smem_addr, phase);
-    // Arbitrarily large timer value after which try-wait expires and re-tries.
-    uint32_t ticks = 0x989680;
-    asm volatile(
-        "{\n\t"
-        ".reg .pred       P1; \n\t"
-        "LAB_WAIT: \n\t"
-        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1, %2; \n\t"
-        "@P1 bra DONE; \n\t"
-        "bra     LAB_WAIT; \n\t"
-        "DONE: \n\t"
-        "}"
-        :
-        : "r"(smem_addr), "r"(phase), "r"(ticks));
-
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static bool test_wait(ValueType const* smem_ptr, uint32_t phase, uint32_t pred) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_cluster_barrier_test_wait(__LINE__, smem_addr, phase, pred);
-    uint32_t waitComplete;
-
-    asm volatile(
-        "{\n\t"
-        ".reg .pred P1; \n\t"
-        ".reg .pred P2; \n\t"
-        "setp.eq.u32 P2, %3, 1;\n\t"
-        "@P2 mbarrier.test_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
-        "selp.b32 %0, 1, 0, P1; \n\t"
-        "}"
-        : "=r"(waitComplete)
-        : "r"(smem_addr), "r"(phase), "r"(pred));
-
-    return static_cast<bool>(waitComplete);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-    return 0;
-  }
-
-  CUTLASS_DEVICE
-  static bool try_wait(ValueType const* smem_ptr, uint32_t phase) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    cutlass::arch::synclog_emit_cluster_barrier_try_wait(__LINE__, smem_addr, phase);
-    uint32_t waitComplete;
-
-    asm volatile(
-        "{\n\t"
-        ".reg .pred P1; \n\t"
-        "mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
-        "selp.b32 %0, 1, 0, P1; \n\t"
-        "}"
-        : "=r"(waitComplete)
-        : "r"(smem_addr), "r"(phase));
-
-    return static_cast<bool>(waitComplete);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-    return 0;
-  }
-
-  // Static Predicated version of the above - in case we know the address.
-  CUTLASS_DEVICE
-  static void arrive(ValueType const* smem_ptr, uint32_t cta_id, uint32_t pred) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    if (pred) {
-      asm volatile(
-          "{\n\t"
-          ".reg .b32 remAddr32;\n\t"
-          "mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
-          "mbarrier.arrive.shared::cluster.b64  _, [remAddr32];\n\t"
-          "}"
-          :
-          : "r"(smem_addr), "r"(cta_id));
-    }
-
-    cutlass::arch::synclog_emit_cluster_barrier_arrive_cluster(__LINE__, smem_addr, cta_id, pred);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  // Barrier arrive on local smem
-  CUTLASS_DEVICE
-  static void arrive(ValueType const* smem_ptr) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        "mbarrier.arrive.shared::cta.b64 _, [%0];\n\t"
-        "}"
-        :
-        : "r"(smem_addr));
-    cutlass::arch::synclog_emit_cluster_barrier_arrive(__LINE__, smem_addr);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  static void invalidate(ValueType const* smem_ptr) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        "mbarrier.inval.shared::cta.b64 [%0]; \n\t"
-        "}"
-        :
-        : "r"(smem_addr));
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SM90 also introduces a new type of cluster-barrier which supports sync.
-// not just based on Arrive Count, but also transaction count (in bytes)
-struct ClusterTransactionBarrier : public ClusterBarrier {
-
-  CUTLASS_DEVICE
-  ClusterTransactionBarrier() = delete;
-
-  // Performs an arrive operation + expected transaction bytes increment
-  CUTLASS_DEVICE
-  void arrive_and_expect_tx(uint32_t transaction_bytes) const {
-    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes);
-  }
-
-  // Performs an arrive operation + expected transaction bytes increment
-  CUTLASS_DEVICE
-  void arrive_and_expect_tx(uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred = 1u) const {
-    ClusterTransactionBarrier::arrive_and_expect_tx(&this->barrier_, transaction_bytes , cta_id, pred);
-  }
-
-  // Performs an expected transaction bytes increment without doing an arrive operation
-  CUTLASS_DEVICE
-  void expect_transaction(uint32_t transaction_bytes) const {
-    ClusterTransactionBarrier::expect_transaction(&this->barrier_, transaction_bytes);
-  }
-
-  // Performs an expected transaction bytes decrement without doing an arrive operation
-  CUTLASS_DEVICE
-  void complete_transaction(uint32_t transaction_bytes, uint32_t pred = 1) const {
-    uint32_t cta_rank = cute::block_rank_in_cluster();
-    ClusterTransactionBarrier::complete_transaction(&this->barrier_, cta_rank, transaction_bytes, pred);
-  }
-
-  // Performs an expected transaction bytes decrement without doing an arrive operation
-  CUTLASS_DEVICE
-  void complete_transaction(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
-    ClusterTransactionBarrier::complete_transaction(&this->barrier_, dst_cta_id, transaction_bytes, pred);
-  }
-
-  //
-  //  Static Versions
-  //
-
-  // Performs an arrive operation + expected transaction bytes increment
-  CUTLASS_DEVICE
-  static void arrive_and_expect_tx(ValueType const* smem_ptr, uint32_t transaction_bytes) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        "mbarrier.arrive.expect_tx.shared::cta.b64 _, [%1], %0; \n\t"
-        "}"
-        :
-        : "r"(transaction_bytes), "r"(smem_addr));
-    cutlass::arch::synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(__LINE__, smem_addr, transaction_bytes);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  // Performs an arrive operation + expected transaction bytes increment for a remote cta_id in a Cluster
-  CUTLASS_DEVICE
-  static void arrive_and_expect_tx(
-      ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        ".reg .pred p;\n\t"
-        ".reg .b32 remAddr32;\n\t"
-        "setp.eq.u32 p, %2, 1;\n\t"
-        "@p mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
-        "@p mbarrier.arrive.expect_tx.shared::cluster.b64  _, [remAddr32], %3;\n\t"
-        "}"
-        :
-        : "r"(smem_addr), "r"(cta_id), "r"(pred), "r"(transaction_bytes));
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  // Performs an expected transaction bytes increment without doing an arrive operation
-  CUTLASS_DEVICE
-  static void expect_transaction(ValueType const* smem_ptr, uint32_t transaction_bytes) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "{\n\t"
-        "mbarrier.expect_tx.shared::cta.b64 [%1], %0; \n\t"
-        "}"
-        :
-        : "r"(transaction_bytes), "r"(smem_addr));
-    cutlass::arch::synclog_emit_cluster_transaction_barrier_expect_transaction(__LINE__, smem_addr, transaction_bytes);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  // Performs an expected transaction bytes decrement without doing an arrive operation
-  CUTLASS_DEVICE
-  static void complete_transaction(
-      ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
-#if CUDA_BARRIER_ENABLED
-    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-    smem_addr = cute::set_block_rank(smem_addr, dst_cta_id);
-    asm volatile(
-        "{\n\t"
-        ".reg .pred p;\n\t"
-        "setp.eq.u32 p, %2, 1;\n\t"
-        "@p mbarrier.complete_tx.shared::cluster.relaxed.cluster.b64   [%1], %0;"
-        "}"
-        :
-        : "r"(transaction_bytes), "r"(smem_addr), "r"(pred));
-    cutlass::arch::synclog_emit_cluster_transaction_barrier_complete_transaction(__LINE__, smem_addr, dst_cta_id, transaction_bytes, pred);
-#elif defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-#endif
-  }
-
-  //
-  // DEPRECATED APIs
-  //
-  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
-  void arrive_and_reset_bytes(uint32_t transaction_bytes) const {
-    arrive_and_expect_tx(transaction_bytes);
-  }
-  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
-  void arrive_and_reset_bytes(uint32_t transaction_bytes, uint32_t cta_id) const {
-    arrive_and_expect_tx(transaction_bytes, cta_id);
-  }
-  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
-  void reset_bytes(uint32_t transaction_bytes) const {
-    expect_transaction(transaction_bytes);
-  }
-  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
-  void commit(uint32_t transaction_bytes, uint32_t pred = 1) const {
-    complete_transaction(transaction_bytes, pred);
-  }
-  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
-  void commit(uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred) const {
-    complete_transaction(dst_cta_id, transaction_bytes, pred);
-  }
-  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
-  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
-    arrive_and_expect_tx(smem_ptr, transaction_bytes);
-  }
-  [[deprecated("Use arrive_and_expect_tx instead")]] CUTLASS_DEVICE
-  static void arrive_and_reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes, uint32_t cta_id, uint32_t pred) {
-    arrive_and_expect_tx(smem_ptr, transaction_bytes, cta_id, pred);
-  }
-  [[deprecated("Use expect_transaction instead")]] CUTLASS_DEVICE
-  static void reset_bytes(ValueType const* smem_ptr, uint32_t transaction_bytes) {
-    expect_transaction(smem_ptr, transaction_bytes);
-  }
-  [[deprecated("Use complete_transaction instead")]] CUTLASS_DEVICE
-  static void commit(ValueType const* smem_ptr, uint32_t dst_cta_id, uint32_t transaction_bytes, uint32_t pred = 1) {
-    complete_transaction(smem_ptr, dst_cta_id, transaction_bytes, pred);
-  }
-};
-
-// Helps with visibility of barrier init operations across warps / cta / cluster
-// Available as a separate function so as to batch inits across barriers and fence once
-// Note : It must be composed with an appropriate sync instruction with the right scope
-// to ensure visibility eg. __syncthreads() or a cluster_arrive() + cluster_wait()
-CUTLASS_DEVICE
-void fence_barrier_init() {
-#if CUDA_BARRIER_ENABLED
-  cutlass::arch::synclog_emit_fence_barrier_init(__LINE__);
-  asm volatile(
-      "{\n\t"
-      "fence.mbarrier_init.release.cluster; \n"
-      "}"
-      ::);
-#elif defined(__CUDA_ARCH__)
-  asm volatile ("brkpt;\n" ::);
-#endif
-}
-
-// Issue a shared memory fence for async operations
-CUTLASS_DEVICE
-void fence_view_async_shared() {
-#if CUDA_BARRIER_ENABLED
-    cutlass::arch::synclog_emit_fence_view_async_shared(__LINE__);
-    asm volatile (
-        "{\n\t"
-        "fence.proxy.async.shared::cta; \n"
-        "}"
-        ::);
-#elif defined(__CUDA_ARCH__)
-  asm volatile ("brkpt;\n" ::);
-#endif
-}
-
-// Arrive on completion of in-flight cp.async operations issued by the calling thread 
-CUTLASS_DEVICE
-void cpasync_barrier_arrive(uint64_t const* smem_ptr) {
-#if CUDA_BARRIER_ENABLED
-  uint32_t smem_addr = cute::cast_smem_ptr_to_uint(smem_ptr);
-  asm volatile(
-    "{\n\t"
-    "cp.async.mbarrier.arrive.shared::cta.b64 [%0];\n\t"
-    "}"
-    :
-    : "r"(smem_addr));
-  cutlass::arch::synclog_emit_cpasync_barrier_arrive(__LINE__, smem_addr);
-#elif defined(__CUDA_ARCH__)
-  asm volatile ("brkpt;\n" ::);
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-}  // end namespace arch
-}  // end namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h b/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
deleted file mode 100755
index 9d2344bf3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Directives related to cache operations
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Controls PTX cache operations
-struct CacheOperation {
-  enum Kind {
-    /// Cache at all levels - accessed again
-    Always,
-    /// Cache at global level
-    Global,
-    /// Streaming - likely to be accessed once
-    Streaming,
-    /// Indicates the line will not be used again
-    LastUse,
-    /// Don't cache, and fetch again
-    Volatile,
-    /// Write back at all coherent levels
-    WriteBack,
-    /// Write through to system memory
-    WriteThrough
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace arch
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/config.h b/lightllm-kernel/cutlass/include/cutlass/arch/config.h
deleted file mode 100755
index b0f750063..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/config.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Definitions for architecture macros
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SM90
-#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 0))
-  #define CUTLASS_ARCH_MMA_SM90_SUPPORTED 1
-  #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
-    #define CUTLASS_ARCH_MMA_SM90_ENABLED 1
-
-    #if (!defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-      #define CUTLASS_ARCH_MMA_SM90A_ENABLED 1
-    #endif
-  #endif
-#endif
-
-#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)
-  #define CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SM90 Modifiable
-#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 3))
-  #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED 1
-  #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900)
-    #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_ENABLED 1
-
-    #if (!defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-      #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90A_ENABLED 1
-    #endif
-  #endif
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// SM90 F64
-#if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8))
-  #define CUTLASS_ARCH_MMA_SM90_F64_MMA_SUPPORTED 1
-  #if (!defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900)
-    #define CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED 1
-  #endif
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h b/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
deleted file mode 100755
index 14ef19749..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
-/*! \file
-    \brief Grid dependent control (GDC) helpers for programmatic dependent launches (PDL).
-*/
-
-#pragma once
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/conv/dispatch_policy.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#ifndef CUTLASS_GDC_ENABLED
-  #if (defined(CUTLASS_ENABLE_GDC_FOR_SM90) && \
-     __CUDACC_VER_MAJOR__ >= 12 && \
-     defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-    #define CUTLASS_GDC_ENABLED
-  #endif
-#endif
-
-namespace cutlass {
-namespace arch {
-
-// Issuing the launch_dependents instruction hints a dependent kernel to launch earlier
-// launch_dependents doesn't impact the functionality but the performance:
-// Launching a dependent kernel too early can compete with current kernels,
-// while launching too late can lead to a long latency.
-CUTLASS_DEVICE
-void launch_dependent_grids() {
-#if (defined(CUTLASS_GDC_ENABLED))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Issuing the griddepcontrol.wait instruction enforces no global memory access
-// prior to this istruction. This ensures the correctness of global memory access
-// when launching a dependent kernel earlier.
-CUTLASS_DEVICE
-void wait_on_dependent_grids() {
-#if (defined(CUTLASS_GDC_ENABLED))
-  asm volatile("griddepcontrol.wait;");
-#endif
-}
-
-// Enable kernel-level query regarding whether the GDC feature is turned on
-#if (defined(CUTLASS_GDC_ENABLED))
-static constexpr bool IsGdcGloballyEnabled = true;
-#else
-static constexpr bool IsGdcGloballyEnabled = false;
-#endif
-
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory.h
deleted file mode 100755
index db9ad7397..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/memory.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Architecture-specific operators on memory
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/cache_operation.h"
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Fragment type to store loaded data
-    typename AccessType,
-    /// The bytes of loading
-    int LoadBytes,
-    /// Cache operation
-    CacheOperation::Kind cache_op = CacheOperation::Always
-    >
-struct global_load;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Specializations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
-     (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
-    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-  #define CUTLASS_ENABLE_L2_PREFETCH 1
-#else
-  #define CUTLASS_ENABLE_L2_PREFETCH 0
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The redundant mov PTX instruction is used to enforce the compiler to
-// keep the initializing code before ld.global
-template <typename AccessType>
-struct global_load<AccessType,
-                   32,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint4 *data = reinterpret_cast<uint4 *>(&D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %9, 0;\n"
-        "  mov.b32 %0, %10;\n"
-        "  mov.b32 %1, %11;\n"
-        "  mov.b32 %2, %12;\n"
-        "  mov.b32 %3, %13;\n"
-        "  mov.b32 %4, %14;\n"
-        "  mov.b32 %5, %15;\n"
-        "  mov.b32 %6, %16;\n"
-        "  mov.b32 %7, %17;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%8];\n"
-        "  @p ld.global.L2::128B.v4.u32 {%4, %5, %6, %7}, [%18];\n"
-#else
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
-        "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
-#endif
-        "}\n"
-        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
-          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
-          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
-          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   32,
-                   CacheOperation::LastUse
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint4 *data = reinterpret_cast<uint4 *>(&D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %9, 0;\n"
-        "  mov.b32 %0, %10;\n"
-        "  mov.b32 %1, %11;\n"
-        "  mov.b32 %2, %12;\n"
-        "  mov.b32 %3, %13;\n"
-        "  mov.b32 %4, %14;\n"
-        "  mov.b32 %5, %15;\n"
-        "  mov.b32 %6, %16;\n"
-        "  mov.b32 %7, %17;\n"
-        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
-        "  @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
-        "}\n"
-        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
-          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
-          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
-          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   16,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint4 &data = reinterpret_cast<uint4 &>(D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-#else
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-#endif
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   16,
-                   CacheOperation::LastUse
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint4 &data = reinterpret_cast<uint4 &>(D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   8,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint2 &data = reinterpret_cast<uint2 &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %3, 0;\n"
-        "  mov.b32 %0, %4;\n"
-        "  mov.b32 %1, %5;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "  @p ld.global.L2::128B.v2.u32 {%0, %1}, [%2];\n"
-#else
-        "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
-#endif
-        "}\n"
-        : "=r"(data.x), "=r"(data.y)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   8,
-                   CacheOperation::LastUse
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint2 &data = reinterpret_cast<uint2 &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %3, 0;\n"
-        "  mov.b32 %0, %4;\n"
-        "  mov.b32 %1, %5;\n"
-        "  @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   4,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  unsigned &data = reinterpret_cast<unsigned &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %2, 0;\n"
-        "  mov.b32 %0, %3;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "  @p ld.global.L2::128B.u32 %0, [%1];\n"
-#else
-        "  @p ld.global.u32 %0, [%1];\n"
-#endif
-        "}\n"
-        : "=r"(data)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   4,
-                   CacheOperation::LastUse
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  unsigned &data = reinterpret_cast<unsigned &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %2, 0;\n"
-        "  mov.b32 %0, %3;\n"
-        "  @p ld.global.lu.u32 %0, [%1];\n"
-        "}\n"
-        : "=r"(data)
-        : "l"(ptr), "r"((int)pred_guard), "r"(data));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   2,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint16_t &data = reinterpret_cast<uint16_t &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %2, 0;\n"
-        "  mov.b16 %0, %3;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "  @p ld.global.L2::128B.u16 %0, [%1];\n"
-#else
-        "  @p ld.global.u16 %0, [%1];\n"
-#endif
-        "}\n"
-        : "=h"(data)
-        : "l"(ptr), "r"((int)pred_guard), "h"(data));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   2,
-                   CacheOperation::LastUse
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-  uint16_t &data = reinterpret_cast<uint16_t &>(D);
-
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %2, 0;\n"
-        "  mov.b16 %0, %3;\n"
-        "  @p ld.global.lu.u16 %0, [%1];\n"
-        "}\n"
-        : "=h"(data)
-        : "l"(ptr), "r"((int)pred_guard), "h"(data));
-  }
-};
-
-template <typename AccessType>
-struct global_load<AccessType,
-                   1,
-                   CacheOperation::Always
-                  > {
-  CUTLASS_DEVICE
-  global_load(AccessType &D, void const *ptr, bool pred_guard) {
-    if (pred_guard) D = *(reinterpret_cast<AccessType const *>(ptr));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Fragment type to store data
-    typename AccessType,
-    /// The bytes of storing
-    int StoreBytes
-    >
-struct global_store;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Specializations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <typename AccessType>
-struct global_store<AccessType, 64> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
-
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %5, 0;\n"
-      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
-      "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
-      "  @p st.global.v4.u32 [%11], {%12, %13, %14, %15};\n"
-      "  @p st.global.v4.u32 [%16], {%17, %18, %19, %20};\n"
-      "}\n"
-      :
-      : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
-        "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16),
-        "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w), 
-        "l"(((uint8_t *)ptr) + 32),
-        "r"(data[2].x), "r"(data[2].y), "r"(data[2].z), "r"(data[2].w),
-        "l"(((uint8_t *)ptr) + 48),
-        "r"(data[3].x), "r"(data[3].y), "r"(data[3].z), "r"(data[3].w));
-  }
-};
-
-
-template <typename AccessType>
-struct global_store<AccessType, 32> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
-
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %5, 0;\n"
-      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
-      "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
-      "}\n"
-      :
-      : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
-        "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16),
-        "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w));
-  }
-};
-
-template <typename AccessType>
-struct global_store<AccessType, 16> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint4 const &data = reinterpret_cast<uint4 const &>(D);
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %5, 0;\n"
-      "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
-      "}\n"
-      :
-      : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard));
-  }
-};
-
-template <typename AccessType>
-struct global_store<AccessType, 8> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint2 const &data = reinterpret_cast<uint2 const &>(D);
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %3, 0;\n"
-      "  @p st.global.v2.u32 [%0], {%1, %2};\n"
-      "}\n"
-      :
-      : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
-  }
-};
-
-template <typename AccessType>
-struct global_store<AccessType, 4> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint32_t const &data = reinterpret_cast<uint32_t const &>(D);
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %2, 0;\n"
-      "  @p st.global.u32 [%0], %1;\n"
-      "}\n"
-      :
-      : "l"(ptr), "r"(data), "r"((int)pred_guard));
-  }
-};
-
-template <typename AccessType>
-struct global_store<AccessType, 2> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-  uint16_t const &data = reinterpret_cast<uint16_t const &>(D);
-  asm volatile(
-      "{\n"
-      "  .reg .pred p;\n"
-      "  setp.ne.b32 p, %2, 0;\n"
-      "  @p st.global.u16 [%0], %1;\n"
-      "}\n"
-      :
-      : "l"(ptr), "h"(data), "r"((int)pred_guard));
-  }
-};
-
-template <typename AccessType>
-struct global_store<AccessType, 1> {
-  CUTLASS_DEVICE
-  global_store(AccessType const &D, void *ptr, bool pred_guard) {
-    if (pred_guard) *(reinterpret_cast<AccessType *>(ptr)) = D;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// ld.shared
-template <int Bytes>
-CUTLASS_DEVICE
-void shared_load(void *dst, uint32_t ptr);
-
-/// ld.shared - 16b
-template <>
-CUTLASS_DEVICE
-void shared_load<2>(void *dst, uint32_t ptr) {
-  asm volatile("ld.shared.u16 %0, [%1];\n"
-    : "=h"(*reinterpret_cast<uint16_t *>(dst))
-    : "r"(ptr));
-}
-
-/// ld.shared - 32b
-template <>
-CUTLASS_DEVICE
-void shared_load<4>(void *dst, uint32_t ptr) {
-  asm volatile("ld.shared.u32 %0, [%1];\n"
-    : "=r"(*reinterpret_cast<uint32_t *>(dst))
-    : "r"(ptr));
-}
-
-/// ld.shared - 64b
-template <>
-CUTLASS_DEVICE
-void shared_load<8>(void *dst, uint32_t ptr) {
-  uint2 *dst_u64 = reinterpret_cast<uint2 *>(dst);
-  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
-    :
-      "=r"(dst_u64->x),
-      "=r"(dst_u64->y)
-    : "r"(ptr));
-}
-
-/// ld.shared - 128b
-template <>
-CUTLASS_DEVICE
-void shared_load<16>(void *dst, uint32_t ptr) {
-  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
-  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-    :
-      "=r"(dst_u128->x),
-      "=r"(dst_u128->y),
-      "=r"(dst_u128->z),
-      "=r"(dst_u128->w)
-    : "r"(ptr));
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// st.shared
-template <int Bytes>
-CUTLASS_DEVICE
-void shared_store(uint32_t ptr, void const *src);
-
-/// st.shared - 16b
-template <>
-CUTLASS_DEVICE
-void shared_store<2>(uint32_t ptr, void const *src) {
-  asm volatile("st.shared.u16 [%0], %1;\n"
-    : :
-    "r"(ptr),
-    "h"(*reinterpret_cast<uint16_t const *>(src))
-  );
-}
-
-/// st.shared - 32b
-template <>
-CUTLASS_DEVICE
-void shared_store<4>(uint32_t ptr, void const *src) {
-  asm volatile("st.shared.u32 [%0], %1;\n"
-    : :
-    "r"(ptr),
-    "r"(*reinterpret_cast<uint32_t const  *>(src))
-  );
-}
-
-/// st.shared - 64b
-template <>
-CUTLASS_DEVICE
-void shared_store<8>(uint32_t ptr, void const *src) {
-  uint2 const *dst_u64 = reinterpret_cast<uint2 const *>(src);
-  asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
-    : :
-      "r"(ptr),
-      "r"(dst_u64->x),
-      "r"(dst_u64->y)
-    );
-}
-
-/// st.shared - 128b
-template <>
-CUTLASS_DEVICE
-void shared_store<16>(uint32_t ptr, void const *src) {
-  uint4 const *dst_u128 = reinterpret_cast<uint4 const *>(src);
-  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
-    : :
-      "r"(ptr),
-      "r"(dst_u128->x),
-      "r"(dst_u128->y),
-      "r"(dst_u128->z),
-      "r"(dst_u128->w)
-    );
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/memory_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
deleted file mode 100755
index 6b487a737..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Architecture-specific operators on memory added for SM75
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cute/arch/copy_sm75.hpp"
-#include "cute/arch/util.hpp"
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Layout of destination matrix (column-major implies transpose)
-  typename Layout,
-  /// .x1, .x2, or .x4
-  int MatrixCount
->
-inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Determine the appropriate way to target PTX's "ldmatrix" instruction.
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// CUTLASS helper to get SMEM pointer
-inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
-  return cute::cast_smem_ptr_to_uint(ptr);
-}
-
-/// CUTLASS helper to get SMEM pointer
-inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) {
-  return cutlass_get_smem_pointer(const_cast<void *>(ptr));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::RowMajor, 1>(
-    Array<unsigned, 1> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x;
-    asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
-    reinterpret_cast<int &>(D) = x;
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::RowMajor, 2>(
-    Array<unsigned, 2> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x, y;
-    asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
-    reinterpret_cast<int2 &>(D) = make_int2(x, y);
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::RowMajor, 4>(
-    Array<unsigned, 4> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x, y, z, w;
-    asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
-    reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Transpose on 16b granularity
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::ColumnMajor, 1>(
-    Array<unsigned, 1> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x;
-    asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
-    reinterpret_cast<int &>(D) = x;
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::ColumnMajor, 2>(
-    Array<unsigned, 2> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x, y;
-    asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
-    reinterpret_cast<int2 &>(D) = make_int2(x, y);
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-inline __device__ void ldsm<layout::ColumnMajor, 4>(
-    Array<unsigned, 4> & D,
-    void const* ptr) {
-
-  #if defined(CUTE_ARCH_LDSM_SM75_ACTIVATED)
-
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    int x, y, z, w;
-    asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
-    reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
-
-  #else
-
-    CUTLASS_UNUSED(D);
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_NOT_IMPLEMENTED();
-
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename AccessType, int Bytes>
-struct shared_load_op {
-  CUTLASS_DEVICE
-  shared_load_op(AccessType &D, void const *ptr) {
-    D = *reinterpret_cast<AccessType const *>(ptr);  
-  }
-};
-
-template <typename AccessType>
-CUTLASS_DEVICE void shared_load(AccessType &D, void const *ptr) {
-  shared_load_op<AccessType, int(sizeof(AccessType))>(D, ptr);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename AccessType>
-struct shared_load_op<AccessType, 16> {
-  CUTLASS_DEVICE
-  shared_load_op(AccessType &D, void const *ptr) {
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    uint4 v;
-    asm volatile ("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];" : 
-      "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w) : "r"(addr));
-
-    D = reinterpret_cast<AccessType const &>(v);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename AccessType>
-struct shared_load_op<AccessType, 8> {
-  CUTLASS_DEVICE
-  shared_load_op(AccessType &D, void const *ptr) {
-    unsigned addr = cutlass_get_smem_pointer(ptr);
-
-    uint2 v;
-    asm volatile ("ld.shared.v2.b32 {%0, %1}, [%2];" : 
-      "=r"(v.x), "=r"(v.y) : "r"(addr));
-
-    D = reinterpret_cast<AccessType const &>(v);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
deleted file mode 100755
index cb0ba4b54..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h
+++ /dev/null
@@ -1,472 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Architecture-specific operators on memory added for SM80
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/cache_operation.h"
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  #define CUDA_CP_ASYNC_ACTIVATED 1
-#else
-  #define CUDA_CP_ASYNC_ACTIVATED 0
-#endif
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Initiates an asynchronous copy from global memory to shared memory.
-///
-/// cp.async
-///
-template <
-    /// Size of the access in bytes
-    int SizeInBytes,
-    /// Cache operation
-    CacheOperation::Kind cache_op = CacheOperation::Always>
-struct cp_async;
-
-/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
-/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
-///
-/// cp.async
-///
-template <
-    /// Size of the access in bytes
-    int SizeInBytes,
-    /// Cache operation
-    CacheOperation::Kind cache_op = CacheOperation::Always>
-struct cp_async_zfill;
-
-/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
-/// the entire transfer, nans (0x7eff) are written to SMEM if the guard predicate is false.
-///
-/// cp.async
-///
-template <
-    /// Size of the access in bytes
-    int SizeInBytes,
-    /// Cache operation
-    CacheOperation::Kind cache_op = CacheOperation::Always>
-struct cp_async_nan;
-
-/// Either 0 or 1 are written to SMEM based on input element type
-/// Used for diagonal elements of triangular matrix of BLAS3 functions
-///
-/// st.shared
-///
-template <
-   /// Type of Element
-   typename Element,
-   /// If the data is for a Hermitian matrix diagonal
-   bool IsHermitianData = false>
-struct cp_async_diag;
-
-static const uint32_t OOB_NAN_F16 = 0x7eff;
-static const uint32_t OOB_NAN_F16x2 = ((OOB_NAN_F16 << 16) | OOB_NAN_F16);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization
-template <
-    /// Size of the access in bytes
-    int SizeInBytes>
-struct cp_async<SizeInBytes, CacheOperation::Always> {
-
-  /// Copy
-  CUTLASS_DEVICE
-  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      // Make sure the size is supported.
-      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %0, 0;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-          "  @p cp.async.ca.shared.global.L2::128B [%1], [%2], %3;\n"
-#else
-          "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
-#endif
-          "}\n" ::"r"((int)pred_guard),
-          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
-
-    #else
-      using AccessType  = Array<uint8_t, SizeInBytes>;
-
-      if (pred_guard) {
-        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
-      }
-    #endif
-  }
-};
-
-/// Partial specialization
-template <
-    /// Size of the access in bytes
-    int SizeInBytes>
-struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
-
-  /// Copy with zero fill
-  CUTLASS_DEVICE
-  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      // Make sure the size is supported.
-      static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
-                "Size is not supported");
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
-
-      asm volatile(
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
-#else
-        "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
-#endif
-        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
-
-    #else
-      using AccessType  = Array<uint8_t, SizeInBytes>;
-
-      if (pred_guard) {
-        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
-      }
-      else {
-        AccessType zeros;
-        zeros.clear();
-        *static_cast<AccessType *>(smem_ptr) = zeros;
-      }
-    #endif
-  }
-};
-
-/// Partial specialization
-template <>
-struct cp_async_nan<16, CacheOperation::Always> {
-  static int const kSizeInBytes = 16;
-
-  /// Copy with nan fill
-  CUTLASS_DEVICE
-  cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
-                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %0, 0;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-          "  @p cp.async.ca.shared.global.L2::128B [%1], [%2], %3;\n"
-#else
-          "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
-#endif
-          "  @!p st.shared.v4.u32 [%1], {%4, %5, %6, %7};\n"
-          "}\n"
-          :
-          : "r"((int)pred_guard), "r"(smem_int_ptr), "l"(global_ptr),
-            "n"(kSizeInBytes), "r"(OOB_NAN_F16x8.x), "r"(OOB_NAN_F16x8.y), "r"(OOB_NAN_F16x8.z),
-            "r"(OOB_NAN_F16x8.w));
-
-    #else
-
-      CUTLASS_UNUSED(smem_ptr);
-      CUTLASS_UNUSED(global_ptr);
-      CUTLASS_UNUSED(pred_guard);
-      CUTLASS_NOT_IMPLEMENTED();
-
-    #endif
-  }
-};
-
-/// Partial specialization to write one (1)
-template<typename Element_>
-struct cp_async_diag <Element_, false> {
-  using Element = Element_;
-
-  CUTLASS_DEVICE
-  cp_async_diag(void *smem_ptr) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      /// Values for the diagonal elements of the triangular input matrix
-      static __constant__ uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
-      static __constant__ uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
-      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-
-      if (platform::is_same<Element, complex<double>>::value) {
-        asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y), "r"(DIAG_DATA_DOUBLE_ONE.x),
-                      "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
-      } else if (platform::is_same<Element, complex<float>>::value) {
-        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x), "r"(DIAG_DATA_ZERO.x));
-      } else if (platform::is_same<Element, double>::value) {
-        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y),"r"(DIAG_DATA_DOUBLE_ONE.x));
-      } else if (platform::is_same<Element, float>::value) {
-        asm volatile("st.shared.u32 [%0], %1;\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x));
-      } else {
-        CUTLASS_UNUSED(smem_int_ptr);
-        CUTLASS_NOT_IMPLEMENTED();
-      }
-      
-    #else
-
-      CUTLASS_UNUSED(smem_ptr);
-      CUTLASS_NOT_IMPLEMENTED();
-
-    #endif
-  }
-};
-
-/// Partial specialization to write zero for the imaginary part of Hermitian data
-template<typename Element_>
-struct cp_async_diag <Element_, true> {
-  using Element = Element_;
-
-  CUTLASS_DEVICE
-  cp_async_diag(void *smem_ptr) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      /// Values for the diagonal elements of the triangular input matrix
-      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-
-      if (platform::is_same<Element, complex<double>>::value) {
-        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
-      } else if (platform::is_same<Element, complex<float>>::value) {
-        asm volatile("st.shared.u32 [%0], %1;\n"
-                      : :
-                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x));
-      } else {
-        CUTLASS_UNUSED(smem_int_ptr);
-        CUTLASS_NOT_IMPLEMENTED();
-      }
-      
-    #else
-
-      CUTLASS_UNUSED(smem_ptr);
-      CUTLASS_NOT_IMPLEMENTED();
-
-    #endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization
-template <
-    /// Size of the access in bytes
-    int SizeInBytes>
-struct cp_async<SizeInBytes, CacheOperation::Global> {
-
-  /// Copy
-  CUTLASS_DEVICE
-  cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      static_assert(SizeInBytes == 16,
-        "cp.async only supports CacheOperation::Global when access size is 16B.");
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-      cutlass::arch::synclog_emit_cp_async(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
-
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %0, 0;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-          "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
-#else
-          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-#endif
-          "}\n" ::"r"((int)pred_guard),
-          "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes));
-
-    #else
-      using AccessType  = Array<uint8_t, SizeInBytes>;
-
-      if (pred_guard) {
-        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
-      }
-    #endif
-  }
-};
-
-/// Partial specialization
-template <
-    /// Size of the access in bytes
-    int SizeInBytes>
-struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
-
-  /// Copy with zero fill
-  CUTLASS_DEVICE
-  cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      static_assert(SizeInBytes == 16,
-        "cp.async only supports CacheOperation::Global when access size is 16B.");
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-      int src_in_bytes = (pred_guard ? SizeInBytes : 0);
-      cutlass::arch::synclog_emit_cp_async_zfill(__LINE__, smem_int_ptr, global_ptr, pred_guard, SizeInBytes);
-
-      asm volatile(
-#if CUTLASS_ENABLE_L2_PREFETCH
-        "cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
-#else
-        "cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr),
-#endif
-        "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes));
-
-    #else
-      using AccessType  = Array<uint8_t, SizeInBytes>;
-
-      if (pred_guard) {
-        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
-      }
-      else {
-        AccessType zeros;
-        zeros.clear();
-        *static_cast<AccessType *>(smem_ptr) = zeros;
-      }
-    #endif
-  }
-};
-
-/// Partial specialization
-template <>
-struct cp_async_nan<16, CacheOperation::Global> {
-  static int const kSizeInBytes = 16;
-
-  /// Copy with nan fill
-  CUTLASS_DEVICE
-  cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
-    #if CUDA_CP_ASYNC_ACTIVATED
-
-      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
-                                                 OOB_NAN_F16x2, OOB_NAN_F16x2};
-
-      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
-      cutlass::arch::synclog_emit_cp_async_nan(__LINE__, smem_int_ptr, global_ptr, pred_guard);
-
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %0, 0;\n"
-#if CUTLASS_ENABLE_L2_PREFETCH
-          "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
-#else
-          "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-#endif
-          "  @!p st.shared.v4.u32 [%1], {%4, %5, %6, %7};\n"
-          "}\n"
-          :
-          : "r"((int)pred_guard), "r"(smem_int_ptr), "l"(global_ptr),
-            "n"(kSizeInBytes), "r"(OOB_NAN_F16x8.x), "r"(OOB_NAN_F16x8.y), "r"(OOB_NAN_F16x8.z),
-            "r"(OOB_NAN_F16x8.w));
-
-    #else
-
-      CUTLASS_UNUSED(smem_ptr);
-      CUTLASS_UNUSED(global_ptr);
-      CUTLASS_UNUSED(pred_guard);
-      CUTLASS_NOT_IMPLEMENTED();
-
-    #endif
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
-CUTLASS_DEVICE
-void cp_async_fence() {
-  #if CUDA_CP_ASYNC_ACTIVATED
-  asm volatile("cp.async.commit_group;\n" ::);
-  cutlass::arch::synclog_emit_cp_async_fence(__LINE__);
-  #endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
-template <int N>
-CUTLASS_DEVICE void cp_async_wait() {
-  #if CUDA_CP_ASYNC_ACTIVATED
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
-  cutlass::arch::synclog_emit_cp_async_wait(__LINE__, N);
-  #endif
-}
-
-/// Blocks until all previous cp.async.commit_group operations have committed.
-template <>
-CUTLASS_DEVICE void cp_async_wait<0>() {
-  #if CUDA_CP_ASYNC_ACTIVATED
-  asm volatile("cp.async.wait_all;\n" ::);
-  cutlass::arch::synclog_emit_cp_async_wait_all(__LINE__);
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace arch
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma.h
deleted file mode 100755
index 007ba19be..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/arch.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the operation implied by MMA.
-struct OpMultiplyAdd {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the result is saturated to MAX_FLOAT|MIN_FLOAT or MAX_INT|MIN_INT
-struct OpMultiplyAddSaturate {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to a narrower type (BF16)
-struct OpMultiplyAddFastBF16 {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to a narrower type (F16)
-struct OpMultiplyAddFastF16 {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input data types are mixed and the narrower type is 
-/// upcasted to the wider type
-struct OpMultiplyAddMixedInputUpcast {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to 2 (big and small) TF32 components
-//  Perform 3xTF32 or 4xTF32 for every F32 output element
-struct OpMultiplyAddFastF32 {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to 2 (big and small) TF32 components
-//  Perform 3xTF32 or 4xTF32 for every complex<F32> output element
-struct OpMultiplyAddComplexFastF32 {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating that staged accumulation is not to be used. This is valid only for SM89
-/// FP8 kernels.
-struct OpMultiplyAddFastAccum;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the complex multiply-add operation
-struct OpMultiplyAddComplex {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the gaussian complex multiply-add operation
-struct OpMultiplyAddGaussianComplex {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the inner product is defined by (XOR, POPC)
-struct OpXorPopc {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the inner product is defined by (AND, POPC)
-struct OpAndPopc {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag classifying math operators as thread-level operations.
-struct OpClassSimt {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag classifying operators as Tensor Core operations.
-struct OpClassTensorOp {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Tag classifying operators as WMMA Tensor Core operations
-struct OpClassWmmaTensorOp {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag classifying operators as Tensor Core with structure sparse operations.
-struct OpClassSparseTensorOp {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Size of the matrix product (concept: GemmShape)
-  typename Shape_,
-  /// Number of threads participating
-  int kThreads_,
-  /// Data type of A elements
-  typename ElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB,
-  /// Element type of C matrix
-  typename ElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC,
-  /// Inner product operator
-  typename Operator
->
-struct Mma;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation - specialized for 1x1x1x1 matrix multiply operation
-template <
-  /// Data type of A elements
-  typename ElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC,
-  /// Inner product operator
-  typename Operator_
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, ElementA, LayoutA, ElementB, LayoutB, ElementC_, LayoutC, Operator_> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = Operator_;
-  using ElementC = ElementC_;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<ElementC, 1> &d,
-    Array<ElementA, 1> const &a,
-    Array<ElementB, 1> const &b,
-    Array<ElementC, 1> const &c
-  ) {
-
-    multiply_add<ElementA, ElementB, ElementC> op;
-
-    d[0] = op(a[0], b[0], c[0]);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specifies internal data type for computation
-struct SPFormatType {
-  enum Kind {
-    Thread
-  };
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Size of the matrix product (concept: GemmShape)
-  typename Shape_,
-  /// Number of threads participating
-  int kThreads_,
-  /// Data type of A elements
-  typename ElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB,
-  /// Element type of C matrix
-  typename ElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC,
-  /// Inner product operator
-  typename Operator,
-  /// Specifies meta data format
-  SPFormatType::Kind SPFormat = SPFormatType::Thread
->
-struct SparseMma;
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Specializations for each compute capability
-//
-
-#include "cutlass/arch/mma_sm50.h"
-#include "cutlass/arch/mma_sm60.h"
-#include "cutlass/arch/mma_sm61.h"
-#include "cutlass/arch/mma_sm70.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-#include "cutlass/arch/mma_sparse_sm80.h"
-#include "cutlass/arch/mma_sm89.h"
-#include "cutlass/arch/mma_sparse_sm89.h"
-#include "cutlass/arch/mma_sm90.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-namespace detail {
-/// Helper for determining whether staged accumulation should be used for a given operator
-template <typename Operator>
-struct UseStagedAccumulation {
-  static bool const value = platform::is_same<typename Operator::MathOperator, OpMultiplyAddFastF32>::value ||
-                            platform::is_same<typename Operator::MathOperator, OpMultiplyAddComplexFastF32>::value ||
-                            is_sm89_staged_policy_v<Operator>;
-};
-} // namespace detail
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
deleted file mode 100755
index 98ff18bea..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h
+++ /dev/null
@@ -1,432 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, float, LayoutA, float, LayoutB, float, LayoutC, OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = float;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<float, 1> &d,
-    Array<float, 1> const &a,
-    Array<float, 1> const &b,
-    Array<float, 1> const &c
-  ) {
-    d[0] = a[0] * b[0] + c[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, double, LayoutA, double, LayoutB, double, LayoutC, OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = double;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<double, 1> &d,
-    Array<double, 1> const &a,
-    Array<double, 1> const &b,
-    Array<double, 1> const &c
-  ) {
-
-    d[0] = a[0] * b[0] + c[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, int, LayoutA, int, LayoutB, int, LayoutC, OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = int;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<int, 1> &d,
-    Array<int, 1> const &a,
-    Array<int, 1> const &b,
-    Array<int, 1> const &c
-  ) {
-
-    d[0] = a[0] * b[0] + c[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  complex<float>,
-  LayoutA,
-  complex<float>,
-  LayoutB,
-  complex<float>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<float>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<float>, 1> &d,
-    Array<complex<float>, 1> const &a,
-    Array<complex<float>, 1> const &b,
-    Array<complex<float>, 1> const &c
-  ) {
-
-    d[0].real() = a[0].real() * b[0].real() + c[0].real();
-    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
-    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
-    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  complex<float>,
-  LayoutA,
-  float,
-  LayoutB,
-  complex<float>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<float>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<float>, 1> &d,
-    Array<complex<float>, 1> const &a,
-    Array<float, 1> const &b,
-    Array<complex<float>, 1> const &c
-  ) {
-
-    d[0].real() = a[0].real() * b[0] + c[0].real();
-    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  float,
-  LayoutA,
-  complex<float>,
-  LayoutB,
-  complex<float>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<float>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<float>, 1> &d,
-    Array<float, 1> const &a,
-    Array<complex<float>, 1> const &b,
-    Array<complex<float>, 1> const &c
-  ) {
-
-    d[0].real() = a[0] * b[0].real() + c[0].real();
-    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  complex<double>,
-  LayoutA,
-  complex<double>,
-  LayoutB,
-  complex<double>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<double>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<double>, 1> &d,
-    Array<complex<double>, 1> const &a,
-    Array<complex<double>, 1> const &b,
-    Array<complex<double>, 1> const &c
-  ) {
-
-    d[0].real() = a[0].real() * b[0].real() + c[0].real();
-    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
-    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
-    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
-  }
-};
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  complex<double>,
-  LayoutA,
-  double,
-  LayoutB,
-  complex<double>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<double>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<double>, 1> &d,
-    Array<complex<double>, 1> const &a,
-    Array<double, 1> const &b,
-    Array<complex<double>, 1> const &c
-  ) {
-
-    d[0].real() = a[0].real() * b[0] + c[0].real();
-    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
-  }
-};
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<
-  gemm::GemmShape<1, 1, 1>,
-  1,
-  double,
-  LayoutA,
-  complex<double>,
-  LayoutB,
-  complex<double>,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAddComplex;
-  using ElementC = complex<double>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<double>, 1> &d,
-    Array<double, 1> const &a,
-    Array<complex<double>, 1> const &b,
-    Array<complex<double>, 1> const &c
-  ) {
-
-    d[0].real() = a[0] * b[0].real() + c[0].real();
-    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, half_t, LayoutA, half_t, LayoutB, float, LayoutC, OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = float;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<float, 1> &d,
-    Array<half_t, 1> const &a,
-    Array<half_t, 1> const &b,
-    Array<float, 1> const &c
-  ) {
-    d[0] = float(a[0]) * float(b[0]) + c[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation for Quaternions
-template <
-  /// Layout of A matrix
-  typename LayoutA,
-  /// Layout of B matrix
-  typename LayoutB,
-  /// Layout of C matrix
-  typename LayoutC
->
-struct Mma<gemm::GemmShape<1, 1, 1>, 1, Quaternion<float>, LayoutA, Quaternion<float>, LayoutB, Quaternion<float>, LayoutC, OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using Element = Quaternion<float>;
-  using ElementC = Element;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<Element, 1> &d,
-    Array<Element, 1> const &a,
-    Array<Element, 1> const &b,
-    Array<Element, 1> const &c
-  ) {
-    multiply_add<Element, Element, Element> op;
-    d[0] = op(a[0], b[0], c[0]);
-  }
-
-};
-
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
deleted file mode 100755
index 3e3c71ef3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#include <cuda_fp16.h>
-
-#include "cutlass/arch/mma.h"
-
-#include "cutlass/layout/matrix.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <typename LayoutA, typename LayoutB, typename LayoutC>
-struct Mma<
-  gemm::GemmShape<2,1,1>,
-  1,
-  half_t,
-  LayoutA,
-  half_t,
-  LayoutB,
-  half_t,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<2, 1, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = half_t;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<half_t, 2> &d,
-    Array<half_t, 2> const &a,
-    Array<half_t, 1> const &b,
-    Array<half_t, 2> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
-
-    __half2 const & A = reinterpret_cast<__half2 const &>(a);
-    __half2 B = __half2half2(reinterpret_cast<__half const &>(b));
-    __half2 const & C = reinterpret_cast<__half2 const &>(c);
-
-    __half2 D = __hfma2(A, B, C);
-
-    d = reinterpret_cast<Array<half_t, 2> &>(D);
-
-#else
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      d[i] = a[i] * b[0] + c[i];
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <typename LayoutA, typename LayoutB>
-struct Mma<
-  gemm::GemmShape<1,2,1>,
-  1,
-  half_t,
-  LayoutA,
-  half_t,
-  LayoutB,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 2, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = half_t;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<half_t, 2> &d,
-    Array<half_t, 1> const &a,
-    Array<half_t, 2> const &b,
-    Array<half_t, 2> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
-
-    __half2 const & A = __half2half2(reinterpret_cast<__half const &>(a));
-    __half2 B = reinterpret_cast<__half2 const &>(b);
-    __half2 const & C = reinterpret_cast<__half2 const &>(c);
-
-    __half2 D = __hfma2(A, B, C);
-
-    d = reinterpret_cast<Array<half_t, 2> &>(D);
-
-#else
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      d[i] = a[0] * b[i] + c[i];
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <>
-struct Mma <
-  gemm::GemmShape<2, 2, 1>,
-  1,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<2, 2, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = half_t;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<half_t, 4> &d,
-    Array<half_t, 2> const &a,
-    Array<half_t, 2> const &b,
-    Array<half_t, 4> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
-
-    __half2 const & A = reinterpret_cast<__half2 const &>(a);
-    __half2 Blo = __low2half2(reinterpret_cast<__half2 const &>(b));
-    __half2 Bhi = __high2half2(reinterpret_cast<__half2 const &>(b));
-
-    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
-
-    __half2 Dlo = __hfma2(A, Blo, C[0]);
-    __half2 Dhi = __hfma2(A, Bhi, C[1]);
-
-    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
-
-    D[0] = reinterpret_cast<Array<half_t, 2> const &>(Dlo);
-    D[1] = reinterpret_cast<Array<half_t, 2> const &>(Dhi);
-
-#else
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < 2; ++j) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < 2; ++i) {
-        d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
-      }
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <>
-struct Mma<
-  gemm::GemmShape<2, 2, 1>,
-  1,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<2, 2, 1>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = half_t;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<half_t, 4> &d,
-    Array<half_t, 2> const &a,
-    Array<half_t, 2> const &b,
-    Array<half_t, 4> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
-
-    __half2 Alo = __low2half2(reinterpret_cast<__half2 const &>(a));
-    __half2 Ahi = __high2half2(reinterpret_cast<__half2 const &>(a));
-    __half2 const & B = reinterpret_cast<__half2 const &>(b);
-
-    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
-
-    __half2 Dlo = __hfma2(Alo, B, C[0]);
-    __half2 Dhi = __hfma2(Ahi, B, C[0]);
-
-    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
-
-    D[0] = reinterpret_cast<Array<half_t, 2> &>(Dlo);
-    D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
-#else
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < 2; ++j) {
-        d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
-      }
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
deleted file mode 100755
index 82a5aa728..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#include "cutlass/layout/matrix.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <typename LayoutA, typename LayoutB, typename LayoutC>
-struct Mma<
-  gemm::GemmShape<1,1,4>,
-  1,
-  int8_t,
-  LayoutA,
-  int8_t,
-  LayoutB,
-  int,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 4>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = int;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<int, 1> &d,
-    Array<int8_t, 4> const &a,
-    Array<int8_t, 4> const &b,
-    Array<int, 1> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
-
-    unsigned const &A = reinterpret_cast<unsigned const &>(a);
-    unsigned const &B = reinterpret_cast<unsigned const &>(b);
-
-    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(d[0])
-                 : "r"(A), "r"(B), "r"(c[0]));
-
-#else
-
-    d[0] = c[0];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 4; ++k) {
-      d[0] += a[k] * b[k];
-    }
-
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <typename LayoutC>
-struct Mma<
-  gemm::GemmShape<1, 1, 2>,
-  1,
-  int16_t,
-  layout::RowMajor,
-  int16_t,
-  layout::ColumnMajor,
-  int,
-  LayoutC,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<1, 1, 2>;
-  using Operator = OpMultiplyAdd;
-  using ElementC = int;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<int, 1> &d,
-    Array<int16_t, 2> const &a,
-    Array<int16_t, 2> const &b,
-    Array<int, 1> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
-
-    unsigned const &A = reinterpret_cast<unsigned const &>(a);
-    unsigned const &B = reinterpret_cast<unsigned const &>(b);
-
-    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(d[0])
-                 : "r"(A), "r"(B), "r"(c[0]));
-#else
-    d[0] = c[0];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < 2; ++k) {
-      d[0] += a[k] * b[k];
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
deleted file mode 100755
index 6471de8a8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h
+++ /dev/null
@@ -1,665 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
-#define CUTLASS_ARCH_MMA_SM70_SUPPORTED
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
-
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 &&__CUDACC_VER_MINOR__ >= 1))
-#define CUTLASS_ARCH_MMA_SM70_ENABLED
-#endif
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix multiply accumulate 884 - FP16 accumulation
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,4>,
-  8,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::ColumnMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
-    unsigned *D = reinterpret_cast<unsigned *>(&d);
-
-    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
-    );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::ColumnMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::RowMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
-    unsigned *D = reinterpret_cast<unsigned *>(&d);
-
-    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
-    );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
-    unsigned *D = reinterpret_cast<unsigned *>(&d);
-
-    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
-    );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::RowMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
-    unsigned *D = reinterpret_cast<unsigned *>(&d);
-
-    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
-    );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix multiply accumulate 884 - FP32 accumulation
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::ColumnMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  /// Multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
-      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
-      : "=f"(D[0]),
-        "=f"(D[1]),
-        "=f"(D[2]),
-        "=f"(D[3]),
-        "=f"(D[4]),
-        "=f"(D[5]),
-        "=f"(D[6]),
-        "=f"(D[7])
-      : "r"(A[0]),
-        "r"(A[1]),
-        "r"(B[0]),
-        "r"(B[1]),
-        "f"(C[0]),
-        "f"(C[1]),
-        "f"(C[2]),
-        "f"(C[3]),
-        "f"(C[4]),
-        "f"(C[5]),
-        "f"(C[6]),
-        "f"(C[7])
-  );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::ColumnMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::RowMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  /// Multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
-      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
-      : "=f"(D[0]),
-        "=f"(D[1]),
-        "=f"(D[2]),
-        "=f"(D[3]),
-        "=f"(D[4]),
-        "=f"(D[5]),
-        "=f"(D[6]),
-        "=f"(D[7])
-      : "r"(A[0]),
-        "r"(A[1]),
-        "r"(B[0]),
-        "r"(B[1]),
-        "f"(C[0]),
-        "f"(C[1]),
-        "f"(C[2]),
-        "f"(C[3]),
-        "f"(C[4]),
-        "f"(C[5]),
-        "f"(C[6]),
-        "f"(C[7])
-  );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  /// Multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
-      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
-      : "=f"(D[0]),
-        "=f"(D[1]),
-        "=f"(D[2]),
-        "=f"(D[3]),
-        "=f"(D[4]),
-        "=f"(D[5]),
-        "=f"(D[6]),
-        "=f"(D[7])
-      : "r"(A[0]),
-        "r"(A[1]),
-        "r"(B[0]),
-        "r"(B[1]),
-        "f"(C[0]),
-        "f"(C[1]),
-        "f"(C[2]),
-        "f"(C[3]),
-        "f"(C[4]),
-        "f"(C[5]),
-        "f"(C[6]),
-        "f"(C[7])
-  );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 4>,
-  8,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::RowMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8, 8, 4>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::RowMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 8>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  /// Multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) {
-
-#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
-      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
-      : "=f"(D[0]),
-        "=f"(D[1]),
-        "=f"(D[2]),
-        "=f"(D[3]),
-        "=f"(D[4]),
-        "=f"(D[5]),
-        "=f"(D[6]),
-        "=f"(D[7])
-      : "r"(A[0]),
-        "r"(A[1]),
-        "r"(B[0]),
-        "r"(B[1]),
-        "f"(C[0]),
-        "f"(C[1]),
-        "f"(C[2]),
-        "f"(C[3]),
-        "f"(C[4]),
-        "f"(C[5]),
-        "f"(C[6]),
-        "f"(C[7])
-  );
-
-#else
-    assert(0);
-    #if defined(__CUDA_ARCH__)
-    asm volatile ("brkpt;\n" ::);
-    #endif
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation specialized for the entire warp
-template <
-  typename LayoutA,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename Operator
->
-struct Mma<
-  gemm::GemmShape<16, 16, 4>,
-  32,
-  half_t,
-  LayoutA,
-  half_t,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  Operator
-> : 
-  public Mma<
-    gemm::GemmShape<8, 8, 4>, 
-    8, 
-    half_t, 
-    LayoutA, 
-    half_t, 
-    LayoutB,
-    ElementC, 
-    LayoutC, 
-    Operator> {
-
-  using Shape = gemm::GemmShape<16, 16, 4>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
deleted file mode 100755
index 6cced190e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h
+++ /dev/null
@@ -1,793 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply for SM75
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-// CUDA Toolkit includes for nvcuda::wmma needed for binarized matrix multiply.
-#include <mma.h>
-#include "cutlass/wmma_array.h"
-#endif
-
-// CUTLASS includes
-#include "cutlass/arch/mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
-
-#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
-#define CUTLASS_ARCH_MMA_SM75_ENABLED
-#endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 1688 - FP16 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation - F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 8>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 8>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-  
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 2>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  unsigned const *C = reinterpret_cast<unsigned const *>(&c);
-  unsigned *D = reinterpret_cast<unsigned *>(&d);
-
-  asm volatile(
-    "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 1688 - FP32 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 8>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 8>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 4>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 2>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
-  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      : 
-        "r"(A[0]), "r"(A[1]), 
-        "r"(B[0]), 
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Integer matrix multiply  (8b) with SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 4>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 4>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 4>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 4>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Integer matrix multiply  (4b) - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 32>,
-  32,
-  int4b_t,
-  layout::RowMajor,
-  int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 32>;
-
-  using ElementA = int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int4b_t, 8>;
-
-  using ElementB = int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 32>,
-  32,
-  uint4b_t,
-  layout::RowMajor,
-  int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 32>;
-
-  using ElementA = uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint4b_t, 8>;
-
-  using ElementB = int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 32>,
-  32,
-  int4b_t,
-  layout::RowMajor,
-  uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 32>;
-
-  using ElementA = int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int4b_t, 8>;
-
-  using ElementB = uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<8, 8, 32>,
-  32,
-  uint4b_t,
-  layout::RowMajor,
-  uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<8, 8, 32>;
-
-  using ElementA = uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint4b_t, 8>;
-
-  using ElementB = uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint4b_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-  unsigned const & A = reinterpret_cast<unsigned const &>(a);
-  unsigned const & B = reinterpret_cast<unsigned const &>(b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// b1 ^ b1 + s32 => s32
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,128>,
-  32,
-  uint1b_t,
-  layout::RowMajor,
-  uint1b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpXorPopc> {
-
-  using Shape = gemm::GemmShape<8,8,128>;
-
-  using ElementA = uint1b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint1b_t, 32>;
-
-  using ElementB = uint1b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint1b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 2>;
-
-  using Operator = OpXorPopc;
-  using ArchTag = arch::Sm75;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-  using WmmaFragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          nvcuda::wmma::experimental::precision::b1,
-          nvcuda::wmma::row_major>;
-
-  using WmmaFragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          nvcuda::wmma::experimental::precision::b1,
-          nvcuda::wmma::col_major>;
-
-  using WmmaFragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          int>;
-  
-  WmmaFragmentA const & A = reinterpret_cast<WmmaFragmentA const &>(a);
-  WmmaFragmentB const & B = reinterpret_cast<WmmaFragmentB const &>(b);
-
-  WmmaFragmentC const & C = reinterpret_cast<WmmaFragmentC const &>(c);
-  WmmaFragmentC & D = reinterpret_cast<WmmaFragmentC &>(d);
-
-  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
-                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
-
-#else
-
-  CUTLASS_UNUSED(a);
-  CUTLASS_UNUSED(b);
-  CUTLASS_UNUSED(c);
-  CUTLASS_UNUSED(d);
-  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
-
-#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
deleted file mode 100755
index f990c1ac2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h
+++ /dev/null
@@ -1,1505 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
-
-#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-#define CUTLASS_ARCH_MMA_SM80_ENABLED
-
-#if (__CUDA_ARCH__ <= 900)
-#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
-#endif
-#if (__CUDA_ARCH__ <= 890)
-#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
-#endif
-
-#endif
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 1688 - Float BF16, FP32 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 8>,
-  32,
-  bfloat16_t,
-  layout::RowMajor,
-  bfloat16_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 8>;
-
-  using ElementA = bfloat16_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<bfloat16_t, 4>;
-
-  using ElementB = bfloat16_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<bfloat16_t, 2>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm(
-      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
-      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      : 
-        "r"(A[0]), "r"(A[1]), 
-        "r"(B[0]), 
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 1684 - Float TF32
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 4>,
-  32,
-  tfloat32_t,
-  layout::RowMajor,
-  tfloat32_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 4>;
-
-  using ElementA = tfloat32_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<tfloat32_t, 2>;
-
-  using ElementB = tfloat32_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<tfloat32_t, 1>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm volatile(
-      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      : 
-        "r"(A[0]), "r"(A[1]), 
-        "r"(B[0]), 
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 1688 - Float TF32
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
-template <>
-struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
-           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
-           OpMultiplyAdd> {
-  using Shape = gemm::GemmShape<16, 8, 8>;
-
-  using ElementA = tfloat32_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<tfloat32_t, 4>;
-
-  using ElementB = tfloat32_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<tfloat32_t, 2>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16816
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 16>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 16>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 8>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
-  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
-
-  asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
-      : "=r"(D[0]), "=r"(D[1])
-      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
-        "r"(B[0]), "r"(B[1]),
-        "r"(C[0]), "r"(C[1])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 16>,
-  32,
-  bfloat16_t,
-  layout::RowMajor,
-  bfloat16_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 16>;
-
-  using ElementA = bfloat16_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<bfloat16_t, 8>;
-
-  using ElementB = bfloat16_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<bfloat16_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 16>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16, 8, 16>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 8>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
-        "{%10,%11,%12,%13};\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 884 - F64
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F64 = F64 * F64 + F64
-template <>
-struct Mma<
-  gemm::GemmShape<8,8,4>,
-  32,
-  double,
-  layout::RowMajor,
-  double,
-  layout::ColumnMajor,
-  double,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<8,8,4>;
-
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<double, 1>;
-
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<double, 1>;
-
-  using ElementC = double;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<double, 2>;
-
-  using Operator = OpMultiplyAdd;
-
-  using ArchTag = arch::Sm80;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  double const & A = reinterpret_cast<double const &>(a);
-  double const & B = reinterpret_cast<double const &>(b);
-
-  double const *C = reinterpret_cast<double const *>(&c);
-  double *D = reinterpret_cast<double *>(&d);
-
-  asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
-      : "=d"(D[0]), "=d"(D[1])
-      : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-    
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 8>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 8>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,16>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,16>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 8>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
-    
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,16>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,16>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 8>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 4>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
-        "{%6}, {%7,%8,%9,%10};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
-          "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile(
-      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
-      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,32>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16,8,32>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 8>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
-
-  int const *C = reinterpret_cast<int const *>(&c);
-  int *D = reinterpret_cast<int *>(&d);
-
-  asm volatile(
-      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
-      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16, 8, 64>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate> {
-
-  using Shape = gemm::GemmShape<16, 8, 64>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 168256 - B1 input, S32 accumulation - AND,POPC
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = B1 & B1 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,256>,
-  32,
-  cutlass::uint1b_t,
-  layout::RowMajor,
-  cutlass::uint1b_t,
-  layout::ColumnMajor,
-  int32_t,
-  layout::RowMajor,
-  OpAndPopc> {
-
-  using Shape = gemm::GemmShape<16,8,256>;
-
-  using ElementA = cutlass::uint1b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint1b_t, 128>;
-
-  using ElementB = cutlass::uint1b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint1b_t, 64>;
-
-  using ElementC = int32_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int32_t, 4>;
-
-  using Operator = OpAndPopc;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = B1 & B1 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,256>,
-  32,
-  cutlass::uint1b_t,
-  layout::RowMajor,
-  cutlass::uint1b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16,8,256>;
-
-  using ElementA = cutlass::uint1b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint1b_t, 128>;
-
-  using ElementB = cutlass::uint1b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint1b_t, 64>;
-
-  using ElementC = int32_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int32_t, 4>;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = B1 & B1 + S32
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,256>,
-  32,
-  cutlass::uint1b_t,
-  layout::RowMajor,
-  cutlass::uint1b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpXorPopc> {
-
-  using Shape = gemm::GemmShape<16,8,256>;
-
-  using ElementA = cutlass::uint1b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint1b_t, 128>;
-
-  using ElementB = cutlass::uint1b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint1b_t, 64>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using Operator = OpXorPopc;
-  using ArchTag = arch::Sm80;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c
-  ) const {
-
-#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-    asm volatile(
-        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, "
-        "{%8,%9}, {%10,%11,%12,%13};\n"
-        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
-
-#else
-    
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-
-#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
deleted file mode 100755
index fe4b7eb7e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h
+++ /dev/null
@@ -1,367 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Matrix multiply-accumulate specialzied for SM89
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
-
-#  define CUTLASS_ARCH_MMA_SM89_SUPPORTED 1
-#endif
-
-#if defined(CUTLASS_ARCH_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
-#  define CUTLASS_ARCH_MMA_SM89_ENABLED
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// Whether the Mma uses as SM89 staged accumulation policy
-template <class Operator>
-static constexpr bool is_sm89_staged_policy_v =
-  (
-    // ElementA must be FP8
-    platform::is_same<typename Operator::ElementA, cutlass::float_e4m3_t>::value ||
-    platform::is_same<typename Operator::ElementA, cutlass::float_e5m2_t>::value
-  ) &&
-  (
-    // ElementB must be FP8
-    platform::is_same<typename Operator::ElementB, cutlass::float_e4m3_t>::value ||
-    platform::is_same<typename Operator::ElementB, cutlass::float_e5m2_t>::value
-  ) &&
-  (
-    // The instruction shape must be 16x8x32
-    Operator::ArchMmaOperator::Shape::kM == 16 &&
-    Operator::ArchMmaOperator::Shape::kN == 8 &&
-    Operator::ArchMmaOperator::Shape::kK == 32
-  ) &&
-  (
-    // The operator must be OpMultiplyAdd (default)
-    platform::is_same<typename Operator::MathOperator, OpMultiplyAdd>::value
-  );
-} // namespace detail
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP32 accumulation
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation - F32 = fe4m3 * fe4m3 + F32
-template <typename Operator_>
-struct Mma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  cutlass::float_e4m3_t,
-  layout::RowMajor,
-  cutlass::float_e4m3_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_> {
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = cutlass::float_e4m3_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e4m3_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm(
-      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      :
-        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
-        "r"(B[0]), "r"(B[1]),
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-/// Matrix multiply-add operation - F32 = fe4m3 * fe5m2 + F32
-template <typename Operator_>
-struct Mma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  cutlass::float_e4m3_t,
-  layout::RowMajor,
-  cutlass::float_e5m2_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_> {
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = cutlass::float_e4m3_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e5m2_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm(
-      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      :
-        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
-        "r"(B[0]), "r"(B[1]),
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-/// Matrix multiply-add operation - F32 = fe5m2 * fe4m3 + F32
-template <typename Operator_>
-struct Mma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  cutlass::float_e5m2_t,
-  layout::RowMajor,
-  cutlass::float_e4m3_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_> {
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = cutlass::float_e5m2_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e4m3_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm(
-      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      :
-        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
-        "r"(B[0]), "r"(B[1]),
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-/// Matrix multiply-add operation - F32 = fe5m2 * fe5m2 + F32
-template <typename Operator_>
-struct Mma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  cutlass::float_e5m2_t,
-  layout::RowMajor,
-  cutlass::float_e5m2_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_> {
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = cutlass::float_e5m2_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e5m2_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM89_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-  asm(
-      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-      :
-        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
-        "r"(B[0]), "r"(B[1]),
-        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
-  );
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-
-#endif
-  }
-};
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
deleted file mode 100755
index 1183ee5e0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/config.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-/// Matrix Multiply-Add 16x8x4 fp64
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F64 = F64 * F64 + F64
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,4>,
-  32,
-  double,
-  layout::RowMajor,
-  double,
-  layout::ColumnMajor,
-  double,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16,8,4>;
-
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<double, 2>;
-
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<double, 1>;
-
-  using ElementC = double;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<double, 4>;
-
-  using Operator = OpMultiplyAdd;
-
-  using ArchTag = arch::Sm90;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
-
-  double const *A = reinterpret_cast<double const *>(&a);
-  double const *B = reinterpret_cast<double const *>(&b);
-
-  double const *C = reinterpret_cast<double const *>(&c);
-  double *D = reinterpret_cast<double *>(&d);
-
-  asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
-      : "d"(A[0]), "d"(A[1]),
-        "d"(B[0]),
-        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
-
-#else
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Matrix Multiply-Add 16x8x8 fp64
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F64 = F64 * F64 + F64
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,8>,
-  32,
-  double,
-  layout::RowMajor,
-  double,
-  layout::ColumnMajor,
-  double,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16,8,8>;
-
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<double, 4>;
-
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<double, 2>;
-
-  using ElementC = double;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<double, 4>;
-
-  using Operator = OpMultiplyAdd;
-
-  using ArchTag = arch::Sm90;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-
-#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
-
-  double const *A = reinterpret_cast<double const *>(&a);
-  double const *B = reinterpret_cast<double const *>(&b);
-
-  double const *C = reinterpret_cast<double const *>(&c);
-  double *D = reinterpret_cast<double *>(&d);
-
-  asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
-      : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3])
-      : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]),
-        "d"(B[0]), "d"(B[1]),
-        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
-
-#else
-
-    CUTLASS_UNUSED(d);
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Matrix Multiply-Add 16x8x16 fp64
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F64 = F64 * F64 + F64
-template <>
-struct Mma<
-  gemm::GemmShape<16,8,16>,
-  32,
-  double,
-  layout::RowMajor,
-  double,
-  layout::ColumnMajor,
-  double,
-  layout::RowMajor,
-  OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<16,8,16>;
-
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<double, 8>;
-
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<double, 4>;
-
-  using ElementC = double;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<double, 4>;
-
-  using Operator = OpMultiplyAdd;
-
-  using ArchTag = arch::Sm90;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c) const {
-    
-#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
-
-  double const *A = reinterpret_cast<double const *>(&a);
-  double const *B = reinterpret_cast<double const *>(&b);
-
-  double const *C = reinterpret_cast<double const *>(&c);
-  double *D = reinterpret_cast<double *>(&d);
-
-  asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n"
-      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
-      : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]),
-        "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]), 
-        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
-
-#else
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
deleted file mode 100755
index 7041d04dd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h
+++ /dev/null
@@ -1,1238 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Sparse matrix multiply accumulate for SM80
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))
-
-#define CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED 1
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-#define CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED
-#endif
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 16832
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F16 = F16 * F16 + F16
-template <>
-struct SparseMma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  half_t,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread
-> {
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 8>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 8>;
-
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<half_t, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 2;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c, uint32_t const &E, int const id2) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
-  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-  if (id2 == 0) {
-    asm volatile(
-        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
-        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
-  }
-  else if (id2 == 1) {
-    asm volatile(
-        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
-        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
-        : "=r"(D[0]), "=r"(D[1])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
-  }
-  else {
-    assert(0);
-  }
-#else
-  if (id2 == 0) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
-        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
-        : "=r"(D[0]), "=r"(D[1])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
-  }
-  else if (id2 == 1) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
-        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
-        : "=r"(D[0]), "=r"(D[1])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
-  }
-  else {
-    assert(0);
-  }
-#endif
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = F16 * F16 + F32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16, 8, 32>,
-  32,
-  half_t,
-  layout::RowMajor,
-  half_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  OpMultiplyAdd,
-  SPFormatType::Thread
-  > {
-
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = half_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<half_t, 8>;
-
-  using ElementB = half_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<half_t, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 2;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c, uint32_t const &E, int const id2) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-  float const *C = reinterpret_cast<float const *>(&c);
-  float *D = reinterpret_cast<float *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-  if (id2 == 0) {
-    asm volatile(
-        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
-          "r"(E));
-  }
-  else if (id2 == 1) {
-    asm volatile(
-        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
-          "r"(E));
-  }
-  else {
-    assert(0);
-  }
-#else
-  if (id2 == 0) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
-          "r"(E));
-  }
-  else if (id2 == 1) {
-    asm volatile(
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
-          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
-          "r"(E));
-  }
-  else {
-    assert(0);
-  }
-
-#endif
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 16832 - Float BF16, FP32 accumulation 
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
-template <>
-struct SparseMma<gemm::GemmShape<16, 8, 32>, 32, bfloat16_t, layout::RowMajor,
-           bfloat16_t, layout::ColumnMajor, float, layout::RowMajor,
-           OpMultiplyAdd, SPFormatType::Thread> {
-  using Shape = gemm::GemmShape<16, 8, 32>;
-
-  using ElementA = bfloat16_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<bfloat16_t, 8>;
-
-  using ElementB = bfloat16_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<bfloat16_t, 8>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 2;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c, uint32_t const &E, int const id2) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else if (id2 == 1) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else if (id2 == 1) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 16816 - Float TF32
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
-template <>
-struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
-           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
-           OpMultiplyAdd, SPFormatType::Thread> {
-  using Shape = gemm::GemmShape<16, 8, 16>;
-
-  using ElementA = tfloat32_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<tfloat32_t, 4>;
-
-  using ElementB = tfloat32_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<tfloat32_t, 4>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<float, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAdd;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 4;
-
-  static int const kMaxID2 = 2;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
-                  FragmentC const &c, uint32_t const &E, int const id2) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else if (id2 == 1) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else if (id2 == 1) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
-          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
-            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S8 * S8 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S8 * U8 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  int8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<int8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * S8 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<int8_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U8 * U8 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  uint8_t,
-  layout::RowMajor,
-  uint8_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = uint8_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<uint8_t, 16>;
-
-  using ElementB = uint8_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<uint8_t, 16>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: S32 = S4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = S4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::int4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::int4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * S4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::int4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::int4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/// Matrix multiply-add operation: S32 = U4 * U4 + S32
-template <>
-struct SparseMma<
-  gemm::GemmShape<16,8,128>,
-  32,
-  cutlass::uint4b_t,
-  layout::RowMajor,
-  cutlass::uint4b_t,
-  layout::ColumnMajor,
-  int,
-  layout::RowMajor,
-  OpMultiplyAddSaturate,
-  SPFormatType::Thread> {
-
-  using Shape = gemm::GemmShape<16,8,128>;
-
-  using ElementA = cutlass::uint4b_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<cutlass::uint4b_t, 32>;
-
-  using ElementB = cutlass::uint4b_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<cutlass::uint4b_t, 32>;
-
-  using ElementC = int;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<int, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = OpMultiplyAddSaturate;
-  using ArchTag = arch::Sm80;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    int const *C = reinterpret_cast<int const *>(&c);
-    int *D = reinterpret_cast<int *>(&d);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#else
-    if (id2 == 0) {
-      asm volatile(
-          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
-          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
-    } else {
-      assert(0);
-    }
-#endif
-
-#else
-
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h b/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
deleted file mode 100755
index c092df768..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h
+++ /dev/null
@@ -1,409 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Sparse matrix multiply accumulate for SM89
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "mma.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
-
-#  define CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED 1
-#endif
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 890)
-#  define CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = fe4m3 * fe4m3 + F32
-template <typename Operator_>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  cutlass::float_e4m3_t,
-  layout::RowMajor,
-  cutlass::float_e4m3_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_,
-  SPFormatType::Thread> {
-
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = cutlass::float_e4m3_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e4m3_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 16>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<ElementC, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-      if (id2 == 0) {
-        asm volatile(
-            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-      }
-      else {
-        assert(0);
-      }
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = fe4m3 * fe5m2 + F32
-template <typename Operator_>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  cutlass::float_e4m3_t,
-  layout::RowMajor,
-  cutlass::float_e5m2_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_,
-  SPFormatType::Thread> {
-
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = cutlass::float_e4m3_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e5m2_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 16>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<ElementC, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-      if (id2 == 0) {
-        asm volatile(
-            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-      }
-      else {
-        assert(0);
-      }
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = fe5m2 * fe4m3 + F32
-template <typename Operator_>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  cutlass::float_e5m2_t,
-  layout::RowMajor,
-  cutlass::float_e4m3_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_,
-  SPFormatType::Thread> {
-
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = cutlass::float_e5m2_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e4m3_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 16>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<ElementC, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-      if (id2 == 0) {
-        asm volatile(
-            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-      }
-      else {
-        assert(0);
-      }
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Matrix multiply-add operation: F32 = fe5m2 * fe5m2 + F32
-template <typename Operator_>
-struct SparseMma<
-  gemm::GemmShape<16,8,64>,
-  32,
-  cutlass::float_e5m2_t,
-  layout::RowMajor,
-  cutlass::float_e5m2_t,
-  layout::ColumnMajor,
-  float,
-  layout::RowMajor,
-  Operator_,
-  SPFormatType::Thread> {
-
-  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
-                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
-                "Invalid operator for SM89 FP8 instruction");
-
-  using Shape = gemm::GemmShape<16,8,64>;
-
-  using ElementA = cutlass::float_e5m2_t;
-  using LayoutA = layout::RowMajor;
-  using FragmentA = Array<ElementA, 16>;
-
-  using ElementB = cutlass::float_e5m2_t;
-  using LayoutB = layout::ColumnMajor;
-  using FragmentB = Array<ElementB, 16>;
-
-  using ElementC = float;
-  using LayoutC = layout::RowMajor;
-  using FragmentC = Array<ElementC, 4>;
-
-  using FragmentE = uint32_t;
-
-  using Operator = Operator_;
-  using ArchTag = arch::Sm89;
-
-  static int const kSparse = 2;
-
-  static int const kMetaSizeInBits = 2;
-
-  static int const kMaxID2 = 1;
-
-  /// Computes multiply-add
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC &d,
-    FragmentA const &a,
-    FragmentB const &b,
-    FragmentC const &c,
-    uint32_t const &E,
-    int const id2
-  ) const {
-
-#if defined(CUTLASS_ARCH_SPARSE_MMA_SM89_ENABLED)
-
-    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
-    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
-
-    float const *C = reinterpret_cast<float const *>(&c);
-    float *D = reinterpret_cast<float *>(&d);
-
-      if (id2 == 0) {
-        asm volatile(
-            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
-            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
-            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
-              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
-      }
-      else {
-        assert(0);
-      }
-#else
-    CUTLASS_UNUSED(a);
-    CUTLASS_UNUSED(b);
-    CUTLASS_UNUSED(c);
-    CUTLASS_UNUSED(d);
-    assert(0);
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h b/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
deleted file mode 100755
index d2b434453..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief PTX for CTA Reconfiguration
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#ifndef CUDA_CTA_RECONFIG_ACTIVATED
-  #if (__CUDACC_VER_MAJOR__ >= 12 && \
-    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
-    #define CUDA_CTA_RECONFIG_ACTIVATED 1
-  #endif
-#endif
-
-namespace cutlass {
-namespace arch {
-
-template<uint32_t RegCount>
-CUTLASS_DEVICE
-void warpgroup_reg_alloc(){
-#if CUDA_CTA_RECONFIG_ACTIVATED
-  asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
-#endif
-}
-
-template<uint32_t RegCount>
-CUTLASS_DEVICE
-void warpgroup_reg_dealloc(){
-#if CUDA_CTA_RECONFIG_ACTIVATED
-  asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
-#endif
-}
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd.h
deleted file mode 100755
index 3104746e5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/simd.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing SIMD operators
-*/
-
-#pragma once
-
-#include "../array.h"
-#include "../numeric_types.h"
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Element-wise operators
-//
-
-CUTLASS_HOST_DEVICE
-template <typename T, int N>
-Array<T, N> operator*(Array<T, N> const &a, Array<T, N> const &b) {
-  Array<T, N> d;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i) {
-    d[i] = a[i] * b[i];
-  }
-  return d;
-}
-
-CUTLASS_HOST_DEVICE
-template <typename T, int N>
-Array<T, N> operator+(Array<T, N> const &a, Array<T, N> const &b) {
-  Array<T, N> d;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i) {
-    d[i] = a[i] + b[i];
-  }
-  return d;
-}
-
-CUTLASS_HOST_DEVICE
-template <typename T, int N>
-Array<T, N> operator-(Array<T, N> const &a, Array<T, N> const &b) {
-  Array<T, N> d;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i) {
-    d[i] = a[i] - b[i];
-  }
-  return d;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Multiply-accumulate operators
-//
-
-CUTLASS_HOST_DEVICE
-template <typename T, int N>
-Array<T, N> mac(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
-  Array<T, N> d;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i) {
-    d[i] = a[i] * b[i] + c[i];
-  }
-  return d;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Dot product operator
-//
-
-CUTLASS_HOST_DEVICE
-template <typename Element, typename Accumulator, int N>
-Accumulator dot(Array<T, N> const &a, Array<T, N> const &b, Accumulator accum) {
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < N; ++i) {
-    accum += a[i] * b[i];
-  }
-  return accum;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "simd_sm60.h"
-#include "simd_sm61.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
deleted file mode 100755
index 6e1ef2044..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing SIMD operators for SM60
-*/
-
-#pragma once
-
-#include "simd.h"
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Element-wise operators - specialized for half_t x 2
-//
-
-CUTLASS_HOST_DEVICE
-template <>
-Array<half_t, 2> operator*(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
-  Array<half_t, 2> d;
-
-  return d;
-}
-
-CUTLASS_HOST_DEVICE
-template <>
-Array<half_t, 2> operator+(AArray<half_t, 2> const &a, Array<half_t, 2> const &b) {
-  Array<half_t, 2> d;
-
-  return d;
-}
-
-CUTLASS_HOST_DEVICE
-template <>
-Array<half_t, 2> operator-(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
-  Array<T, N> d;
-
-  return d;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Multiply-accumulate operators - specialized for half_t x 2
-CUTLASS_HOST_DEVICE
-template <>
-Array<half_t, 2> mac(Array<half_t, 2> const &a, Array<half_t, 2> const &b, Array<half_t, 2> const &c) {
-  Array<half_t, 2> d;
-
-  return d;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Dot product operator - specialized for half_t <- (half_t * half_t) x 2 + half_t
-CUTLASS_HOST_DEVICE
-template <>
-half_t dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, half_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for float <- (half_t * half_t) x 2 + float
-CUTLASS_HOST_DEVICE
-template <>
-float dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, float accum) {
-
-  return accum;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h b/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
deleted file mode 100755
index b783c943e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing SIMD operators for SM61
-*/
-
-#pragma once
-
-#include "simd.h"
-
-namespace cutlass {
-namespace arch {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Dot product operator - specialized for int32_t <- (int8_t * int8_t) x 4 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint8_t * int8_t) x 4 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (int8_t * uint8_t) x 4 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint8_t * uint8_t) x 4 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<int16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
-CUTLASS_HOST_DEVICE
-template <>
-int32_t dot(Array<uint16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
-
-  return accum;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp b/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
deleted file mode 100755
index ea683859a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp
+++ /dev/null
@@ -1,1324 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Synchronization event logging for race condition debugging.
-*/
-
-#pragma once
-
-#include "cutlass/detail/helper_macros.hpp"
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#endif
-
-#if !defined(__CUDACC_RTC__)
-#include <mutex>
-#include <vector>
-#endif
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(CUTLASS_ENABLE_SYNCLOG)
-
-constexpr uint32_t synclog_cap = 1 << 26;
-
-inline std::mutex synclog_mutex;
-inline std::vector<uint32_t*> synclog_buf_list;
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-inline __device__ uint32_t* synclog_buf;
-#endif
-
-CUTLASS_DEVICE
-uint32_t* synclog_alloc(uint32_t n) {
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  uint32_t* buf = synclog_buf;
-  if (buf == nullptr) return nullptr;
-  uint32_t last = atomicAdd(&buf[0], n);
-  if (last + n < synclog_cap) return buf + last + 1;
-  if (last >= synclog_cap) atomicAdd(&buf[0], -n);
-  #endif
-  return nullptr;
-}
-
-CUTLASS_DEVICE
-void synclog_emit_prefix(uint32_t* to, uint32_t header, uint32_t line) {
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  uint64_t time64;
-  asm volatile (
-    "mov.u64 %0, %%globaltimer;\n"
-    : "=l"(time64) :
-  );
-  to[0] = header;
-  to[1] = line;
-  to[2] = time64;
-  to[3] = time64 >> 32;
-  to[4] = threadIdx.x;
-  to[5] = threadIdx.y;
-  to[6] = threadIdx.z;
-  to[7] = blockIdx.x;
-  to[8] = blockIdx.y;
-  to[9] = blockIdx.z;
-  #endif
-}
-
-constexpr uint32_t synclog_header_none = 0;
-constexpr uint32_t synclog_length_prefix = 1 + 1 + 2 + 3 + 3;
-
-constexpr bool     synclog_enable_syncthreads = true;
-constexpr uint32_t synclog_header_syncthreads = 1;
-constexpr uint32_t synclog_length_syncthreads = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_syncwarp = true;
-constexpr uint32_t synclog_header_syncwarp = 2;
-constexpr uint32_t synclog_length_syncwarp = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_named_barrier_arrive_and_wait = true;
-constexpr uint32_t synclog_header_named_barrier_arrive_and_wait = 3;
-constexpr uint32_t synclog_length_named_barrier_arrive_and_wait = synclog_length_prefix + 2;
-
-constexpr bool     synclog_enable_named_barrier_arrive = true;
-constexpr uint32_t synclog_header_named_barrier_arrive = 4;
-constexpr uint32_t synclog_length_named_barrier_arrive = synclog_length_prefix + 2;
-
-constexpr bool     synclog_enable_cluster_barrier_init = true;
-constexpr uint32_t synclog_header_cluster_barrier_init = 5;
-constexpr uint32_t synclog_length_cluster_barrier_init = synclog_length_prefix + 2;
-
-constexpr bool     synclog_enable_cluster_barrier_wait = true;
-constexpr uint32_t synclog_header_cluster_barrier_wait = 6;
-constexpr uint32_t synclog_length_cluster_barrier_wait = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cluster_barrier_test_wait = true;
-constexpr uint32_t synclog_header_cluster_barrier_test_wait = 7;
-constexpr uint32_t synclog_length_cluster_barrier_test_wait = synclog_length_prefix + 5;
-
-constexpr bool     synclog_enable_cluster_barrier_try_wait = true;
-constexpr uint32_t synclog_header_cluster_barrier_try_wait = 8;
-constexpr uint32_t synclog_length_cluster_barrier_try_wait = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cluster_barrier_arrive_cluster = true;
-constexpr uint32_t synclog_header_cluster_barrier_arrive_cluster = 9;
-constexpr uint32_t synclog_length_cluster_barrier_arrive_cluster = synclog_length_prefix + 5;
-
-constexpr bool     synclog_enable_cluster_barrier_arrive = true;
-constexpr uint32_t synclog_header_cluster_barrier_arrive = 10;
-constexpr uint32_t synclog_length_cluster_barrier_arrive = synclog_length_prefix + 3;
-
-constexpr bool     synclog_enable_cluster_barrier_invalidate = true;
-constexpr uint32_t synclog_header_cluster_barrier_invalidate = 11;
-constexpr uint32_t synclog_length_cluster_barrier_invalidate = synclog_length_prefix + 3;
-
-constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx = true;
-constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx = 12;
-constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster = true;
-constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster = 13;
-constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster = synclog_length_prefix + 6;
-
-constexpr bool     synclog_enable_cluster_transaction_barrier_expect_transaction = true;
-constexpr uint32_t synclog_header_cluster_transaction_barrier_expect_transaction = 14;
-constexpr uint32_t synclog_length_cluster_transaction_barrier_expect_transaction = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cluster_transaction_barrier_complete_transaction = true;
-constexpr uint32_t synclog_header_cluster_transaction_barrier_complete_transaction = 15;
-constexpr uint32_t synclog_length_cluster_transaction_barrier_complete_transaction = synclog_length_prefix + 6;
-
-constexpr bool     synclog_enable_fence_barrier_init = true;
-constexpr uint32_t synclog_header_fence_barrier_init = 16;
-constexpr uint32_t synclog_length_fence_barrier_init = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_fence_view_async_shared = true;
-constexpr uint32_t synclog_header_fence_view_async_shared = 17;
-constexpr uint32_t synclog_length_fence_view_async_shared = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_cp_async_wait = true;
-constexpr uint32_t synclog_header_cp_async_wait = 18;
-constexpr uint32_t synclog_length_cp_async_wait = synclog_length_prefix + 1;
-
-constexpr bool     synclog_enable_cp_async_wait_all = true;
-constexpr uint32_t synclog_header_cp_async_wait_all = 19;
-constexpr uint32_t synclog_length_cp_async_wait_all = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_cp_async_fence = true;
-constexpr uint32_t synclog_header_cp_async_fence = 20;
-constexpr uint32_t synclog_length_cp_async_fence = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_cp_async_nan = true;
-constexpr uint32_t synclog_header_cp_async_nan = 21;
-constexpr uint32_t synclog_length_cp_async_nan = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cp_async_zfill = true;
-constexpr uint32_t synclog_header_cp_async_zfill = 22;
-constexpr uint32_t synclog_length_cp_async_zfill = synclog_length_prefix + 5;
-
-constexpr bool     synclog_enable_cp_async = true;
-constexpr uint32_t synclog_header_cp_async = 23;
-constexpr uint32_t synclog_length_cp_async = synclog_length_prefix + 5;
-
-constexpr bool     synclog_enable_tma_load = true;
-constexpr uint32_t synclog_header_tma_load = 24;
-constexpr uint32_t synclog_length_tma_load = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_tma_store = true;
-constexpr uint32_t synclog_header_tma_store = 25;
-constexpr uint32_t synclog_length_tma_store = synclog_length_prefix + 3;
-
-constexpr bool     synclog_enable_tma_store_arrive = true;
-constexpr uint32_t synclog_header_tma_store_arrive = 26;
-constexpr uint32_t synclog_length_tma_store_arrive = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_tma_store_wait = true;
-constexpr uint32_t synclog_header_tma_store_wait = 27;
-constexpr uint32_t synclog_length_tma_store_wait = synclog_length_prefix + 1;
-
-constexpr bool     synclog_enable_warpgroup_arrive = true;
-constexpr uint32_t synclog_header_warpgroup_arrive = 28;
-constexpr uint32_t synclog_length_warpgroup_arrive = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_warpgroup_wait = true;
-constexpr uint32_t synclog_header_warpgroup_wait = 29;
-constexpr uint32_t synclog_length_warpgroup_wait = synclog_length_prefix + 1;
-
-constexpr bool     synclog_enable_warpgroup_commit_batch = true;
-constexpr uint32_t synclog_header_warpgroup_commit_batch = 30;
-constexpr uint32_t synclog_length_warpgroup_commit_batch = synclog_length_prefix + 0;
-
-constexpr bool     synclog_enable_wgmma_reg_smem = true;
-constexpr uint32_t synclog_header_wgmma_reg_smem = 31;
-constexpr uint32_t synclog_length_wgmma_reg_smem = synclog_length_prefix + 2;
-
-constexpr bool     synclog_enable_wgmma_smem_smem = true;
-constexpr uint32_t synclog_header_wgmma_smem_smem = 32;
-constexpr uint32_t synclog_length_wgmma_smem_smem = synclog_length_prefix + 4;
-
-constexpr bool     synclog_enable_cpasync_barrier_arrive = true;
-constexpr uint32_t synclog_header_cpasync_barrier_arrive = 33;
-constexpr uint32_t synclog_length_cpasync_barrier_arrive = synclog_length_prefix + 3;
-
-CUTLASS_DEVICE
-bool synclog_condition_emit() {
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  return threadIdx.x%NumThreadsPerWarp == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
-    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
-  #else
-  return 0;
-  #endif
-}
-
-CUTLASS_DEVICE
-bool synclog_condition_print() {
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  return threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
-    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
-  #else
-  return false;
-  #endif
-}
-
-CUTLASS_DEVICE
-void synclog_print_prefix(char const* header, uint32_t at) {
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  uint32_t line = synclog_buf[at + 1];
-  uint32_t timeLo = synclog_buf[at + 2];
-  uint32_t timeHi = synclog_buf[at + 3];
-  uint32_t threadIdxX = synclog_buf[at + 4];
-  uint32_t threadIdxY = synclog_buf[at + 5];
-  uint32_t threadIdxZ = synclog_buf[at + 6];
-  uint32_t blockIdxX = synclog_buf[at + 7];
-  uint32_t blockIdxY = synclog_buf[at + 8];
-  uint32_t blockIdxZ = synclog_buf[at + 9];
-  printf(
-    "%s line=%u time=%lu thread=%u,%u,%u block=%u,%u,%u ",
-    header, line,
-    (uint64_t)timeHi << 32 | timeLo,
-    threadIdxX, threadIdxY, threadIdxZ,
-    blockIdxX, blockIdxY, blockIdxZ
-  );
-  #endif
-}
-
-CUTLASS_DEVICE
-uint64_t synclog_mbarrier_bits(uint32_t smem_addr) {
-  uint64_t bits = 0;
-  asm volatile (
-    "mbarrier.inval.shared::cta.b64 [%1];\n"
-    "ld.shared::cta.b64 %0, [%1];\n"
-    : "=l"(bits) : "r"(smem_addr)
-  );
-  return bits;
-}
-
-CUTLASS_DEVICE
-void synclog_print_wgmma_desc(char const* str, uint32_t lo, uint32_t hi, char const* sep) {
-  CUTLASS_UNUSED(hi);
-  uint32_t smem_int_ptr = (lo & ((1 << 14) - 1)) << 4;
-  printf("%s_smem_int_ptr=%u%s", str, smem_int_ptr, sep);
-}
-
-#endif // defined(CUTLASS_ENABLE_SYNCLOG)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline void synclog_setup() {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  std::scoped_lock lock(synclog_mutex);
-  auto fail = [] () {
-    fprintf(stderr, "synclog_setup() failed\n");
-    std::terminate();
-  };
-  int orig_device = 0;
-  if (cudaGetDevice(&orig_device) != cudaSuccess) {
-    fail();
-  }
-  int device_count = 0;
-  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
-    fail();
-  }
-  if (synclog_buf_list.size() == 0) {
-    for (int device = 0; device < device_count; device++) {
-      uint32_t* buf = 0;
-      if (cudaSetDevice(device) != cudaSuccess ||
-        cudaMalloc(&buf, synclog_cap * sizeof(uint32_t)) != cudaSuccess) {
-        fail();
-      }
-      synclog_buf_list.push_back(buf);
-    }
-  }
-  for (int device = 0; device < device_count; device++) {
-    uint32_t* buf = synclog_buf_list.at(device);
-    if (cudaSetDevice(device) != cudaSuccess ||
-      cudaMemset(buf, 0, synclog_cap * sizeof(uint32_t)) != cudaSuccess ||
-      cudaMemcpyToSymbol(synclog_buf, &buf, sizeof(buf)) != cudaSuccess) {
-      fail();
-    }
-  }
-  if (cudaSetDevice(orig_device) != cudaSuccess) {
-    fail();
-  }
-  #endif
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_syncthreads(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_syncthreads) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_syncthreads);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_syncthreads, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_syncwarp(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_syncwarp) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_syncwarp);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_syncwarp, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_named_barrier_arrive_and_wait(
-  uint32_t line,
-  uint32_t num_threads,
-  uint32_t barrier_id) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_named_barrier_arrive_and_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive_and_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_named_barrier_arrive_and_wait, line);
-  to[synclog_length_prefix + 0] = num_threads;
-  to[synclog_length_prefix + 1] = barrier_id;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(num_threads);
-  CUTLASS_UNUSED(barrier_id);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_named_barrier_arrive(
-  uint32_t line,
-  uint32_t num_threads,
-  uint32_t barrier_id) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_named_barrier_arrive) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_named_barrier_arrive, line);
-  to[synclog_length_prefix + 0] = num_threads;
-  to[synclog_length_prefix + 1] = barrier_id;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(num_threads);
-  CUTLASS_UNUSED(barrier_id);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_init(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t arrive_count) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_init) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_init);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_init, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = arrive_count;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(arrive_count);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_wait(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t phase) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_wait, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = phase;
-  to[synclog_length_prefix + 2] = bits;
-  to[synclog_length_prefix + 3] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(phase);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_test_wait(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t phase,
-  uint32_t pred) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_test_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_test_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_test_wait, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = phase;
-  to[synclog_length_prefix + 2] = pred;
-  to[synclog_length_prefix + 3] = bits;
-  to[synclog_length_prefix + 4] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(phase);
-  CUTLASS_UNUSED(pred);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_try_wait(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t phase) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_try_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_try_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_try_wait, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = phase;
-  to[synclog_length_prefix + 2] = bits;
-  to[synclog_length_prefix + 3] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(phase);  
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_arrive_cluster(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t cta_id,
-  uint32_t pred) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_arrive_cluster) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive_cluster);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive_cluster, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = cta_id;
-  to[synclog_length_prefix + 2] = pred;
-  to[synclog_length_prefix + 3] = bits;
-  to[synclog_length_prefix + 4] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(cta_id);
-  CUTLASS_UNUSED(pred);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_arrive(
-  uint32_t line,
-  uint32_t smem_addr) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_arrive) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = bits;
-  to[synclog_length_prefix + 2] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_barrier_invalidate(
-  uint32_t line,
-  uint32_t smem_addr) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_barrier_invalidate) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_invalidate);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_barrier_invalidate, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = bits;
-  to[synclog_length_prefix + 2] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t transaction_bytes) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = transaction_bytes;
-  to[synclog_length_prefix + 2] = bits;
-  to[synclog_length_prefix + 3] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(transaction_bytes);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx_cluster(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t transaction_bytes,
-  uint32_t cta_id,
-  uint32_t pred) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = transaction_bytes;
-  to[synclog_length_prefix + 2] = cta_id;
-  to[synclog_length_prefix + 3] = pred;
-  to[synclog_length_prefix + 4] = bits;
-  to[synclog_length_prefix + 5] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(transaction_bytes);
-  CUTLASS_UNUSED(cta_id);
-  CUTLASS_UNUSED(pred);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_transaction_barrier_expect_transaction(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t transaction_bytes) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_transaction_barrier_expect_transaction) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_expect_transaction);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_expect_transaction, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = transaction_bytes;
-  to[synclog_length_prefix + 2] = bits;
-  to[synclog_length_prefix + 2] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(transaction_bytes);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cluster_transaction_barrier_complete_transaction(
-  uint32_t line,
-  uint32_t smem_addr,
-  uint32_t dst_cta_id,
-  uint32_t transaction_bytes,
-  uint32_t pred) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cluster_transaction_barrier_complete_transaction) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_complete_transaction);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_complete_transaction, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = dst_cta_id;
-  to[synclog_length_prefix + 2] = transaction_bytes;
-  to[synclog_length_prefix + 3] = pred;
-  to[synclog_length_prefix + 4] = bits;
-  to[synclog_length_prefix + 5] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(dst_cta_id);
-  CUTLASS_UNUSED(transaction_bytes);
-  CUTLASS_UNUSED(pred);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_fence_barrier_init(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_fence_barrier_init) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_fence_barrier_init);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_fence_barrier_init, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_fence_view_async_shared(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_fence_view_async_shared) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_fence_view_async_shared);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_fence_view_async_shared, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async_wait(
-  uint32_t line,
-  uint32_t n) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async_wait, line);
-  to[synclog_length_prefix + 0] = n;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(n);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async_wait_all(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async_wait_all) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait_all);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async_wait_all, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async_fence(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async_fence) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async_fence);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async_fence, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async_nan(
-  uint32_t line,
-  uint32_t smem_addr,
-  const void* gmem_ptr,
-  uint32_t pred) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async_nan) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async_nan);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async_nan, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
-  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
-  to[synclog_length_prefix + 3] = pred;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(gmem_ptr);
-  CUTLASS_UNUSED(pred);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async_zfill(
-  uint32_t line,
-  uint32_t smem_addr,
-  const void* gmem_ptr,
-  uint32_t pred,
-  uint32_t size) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async_zfill) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async_zfill);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async_zfill, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
-  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
-  to[synclog_length_prefix + 3] = pred;
-  to[synclog_length_prefix + 4] = size;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(gmem_ptr);
-  CUTLASS_UNUSED(pred);
-  CUTLASS_UNUSED(size);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cp_async(
-  uint32_t line,
-  uint32_t smem_addr,
-  const void* gmem_ptr,
-  uint32_t pred,
-  uint32_t size) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cp_async) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_cp_async);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cp_async, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
-  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
-  to[synclog_length_prefix + 3] = pred;
-  to[synclog_length_prefix + 4] = size;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  CUTLASS_UNUSED(gmem_ptr);
-  CUTLASS_UNUSED(pred);
-  CUTLASS_UNUSED(size);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_tma_load(
-  uint32_t line,
-  uint64_t gmem_int_desc,
-  uint32_t smem_int_mbar,
-  uint32_t smem_int_ptr) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_tma_load) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_tma_load);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_tma_load, line);
-  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
-  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
-  to[synclog_length_prefix + 2] = smem_int_mbar;
-  to[synclog_length_prefix + 3] = smem_int_ptr;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(gmem_int_desc);
-  CUTLASS_UNUSED(smem_int_mbar);
-  CUTLASS_UNUSED(smem_int_ptr);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_tma_store(
-  uint32_t line,
-  uint64_t gmem_int_desc,
-  uint32_t smem_int_ptr) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_tma_store) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_tma_store);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_tma_store, line);
-  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
-  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
-  to[synclog_length_prefix + 2] = smem_int_ptr;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(gmem_int_desc);
-  CUTLASS_UNUSED(smem_int_ptr);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_tma_store_arrive(uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_tma_store_arrive) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_tma_store_arrive);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_tma_store_arrive, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_tma_store_wait(
-  uint32_t line,
-  uint32_t count) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_tma_store_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_tma_store_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_tma_store_wait, line);
-  to[synclog_length_prefix + 0] = count;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(count);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_warpgroup_arrive(
-  uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_warpgroup_arrive) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_warpgroup_arrive);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_warpgroup_arrive, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_warpgroup_wait(
-  uint32_t line,
-  uint32_t n) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_warpgroup_wait) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_warpgroup_wait);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_warpgroup_wait, line);
-  to[synclog_length_prefix + 0] = n;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(n);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_warpgroup_commit_batch(
-  uint32_t line) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_warpgroup_commit_batch) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_warpgroup_commit_batch);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_warpgroup_commit_batch, line);
-  #else
-  CUTLASS_UNUSED(line);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_wgmma_reg_smem(
-  uint32_t line,
-  uint64_t desc_b) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_wgmma_reg_smem) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_wgmma_reg_smem);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_wgmma_reg_smem, line);
-  to[synclog_length_prefix + 0] = desc_b;
-  to[synclog_length_prefix + 1] = desc_b >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(desc_b);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_wgmma_smem_smem(
-  uint32_t line,
-  uint64_t desc_a,
-  uint64_t desc_b) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_wgmma_smem_smem) return;
-  if (!synclog_condition_emit()) return;
-  uint32_t* to = synclog_alloc(synclog_length_wgmma_smem_smem);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_wgmma_smem_smem, line);
-  to[synclog_length_prefix + 0] = desc_a;
-  to[synclog_length_prefix + 1] = desc_a >> 32;
-  to[synclog_length_prefix + 2] = desc_b;
-  to[synclog_length_prefix + 3] = desc_b >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(desc_a);
-  CUTLASS_UNUSED(desc_b);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-CUTLASS_DEVICE
-void synclog_emit_cpasync_barrier_arrive(
-  uint32_t line,
-  uint32_t smem_addr) {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  if constexpr (!synclog_enable_cpasync_barrier_arrive) return;
-  if (!synclog_condition_emit()) return;
-  uint64_t bits = synclog_mbarrier_bits(smem_addr);
-  uint32_t* to = synclog_alloc(synclog_length_cpasync_barrier_arrive);
-  if (to == nullptr) return;
-  synclog_emit_prefix(to, synclog_header_cpasync_barrier_arrive, line);
-  to[synclog_length_prefix + 0] = smem_addr;
-  to[synclog_length_prefix + 1] = bits;
-  to[synclog_length_prefix + 2] = bits >> 32;
-  #else
-  CUTLASS_UNUSED(line);
-  CUTLASS_UNUSED(smem_addr);
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-#if !defined(CUTLASS_ENABLE_SYNCLOG)
-CUTLASS_DEVICE
-#elif defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-static __attribute__((__noinline__)) __device__
-#else
-static __attribute__((__noinline__))
-#endif
-void synclog_print() {
-  #if defined(CUTLASS_ENABLE_SYNCLOG)
-  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-  if (synclog_buf == nullptr || !synclog_condition_print()) {
-    return;
-  }
-  printf("synclog start\n");
-  for (uint32_t at = 1; at < synclog_cap; ) {
-    uint32_t header = synclog_buf[at];
-    if (header == synclog_header_none) {
-      break;
-    }
-    printf("synclog at %u: ", at);
-    if constexpr (synclog_enable_syncthreads) {
-      if (header == synclog_header_syncthreads) {
-        synclog_print_prefix("syncthreads", at);
-        at += synclog_length_syncthreads;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_syncwarp) {
-      if (header == synclog_header_syncwarp) {
-        synclog_print_prefix("syncwarp", at);
-        at += synclog_length_syncwarp;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_named_barrier_arrive_and_wait) {
-      if (header == synclog_header_named_barrier_arrive_and_wait) {
-        synclog_print_prefix("named_barrier_arrive_and_wait", at);
-        at += synclog_length_named_barrier_arrive_and_wait;
-        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_named_barrier_arrive) {
-      if (header == synclog_header_named_barrier_arrive) {
-        synclog_print_prefix("named_barrier_arrive", at);
-        at += synclog_length_named_barrier_arrive;
-        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_init) {
-      if (header == synclog_header_cluster_barrier_init) {
-        synclog_print_prefix("cluster_barrier_init", at);
-        at += synclog_length_cluster_barrier_init;
-        printf("smem_addr=%u arrive_count=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_wait) {
-      if (header == synclog_header_cluster_barrier_wait) {
-        synclog_print_prefix("cluster_barrier_wait", at);
-        at += synclog_length_cluster_barrier_wait;
-        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_test_wait) {
-      if (header == synclog_header_cluster_barrier_test_wait) {
-        synclog_print_prefix("cluster_barrier_test_wait", at);
-        at += synclog_length_cluster_barrier_test_wait;
-        printf("smem_addr=%u phase=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_try_wait) {
-      if (header == synclog_header_cluster_barrier_try_wait) {
-        synclog_print_prefix("cluster_barrier_try_wait", at);
-        at += synclog_length_cluster_barrier_try_wait;
-        printf("smem_addr=%u phase=%u", synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_arrive_cluster) {
-      if (header == synclog_header_cluster_barrier_arrive_cluster) {
-        synclog_print_prefix("cluster_barrier_arrive_cluster", at);
-        at += synclog_length_cluster_barrier_arrive_cluster;
-        printf("smem_addr=%u cta_id=%u pred=%u", synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_arrive) {
-      if (header == synclog_header_cluster_barrier_arrive) {
-        synclog_print_prefix("cluster_barrier_arrive", at);
-        at += synclog_length_cluster_barrier_arrive;
-        printf("smem_addr=%u", synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_barrier_invalidate) {
-      if (header == synclog_header_cluster_barrier_invalidate) {
-        synclog_print_prefix("cluster_barrier_invalidate", at);
-        at += synclog_length_cluster_barrier_invalidate;
-        printf("smem_addr=%u", synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) {
-      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx) {
-        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx", at);
-        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx;
-        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
-      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
-        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx_cluster", at);
-        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster;
-        printf("smem_addr=%u transaction_bytes=%u cta_id=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_transaction_barrier_expect_transaction) {
-      if (header == synclog_header_cluster_transaction_barrier_expect_transaction) {
-        synclog_print_prefix("cluster_transaction_barrier_expect_transaction", at);
-        at += synclog_length_cluster_transaction_barrier_expect_transaction;
-        printf("smem_addr=%u transaction_bytes=%u", synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cluster_transaction_barrier_complete_transaction) {
-      if (header == synclog_header_cluster_transaction_barrier_complete_transaction) {
-        synclog_print_prefix("cluster_transaction_barrier_complete_transaction", at);
-        at += synclog_length_cluster_transaction_barrier_complete_transaction;
-        printf("smem_addr=%u dst_cta_id=%u transaction_bytes=%u pred=%u", synclog_buf[at-6], synclog_buf[at-5], synclog_buf[at-4], synclog_buf[at-3]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_fence_barrier_init) {
-      if (header == synclog_header_fence_barrier_init) {
-        synclog_print_prefix("fence_barrier_init", at);
-        at += synclog_length_fence_barrier_init;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_fence_view_async_shared) {
-      if (header == synclog_header_fence_view_async_shared) {
-        synclog_print_prefix("fence_view_async_shared", at);
-        at += synclog_length_fence_view_async_shared;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async_wait) {
-      if (header == synclog_header_cp_async_wait) {
-        synclog_print_prefix("cp_async_wait", at);
-        at += synclog_length_cp_async_wait;
-        printf("n=%u\n", synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async_wait_all) {
-      if (header == synclog_header_cp_async_wait_all) {
-        synclog_print_prefix("cp_async_wait_all", at);
-        at += synclog_length_cp_async_wait_all;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async_fence) {
-      if (header == synclog_header_cp_async_fence) {
-        synclog_print_prefix("cp_async_fence", at);
-        at += synclog_length_cp_async_fence;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async_nan) {
-      if (header == synclog_header_cp_async_nan) {
-        synclog_print_prefix("cp_async_nan", at);
-        at += synclog_length_cp_async_nan;
-        uint64_t gmem_addr = synclog_buf[at-3];
-        gmem_addr += (uint64_t)synclog_buf[at-2] << 32;
-        printf("smem_addr=%u gmem_addr=%llu pred=%u\n", synclog_buf[at-4], gmem_addr, synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async_zfill) {
-      if (header == synclog_header_cp_async_zfill) {
-        synclog_print_prefix("cp_async_zfill", at);
-        at += synclog_length_cp_async_zfill;
-        uint64_t gmem_addr = synclog_buf[at-4];
-        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
-        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cp_async) {
-      if (header == synclog_header_cp_async) {
-        synclog_print_prefix("cp_async", at);
-        at += synclog_length_cp_async;
-        uint64_t gmem_addr = synclog_buf[at-4];
-        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
-        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_tma_load) {
-      if (header == synclog_header_tma_load) {
-        synclog_print_prefix("tma_load", at);
-        at += synclog_length_tma_load;
-        uint64_t gmem_int_desc = synclog_buf[at-4];
-        gmem_int_desc += (uint64_t)synclog_buf[at-3] << 32;
-        printf("gmem_int_desc=%llu smem_int_mbar=%u smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-2], synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_tma_store) {
-      if (header == synclog_header_tma_store) {
-        synclog_print_prefix("tma_store", at);
-        at += synclog_length_tma_store;
-        uint64_t gmem_int_desc = synclog_buf[at-3];
-        gmem_int_desc += (uint64_t)synclog_buf[at-2] << 32;
-        printf("gmem_int_desc=%llu smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_tma_store_arrive) {
-      if (header == synclog_header_tma_store_arrive) {
-        synclog_print_prefix("tma_store_arrive", at);
-        at += synclog_length_tma_store_arrive;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_tma_store_wait) {
-      if (header == synclog_header_tma_store_wait) {
-        synclog_print_prefix("tma_store_wait", at);
-        at += synclog_length_tma_store_wait;
-        printf("count=%u\n", synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_warpgroup_arrive) {
-      if (header == synclog_header_warpgroup_arrive) {
-        synclog_print_prefix("warpgroup_arrive", at);
-        at += synclog_length_warpgroup_arrive;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_warpgroup_wait) {
-      if (header == synclog_header_warpgroup_wait) {
-        synclog_print_prefix("warpgroup_wait", at);
-        at += synclog_length_warpgroup_wait;
-        printf("n=%u\n", synclog_buf[at-1]);
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_warpgroup_commit_batch) {
-      if (header == synclog_header_warpgroup_commit_batch) {
-        synclog_print_prefix("warpgroup_commit_batch", at);
-        at += synclog_length_warpgroup_commit_batch;
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_wgmma_reg_smem) {
-      if (header == synclog_header_wgmma_reg_smem) {
-        synclog_print_prefix("wgmma_reg_smem", at);
-        at += synclog_length_wgmma_reg_smem;
-        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_wgmma_smem_smem) {
-      if (header == synclog_header_wgmma_smem_smem) {
-        synclog_print_prefix("wgmma_smem_smem", at);
-        at += synclog_length_wgmma_smem_smem;
-        synclog_print_wgmma_desc("desc_a", synclog_buf[at-4], synclog_buf[at-3], " ");
-        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
-        printf("\n");
-        continue;
-      }
-    }
-    if constexpr (synclog_enable_cpasync_barrier_arrive) {
-      if (header == synclog_header_cpasync_barrier_arrive) {
-        synclog_print_prefix("cpasync_barrier_arrive", at);
-        at += synclog_length_cpasync_barrier_arrive;
-        printf("smem_addr=%u", synclog_buf[at-3]);
-        continue;
-      }
-    }
-    asm volatile ("brkpt;\n" ::);
-  }
-  if (synclog_buf[0] >= synclog_cap) {
-    printf(
-      "synclog was truncated (exceeded capacity of %lu bytes)\n",
-      (synclog_cap - 1) * sizeof(uint32_t)
-    );
-  }
-  printf("synclog end\n");
-  #endif
-  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(CUTLASS_ENABLE_SYNCLOG)
-#undef __syncthreads
-#define __syncthreads() do {\
-  cutlass::arch::synclog_emit_syncthreads(__LINE__);\
-  __syncthreads();\
-} while (0)
-#endif // defined(CUTLASS_ENABLE_SYNCLOG)
-
-#if defined(CUTLASS_ENABLE_SYNCLOG)
-#undef __syncwarp
-#define __syncwarp(...) do {\
-  cutlass::arch::synclog_emit_syncwarp(__LINE__);\
-  __syncwarp(__VA_ARGS__);\
-} while (0)
-#endif // defined(CUTLASS_ENABLE_SYNCLOG)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
deleted file mode 100755
index 720895f38..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/wmma.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for warp matrix multiply-add (WMMA) operations
-*/
-
-#pragma once
-
-// CUTLASS WMMA does not support clang at present.
-#if !(defined(__clang__) && defined(__CUDA__))
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))
-#define CUTLASS_ARCH_WMMA_ENABLED
-#define CUTLASS_ARCH_WMMA_SM70_ENABLED
-#endif
-#endif
-
-#if (__CUDACC_VER_MAJOR__ >= 10)
-#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 720))
-#define CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
-#define CUTLASS_ARCH_WMMA_SM72_ENABLED
-#endif
-#endif
-
-#if (__CUDACC_VER_MAJOR__ >= 10)
-#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 750))
-#define CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
-#define CUTLASS_ARCH_WMMA_SM75_ENABLED
-#endif
-#endif
-
-#endif //!(defined(__clang__) && defined(__CUDA__))
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include <mma.h>
-#include "cutlass/arch/mma.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/gemm.h"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-/// Statically maps cutlass data types => nvcuda::wmma data types
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Type_>
-struct CutlassToWmmaDataType{
-  using Type = Type_;
-};
-
-/// Statically maps cutlass::half_t => __half
-template<>
-struct CutlassToWmmaDataType<cutlass::half_t> {
-  using Type = __half;
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
-template<>
-struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
-  using Type = __nv_bfloat16;
-};
-#endif
-
-/// Statically maps int8_t => char
-template<>
-struct CutlassToWmmaDataType<int8_t> {
-  using Type = signed char;
-};
-
-/// Statically maps uint8_t => char
-template<>
-struct CutlassToWmmaDataType<uint8_t> {
-  using Type = unsigned char;
-};
-
-/// Statically maps int32_t => int
-template<>
-struct CutlassToWmmaDataType<int32_t> {
-  using Type = int;
-};
-
-#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
-/// Statically maps cutlass::int4b_t => experimental::precision::s4
-template<>
-struct CutlassToWmmaDataType<cutlass::int4b_t> {
-  using Type = nvcuda::wmma::experimental::precision::s4;
-};
-
-/// Statically maps cutlass::uint4b_t => experimental::precision::s4
-template<>
-struct CutlassToWmmaDataType<cutlass::uint4b_t> {
-  using Type = nvcuda::wmma::experimental::precision::u4;
-};
-
-/// Statically maps cutlass::uint1b_t => experimental::precision::b1
-template<>
-struct CutlassToWmmaDataType<cutlass::uint1b_t> {
-  using Type = nvcuda::wmma::experimental::precision::b1;
-};
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-/// Statically maps cutlass::layout => nvcuda::wmma layout tags
-////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Layout_>
-struct CutlassToWmmaLayout {
-};
-
-/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
-template <>
-struct CutlassToWmmaLayout<cutlass::layout::RowMajor> {
-  using Layout = nvcuda::wmma::row_major;
-  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_row_major;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
-////////////////////////////////////////////////////////////////////////////////////////////////
-template <>
-struct CutlassToWmmaLayout<cutlass::layout::ColumnMajor> {
-  using Layout = nvcuda::wmma::col_major;
-  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_col_major;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-/// Statically maps nvcuda::wmma data types => cutlass data types
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Type_>
-struct WmmaToCutlassDataType{
-  using Type = Type_;
-};
-
-/// Statically maps __half => cutlass::half_t
-template<>
-struct WmmaToCutlassDataType<__half> {
-  using Type = cutlass::half_t;
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
-template<>
-struct WmmaToCutlassDataType<__nv_bfloat16> {
-  using Type = cutlass::bfloat16_t;
-};
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// WMMA template structure defines nvcuda::wmma::fragments and static assertion chaeks
-// for a specific template paramterized data type (Element[A|B|C]), layout (Layout[A|B|C]), 
-// and native wmma size (Shape)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <  
-  typename Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  typename ElementA_,                                ///< Data type of A elements 
-  typename LayoutA_,                                 ///< Layout of A matrix (concept: MatrixLayout)  
-  typename ElementB_,                                ///< Data type of B elements
-  typename LayoutB_,                                 ///< Layout of B matrix (concept: MatrixLayout)  
-  typename ElementC_,                                ///< Element type of C matrix  
-  typename LayoutC_,                                 /// Layout of C matrix (concept: MatrixLayout)
-  typename Operator_ = cutlass::arch::OpMultiplyAdd   ///< Inner product operator (multiply-add, xor.popc)
->
-struct Wmma;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace arch
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Specializations for each compute capability
-//
-#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
-#include "cutlass/arch/wmma_sm70.h"
-#endif
-
-#ifdef CUTLASS_ARCH_WMMA_SM72_ENABLED
-#include "cutlass/arch/wmma_sm72.h"
-#endif
-
-#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
-#include "cutlass/arch/wmma_sm75.h"
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif //CUTLASS_ARCH_WMMA_ENABLED
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
deleted file mode 100755
index 19fda4f85..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-#include "cutlass/layout/matrix.h"
-
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace arch {
-
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for half
-//
-////////////////////////////////////////////////////////////////////////////////
-template <
-typename Shape_, 
-typename LayoutA_, 
-typename LayoutB_,
-typename ElementC_,
-typename LayoutC_>
-struct Wmma<
-  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  cutlass::half_t,                          ///< ElementA
-  LayoutA_,                                 ///< LayoutA
-  cutlass::half_t,                          ///< ElementB
-  LayoutB_,                                 ///< LayoutB
-  ElementC_,                                ///< ElementC
-  LayoutC_,                                 ///< LayoutC
-  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
-> {
-
-#if defined(CUTLASS_ARCH_WMMA_SM70_ENABLED)
-  using Shape = Shape_;
-  using ElementA = cutlass::half_t;
-  using LayoutA = LayoutA_;
-  using ElementB = cutlass::half_t;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-  using ArchTag = arch::Sm70;
-
-  // check supported wmma shape for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
-    "Supported list of wmma operator shape for f16 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
-
-  // check supported wmma output data type for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::half_t, ElementC>::value || platform::is_same<float, ElementC>::value,
-    "Supported of wmma output data type for f16 multiplicands are: f16 and f32");
-
-  // Wmma Fragment
-  using FragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementA>::Type,
-          typename CutlassToWmmaLayout<LayoutA>::Layout>;
-
-  using FragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementB>::Type,
-          typename CutlassToWmmaLayout<LayoutB>::Layout>;
-
-  using FragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementC>::Type>;
-
-  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-    
-      nvcuda::wmma::mma_sync(D, A, B, C);
-  }
-#else
-    static_assert(false, "wmma.mma.sync for floating point multiplicands is avialable only for SM70 and beyond");
-#endif
-
-};
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
deleted file mode 100755
index 4a2689058..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-#include "cutlass/layout/matrix.h"
-
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for int8_t
-//
-////////////////////////////////////////////////////////////////////////////////
-template <
-typename Shape_, 
-typename LayoutA_, 
-typename LayoutB_,
-typename LayoutC_>
-struct Wmma<
-  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  int8_t,                                   ///< ElementA
-  LayoutA_,                                 ///< LayoutA
-  int8_t,                                   ///< ElementB
-  LayoutB_,                                 ///< LayoutB
-  int32_t,                                  ///< ElementC
-  LayoutC_,                                 ///< LayoutC
-  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
-> {
-#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
-  using Shape = Shape_;
-  using ElementA = int8_t;
-  using LayoutA = LayoutA_;
-  using ElementB = int8_t;
-  using LayoutB = LayoutB_;
-  using ElementC = int32_t;
-  using LayoutC = LayoutC_;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-  using ArchTag = arch::Sm72;
-
-  // check supported wmma shape for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
-    "Supported list of wmma operator shape for s8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
-
-
-  // Wmma Fragment
-  using FragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementA>::Type,
-          typename CutlassToWmmaLayout<LayoutA>::Layout>;
-
-  using FragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementB>::Type,
-          typename CutlassToWmmaLayout<LayoutB>::Layout>;
-
-  using FragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementC>::Type>;
-
-  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-
-      nvcuda::wmma::mma_sync(D, A, B, C);
-  }
-
-#else
-    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM72 and beyond");
-#endif
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for uint8_t
-//
-////////////////////////////////////////////////////////////////////////////////
-template <
-typename Shape_, 
-typename LayoutA_, 
-typename LayoutB_,
-typename LayoutC_>
-struct Wmma<
-  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  uint8_t,                                  ///< ElementA
-  LayoutA_,                                 ///< LayoutA
-  uint8_t,                                  ///< ElementB
-  LayoutB_,                                 ///< LayoutB
-  int32_t,                                  ///< ElementC
-  LayoutC_,                                 ///< LayoutC
-  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
-> {
-#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
-  using Shape = Shape_;
-  using ElementA = uint8_t;
-  using LayoutA = LayoutA_;
-  using ElementB = uint8_t;
-  using LayoutB = LayoutB_;
-  using ElementC = int32_t;
-  using LayoutC = LayoutC_;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-  using ArchTag = arch::Sm72;
-
-  // check supported wmma shape for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
-    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
-    "Supported list of wmma operator shape for u8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
-
-  // Wmma Fragment
-  using FragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementA>::Type,
-          typename CutlassToWmmaLayout<LayoutA>::Layout>;
-
-  using FragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementB>::Type,
-          typename CutlassToWmmaLayout<LayoutB>::Layout>;
-
-  using FragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementC>::Type>;
-  
-  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-
-      nvcuda::wmma::mma_sync(D, A, B, C);
-  }
-  
-#else
-    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM72 and beyond");
-#endif
-
-};
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h b/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
deleted file mode 100755
index 4663e95c7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Matrix multiply
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-#include "cutlass/layout/matrix.h"
-
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace arch {
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
-//
-////////////////////////////////////////////////////////////////////////////////
-template <
-typename Shape_, 
-typename LayoutA_, 
-typename LayoutB_,
-typename LayoutC_>
-struct Wmma<
-  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  cutlass::int4b_t,                         ///< ElementA
-  LayoutA_,                                 ///< LayoutA
-  cutlass::int4b_t,                         ///< ElementB
-  LayoutB_,                                 ///< LayoutB
-  int32_t,                                  ///< ElementC
-  LayoutC_,                                 ///< LayoutC
-  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
-> {
-#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
-  using Shape = Shape_;
-  using ElementA = cutlass::int4b_t;
-  using LayoutA = LayoutA_;
-  using ElementB = cutlass::int4b_t;
-  using LayoutB = LayoutB_;
-  using ElementC = int32_t;
-  using LayoutC = LayoutC_;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-  using ArchTag = arch::Sm75;
-
-  // check supported wmma shape for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::gemm::GemmShape<8, 8, 32>, Shape>::value,
-    "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
-
-
-  // Wmma Fragment
-  using FragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementA>::Type,
-          typename CutlassToWmmaLayout<LayoutA>::Layout>;
-
-  using FragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementB>::Type,
-          typename CutlassToWmmaLayout<LayoutB>::Layout>;
-
-  using FragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementC>::Type>;
-
-  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-      nvcuda::wmma::mma_sync(D, A, B, C);
-
-  }
-
-#else
-    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
-#endif
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// WMMA template structure defines nvcuda::wmma::fragments and static assert for
-// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1).
-//
-////////////////////////////////////////////////////////////////////////////////
-template <
-typename Shape_, 
-typename LayoutA_, 
-typename LayoutB_,
-typename LayoutC_>
-struct Wmma<
-  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
-  cutlass::uint1b_t,                        ///< ElementA
-  LayoutA_,                                 ///< LayoutA
-  cutlass::uint1b_t,                        ///< ElementB
-  LayoutB_,                                 ///< LayoutB
-  int32_t,                                  ///< ElementC
-  LayoutC_,                                 ///< LayoutC
-  cutlass::arch::OpXorPopc                  ///< Operator (multiply-add, xor.popc)
-> {
-#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
-  using Shape = Shape_;
-  using ElementA = cutlass::uint1b_t;
-  using LayoutA = LayoutA_;
-  using ElementB = cutlass::uint1b_t;
-  using LayoutB = LayoutB_;
-  using ElementC = int32_t;
-  using LayoutC = LayoutC_;
-  using Operator = cutlass::arch::OpXorPopc;
-  using ArchTag = arch::Sm75;
-
-  // check supported wmma shape for the given multiplicand data types
-  static_assert(
-    platform::is_same<cutlass::gemm::GemmShape<8, 8, 128>, Shape>::value,
-    "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
-
-
-  // Wmma Fragment
-  using FragmentA = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_a,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementA>::Type,
-          typename CutlassToWmmaLayout<LayoutA>::Layout>;
-
-  using FragmentB = nvcuda::wmma::fragment<
-          nvcuda::wmma::matrix_b,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementB>::Type,
-          typename CutlassToWmmaLayout<LayoutB>::Layout>;
-
-  using FragmentC = nvcuda::wmma::fragment<
-          nvcuda::wmma::accumulator,
-          Shape::kM,
-          Shape::kN,
-          Shape::kK,
-          typename CutlassToWmmaDataType<ElementC>::Type>;
-  
-  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
-                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
-  }
-
-#else
-    static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
-#endif
-
-};
-
-} // namespace arch
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/array.h b/lightllm-kernel/cutlass/include/cutlass/array.h
deleted file mode 100755
index 62e946949..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/array.h
+++ /dev/null
@@ -1,2614 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
-           and is safe to use in a union.
-*/
-
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/platform/platform.h"
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Statically sized array for any data type
-template <
-  typename T,
-  int N,
-  bool RegisterSized = sizeof_bits<T>::value >= 32
->
-struct Array;
-
-namespace detail {
-
-template<class T>
-struct is_Array : platform::false_type {};
-
-template <
-  typename T,
-  int N,
-  bool RegisterSized
->
-struct is_Array<Array<T, N, RegisterSized> > : platform::true_type {};
-
-template<typename T>
-constexpr bool is_Array_v = is_Array<T>::value;
-
-} // namespace detail
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the size of an Array<> in bits
-template <typename T, int N, bool RegisterSized>
-struct sizeof_bits<Array<T, N, RegisterSized> > {
-  static constexpr int value = sizeof(Array<T, N, RegisterSized>) * 8;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if the argument is a power of 2
-CUTLASS_HOST_DEVICE
-constexpr bool ispow2(unsigned x) {
-  return x && (!(x & (x - 1)));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns the largest power of two not greater than the argument.
-CUTLASS_HOST_DEVICE
-constexpr unsigned floor_pow_2(unsigned x) {
-  return (x == 0 || ispow2(x)) ? x : ((floor_pow_2(x >> 1)) << 1);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Statically sized array for any data type
-template <
-  typename T,
-  int N
->
-struct Array<T, N, true> {
-
-  /// Storage type
-  using Storage = T;
-
-  /// Element type
-  using Element = T;
-
-  /// Number of storage elements
-  //static std::size_t const kStorageElements = N;
-  static constexpr size_t kStorageElements = N;
-
-  /// Number of logical elements
-  static constexpr size_t kElements = N;
-
-  //
-  // C++ standard members
-  //
-
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef value_type &reference;
-  typedef value_type const & const_reference;
-  typedef value_type *pointer;
-  typedef value_type const * const_pointer;
-
-  //
-  // Iterators
-  //
-
-  /// Bidirectional iterator over elements
-  class iterator {
-
-    /// Pointer to object
-    T *ptr_;
-
-  public:
-
-    CUTLASS_HOST_DEVICE
-    iterator(): ptr_(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    iterator(T *_ptr): ptr_(_ptr) { }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator++() {
-      ++ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator--() {
-      --ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator++(int) {
-      iterator ret(*this);
-      ++ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator--(int) {
-      iterator ret(*this);
-      --ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    T &operator*() const {
-      return *ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(iterator const &other) const {
-      return ptr_ == other.ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(iterator const &other) const {
-      return ptr_ != other.ptr_;
-    }
-  };
-
-  /// Bidirectional constant iterator over elements
-  class const_iterator {
-
-    /// Pointer to object
-    const T *ptr_;
-
-  public:
-
-    CUTLASS_HOST_DEVICE
-    const_iterator(): ptr_(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    const_iterator(T const *_ptr): ptr_(_ptr) { }
-
-    CUTLASS_HOST_DEVICE
-    const_iterator &operator++() {
-      ++ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_iterator &operator--() {
-      --ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_iterator operator++(int) {
-      const_iterator ret(*this);
-      ++ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_iterator operator--(int) {
-      const_iterator ret(*this);
-      --ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    T const &operator*() const {
-      return *ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(const_iterator const &other) const {
-      return ptr_ == other.ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(const_iterator const &other) const {
-      return ptr_ != other.ptr_;
-    }
-  };
-
-  /// Bidirectional iterator over elements
-  class reverse_iterator {
-
-    /// Pointer to object
-    T *ptr_;
-
-  public:
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator(): ptr_(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator(T *_ptr): ptr_(_ptr) { }
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator &operator++() {
-      --ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator &operator--() {
-      ++ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator operator++(int) {
-      iterator ret(*this);
-      --ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator operator--(int) {
-      iterator ret(*this);
-      ++ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    T &operator*() const {
-      return *(ptr_ - 1);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(reverse_iterator const &other) const {
-      return ptr_ == other.ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(reverse_iterator const &other) const {
-      return ptr_ != other.ptr_;
-    }
-  };
-
-  /// Bidirectional constant iterator over elements
-  class const_reverse_iterator {
-
-    /// Pointer to object
-    T const *ptr_;
-
-  public:
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator(): ptr_(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator(T const *_ptr): ptr_(_ptr) { }
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator &operator++() {
-      --ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator &operator--() {
-      ++ptr_;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator operator++(int) {
-      const_reverse_iterator ret(*this);
-      --ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator operator--(int) {
-      const_reverse_iterator ret(*this);
-      ++ptr_;
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    T const &operator*() const {
-      return *(ptr_ - 1);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(const_iterator const &other) const {
-      return ptr_ == other.ptr_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(const_iterator const &other) const {
-      return ptr_ != other.ptr_;
-    }
-  };
-
-  /// Internal storage
-  Storage storage[kElements];
-
-  /// Efficient clear method
-  CUTLASS_HOST_DEVICE
-  void clear() {
-    fill(T(0));
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference at(size_type pos) {
-    return reinterpret_cast<reference>(storage[pos]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference at(size_type pos) const {
-    return reinterpret_cast<const_reference>(storage[pos]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference operator[](size_type pos) {
-    return reinterpret_cast<reference>(storage[pos]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference operator[](size_type pos) const {
-    return reinterpret_cast<const_reference>(storage[pos]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference front() {
-    return reinterpret_cast<reference>(storage[0]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference front() const {
-    return reinterpret_cast<const_reference>(storage[0]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference back() {
-    return reinterpret_cast<reference>(storage[kStorageElements - 1]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference back() const {
-    return reinterpret_cast<const_reference>(storage[kStorageElements - 1]);
-  }
-
-  CUTLASS_HOST_DEVICE
-  pointer data() {
-    return reinterpret_cast<pointer>(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_pointer data() const {
-    return reinterpret_cast<const_pointer>(storage);
-  }
-  
-  CUTLASS_HOST_DEVICE
-  pointer raw_data() {
-    return reinterpret_cast<pointer>(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_pointer raw_data() const {
-    return reinterpret_cast<const_pointer>(storage);
-  }
-
-
-  CUTLASS_HOST_DEVICE
-  constexpr bool empty() const {
-    return !kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type size() const {
-    return kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type max_size() const {
-    return kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void fill(T const &value) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(kElements); ++i) {
-      storage[i] = static_cast<Storage>(value);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  iterator begin() {
-    return iterator(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator begin() const {
-    return cbegin();
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator cbegin() const {
-    return const_iterator(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  iterator end() {
-    return iterator(reinterpret_cast<pointer>(storage + kStorageElements));
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator end() const {
-    return cend();
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator cend() const {
-    return const_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
-  }
-
-  CUTLASS_HOST_DEVICE
-  reverse_iterator rbegin() {
-    return reverse_iterator(reinterpret_cast<pointer>(storage + kStorageElements));
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator rbegin() const {
-    return crbegin();
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator crbegin() const {
-    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
-  }
-
-  CUTLASS_HOST_DEVICE
-  reverse_iterator rend() {
-    return reverse_iterator(reinterpret_cast<pointer>(storage));
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator rend() const {
-    return crend();
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator crend() const {
-    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage));
-  }
-
-  //
-  // Comparison operators
-  //
-
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Factories
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Array<Element, 1> make_Array(Element x) {
-  return {x};
-}
-
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Array<Element, 2> make_Array(Element x, Element y) {
-  return {x,y};
-}
-
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Array<Element, 3> make_Array(Element x, Element y, Element z) {
-  return {x,y,z};
-}
-
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Array<Element, 4> make_Array(Element x, Element y, Element z, Element w) {
-  return {x,y,z,w};
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// functional.h numeric specializations
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, int N>
-struct absolute_value_op< Array<T, N> > {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-
-    Array<T, N> result;
-    absolute_value_op<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct plus<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    plus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-template <typename T, int N>
-struct minus<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    minus<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct multiplies<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    multiplies<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N, bool PropogateNaN>
-struct maximum_absolute_value_reduction<Array<T, N>, PropogateNaN> {
-
-  CUTLASS_HOST_DEVICE
-  T operator() (T const& scalar, Array<T, N> const& rhs) const {
-
-    T result = scalar;
-    maximum_absolute_value_reduction<T, PropogateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result = scalar_op(result, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct scale<Array<T, N>> {
-  T const scaling_factor_;
-
-  CUTLASS_HOST_DEVICE
-  scale(T scaling_factor) : scaling_factor_(scaling_factor) {
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const & rhs) const {
-    Array<T, N> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = rhs[i] * scaling_factor_;
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct divides<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    divides<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct reciprocal_approximate<Array<T, N>> {
-  
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-
-    Array<T, N> result;
-    reciprocal_approximate<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct reciprocal_approximate_ftz<Array<T, N>> {
-  
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-
-    Array<T, N> result;
-    reciprocal_approximate_ftz<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N, bool PropagateNaN>
-struct maximum<Array<T, N>, PropagateNaN> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    maximum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    maximum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    maximum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N, bool PropagateNaN>
-struct minimum<Array<T, N>, PropagateNaN> {
-
-  CUTLASS_HOST_DEVICE
-  static T scalar_op(T const &lhs, T const &rhs) {
-    return (rhs < lhs ? rhs : lhs);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    minimum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
-
-    Array<T, N> result;
-    minimum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i], scalar);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
-
-    Array<T, N> result;
-    minimum<T, PropagateNaN> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-template <typename T, int N>
-struct minimum_with_nan_propagation<Array<T, N>> : minimum<Array<T, N>, true> 
-{};
-
-template <typename T, int N>
-struct negate<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs) const {
-
-    Array<T, N> result;
-    negate<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(lhs[i]);
-    }
-
-    return result;
-  }
-};
-
-/// Fused multiply-add
-template <typename T, int N>
-struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i], b[i], c[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i], scalar, c[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(scalar, b[i], c[i]);
-    }
-
-    return result;
-  }
-};
-
-/// Fused square-and-plus
-template <typename T, int N>
-struct square_and_plus<Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
-    multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> ma_op;
-    return ma_op(rhs, rhs, lhs);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &lhs, T const &rhs) const {
-    plus<Array<T, N>> plus_op;
-    multiplies<T> multiplies_op;
-    return plus_op(multiplies_op(rhs, rhs), lhs);
-  }
-};
-
-/// Inverse-square-root
-template <typename T, int N>
-struct inverse_square_root<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a) const {
-    Array<T, N> result;
-    inverse_square_root<T> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i]);
-    }
-    return result;
-  }
-};
-
-template <int N>
-struct inverse_square_root<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & a) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = h2rsqrt(a_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half d_residual = hrsqrt(a_residual_ptr[N - 1]);
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    inverse_square_root<half_t> scalar_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = scalar_op(a[i]);
-    }
-
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add-relu0
-template <typename T, int N>
-struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
-
-    Array<T, N> result;
-    multiply_add<T> scalar_op;
-    maximum<T> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
-    }
-
-    return result;
-  }
-};
-
-
-template <typename T, int N>
-struct conjugate<Array<T, N> >  {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &a) const {
-
-    conjugate<T> conj_op;
-
-    Array<T, N> ca;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      ca[i] = conj_op(a[i]);
-    }
-    return ca;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// functional.h numeric specializations targeting SIMD instructions in device code.
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int N>
-struct plus<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] + rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs + rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] + rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct minus<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] - rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs - rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] - rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct multiplies<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] * rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hmul(
-        reinterpret_cast<__half const &>(lhs),
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs * rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hmul(
-        a_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] * rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct divides<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hdiv(
-        a_residual_ptr[N - 1],
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] / rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = __hdiv(
-        reinterpret_cast<__half const &>(lhs),
-        b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs / rhs[i];
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = __hdiv(
-        a_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = lhs[i] / rhs;
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N>
-struct negate<Array<half_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hneg2(source_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      half_t x = -lhs[N - 1];
-      __half lhs_val = reinterpret_cast<__half const &>(x);
-      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
-    }
-
-    #else
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = -lhs[i];
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add
-template <int N>
-struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    Array<half_t, N> const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1],
-        b_residual_ptr[N - 1],
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    half_t const &a,
-    Array<half_t, N> const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-      __half d_residual = __hfma(
-        reinterpret_cast<__half const &>(a),
-        b_residual_ptr[N - 1],
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a, b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    half_t const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(b),
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b, c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    Array<half_t, N> const &b,
-    half_t const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-
-      __half d_residual = __hfma(
-        a_residual_ptr[N - 1],
-        b_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(c));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add-relu0
-template <int N>
-struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    Array<half_t, N> const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1],
-        b_residual_ptr[N - 1],
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    half_t const &a,
-    Array<half_t, N> const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-      __half d_residual = __hfma_relu(
-        reinterpret_cast<__half const &>(a),
-        b_residual_ptr[N - 1],
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a, b[i], c[i]), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    half_t const &b,
-    Array<half_t, N> const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
-    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(b),
-        c_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b, c[i]), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(
-    Array<half_t, N> const &a,
-    Array<half_t, N> const &b,
-    half_t const &c) const {
-
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
-    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
-    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
-    }
-
-    if constexpr (N % 2) {
-
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
-
-      __half d_residual = __hfma_relu(
-        a_residual_ptr[N - 1],
-        b_residual_ptr[N - 1],
-        reinterpret_cast<__half const &>(c));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    multiply_add<half_t> op;
-    maximum<half_t> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b[i], c), half_t(0));
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N, bool PropagateNaN>
-struct minimum<Array<half_t, N>, PropagateNaN> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_ptr[i])
-                                   : __hmin2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
-                                       : __hmin(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    minimum<half_t,PropagateNaN> mn;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mn(lhs[i],rhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_pair, rhs_ptr[i])
-                                   : __hmin2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = PropagateNaN ? __hmin_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
-                                       : __hmin(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    minimum<half_t,PropagateNaN> mn;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mn(lhs, rhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_pair)
-                                   : __hmin2(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
-                                       : __hmin(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    minimum<half_t, PropagateNaN> mn;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mn(lhs[i], rhs);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-template <int N, bool PropagateNaN>
-struct maximum<Array<half_t, N>, PropagateNaN> {
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_ptr[i])
-                                   : __hmax2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = PropagateNaN ? __hmax(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
-                                       : __hmax_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    maximum<half_t,PropagateNaN> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(lhs[i], rhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_pair, rhs_ptr[i])
-                                   : __hmax2(lhs_pair, rhs_ptr[i]);
-    }
-
-    if constexpr (N % 2) {
-      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
-
-      __half d_residual = PropagateNaN ? __hmax_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
-                                       : __hmax(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    maximum<half_t,PropagateNaN> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(lhs, rhs[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
-    Array<half_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
-    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_pair)
-                                   : __hmax2(lhs_ptr[i], rhs_pair);
-    }
-
-    if constexpr (N % 2) {
-      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
-
-      __half d_residual = PropagateNaN ? __hmax_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
-                                       : __hmax(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
-
-      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
-    }
-
-    #else
-
-    maximum<half_t,PropagateNaN> mx;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = mx(lhs[i], rhs);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-/// Fused multiply-add
-template <int N>
-struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a,
-    Array<bfloat16_t, N> const &b,
-    Array<bfloat16_t, N> const &c) const {
-
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
-        : "=r"(result_ptr[i])
-        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
-      );
-    }
-
-    if constexpr (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
-        : "=h"(result_ptr[N - 1])
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    bfloat16_t const &a,
-    Array<bfloat16_t, N> const &b,
-    Array<bfloat16_t, N> const &c) const {
-
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    unsigned a_packed = static_cast<unsigned>(a.raw());
-    a_packed = (a_packed | (a_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
-        : "=r"(result_ptr[i])
-        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
-      );
-    }
-
-    if constexpr (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
-        : "=h"(result_ptr[N - 1])
-        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a, b[i], c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a,
-    bfloat16_t const &b,
-    Array<bfloat16_t, N> const &c) const {
-
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
-
-    unsigned b_packed = static_cast<unsigned>(b.raw());
-    b_packed = (b_packed | (b_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
-        : "=r"(result_ptr[i])
-        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
-      );
-    }
-
-    if constexpr (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
-        : "=h"(result_ptr[N - 1])
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b, c[i]);
-    }
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(
-    Array<bfloat16_t, N> const &a,
-    Array<bfloat16_t, N> const &b,
-    bfloat16_t const &c) const {
-
-    Array<bfloat16_t, N> result;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
-
-    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
-    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
-
-    unsigned c_packed = static_cast<unsigned>(c.raw());
-    c_packed = (c_packed | (c_packed << 16));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
-        : "=r"(result_ptr[i])
-        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
-      );
-    }
-
-    if constexpr (N % 2) {
-
-      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
-      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
-      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
-      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
-
-      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
-        : "=h"(result_ptr[N - 1])
-        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
-      );
-    }
-
-    #else
-
-    multiply_add<bfloat16_t> op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = op(a[i], b[i], c);
-    }
-    #endif
-
-    return result;
-  }
-};
-
-
-/// bit_and
-template <int N>
-struct bit_and<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] & b_data[i]);
-    }
-
-    return result;
-  }
-};
-
-
-/// bit_or
-template <int N>
-struct bit_or<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] | b_data[i]);
-    }
-
-    return result;
-  }
-};
-
-
-/// bit_not
-template <int N>
-struct bit_not<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (~a_data[i]);
-    }
-
-    return result;
-  }
-};
-
-
-/// bit_xor
-template <int N>
-struct bit_xor<Array<uint1b_t, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
-    using ArrayType = Array<uint1b_t, N>;
-    using Storage = typename ArrayType::Storage;
-    ArrayType result;
-
-    Storage *result_data = result.raw_data();
-    Storage const *a_data = a.raw_data();
-    Storage const *b_data = b.raw_data();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
-      result_data[i] = (a_data[i] ^ b_data[i]);
-    }
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Operator overloads
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  plus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator+(T const &lhs, Array<T, N> const &rhs) {
-  plus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator+(Array<T, N> const &lhs, T const &rhs) {
-  plus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  minus<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator-(Array<T, N> const &lhs) {
-  negate<Array<T, N>> op;
-  return op(lhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
-  multiplies<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
-  divides<Array<T, N>> op;
-  return op(lhs, rhs);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-template <typename T, int N>
-CUTLASS_HOST_DEVICE
-Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
-  multiply_add<Array<T, N>> op;
-  return op(a, b, c);
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/array_subbyte.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// AlignedArray
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Aligned array type
-template <
-  /// Element type
-  typename T,
-  /// Number of elements in the array
-  int N,
-  /// Alignment requirement in bytes
-  int Alignment = ( sizeof_bits<T>::value * N + 7 ) / 8
->
-class alignas(Alignment) AlignedArray: public Array<T, N> {
-public:
-
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
deleted file mode 100755
index 2dd8aa84e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Array holding planar complex elements
-template <typename Element_, int N>
-struct ArrayPlanarComplex {
-
-  /// Underlying real element
-  using Element = Element_;
-
-  /// Number of logical elements
-  static constexpr size_t kElements = N;
-
-  /// Underlying Fragment of real-valued elemenets
-  using ArrayReal = cutlass::Array<Element, N>;
-
-public:
-  /// Fragment of real-valued elements representing the real part
-  ArrayReal real;
-
-  /// Fragment of real-valued elements representing the imaginary part
-  ArrayReal imag;
-
-public:
-  /// Sets the array to zero efficiently
-  CUTLASS_HOST_DEVICE
-  void clear() {
-    real.clear();
-    imag.clear();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to deduce template arguments
-template <typename Element, int N>
-CUTLASS_HOST_DEVICE
-ArrayPlanarComplex<Element, N> 
-make_ArrayPlanarComplex(Array<Element, N> const &real, Array<Element, N> const &imag) {
-  return ArrayPlanarComplex<Element, N>{real, imag};
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h b/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
deleted file mode 100755
index eb77a9310..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/array_subbyte.h
+++ /dev/null
@@ -1,559 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
-           and is safe to use in a union.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Statically sized array for any data type
-template <
-  typename T,
-  int N
->
-struct Array<T, N, false> {
-  static constexpr int kSizeBits = sizeof_bits<T>::value * N;
-
-  /// Storage type
-  using Storage = typename platform::conditional<
-    ((kSizeBits % 32) != 0),
-    typename platform::conditional<
-      ((kSizeBits % 16) != 0),
-      uint8_t,
-      uint16_t
-    >::type,
-    uint32_t
-  >::type;
-
-  /// Element type
-  using Element = T;
-
-  /// Number of logical elements per stored object
-  static constexpr int kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
-
-  /// Number of storage elements
-  static constexpr size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem;
-
-  /// Number of logical elements
-  static constexpr size_t kElements = N;
-
-  /// Bitmask for covering one item
-  static constexpr Storage kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
-
-  //
-  // C++ standard members with pointer types removed
-  //
-
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef value_type *pointer;
-  typedef value_type const *const_pointer;
-
-  //
-  // References
-  //
-
-  /// Reference object inserts or extracts sub-byte items
-  class reference {
-    /// Pointer to storage element
-    Storage *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    reference() = default;
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    reference(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    /// Assignment
-    CUTLASS_HOST_DEVICE
-    reference &operator=(T x) {
-    // `*ptr_ & kUpdateMask` will read ptr_ before write to it
-    // This means code pattern like
-    //
-    // ```cpp
-    // Array<half_t, N> result;
-    // result[0] = xxx;
-    // ```
-    // 
-    // Will leads to compiler warning on use of unintialized member variable. Although we know
-    //      this read of uninitialized member variable is harmeless.
-
-#if defined(__clang__)
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wuninitialized"
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wuninitialized"
-#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-
-      Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
-
-      Storage kUpdateMask = Storage(~(kMask << (idx_ * sizeof_bits<T>::value)));
-
-      *ptr_ = Storage(((*ptr_ & kUpdateMask) | (item << idx_ * sizeof_bits<T>::value)));
-
-#if defined(__clang__)
-#  pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#  pragma GCC diagnostic pop
-#endif
-
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    T get() const {
-      Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask);
-      return reinterpret_cast<T const &>(item);
-    }
-
-    /// Extract
-    CUTLASS_HOST_DEVICE
-    operator T() const {
-      return get();
-    }
-
-    /// Explicit cast to int
-    CUTLASS_HOST_DEVICE
-    explicit operator int() const {
-      return int(get());
-    }
-
-    /// Explicit cast to float
-    CUTLASS_HOST_DEVICE
-    explicit operator float() const {
-      return float(get());
-    }
-  };
-
-  /// Reference object extracts sub-byte items
-  class const_reference {
-
-    /// Pointer to storage element
-    Storage const *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    const_reference() = default;
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    const_reference(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    CUTLASS_HOST_DEVICE
-    const T get() const {
-      Storage item = (*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask;
-      return reinterpret_cast<T const &>(item);
-    }
-
-    /// Extract
-    CUTLASS_HOST_DEVICE
-    operator T() const {
-      Storage item = Storage(Storage(*ptr_ >> Storage(idx_ * sizeof_bits<T>::value)) & kMask);
-      return reinterpret_cast<T const &>(item);
-    }
-
-    /// Explicit cast to int
-    CUTLASS_HOST_DEVICE
-    explicit operator int() const {
-      return int(get());
-    }
-
-    /// Explicit cast to float
-    CUTLASS_HOST_DEVICE
-    explicit operator float() const {
-      return float(get());
-    }
-  };
-
-  //
-  // Iterators
-  //
-
-  /// Bidirectional iterator over elements
-  class iterator {
-
-    /// Pointer to storage element
-    Storage *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    iterator() = default;
-
-    CUTLASS_HOST_DEVICE
-    iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator++() {
-      ++idx_;
-      if (idx_ == kElementsPerStoredItem) {
-        ++ptr_;
-        idx_ = 0;
-      }
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator--() {
-      if (!idx_) {
-        --ptr_;
-        idx_ = kElementsPerStoredItem - 1;
-      }
-      else {
-        --idx_;
-      }
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator++(int) {
-      iterator ret(*this);
-      ++idx_;
-      if (idx_ == kElementsPerStoredItem) {
-        ++ptr_;
-        idx_ = 0;
-      }
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator--(int) {
-      iterator ret(*this);
-      if (!idx_) {
-        --ptr_;
-        idx_ = kElementsPerStoredItem - 1;
-      }
-      else {
-        --idx_;
-      }
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    reference operator*() const {
-      return reference(ptr_, idx_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(iterator const &other) const {
-      return ptr_ == other.ptr_ && idx_ == other.idx_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(iterator const &other) const {
-      return !(*this == other);
-    }
-  };
-
-  /// Bidirectional constant iterator over elements
-  class const_iterator {
-
-    /// Pointer to storage element
-    Storage const *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    const_iterator() = default;
-
-    CUTLASS_HOST_DEVICE
-    const_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator++() {
-      ++idx_;
-      if (idx_ == kElementsPerStoredItem) {
-        ++ptr_;
-        idx_ = 0;
-      }
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator &operator--() {
-      if (!idx_) {
-        --ptr_;
-        idx_ = kElementsPerStoredItem - 1;
-      }
-      else {
-        --idx_;
-      }
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator++(int) {
-      iterator ret(*this);
-      ++idx_;
-      if (idx_ == kElementsPerStoredItem) {
-        ++ptr_;
-        idx_ = 0;
-      }
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    iterator operator--(int) {
-      iterator ret(*this);
-      if (!idx_) {
-        --ptr_;
-        idx_ = kElementsPerStoredItem - 1;
-      }
-      else {
-        --idx_;
-      }
-      return ret;
-    }
-
-    CUTLASS_HOST_DEVICE
-    const_reference operator*() const {
-      return const_reference(ptr_, idx_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator==(iterator const &other) const {
-      return ptr_ == other.ptr_ && idx_ == other.idx_;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool operator!=(iterator const &other) const {
-      return !(*this == other);
-    }
-  };
-
-  /// Bidirectional iterator over elements
-  class reverse_iterator {
-
-    /// Pointer to storage element
-    Storage *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    reverse_iterator() = default;
-
-    CUTLASS_HOST_DEVICE
-    reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-  };
-
-  /// Bidirectional constant iterator over elements
-  class const_reverse_iterator {
-
-    /// Pointer to storage element
-    Storage const *ptr_{nullptr};
-
-    /// Index into elements packed into Storage object
-    int idx_{0};
-
-  public:
-
-    const_reverse_iterator() = default;
-
-    CUTLASS_HOST_DEVICE
-    const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
-  };
-
-  /// Efficient clear method
-  CUTLASS_HOST_DEVICE
-  void clear() {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(kStorageElements); ++i) {
-      storage[i] = Storage(0);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference at(size_type pos) {
-    return reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference at(size_type pos) const {
-    return const_reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference operator[](size_type pos) {
-    return at(pos);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference operator[](size_type pos) const {
-    return at(pos);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference front() {
-    return at(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference front() const {
-    return at(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reference back() {
-    return reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reference back() const {
-    return const_reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
-  }
-
-  CUTLASS_HOST_DEVICE
-  pointer data() {
-    return reinterpret_cast<pointer>(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_pointer data() const {
-    return reinterpret_cast<const_pointer>(storage);
-  }
-  
-  CUTLASS_HOST_DEVICE
-  Storage * raw_data() {
-    return storage;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Storage const * raw_data() const {
-    return storage;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr bool empty() const {
-    return !kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type size() const {
-    return kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_type max_size() const {
-    return kElements;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void fill(T const &value) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerStoredItem; ++i) {
-      reference ref(storage, i);
-      ref = value;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < kStorageElements; ++i) {
-      storage[i] = storage[0];
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  iterator begin() {
-    return iterator(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator cbegin() const {
-    return const_iterator(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  iterator end() {
-    return iterator(storage + kStorageElements);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_iterator cend() const {
-    return const_iterator(storage + kStorageElements);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reverse_iterator rbegin() {
-    return reverse_iterator(storage + kStorageElements);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator crbegin() const {
-    return const_reverse_iterator(storage + kStorageElements);
-  }
-
-  CUTLASS_HOST_DEVICE
-  reverse_iterator rend() {
-    return reverse_iterator(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  const_reverse_iterator crend() const {
-    return const_reverse_iterator(storage);
-  }
-
-private:
-  /// Internal storage
-  Storage storage[kStorageElements];
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/barrier.h b/lightllm-kernel/cutlass/include/cutlass/barrier.h
deleted file mode 100755
index 6f2373b6d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/barrier.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Implementation of a CTA-wide barrier for inter-CTA synchronization.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-namespace detail {
-
-//
-// Utilities for abstracting synchronization methods for barriers
-//
-
-struct SyncthreadsSync {
-  CUTLASS_DEVICE
-  static void sync() {
-    __syncthreads();
-  }
-};
-
-struct SyncwarpSync {
-  CUTLASS_DEVICE
-  static void sync() {
-    __syncwarp();
-  }
-};
-
-template <
-  int ThreadCount,
-  int BarrierId
->
-struct NamedBarrierSync {
-  CUTLASS_DEVICE
-  static void sync() {
-    cutlass::arch::NamedBarrier::sync(ThreadCount, static_cast<arch::ReservedNamedBarriers>(BarrierId));
-  }
-};
-
-} // namepspace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Group or CTA-wide semaphore for inter-CTA synchronization.
-template <class Sync>
-struct GenericBarrier {
-
-public:
-
-  /// Flag type
-  using T = int;
-
-  /// Initial flag value
-  static const T INIT = 0;
-
-
-protected:
-
-  /// Load flag, as a strong acquire operation (int specialization)
-  CUTLASS_DEVICE
-  static int ld_acquire(int *ptr)
-  {
-    int state = 0;
-
-#if (__CUDA_ARCH__ >= 700)
-    /// SM70 and newer use memory consistency qualifiers
-
-    // Acquire pattern using acquire modifier
-    asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
-
-#else
-    asm volatile ("ld.cg.global.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
-#endif // (__CUDA_ARCH__ >= 700)
-
-    return state;
-  }
-
-
-  /// Reduce into flag, with release pattern (int specialization)
-  CUTLASS_DEVICE
-  static void red_release(int *ptr, int val)
-  {
-#if (__CUDA_ARCH__ >= 700)
-    /// SM70 and newer use memory consistency qualifiers
-
-    // Release pattern using acq_rel fence + relaxed modifier.  (The fence also releases data
-    // that was weakly-written by other threads prior to the last syncthreads)
-    asm volatile ("fence.acq_rel.gpu;\n");
-    asm volatile ("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(ptr), "r"(val));
-
-#else
-    __threadfence();
-    atomicAdd(ptr, val);
-#endif // (__CUDA_ARCH__ >= 700)
-  }
-
-
-public:
-
-  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
-  CUTLASS_DEVICE
-  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
-  {
-    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
-
-    if (thread_idx == 0)
-    {
-        // Spin-loop
-        #pragma unroll 1
-        while(ld_acquire(flag_ptr) < count) {}
-    }
-
-    Sync::sync();
-  }
-
-  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
-  CUTLASS_DEVICE
-  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
-  {
-    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
-
-    if (thread_idx == 0)
-    {
-        // Spin-loop
-        #pragma unroll 1
-        while(ld_acquire(flag_ptr) != val) {}
-    }
-    Sync::sync();
-  }
-
-  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
-  CUTLASS_DEVICE
-  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
-
-    if (thread_idx == 0)
-    {
-        // Spin-loop
-        #pragma unroll 1
-        while(atomicCAS(flag_ptr, val, 0) != val) {}
-    }
-
-    Sync::sync();
-  }
-
-  /// Increment the arrival count for a flag
-  CUTLASS_DEVICE
-  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx, int val = 1)
-  {
-    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
-
-    Sync::sync();
-
-    if (thread_idx == 0)
-    {
-      red_release(flag_ptr, val);
-    }
-  }
-
-
-  /// Increment the arrival counts for a range of flags
-  CUTLASS_DEVICE
-  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1)
-  {
-    int flag_idx = first_flag_idx + thread_idx;
-    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
-
-    // Barrier to make sure all other threads in group have written their data
-    Sync::sync();
-
-    // Select threads increment their flags
-    if (thread_idx < count) {
-      red_release(flag_ptr, val);
-    }
-  }
-};
-
-using Barrier = GenericBarrier<detail::SyncthreadsSync>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/** Structure for managing multiple NamedBarriers to be used by different warp groups, allowing
- * runtime index values to be used to call into named barriers with compile-time-constant IDs.
- *
- * @param ThreadCount_ Number of threads that will wait on a NamedBarrier with a given ID
- * @param Offset Value added to the ID passed in by the user to determine the NamedBarrier ID to call into
- * @param MaxNumNamedBarriers The maximum number of unique barrier IDs that will be requested on this type
-**/
-template <
-  uint32_t ThreadCount_,
-  uint32_t Offset = 0,
-  uint32_t MaxNumNamedBarriers = 16
->
-struct NamedBarrierManager {
-
-  static_assert(MaxNumNamedBarriers <= arch::NamedBarrier::HardwareMaxNumNamedBarriers);
-  static_assert(MaxNumNamedBarriers + Offset <= arch::NamedBarrier::HardwareMaxNumNamedBarriers, "Barrier IDs cannot exceed 15");
-
-  // Number of threads participating in the barrier
-  static constexpr uint32_t ThreadCount = ThreadCount_;
-
-  template <uint32_t BarrierId>
-  using BarrierSync = cutlass::GenericBarrier<cutlass::detail::NamedBarrierSync<ThreadCount, BarrierId>>;
-
-  // Underlying type used by all barriers for synchronization. Does not depend on
-  // template parameter BarrierId, so passing in 0 suffices.
-  using T = typename BarrierSync<0>::T;
-
-  using IntegerSequence = cute::make_integer_sequence<uint32_t, MaxNumNamedBarriers>;
-
-  CUTLASS_DEVICE
-  static
-  void wait_lt(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count) {
-    wait_lt_helper(idx, lock_ptr, thread_idx, flag_idx, count, IntegerSequence{});
-  }
-
-  CUTLASS_DEVICE
-  static void
-  wait_eq(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-    wait_eq_helper<false>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
-  }
-
-  CUTLASS_DEVICE
-  static void
-  wait_eq_reset(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-    wait_eq_helper<true>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
-  }
-
-  CUTLASS_DEVICE
-  static void
-  arrive_inc(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
-    arrive_inc_helper(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
-  }
-
-  CUTLASS_DEVICE
-  static void
-  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
-    arrive_range_inc_helper(idx, lock_ptr, thread_idx, first_flag_idx, count, val, IntegerSequence{});
-  }
-
-private:
-  CUTLASS_DEVICE
-  static void
-  check_barrier_in_range([[maybe_unused]] uint32_t idx) {
-    assert((idx < MaxNumNamedBarriers) && "Index exceeds barrier count");
-  }
-
-  template <uint32_t... Idx>
-  CUTLASS_DEVICE
-  static void
-  wait_lt_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count, cute::integer_sequence<uint32_t, Idx...>) {
-    check_barrier_in_range(idx);
-    ((Idx == idx && (BarrierSync<Idx + Offset>::wait_lt(lock_ptr, thread_idx, flag_idx, count), true)) || ...);
-  }
-
-  template <bool Reset, uint32_t... Idx>
-  CUTLASS_DEVICE
-  static void
-  wait_eq_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val, cute::integer_sequence<uint32_t, Idx...>) {
-    check_barrier_in_range(idx);
-    if constexpr (Reset) {
-      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
-    }
-    else {
-      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
-    }
-  }
-
-  template <uint32_t... Idx>
-  CUTLASS_DEVICE
-  static void
-  arrive_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val, cute::integer_sequence<uint32_t, Idx...>) {
-    check_barrier_in_range(idx);
-    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_inc(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
-  }
-
-  template <uint32_t... Idx>
-  CUTLASS_DEVICE
-  static void
-  arrive_range_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count, int val, cute::integer_sequence<uint32_t, Idx...>) {
-    check_barrier_in_range(idx);
-    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val), true)) || ...);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
- *  via an API that mirrors that of NamedBarrierManager
- *
- * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
-**/
-template <
-  class Synchronizer,
-  uint32_t ThreadCount_
->
-struct SyncManager {
-
-  // Number of threads participating in the barrier
-  static constexpr uint32_t ThreadCount = ThreadCount_;
-
-  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
-
-  // Underlying type used by all barriers for synchronization.
-  using T = typename BarrierSync::T;
-
-  CUTLASS_DEVICE
-  static
-  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
-    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
-  }
-
-  CUTLASS_DEVICE
-  static void
-  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
-  }
-
-  CUTLASS_DEVICE
-  static void
-  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
-  }
-
-  CUTLASS_DEVICE
-  static void
-  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
-    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
-  }
-
-  CUTLASS_DEVICE
-  static void
-  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
-    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/bfloat16.h b/lightllm-kernel/cutlass/include/cutlass/bfloat16.h
deleted file mode 100755
index 5af6d3ab8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/bfloat16.h
+++ /dev/null
@@ -1,679 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines a proxy class for storing non-standard 16-bit floating point values with
-          8 bits of exponent and 7 bit of mantissa.
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include "cutlass/floating_point_nvrtc.h"
-#else
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#include <cstring>
-#endif
-
-#include <cuda_bf16.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Floating-point type with 8 bits of exponent and 7 bits of mantissa.
-struct alignas(2) bfloat16_t {
-
-  //
-  // Data members
-  //
-
-  /// Storage type
-  uint16_t storage;
-
-  //
-  // Methods
-  //
-
-  /// Constructs from an unsigned short
-  CUTLASS_HOST_DEVICE
-  static bfloat16_t bitcast(uint16_t x) {
-    bfloat16_t h;
-    h.storage = x;
-    return h;
-  }
-
-private:
-  struct from_32_bit_integer_t {};
-  static constexpr from_32_bit_integer_t from_32_bit_integer{};
-
-  template<class T>
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(from_32_bit_integer_t, T x) {
-    static_assert(cutlass::platform::is_integral<T>::value && sizeof(T) == 4, "Requires 32-bit integer");
-
-    float flt = static_cast<float>(x);
-    uint32_t bits;
-
-    #if defined(__CUDA_ARCH__)
-    bits = reinterpret_cast<uint32_t &>(flt);
-    #else
-    std::memcpy(&bits, &flt, sizeof(bits));
-    #endif
-
-    storage = uint16_t(bits >> 16);
-  }
-
-public:
-  /// Default constructor
-  bfloat16_t() = default;
-
-  /// Reinterpret cast from CUDA's __nv_bfloat16 type
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(__nv_bfloat16 const & x) {
-    #if defined(__CUDA_ARCH__)
-    storage = reinterpret_cast<uint16_t const &>(x);
-    #else
-    __nv_bfloat16_raw raw(x);
-    std::memcpy(&storage, &raw.x, sizeof(storage));
-    #endif
-  }
-
-  /// Floating-point conversion - round toward nearest
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(float x) {
-
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
-
-    asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x));
-
-    #else
-    uint32_t bits;
-
-    #if defined(__CUDA_ARCH__)
-    bits = reinterpret_cast<uint32_t &>(x);
-    #else
-    std::memcpy(&bits, &x, sizeof(bits));
-    #endif
-
-    if ((bits & 0x7f800000) != 0x7f800000) {
-
-      bool mantissa_bit = ((bits & (1 << 16)) != 0);
-      bool round_bit = ((bits & (1 << 15)) != 0);
-      bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0);
-      
-      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
-        bits += uint32_t(1 << 16);
-      }
-    }
-    else if (bits & ~0xff800000) {
-      bits = 0x7fffffff;
-    }
-
-    storage = uint16_t((bits >> 16) & 0xffff);
-    #endif
-  }
-
-  /// Floating-point conversion - round toward nearest
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(double x): bfloat16_t(float(x)) {
-
-  }
-
-  /// Integer conversion - round toward nearest
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(int x) : bfloat16_t(from_32_bit_integer, x) {}
-
-  CUTLASS_HOST_DEVICE
-  explicit bfloat16_t(uint32_t x) : bfloat16_t(from_32_bit_integer, x) {}
-
-  /// Converts to float
-  CUTLASS_HOST_DEVICE
-  operator float() const {
-    unsigned bits = (unsigned(storage) << 16);
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<float const &>(bits);
-    #else
-    float flt;
-    std::memcpy(&flt, &bits, sizeof(flt));
-    return flt;
-    #endif
-  }
-
-  /// Converts to float
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(float(*this));
-  }
-
-  /// Converts to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(float(*this));
-  }
-
-  /// Casts to bool
-  CUTLASS_HOST_DEVICE
-  explicit operator bool() const {
-    return (float(*this) != 0.0f);
-  }
-
-  /// Bitcasts to CUDA's bf16 type
-  CUTLASS_DEVICE
-  __nv_bfloat16 to_nv_bfloat16() const {
-    return reinterpret_cast<__nv_bfloat16 const &>(storage);
-  }
-
-  /// Obtains raw bits
-  CUTLASS_HOST_DEVICE
-  uint16_t raw() const {
-    return storage;
-  }
-    /// Returns the sign bit
-  CUTLASS_HOST_DEVICE
-  bool signbit() const {
-    return ((raw() & 0x8000) != 0);
-  }
-
-  /// Returns the biased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent_biased() const {
-    return int((raw() >> 7) & 0x0ff);
-  }
-
-  /// Returns the unbiased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent() const {
-    return exponent_biased() - 127;
-  }
-
-  /// Returns the mantissa
-  CUTLASS_HOST_DEVICE
-  int mantissa() const {
-    return int(raw() & 0x7f);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool signbit(cutlass::bfloat16_t const& h) {
-  return h.signbit();
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
-  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isnan(cutlass::bfloat16_t const& h) {
-  return (h.exponent_biased() == 0x0ff) && h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isfinite(cutlass::bfloat16_t const& h) {
-  return (h.exponent_biased() != 0x0ff);
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::bfloat16_t nan_bf16(const char*) {
-  // NVIDIA canonical NaN
-  return cutlass::bfloat16_t::bitcast(0x7fff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isinf(cutlass::bfloat16_t const& h) {
-  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isnormal(cutlass::bfloat16_t const& h) {
-  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
-}
-
-CUTLASS_HOST_DEVICE
-int fpclassify(cutlass::bfloat16_t const& h) {
-  int exp = h.exponent_biased();
-  int mantissa = h.mantissa();
-  if (exp == 0x0ff) {
-    if (mantissa) {
-      return FP_NAN;
-    }
-    else {
-      return FP_INFINITE;
-    }
-  }
-  else if (!exp) {
-    if (mantissa) {
-      return FP_SUBNORMAL;
-    }
-    else {
-      return FP_ZERO;
-    }
-  }
-  return FP_NORMAL;
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) {
-#if defined(__CUDACC_RTC__)
-  return cutlass::bfloat16_t(sqrtf(float(h)));
-#else
-  return cutlass::bfloat16_t(std::sqrt(float(h)));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
-
-  uint16_t a_bits;
-  uint16_t b_bits;
-
-  #if defined(__CUDA_ARCH__)
-  a_bits = reinterpret_cast<uint16_t const &>(a);
-  b_bits = reinterpret_cast<uint16_t const &>(b);
-  #else
-  std::memcpy(&a_bits, &a, sizeof(a_bits));
-  std::memcpy(&b_bits, &b, sizeof(b_bits));
-  #endif
-
-  uint16_t a_mag = (a_bits & 0x7fff);  
-  uint16_t b_sign = (b_bits & 0x8000);
-  uint16_t result = (a_mag | b_sign);
-
-  return bfloat16_t::bitcast(result);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Standard Library operations and definitions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if !defined(__CUDACC_RTC__)
-namespace std {
-
-/// Numeric limits
-template <>
-struct numeric_limits<cutlass::bfloat16_t> {
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_infinity = true;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-  static bool const has_denorm_loss = true;
-  static std::float_round_style const round_style = std::round_to_nearest;
-  static bool const is_iec559 = false;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = 7;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
-
-  /// Minimum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
-};
-
-} // namespace std
-#endif
-
-namespace cutlass {
-namespace platform {
-
-/// Forward Declaration
-template <class T>
-struct numeric_limits;
-
-/// Numeric limits
-template <>
-struct numeric_limits<cutlass::bfloat16_t> {
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_infinity = true;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-#if !defined(__CUDACC_RTC__)
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-#endif
-  static bool const has_denorm_loss = true;
-#if !defined(__CUDACC_RTC__)
-  static std::float_round_style const round_style = std::round_to_nearest;
-#endif
-  static bool const is_iec559 = false;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = 7;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
-
-  /// Minimum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
-};
-
-} // namespace platform
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Arithmetic operators
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __heq(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) == float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hne(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) != float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hlt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) < float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hle(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) <= float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hgt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) > float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hge(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
-#else
-  return float(lhs) >= float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  return bfloat16_t(float(lhs) + float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator-(bfloat16_t const& lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return bfloat16_t(__hneg(lhs.to_nv_bfloat16()));
-#else
-  return bfloat16_t(-float(lhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  return bfloat16_t(float(lhs) - float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  return bfloat16_t(float(lhs) * float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  return bfloat16_t(float(lhs) / float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  lhs = bfloat16_t(float(lhs) + float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  lhs = bfloat16_t(float(lhs) - float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  lhs = bfloat16_t(float(lhs) * float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
-#else
-  lhs = bfloat16_t(float(lhs) / float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator++(bfloat16_t & lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
-#else
-  float tmp(lhs);
-  ++tmp;
-  lhs = bfloat16_t(tmp);
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t& operator--(bfloat16_t & lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
-#else
-  float tmp(lhs);
-  --tmp;
-  lhs = bfloat16_t(tmp);
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator++(bfloat16_t & lhs, int) {
-  bfloat16_t ret(lhs);
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
-#else
-  float tmp(lhs);
-  tmp++;
-  lhs = bfloat16_t(tmp);
-#endif
-  return ret;
-}
-
-CUTLASS_HOST_DEVICE
-bfloat16_t operator--(bfloat16_t & lhs, int) {
-  bfloat16_t ret(lhs);
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
-#else
-  float tmp(lhs);
-  tmp--;
-  lhs = bfloat16_t(tmp);
-#endif
-  return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// User-defined literals
-//
-
-CUTLASS_HOST_DEVICE
-cutlass::bfloat16_t operator "" _bf16(long double x) {
-  return cutlass::bfloat16_t(float(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) {
-  return cutlass::bfloat16_t(int(x));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/blas3.h b/lightllm-kernel/cutlass/include/cutlass/blas3.h
deleted file mode 100755
index ee5587d1c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/blas3.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Basic include for CUTLASS BLAS3/HPC code.
-    
-  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/blas3_types.h"
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines FillMode inversions
-template <FillMode kFillMode>
-struct InvertFillMode;
-
-/// Invert FillMode lower to upper
-template <>
-struct InvertFillMode<FillMode::kLower> {
-  static FillMode const mode = FillMode::kUpper;
-};
-
-/// Invert FillMode upper to lower
-template <>
-struct InvertFillMode<FillMode::kUpper> {
-  static FillMode const mode = FillMode::kLower;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines SideMode inversions
-template <SideMode kSideMode>
-struct InvertSideMode;
-
-/// Invert SideMode left to right
-template <>
-struct InvertSideMode<SideMode::kLeft> {
-  static SideMode const mode = SideMode::kRight;
-};
-
-/// Invert SideMode right to left
-template <>
-struct InvertSideMode<SideMode::kRight> {
-  static SideMode const mode = SideMode::kLeft;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines correct compare operation for Triangular matrix boundary
-template <FillMode kFillMode, DiagType kDiagType = DiagType::kNonUnit>
-struct TrMatrixCompareOp {
-  using Index = int32_t;
-  using Type = typename platform::conditional<
-                        (kFillMode == FillMode::kLower), 
-                        greater_equal<Index>, 
-                        less_equal<Index>>::type;
-};
-
-template <FillMode kFillMode>
-struct TrMatrixCompareOp <kFillMode, DiagType::kUnit> {
-   using Index = int32_t;
-   using Type = typename platform::conditional<
-                        (kFillMode == FillMode::kLower), 
-                        greater_equal<Index>, 
-                        less_equal<Index>>::type;
-};
-
-template <FillMode kFillMode>
-struct TrMatrixCompareOp <kFillMode, DiagType::kZero> {
-   using Index = int32_t;
-   using Type = typename platform::conditional<
-                        (kFillMode == FillMode::kLower), 
-                        greater<Index>, 
-                        less<Index>>::type;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Returns precision in terms of bits (based on datatype) to fill tensors with.
-// Defaults to 5 bits of mantissa for TF32 and FP32 (with implicit round-offs).
-// Also defines acceptable mantissa result variance/error.
-template <typename Element>
-struct MantissaInBits {
-  static int constexpr bits = 5;
-  static double constexpr error = 1.0e-7;
-};
-
-// Full precision is supported for FP64
-template <>
-struct MantissaInBits<double> {
-  static int constexpr bits = 30;
-  static double constexpr error = 1.0e-15;
-};
-
-template <>
-struct MantissaInBits<cutlass::complex<double>> {
-  static int constexpr bits = 30;
-  static double constexpr error = 1.0e-15;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/blas3_types.h b/lightllm-kernel/cutlass/include/cutlass/blas3_types.h
deleted file mode 100755
index 653b93b77..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/blas3_types.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Enumerated type describing the type of kernel (based on input or output matrices).
-enum class BlasMode {
-  kGemm,
-  kSymmetric,
-  kHermitian,
-  kTriangular,
-  kInvalid
-};
-
-/// Enumerated type describing the fill mode for matrices for BLAS functions.
-enum class FillMode {
-  kFull,              /// The entire tensor is covered.
-  kLower,             /// The 'lower' part of a tensor is covered including diagonal
-  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
-  kDiagonal,          /// Only diagonal elements are covered.
-  kNone,              /// No element is covered.
-  kInvalid
-};
-
-/// Enumerated type describing the diagonal property of matrices for BLAS functions.
-enum class DiagType {
-  kNonUnit,
-  kUnit,
-  kZero, // Only used internally for computing SYMM/HEMM
-  kInvalid
-}; 
-
-/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
-enum class SideMode {
-  kLeft,
-  kRight,
-  kInvalid
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/block_striped.h b/lightllm-kernel/cutlass/include/cutlass/block_striped.h
deleted file mode 100755
index 09f3fb04f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/block_striped.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for performing block-striped access (load, store, reduce) of trivially-copyable,
-    statically-sized array types to global memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/wmma_array.h"
-#include "cutlass/functional.h"
-#include "cutlass/complex.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// AccessWidth
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes the maximal power-of-two that evenly divides the size of T, capped at Limit
-template <
-  typename T,
-  int Limit>
-struct AccessWidth
-{
-  // Inductive case
-  template <
-      int ObjectBytes,        /// Size of T in bytes
-      int AlignBytes,         /// Template induction variable
-      bool IsAligned  =       /// Whether ObjectBytes is an even multiple of AlignBytes
-        ((AlignBytes <= Limit) &&  (ObjectBytes % AlignBytes == 0))>
-  struct Detail
-  {
-      static const int value = Detail<ObjectBytes, AlignBytes * 2>::value;
-  };
-
-  // Base case (ObjectBytes is not an even multiple of AlignBytes)
-  template <
-      int ObjectBytes,        /// Size of T in bytes
-      int AlignBytes>         /// Template induction variable
-  struct Detail<ObjectBytes, AlignBytes, false>
-  {
-      static const int value = AlignBytes / 2;
-  };
-
-  /// The maximal power-of-two that evenly divides the size of T
-  static const int value = Detail<
-    (int) sizeof(T),
-    1>::value;
-};
-
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// StripedAccessType
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// ReinterpretCast type for striping a trivially-copyable type in global memory
-/// (Default specialization.  Striping granularity is type T.)
-template <
-    typename T,           /// Data type
-    int TransferBytes =   /// Data access width (16 byte max for global memory access on current architectures)
-      AccessWidth<T, 16>::value>
-struct alignas(TransferBytes) StripedAccessType : public T
-{};
-
-
-/// ReinterpretCast type for striping a trivially-copyable type in global memory
-/// (Specialization for cutlass::Array<T>.  Striping granularity is a multiple of T.)
-template <
-    typename T,           /// Array element type
-    int N,                /// Number of elements in array
-    bool RegisterSized,   /// T is register-sized
-    int TransferBytes>    /// Data access width
-struct StripedAccessType<
-    Array<T, N, RegisterSized>,
-    TransferBytes>
-: public AlignedArray<
-            T,                                                  // Element type of StripedAccessType
-            __NV_STD_MAX(1, TransferBytes / (int) sizeof(T)),   // Number of elements T in StripedAccessType
-            TransferBytes>                                      // Alignment of StripedAccessType
-{};
-
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-/// ReinterpretCast type for striping a trivially-copyable type in global memory
-/// (Specialization for cutlass::WmmaFragmentArray<T>.  Striping granularity is a multiple of T.)
-template<
-    typename Use,
-    int m,
-    int n,
-    int k,
-    typename ElementT,
-    typename Layout,
-    int kFragments,
-    int TransferBytes>
-struct StripedAccessType<
-    WmmaFragmentArray<nvcuda::wmma::fragment<Use, m, n, k, ElementT, Layout>, kFragments>,
-    TransferBytes>
-: public AlignedArray<
-            ElementT,
-            __NV_STD_MAX(1, TransferBytes / (int) sizeof(ElementT)),
-            TransferBytes>
-{};
-
-#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// BlockStriped
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Utility for performing block-striped access (load, store) of trivially-copyable,
-/// statically-sized array types to global memory
-template <
-  int BlockThreads,
-  typename ArrayT,
-  typename AccessT = StripedAccessType<ArrayT> >
-struct BlockStriped
-{
-  /// Number of striped accesses
-  static const int kStripes = int(sizeof(ArrayT) / sizeof(AccessT));
-  static_assert(kStripes > 0, "AccessT type must be smaller than or equal to ArrayT type");
-
-  /// Load
-  CUTLASS_DEVICE
-  static void load(ArrayT &data, ArrayT *ptr, int thread_idx)
-  {
-    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
-    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kStripes; ++i) {
-      access_data[i] = access_input[(BlockThreads * i) + thread_idx];
-    }
-  }
-
-  /// Load & Add
-  CUTLASS_DEVICE
-  static void load_add(ArrayT &data, ArrayT *ptr, int thread_idx)
-  {
-    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
-    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
-
-    plus<AccessT> add;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kStripes; ++i)
-    {
-      access_data[i] = add(access_data[i], access_input[(BlockThreads * i) + thread_idx]);
-    }
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  static void store(ArrayT *ptr, const ArrayT &data, int thread_idx)
-  {
-    AccessT *access_output = reinterpret_cast<AccessT*>(ptr);
-    const AccessT *access_data = reinterpret_cast<const AccessT*>(&data);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kStripes; ++i) {
-      access_output[(BlockThreads * i) + thread_idx] = access_data[i];
-    }
-  }
-
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// BlockStripedReduce
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
-/// statically-sized array types to global memory.
-/// (Default specialization)
-template <
-  int BlockThreads,
-  typename ArrayT,
-  typename ElementT = typename StripedAccessType<ArrayT>::Element>
-struct BlockStripedReduce :
-  BlockStriped<
-    BlockThreads,
-    ArrayT,
-    ElementT>
-{
-  /// Reduce
-  CUTLASS_DEVICE
-  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
-  {
-    cutlass::atomic_add<ElementT> reduce;
-    ElementT *access_output = reinterpret_cast<ElementT*>(ptr);
-    const ElementT *access_data = reinterpret_cast<const ElementT*>(&data);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < BlockStripedReduce::kStripes; ++i) {
-      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
-    }
-  }
-};
-
-
-/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
-/// statically-sized array types to global memory.
-/// (Specialization for half_t.  Uses half2 vectorized-reduction.)
-template <
-  int BlockThreads,
-  typename ArrayT>
-struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
-  BlockStriped<
-    BlockThreads,
-    ArrayT,
-    half2>
-{
-  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of half must be even number in length");
-
-  /// Reduce
-  CUTLASS_DEVICE
-  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
-  {
-    cutlass::atomic_add<half2> reduce;
-    half2 *access_output = reinterpret_cast<half2*>(ptr);
-    const half2 *access_data = reinterpret_cast<const half2*>(&data);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
-    {
-      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
-    }
-  }
-};
-
-
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp b/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
deleted file mode 100755
index a0fa22b6b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief PTX for TMA Tensor Memory Access operators on memory added for SM90
-*/
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/trace.h"
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/type_traits>
-#else
-#include <type_traits>
-#include <cstdio>
-#endif
-
-#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
-#  define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
-#endif
-
-namespace cutlass {
-
-#ifndef NDEBUG
-#define Return_Status(cudaError_t_status)            \
-  if (cudaError_t_status != cudaSuccess) {           \
-    fprintf(stderr,                                  \
-            "[ ERROR: CUDA Runtime ] %s:%d: %s\n",   \
-            __FILE__,                                \
-            __LINE__,                                \
-            cudaGetErrorString(cudaError_t_status)); \
-    return Status::kInvalid;                         \
-  } else {                                           \
-    return Status::kSuccess;                         \
-  }
-#else
-#define Return_Status(cudaError_t_status)          \
-  if (cudaError_t_status != cudaSuccess) {         \
-    return Status::kInvalid;                       \
-  } else {                                         \
-    return Status::kSuccess;                       \
-  }
-#endif
-
-struct ClusterLauncher {
-  constexpr static int MaxClusterSize = 32;
-
-  // Check for hardware compatibility
-  static inline CUTLASS_HOST
-  Status check_cluster_dims(dim3 grid, dim3 cluster) {
-    if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) &&
-        (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) {
-      return Status::kSuccess;
-    }
-    else {
-      CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch.");
-      return Status::kInvalid;
-    }
-  }
-
-  static inline CUTLASS_HOST
-  Status
-#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
-  init(void const* kernel_function)
-#else
-  init(void const* /* kernel_function */)
-#endif
-  {
-#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
-#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    if (kernel_function == nullptr) {
-      CUTLASS_TRACE_HOST("kernel_function is null");
-      return Status::kInvalid;
-    }
-    CUTLASS_TRACE_HOST("Checking previous error state before calling cudaFuncSetAttribute");
-    cudaError_t prevStatus = cudaGetLastError();
-    if (prevStatus != cudaSuccess) {
-      fprintf(stderr,
-              "[ ERROR: CUDA Runtime ] %s:%d: %s\n",
-              __FILE__,
-              __LINE__,
-              cudaGetErrorString(prevStatus));
-      return Status::kInvalid;
-    }
-    CUTLASS_TRACE_HOST("Calling cudaFuncSetAttribute");
-#endif
-    // This attribute was added in CUDA 11.8.
-    cudaError_t status =
-        cudaFuncSetAttribute(
-          kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
-    Return_Status(status);
-#else
-    return Status::kInvalid;
-#endif
-  }
-
-  // This is the method we expect to use going forward
-  static inline CUTLASS_HOST
-  Status launch(
-      dim3 const grid_dims,
-      dim3 const cluster_dims,
-      dim3 const block_dims,
-      size_t const smem_size,
-      cudaStream_t cuda_stream,
-      void const* kernel,
-      void** kernel_params,
-      bool launch_with_pdl = false) {
-#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
-    if (check_cluster_dims(grid_dims, cluster_dims) != Status::kSuccess) {
-      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
-      return Status::kInvalid;
-    }
-
-    auto init_status = init(kernel);
-    if (init_status != Status::kSuccess) {
-      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
-      return Status::kInvalid;
-    }
-
-    cudaLaunchConfig_t launch_config;
-    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
-    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
-    launch_config.dynamicSmemBytes = smem_size;
-    launch_config.stream = cuda_stream;
-
-    cudaLaunchAttribute launch_attribute[2];
-
-    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
-    launch_attribute[0].val.clusterDim.x = cluster_dims.x;
-    launch_attribute[0].val.clusterDim.y = cluster_dims.y;
-    launch_attribute[0].val.clusterDim.z = cluster_dims.z;
-
-    launch_attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    launch_attribute[1].val.programmaticStreamSerializationAllowed = 1;
-
-    launch_config.numAttrs = launch_with_pdl ? 2 : 1;
-
-    launch_config.attrs = launch_attribute;
-
-    CUTLASS_TRACE_HOST("ClusterLauncher: Launching GPC_CLUSTER_GRID GridDims = "
-        "(" << grid_dims.x << ", " << grid_dims.y << ", " << grid_dims.z << "), "
-        "And ClusterDims = "
-        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
-
-    cutlass::arch::synclog_setup();
-    cudaError_t status = cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
-    Return_Status(status);
-#else
-    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
-    return Status::kInvalid;
-#endif
-  }
-
-};
-
-namespace detail {
-
-template<class Arg>
-void* checked_addressof(Arg&& arg) {
-  static_assert(! std::is_rvalue_reference_v<Arg> || ! std::is_const_v<Arg>, "You cannot take the address of a const rvalue reference (const T&&).");
-  // We use std::addressof to ensure we get the address,
-  // in case the type has an overloaded operator&.
-  // Note that this precludes `const T&&` references.
-  return const_cast<void*>(reinterpret_cast<void const*>(std::addressof(arg)));
-}
-
-} // namespace detail
-
-//! Parameters for launch_on_cluster (see below).
-struct ClusterLaunchParams {
-  //! Grid dimensions
-  dim3 grid_dims{1, 1, 1};
-
-  //! Block dimensions
-  dim3 block_dims{1, 1, 1};
-
-  //! Cluster dimensions
-  dim3 cluster_dims{1, 1, 1};
-
-  //! Number of bytes required for the kernel's shared memory.
-  int smem_size_in_bytes = 0;
-
-  //! CUDA stream on which to launch the kernel.
-  cudaStream_t cuda_stream = nullptr;
-};
-
-/// @brief Launch the kernel on the stream using cluster launch.
-///
-/// @param params Cluster launch parameters (see above).
-/// @param kernel_ptr Pointer to the kernel function (see example).
-/// @param args Zero or more arguments to pass to the kernel.
-///
-/// @tparam Args Types of the arguments passed to the kernel.
-///   Don't specify this/these template argument(s) explicitly.
-///
-/// @return Status::Success on success, else an error code.
-///
-/// @code
-/// template<class SharedMemoryType, class A, class B, class C>
-/// __global__ void kernel(A a, B b, C c);
-///
-/// X x = get_x();
-/// Y y = get_y();
-/// Z z = get_z();
-///
-/// void const* kernel_ptr =
-///   const_cast<void const*>(reinterpret_cast<void*>(
-///     &kernel<SharedMemory, X, Y, Z>));
-/// auto status = launch_kernel_on_cluster(
-///   {grid_dims, block_dims, cluster_dims, sizeof(SharedMemory)},
-///   kernel_ptr, x, y, z);
-/// @endcode
-template<class ... Args>
-CUTLASS_HOST cutlass::Status
-launch_kernel_on_cluster(const ClusterLaunchParams& params,
-  void const* kernel_ptr,
-  Args&& ... args)
-{
-  // Unfortunately, we find ourselves needing to pass in
-  // the parameters as an array of raw pointers.
-  if constexpr (sizeof...(Args) == 0) {
-    return cutlass::ClusterLauncher::launch(
-      params.grid_dims,
-      params.cluster_dims,
-      params.block_dims,
-      params.smem_size_in_bytes,
-      params.cuda_stream,
-      kernel_ptr, nullptr);
-  }
-  else {
-    void* kernel_params[sizeof...(Args)] = {
-      detail::checked_addressof(std::forward<Args>(args))...
-    };
-    return cutlass::ClusterLauncher::launch(
-      params.grid_dims,
-      params.cluster_dims,
-      params.block_dims,
-      params.smem_size_in_bytes,
-      params.cuda_stream,
-      kernel_ptr,
-      kernel_params);
-  }
-}
-
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/complex.h b/lightllm-kernel/cutlass/include/cutlass/complex.h
deleted file mode 100755
index 6d0bf31df..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/complex.h
+++ /dev/null
@@ -1,823 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cuComplex.h>
-
-#include <cuda_fp16.h>
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/real.h"
-
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/fast_math.h"
-
-#if !defined(__CUDACC_RTC__)
-#include <iosfwd>
-#endif
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Enumeraed type describing a transformation on a complex value.
-enum class ComplexTransform {
-  kNone,
-  kConjugate
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines ComplexTransform inversions
-template <ComplexTransform kTransform>
-struct InvertComplexTransform;
-
-/// Invert ComplexTransform from kNone to kConjugate
-template <>
-struct InvertComplexTransform<ComplexTransform::kNone> {
-  static ComplexTransform const transform = ComplexTransform::kConjugate;
-};
-
-/// Invert ComplexTransform from kConjugate to kNone
-template <>
-struct InvertComplexTransform<ComplexTransform::kConjugate> {
-  static ComplexTransform const transform = ComplexTransform::kNone;
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Accessors for CUDA complex types
-//
-
-#if !defined(__CUDACC_RTC__)
-/// Returns the real part of the complex number
-CUTLASS_HOST_DEVICE
-float const &real(cuFloatComplex const &z) { return z.x; }
-
-/// Returns the real part of the complex number
-CUTLASS_HOST_DEVICE
-float &real(cuFloatComplex &z) { return z.x; }
-
-/// Returns the real part of the complex number
-CUTLASS_HOST_DEVICE
-double const &real(cuDoubleComplex const &z) { return z.x; }
-
-/// Returns the real part of the complex number
-CUTLASS_HOST_DEVICE
-double &real(cuDoubleComplex &z) { return z.x; }
-
-/// Returns the imaginary part of the complex number
-CUTLASS_HOST_DEVICE
-float const &imag(cuFloatComplex const &z) { return z.y; }
-
-/// Returns the imaginary part of the complex number
-CUTLASS_HOST_DEVICE
-float &imag(cuFloatComplex &z) { return z.y; }
-
-/// Returns the imaginary part of the complex number
-CUTLASS_HOST_DEVICE
-double const &imag(cuDoubleComplex const &z) { return z.y; }
-
-/// Returns the imaginary part of the complex number
-CUTLASS_HOST_DEVICE
-double &imag(cuDoubleComplex &z) { return z.y; }
-
-// Returns the conjugate of the complex number
-CUTLASS_HOST_DEVICE cuFloatComplex
-conj(cuFloatComplex const& z) {
-  return make_cuFloatComplex(z.x, -z.y);
-}
-
-// Returns the conjugate of the complex number
-CUTLASS_HOST_DEVICE cuDoubleComplex
-conj(cuDoubleComplex const& z) {
-  return make_cuDoubleComplex(z.x, -z.y);
-}
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Class for representing and manipulating complex numbers with conversions from built-in CUDA
-/// complex types.
-
-template <typename T>
-class complex
-{
- public:
-  /// Type alias for scalar type
-  using value_type = T;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Real part
-  T _real;
-
-  /// Imaginary part
-  T _imag;
-
- public:
-
-//
-// Methods
-//
-
-  /// Default constructor
-  complex() = default;
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  complex(T r) : _real(r), _imag(T(0)) {}
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  complex(T r, T i) : _real(r), _imag(i) {}
-
-  /// Constructor
-  template<typename A>
-  CUTLASS_HOST_DEVICE
-  complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
-
-
-  #if !defined(__CUDACC_RTC__)
-  /// Conversion from cuFloatComplex
-  CUTLASS_HOST_DEVICE
-  complex(cuFloatComplex const &z) : _real(static_cast<T>(cuCrealf(z))), _imag(static_cast<T>(cuCimagf(z))) {}
-
-  /// Conversion from cuDoubleComplex
-  CUTLASS_HOST_DEVICE
-  complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
-  #endif
-
-  /// Equality operator
-  CUTLASS_HOST_DEVICE bool operator==(complex<T> const &rhs) const {
-    return this->real() == rhs.real() && this->imag() == rhs.imag();
-  }
-
-  /// Inequality operator
-  CUTLASS_HOST_DEVICE bool operator!=(complex<T> const &rhs) const {
-    return !(*this == rhs);
-  }
-
-  /// Addition
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator+(complex<A> const &rhs) const {
-    return complex<T>(this->real() + rhs.real(), this->imag() + rhs.imag());
-  }
-
-  /// Reduction into memory address.  Components may update out of order.
-  template <typename OtherT>
-  CUTLASS_DEVICE void red(complex<OtherT> *ptr) const {
-    static_assert(platform::is_same<T, OtherT>::value, "Component type must match");
-    cutlass::atomic_add<T> reduce;
-    reduce(&ptr->_real, _real);
-    reduce(&ptr->_imag, _imag);
-  }
-
-  /// Reduction into memory address.  Components may update out of order.  (Half specialization)
-  CUTLASS_DEVICE void red(complex<half_t> *ptr) const {
-    static_assert(platform::is_same<T, half_t>::value, "Component type must match");
-    half2 *h2_ptr = reinterpret_cast<half2*>(ptr);
-    half2 h2_data = reinterpret_cast<half2&>(*this);
-    cutlass::atomic_add<half2> reduce;
-    reduce(h2_ptr, h2_data);
-  }
-
-  /// Subtraction
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator-(complex<A> const &rhs) const {
-    return complex<T>(this->real() - rhs.real(), this->imag() - rhs.imag());
-  }
-
-  /// Multiplication
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator*(complex<A> const &rhs) const {
-    return complex<T>(this->real() * rhs.real() - this->imag() * rhs.imag(),
-                      this->real() * rhs.imag() + this->imag() * rhs.real());
-  }
-
-  /// Scalar Multiplication
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator*(A const &s) const {
-    return complex<T>(this->real() * s, this->imag() * s);
-  }
-
-  /// Division
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator/(complex<A> const &rhs) const {
-    T d = T(rhs.real() * rhs.real() + rhs.imag() * rhs.imag());
-
-    return complex<T>(
-      (real() * rhs.real() + imag() * rhs.imag()) / d,
-      (imag() * rhs.real() - real() * rhs.imag()) / d
-    );
-  }
-
-  /// Scalar Division
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> operator/(A const &s) const {
-    return complex<T>(this->real() / s, this->imag() / s);
-  }
-
-  /// Addition
-    template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> &operator+=(complex<A> const &rhs) {
-      *this = *this + rhs;
-      return *this;
-  }
-
-  /// Subtraction
-  template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> &operator-=(complex<A> const &rhs) {
-      *this = *this - rhs;
-      return *this;
-  }
-
-  /// Multiplication
-  template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> &operator*=(complex<A> const &rhs) {
-      *this = *this * rhs;
-      return *this;
-  }
-
-  /// Scalar multiplication
-  template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> &operator*=(A s) {
-      *this = *this * s;
-      return *this;
-  }
-
-  /// Division
-  template <typename A>
-  CUTLASS_HOST_DEVICE complex<T> &operator/=(complex<A> const &rhs) {
-      *this = *this / rhs;
-      return *this;
-  }
-
-  /// Accesses the real part of the complex number
-  CUTLASS_HOST_DEVICE
-  T const &real() const { return _real; }
-
-  /// Accesses the real part of the complex number
-  CUTLASS_HOST_DEVICE
-  T &real() { return _real; }
-
-  /// Accesses the imaginary part of the complex number
-  CUTLASS_HOST_DEVICE
-  T const &imag() const { return _imag; }
-
-  /// Accesses the imaginary part of the complex number
-  CUTLASS_HOST_DEVICE
-  T &imag() { return _imag; }
-
-  /// Set the real part of the complex number
-  CUTLASS_HOST_DEVICE
-  void real(T real) { _real = real; }
-
-  /// Set the imaginary part of the complex number
-  CUTLASS_HOST_DEVICE
-  void imag(T imag) { _imag = imag; }
-
-  #if !defined(__CUDACC_RTC__)
-  /// Converts to cuFloatComplex
-  CUTLASS_HOST_DEVICE
-  explicit operator cuFloatComplex() const { return make_cuFloatComplex(float(real()), float(imag())); }
-
-  /// Converts to cuDoubleComplex
-  CUTLASS_HOST_DEVICE
-  explicit operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
-  #endif
-};
-
-// Complex conjugate
-template<class T>
-CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const& z) {
-  return {z.real(), -z.imag()};
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Accessors for complex template
-//
-
-// Nonmember real and imag need to work for non-complex numbers too.
-// That means cutlass::complex, std::complex, cuda::std::complex, and
-// any user-defined complex number type that looks like std::complex.
-// It's reasonable to assume that a "complex number type" has
-// zero-argument real() and imag() member functions returning
-// non-void.  While cuFloatComplex and cuDoubleComplex lack those
-// member functions, one-argument nonmember real and imag overloads
-// for those types are defined above.
-
-namespace detail {
-
-template <typename T, typename Enable = void>
-struct has_zero_argument_real_member_function :
-  cutlass::platform::false_type
-{};
-
-template <typename T>
-struct has_zero_argument_real_member_function<T,
-  cutlass::platform::enable_if_t<
-    ! cutlass::platform::is_void_v<
-      decltype(cutlass::platform::declval<T>().real())
-    >
-  >
-> : cutlass::platform::true_type
-{};
-
-template <typename T>
-constexpr bool has_zero_argument_real_member_function_v =
-  has_zero_argument_real_member_function<T>::value;
-
-template <typename T, typename Enable = void>
-struct has_zero_argument_imag_member_function :
-  cutlass::platform::false_type
-{};
-
-template <typename T>
-struct has_zero_argument_imag_member_function<T,
-  cutlass::platform::enable_if_t<
-    ! cutlass::platform::is_void_v<
-      decltype(cutlass::platform::declval<T>().imag())
-    >
-  >
-> : cutlass::platform::true_type
-{};
-
-template <typename T>
-constexpr bool has_zero_argument_imag_member_function_v =
-  has_zero_argument_imag_member_function<T>::value;
-
-} // namespace detail
-
-template<typename T>
-CUTLASS_HOST_DEVICE auto real(T z) {
-  if constexpr (detail::has_zero_argument_real_member_function_v<T>) {
-    return z.real();
-  } else {
-    return z;
-  }
-}
-  
-template<typename T>
-CUTLASS_HOST_DEVICE auto imag(T z) {
-  if constexpr (detail::has_zero_argument_imag_member_function_v<T>) {
-    return z.imag();
-  } else {
-    // Imaginary part of a non-complex input has the same type as the
-    // input, and its value is zero.  CUTLASS assumes in this case
-    // that value-initializing T is well-formed and results in zero.
-    return T{};
-  }
-}
-  
-//
-// Output operators
-//
-
-#if !defined(__CUDACC_RTC__)
-template <typename T>
-std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
-  T _r = real(z);
-  T _i = imag(z);
-
-  if (bool(_i)) {
-    return out << _r << "+i" << _i;
-  }
-  return out << _r;
-}
-#endif
-
-//
-// Non-member operators defined for complex types
-//
-
-
-//
-// Non-member functions defined for complex numbers
-//
-
-// abs returns the magnitude of the complex number.
-
-CUTLASS_HOST_DEVICE float abs(complex<float> const &z) {
-  return ::hypot(z.real(), z.imag());
-}
-
-CUTLASS_HOST_DEVICE double abs(complex<double> const &z) {
-  return ::hypot(z.real(), z.imag());
-}
-
-// In theory, it would make sense to add a complex<long double>
-// specialization of abs here, since hypot works for long double too.
-// In practice, long double doesn't have a portable number of bits or
-// behavior, so users who care about higher-precision floating-point
-// computation should probably insist on an actual FP128 type.
-
-template <typename T>
-CUTLASS_HOST_DEVICE T abs(complex<T> const &z) {
-  // cutlass::complex permits all kinds of T, including types that
-  // don't have NaN.  For a generic floating-point type with Inf
-  // and/or NaN, LAPACK's DLAPY2 algorithm would make sense, as it
-  // would handle issues like avoiding unwarranted overflow if
-  // z.real() or z.imag() is slightly bigger than the square root of
-  // the max finite number.  That could be a future improvement; for
-  // now, the code just uses the naive algorithm.
-  //
-  // Use the "swap two-step" idiom so that argument-dependent lookup
-  // can find any CUTLASS-specific overloads.
-  using cutlass::sqrt;
-  return sqrt(z.real() * z.real() + z.imag() * z.imag());
-}
-
-/// Returns the magnitude of the complex number
-template <typename T>
-CUTLASS_HOST_DEVICE T arg(complex<T> const &z) {
-  return atan2(imag(z), real(z));
-}
-
-/// Returns the squared magnitude of a real number
-template <typename T>
-CUTLASS_HOST_DEVICE T norm(T const &z) {
-    return z * z;
-}
-
-/// Returns the squared magnitude of a real number
-template <>
-CUTLASS_HOST_DEVICE int8_t norm(int8_t const &z) {
-    return static_cast<int8_t>(z * z);
-}
-
-/// Returns the squared magnitude of a complex number
-template <typename T>
-CUTLASS_HOST_DEVICE double norm(complex<T> const &z) {
-  return real(z) * real(z) + imag(z) * imag(z);
-}
-
-/// Norm-accumulate calculation
-template <typename T, typename R>
-CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
-  return accumulator + static_cast<R>(x) * static_cast<R>(x);
-}
-
-/// Norm accumulate specialized for complex types
-template <typename T, typename R>
-CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
-  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
-    static_cast<R>(imag(z)) * static_cast<R>(imag(z));
-}
-
-namespace detail {
-  
-template<class T>
-CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::true_type) {
-  return conj(z);
-}
-
-template<class T>
-CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::false_type) {
-  return z;
-}
-
-template<class T>
-CUTLASS_HOST_DEVICE T conj_impl(T const& z) {
-  constexpr bool use_unqualified_conj =
-    ! cutlass::platform::is_arithmetic_v<T> &&
-    ! detail::has_cutlass_conj_v<T> &&
-    detail::has_unqualified_conj_v<T>;
-  return conj_impl(z, cutlass::platform::bool_constant<use_unqualified_conj>{});
-}
-  
-} // namespace detail
-
-// Return the complex conjugate of the input.
-//
-// This MUST be a function and not a function object, because it may
-// be common practice for downstream types to define specifically
-// cutlass::conj overloads, instead of overloads in their namespace.
-//
-// As a result of this being a function and not a function object,
-// CUTLASS code needs to declare "using cutlass::conj;" in scope and
-// then call this function unqualified, just like std::swap.
-//
-// If an overload already exists for cutlass::conj(T), that overload
-// will be called instead of this one.  Otherwise:
-//
-// 1. for arithmetic types, return z;
-//
-// 2. for types where (namespace-unqualified) conj(z) is well formed
-//    and cutlass::conj(z) is NOT well formed, return conj(z); and,
-//
-// 3. for everything else, return z.
-//
-// Regarding (1), the C++ Standard Library makes std::conj always
-// return std::complex, even for (noncomplex) arithmetic types.
-// cutlass::conj(T t) needs to return type T.  This follows the
-// convention of linear algebra software like the BLAS, where
-// "conjugate transpose" means the same thing as "transpose" for a
-// matrix of noncomplex numbers.
-//
-// Case (2) covers std::complex, cuda::std::complex, and non-Standard
-// (including user-defined) complex number types (for which "conj(z)"
-// is findable via argument-dependent lookup, but does not live in the
-// cutlass namespace).  It excludes cutlass::conj(z) in order to
-// prevent infinite recursion.
-//
-// Case (3) covers non-Standard non-complex number types.
-template<class T>
-CUTLASS_HOST_DEVICE T conj(T const& z) {
-  return detail::conj_impl(z);
-}
-
-/// Projects the complex number z onto the Riemann sphere
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> proj(complex<T> const &z) {
-  T d = real(z) * real(z) + imag(z) * imag(z) + T(1);
-  return complex<T>((T(2) * real(z)) / d, (T(2) * imag(z)) / d);
-}
-
-/// Returns a complex number with magnitude r and phase theta
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> polar(T const &r, T const &theta = T()) {
-  return complex<T>(r * cos(theta), r * sin(theta));
-}
-
-/// Computes the complex exponential of z.
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> exp(complex<T> const &z) {
-  return complex<T>(fast_exp(real(z)) * fast_cos(imag(z)), fast_exp(real(z)) * fast_sin(imag(z)));
-}
-
-/// Computes the log of z
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> log(complex<T> const &z) {
-  return complex<T>(log(abs(z)), arg(z));
-}
-
-/// Computes the log base 10 of z
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> log10(complex<T> const &z) {
-  return log(z) / T(log(T(10)));
-}
-
-/// Computes the square root of complex number z
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> sqrt(complex<T> const &z) {
-  return sqrt(T(2)) / T(2) *
-         complex<T>(sqrt(sqrt(norm(z)) + real(z)),
-                    (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z)));
-}
-
-/// Computes the cosine of complex z.
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> cos(complex<T> const &z) {
-  return (exp(z) + exp(-z)) / T(2);
-}
-
-/// Computes the sin of complex z.
-template <typename T>
-CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
-  return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
-}
-
-/// Comparison
-template <typename T>
-CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
-  return true;
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex-valued type.
-template <typename T>
-struct RealType< complex<T> >
-{
-  using Type = T;
-
-  /// Number of elements
-  static int const kExtent = 2;
-
-  CUTLASS_HOST_DEVICE
-  static complex<T> from_real(double x) {
-    return complex<T>(static_cast<T>(x));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::complex<half_t> from_real<cutlass::complex<half_t> >(double r) {
-  return cutlass::complex<half_t>(half_t(r));
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::complex<float> from_real<cutlass::complex<float> >(double r) {
-  return cutlass::complex<float>(float(r));
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::complex<double> from_real<cutlass::complex<double> >(double r) {
-  return cutlass::complex<double>(r);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct is_complex {
-  static bool const value = false;
-};
-
-template <typename T>
-struct is_complex<complex<T>> {
-  static bool const value = true;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// functional.h numeric specializations
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Squares with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared<complex<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(complex<T> lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y_r = Output(lhs.real());
-    Output y_i = Output(lhs.imag());
-
-    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
-  }
-};
-
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<complex<T>, complex<T>, complex<T>> {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    complex<T> const &a,
-    complex<T> const &b,
-    complex<T> const &c) const {
-
-    T real = c.real();
-    T imag = c.imag();
-
-    real += a.real() * b.real();
-    real += -a.imag() * b.imag();
-    imag += a.real() * b.imag();
-    imag += a.imag () * b.real();
-
-    return complex<T>{
-      real,
-      imag
-    };
-  }
-};
-
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<complex<T>, T, complex<T>> {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    complex<T> const &a,
-    T const &b,
-    complex<T> const &c) const {
-
-    T real = c.real();
-    T imag = c.imag();
-
-    real += a.real() * b;
-    imag += a.imag () * b;
-
-    return complex<T>{
-      real,
-      imag
-    };
-  }
-};
-
-/// Fused multiply-add
-template <typename T>
-struct multiply_add<T, complex<T>, complex<T>> {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(
-    T const &a,
-    complex<T> const &b,
-    complex<T> const &c) const {
-
-    T real = c.real();
-    T imag = c.imag();
-
-    real += a * b.real();
-    imag += a * b.imag();
-
-    return complex<T>{
-      real,
-      imag
-    };
-  }
-};
-
-/// Conjugate
-template <typename T>
-struct conjugate<complex<T>>  {
-  CUTLASS_HOST_DEVICE
-  complex<T> operator()(complex<T> const &a) const {
-    // Invoke the complex<T> overload specifically, rather than
-    // wasting the compiler's effort on overload resolution.
-    return cutlass::conj(a);
-  }
-};
-
-#if ! defined(__CUDACC_RTC__)
-template <>
-struct conjugate<cuFloatComplex>  {
-  CUTLASS_HOST_DEVICE
-  cuFloatComplex operator()(cuFloatComplex const& z) const {
-    return make_cuFloatComplex(z.x, -z.y);
-  }
-};
-
-template <>
-struct conjugate<cuDoubleComplex>  {
-  CUTLASS_HOST_DEVICE
-  cuDoubleComplex operator()(cuDoubleComplex const& z) const {
-    return make_cuDoubleComplex(z.x, -z.y);
-  }
-};
-#endif
-  
-/// Computes the square of a difference with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared_difference<complex<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(complex<T> lhs, complex<T> rhs) const {
-    multiplies<Output> mul_op;
-
-    Output y_r = Output(lhs.real()) - Output(rhs.real());
-    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
-
-    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
-  }
-};
-
-/// Reduces value into the data pointed to by ptr (complex<T> specialization)
-template <typename T>
-struct atomic_add<complex<T>> {
-  CUTLASS_DEVICE
-  void operator()(complex<T> *ptr, const complex<T> &data)
-  {
-    data.red(ptr);
-  }
-};
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/constants.h b/lightllm-kernel/cutlass/include/cutlass/constants.h
deleted file mode 100755
index 49d96045a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/constants.h
+++ /dev/null
@@ -1,1239 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *                                                                                                  
- **************************************************************************************************/
-
-/* \file 
-  \brief Boost-style constant definitions for floating-point types.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/complex.h"
-
-///////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace constants {
-
-///////////////////////////////////////////////////////////////////////////////////
-
-//
-// Primary templates
-//
-
-/// Returns 1, the multiplicative identity element
-template <typename T> CUTLASS_HOST_DEVICE T one();
-
-/// Returns 0, the additive identity element
-template <typename T> CUTLASS_HOST_DEVICE T zero();
-
-/// Returns 2
-template <typename T> CUTLASS_HOST_DEVICE T two();
-
-/// Returns pi, approximately 3.141
-template <typename T> CUTLASS_HOST_DEVICE T pi();
-
-/// Returns 2 * pi
-template <typename T> CUTLASS_HOST_DEVICE T two_pi();
-
-/// Returns pi / 2
-template <typename T> CUTLASS_HOST_DEVICE T half_pi();
-
-/// Returns sqrt(pi)
-template <typename T> CUTLASS_HOST_DEVICE T root_pi();
-
-/// Returns sqrt(pi / 2)
-template <typename T> CUTLASS_HOST_DEVICE T root_half_pi();
-
-/// Returns sqrt(2 * pi)
-template <typename T> CUTLASS_HOST_DEVICE T root_two_pi();
-
-/// Returns sqrt(ln(4))
-template <typename T> CUTLASS_HOST_DEVICE T root_ln_four();
-
-/// Returns e, approximately 2.718...
-template <typename T> CUTLASS_HOST_DEVICE T e();
-
-/// Returns (1/2)
-template <typename T> CUTLASS_HOST_DEVICE T half();
-
-/// Returns sqrt(2), approximately 1.414...
-template <typename T> CUTLASS_HOST_DEVICE T root_two();
-
-/// Returns sqrt(2)/2, approximately 0.707...
-template <typename T> CUTLASS_HOST_DEVICE T half_root_two();
-
-/// Returns ln(2), approximately 0.693...
-template <typename T> CUTLASS_HOST_DEVICE T ln_two();
-
-/// Returns ln(ln(2)), approximately -0.3665...
-template <typename T> CUTLASS_HOST_DEVICE T ln_ln_two();
-
-/// Returns 1/3, approximately 0.333...
-template <typename T> CUTLASS_HOST_DEVICE T third();
-
-/// Returns 2/3, approximately 0.666...
-template <typename T> CUTLASS_HOST_DEVICE T twothirds();
-
-/// Returns pi - 3, approximately 0.1416...
-template <typename T> CUTLASS_HOST_DEVICE T pi_minus_three();
-
-/// Returns 4 - pi, approximately 0.858...
-template <typename T> CUTLASS_HOST_DEVICE T four_minus_pi();
-
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-// Specialization for double
-
-/// Returns 1, the multiplicative identity element  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double one<double>() {
-  uint64_t bits = 0x3ff0000000000000ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 1, the multiplicative identity element  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> one< complex<double> >() {
-  return complex<double>(one<double>(), double());
-}
-
-/// Returns 0, the additive identity element  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double zero<double>() {
-  uint64_t bits = 0x0ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 0, the additive identity element  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> zero< complex<double> >() {
-  return complex<double>(zero<double>(), double());
-}
-
-/// Returns 2  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double two<double>() {
-  uint64_t bits = 0x4000000000000000ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 2  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> two< complex<double> >() {
-  return complex<double>(two<double>(), double());
-}
-
-/// Returns pi, approximately 3.141  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double pi<double>() {
-  uint64_t bits = 0x400921fb54442d18ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns pi, approximately 3.141  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> pi< complex<double> >() {
-  return complex<double>(pi<double>(), double());
-}
-
-/// Returns 2 * pi  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double two_pi<double>() {
-  uint64_t bits = 0x401921fb54442d18ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 2 * pi  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> two_pi< complex<double> >() {
-  return complex<double>(two_pi<double>(), double());
-}
-
-/// Returns pi / 2  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double half_pi<double>() {
-  uint64_t bits = 0x3ff921fb54442d18ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns pi / 2  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> half_pi< complex<double> >() {
-  return complex<double>(half_pi<double>(), double());
-}
-
-/// Returns sqrt(pi)  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double root_pi<double>() {
-  uint64_t bits = 0x3ffc5bf891b4ef6aull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(pi)  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> root_pi< complex<double> >() {
-  return complex<double>(root_pi<double>(), double());
-}
-
-/// Returns sqrt(pi / 2)  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double root_half_pi<double>() {
-  uint64_t bits = 0x3ff40d931ff62705ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(pi / 2)  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> root_half_pi< complex<double> >() {
-  return complex<double>(root_half_pi<double>(), double());
-}
-
-/// Returns sqrt(2 * pi)  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double root_two_pi<double>() {
-  uint64_t bits = 0x40040d931ff62705ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(2 * pi)  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> root_two_pi< complex<double> >() {
-  return complex<double>(root_two_pi<double>(), double());
-}
-
-/// Returns sqrt(ln(4))  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double root_ln_four<double>() {
-  uint64_t bits = 0x3ff2d6abe44afc43ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(ln(4))  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> root_ln_four< complex<double> >() {
-  return complex<double>(root_ln_four<double>(), double());
-}
-
-/// Returns e, approximately 2.718...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double e<double>() {
-  uint64_t bits = 0x4005bf0a8b145769ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns e, approximately 2.718...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> e< complex<double> >() {
-  return complex<double>(e<double>(), double());
-}
-
-/// Returns (1/2)  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double half<double>() {
-  uint64_t bits = 0x3fe0000000000000ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns (1/2)  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> half< complex<double> >() {
-  return complex<double>(half<double>(), double());
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double root_two<double>() {
-  uint64_t bits = 0x3ff6a09e667f3bcdull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> root_two< complex<double> >() {
-  return complex<double>(root_two<double>(), double());
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double half_root_two<double>() {
-  uint64_t bits = 0x3fe6a09e667f3bcdull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> half_root_two< complex<double> >() {
-  return complex<double>(half_root_two<double>(), double());
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double ln_two<double>() {
-  uint64_t bits = 0x3fe62e42fefa39efull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> ln_two< complex<double> >() {
-  return complex<double>(ln_two<double>(), double());
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double ln_ln_two<double>() {
-  uint64_t bits = 0xbfd774f29bdd6b9full;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> ln_ln_two< complex<double> >() {
-  return complex<double>(ln_ln_two<double>(), double());
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double third<double>() {
-  uint64_t bits = 0x3fd5555555555555ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> third< complex<double> >() {
-  return complex<double>(third<double>(), double());
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double twothirds<double>() {
-  uint64_t bits = 0x3fe5555555555555ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> twothirds< complex<double> >() {
-  return complex<double>(twothirds<double>(), double());
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double pi_minus_three<double>() {
-  uint64_t bits = 0x3fc21fb54442d180ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> pi_minus_three< complex<double> >() {
-  return complex<double>(pi_minus_three<double>(), double());
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for double)
-template <> CUTLASS_HOST_DEVICE double four_minus_pi<double>() {
-  uint64_t bits = 0x3feb7812aeef4ba0ull;
-  return reinterpret_cast<double const &>(bits);
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for complex<double>)
-template <> CUTLASS_HOST_DEVICE complex<double> four_minus_pi< complex<double> >() {
-  return complex<double>(four_minus_pi<double>(), double());
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-// Specialization for float
-
-/// Returns 1, the multiplicative identity element  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float one<float>() {
-  uint32_t bits = 0x3f800000u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 1, the multiplicative identity element  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> one< complex<float> >() {
-  return complex<float>(one<float>(), float());
-}
-
-/// Returns 0, the additive identity element  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float zero<float>() {
-  uint32_t bits = 0x0u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 0, the additive identity element  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> zero< complex<float> >() {
-  return complex<float>(zero<float>(), float());
-}
-
-/// Returns 2  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float two<float>() {
-  uint32_t bits = 0x40000000u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 2  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> two< complex<float> >() {
-  return complex<float>(two<float>(), float());
-}
-
-/// Returns pi, approximately 3.141  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float pi<float>() {
-  uint32_t bits = 0x40490fdbu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns pi, approximately 3.141  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> pi< complex<float> >() {
-  return complex<float>(pi<float>(), float());
-}
-
-/// Returns 2 * pi  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float two_pi<float>() {
-  uint32_t bits = 0x40c90fdbu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 2 * pi  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> two_pi< complex<float> >() {
-  return complex<float>(two_pi<float>(), float());
-}
-
-/// Returns pi / 2  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float half_pi<float>() {
-  uint32_t bits = 0x3fc90fdbu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns pi / 2  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> half_pi< complex<float> >() {
-  return complex<float>(half_pi<float>(), float());
-}
-
-/// Returns sqrt(pi)  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float root_pi<float>() {
-  uint32_t bits = 0x3fe2dfc5u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(pi)  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> root_pi< complex<float> >() {
-  return complex<float>(root_pi<float>(), float());
-}
-
-/// Returns sqrt(pi / 2)  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float root_half_pi<float>() {
-  uint32_t bits = 0x3fa06c99u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(pi / 2)  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> root_half_pi< complex<float> >() {
-  return complex<float>(root_half_pi<float>(), float());
-}
-
-/// Returns sqrt(2 * pi)  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float root_two_pi<float>() {
-  uint32_t bits = 0x40206c99u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(2 * pi)  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> root_two_pi< complex<float> >() {
-  return complex<float>(root_two_pi<float>(), float());
-}
-
-/// Returns sqrt(ln(4))  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float root_ln_four<float>() {
-  uint32_t bits = 0x3f96b55fu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(ln(4))  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> root_ln_four< complex<float> >() {
-  return complex<float>(root_ln_four<float>(), float());
-}
-
-/// Returns e, approximately 2.718...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float e<float>() {
-  uint32_t bits = 0x402df854u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns e, approximately 2.718...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> e< complex<float> >() {
-  return complex<float>(e<float>(), float());
-}
-
-/// Returns (1/2)  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float half<float>() {
-  uint32_t bits = 0x3f000000u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns (1/2)  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> half< complex<float> >() {
-  return complex<float>(half<float>(), float());
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float root_two<float>() {
-  uint32_t bits = 0x3fb504f3u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> root_two< complex<float> >() {
-  return complex<float>(root_two<float>(), float());
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float half_root_two<float>() {
-  uint32_t bits = 0x3f3504f3u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> half_root_two< complex<float> >() {
-  return complex<float>(half_root_two<float>(), float());
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float ln_two<float>() {
-  uint32_t bits = 0x3f317218u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> ln_two< complex<float> >() {
-  return complex<float>(ln_two<float>(), float());
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float ln_ln_two<float>() {
-  uint32_t bits = 0xbebba795u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> ln_ln_two< complex<float> >() {
-  return complex<float>(ln_ln_two<float>(), float());
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float third<float>() {
-  uint32_t bits = 0x3eaaaaabu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> third< complex<float> >() {
-  return complex<float>(third<float>(), float());
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float twothirds<float>() {
-  uint32_t bits = 0x3f2aaaabu;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> twothirds< complex<float> >() {
-  return complex<float>(twothirds<float>(), float());
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float pi_minus_three<float>() {
-  uint32_t bits = 0x3e10fdaau;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> pi_minus_three< complex<float> >() {
-  return complex<float>(pi_minus_three<float>(), float());
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for float)
-template <> CUTLASS_HOST_DEVICE float four_minus_pi<float>() {
-  uint32_t bits = 0x3f5bc095u;
-  return reinterpret_cast<float const &>(bits);
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for complex<float>)
-template <> CUTLASS_HOST_DEVICE complex<float> four_minus_pi< complex<float> >() {
-  return complex<float>(four_minus_pi<float>(), float());
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-// Specialization for tfloat32_t
-
-/// Returns 1, the multiplicative identity element  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t one<tfloat32_t>() {
-  uint32_t bits = 0x3f801000u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 1, the multiplicative identity element  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> one< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(one<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 0, the additive identity element  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t zero<tfloat32_t>() {
-  uint32_t bits = 0x1000u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 0, the additive identity element  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> zero< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(zero<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 2  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t two<tfloat32_t>() {
-  uint32_t bits = 0x40001000u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 2  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(two<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns pi, approximately 3.141  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t pi<tfloat32_t>() {
-  uint32_t bits = 0x40491fdbu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns pi, approximately 3.141  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 2 * pi  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t two_pi<tfloat32_t>() {
-  uint32_t bits = 0x40c91fdbu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 2 * pi  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(two_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns pi / 2  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t half_pi<tfloat32_t>() {
-  uint32_t bits = 0x3fc91fdbu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns pi / 2  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(half_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(pi)  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t root_pi<tfloat32_t>() {
-  uint32_t bits = 0x3fe2efc5u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(pi)  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(root_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(pi / 2)  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t root_half_pi<tfloat32_t>() {
-  uint32_t bits = 0x3fa07c99u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(pi / 2)  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_half_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(root_half_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(2 * pi)  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t root_two_pi<tfloat32_t>() {
-  uint32_t bits = 0x40207c99u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(2 * pi)  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(root_two_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(ln(4))  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t root_ln_four<tfloat32_t>() {
-  uint32_t bits = 0x3f96c55fu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(ln(4))  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_ln_four< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(root_ln_four<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns e, approximately 2.718...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t e<tfloat32_t>() {
-  uint32_t bits = 0x402e0854u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns e, approximately 2.718...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> e< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(e<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns (1/2)  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t half<tfloat32_t>() {
-  uint32_t bits = 0x3f001000u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns (1/2)  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(half<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t root_two<tfloat32_t>() {
-  uint32_t bits = 0x3fb514f3u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(root_two<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t half_root_two<tfloat32_t>() {
-  uint32_t bits = 0x3f3514f3u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_root_two< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(half_root_two<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t ln_two<tfloat32_t>() {
-  uint32_t bits = 0x3f318218u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_two< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(ln_two<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t ln_ln_two<tfloat32_t>() {
-  uint32_t bits = 0xbebbb795u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_ln_two< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(ln_ln_two<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t third<tfloat32_t>() {
-  uint32_t bits = 0x3eaabaabu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> third< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(third<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t twothirds<tfloat32_t>() {
-  uint32_t bits = 0x3f2abaabu;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> twothirds< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(twothirds<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t pi_minus_three<tfloat32_t>() {
-  uint32_t bits = 0x3e110daau;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi_minus_three< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(pi_minus_three<tfloat32_t>(), tfloat32_t());
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for tfloat32_t)
-template <> CUTLASS_HOST_DEVICE tfloat32_t four_minus_pi<tfloat32_t>() {
-  uint32_t bits = 0x3f5bd095u;
-  return reinterpret_cast<tfloat32_t const &>(bits);
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for complex<tfloat32_t>)
-template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> four_minus_pi< complex<tfloat32_t> >() {
-  return complex<tfloat32_t>(four_minus_pi<tfloat32_t>(), tfloat32_t());
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-// Specialization for half_t
-
-/// Returns 1, the multiplicative identity element  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t one<half_t>() {
-  uint16_t bits = 0x3c00u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 1, the multiplicative identity element  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> one< complex<half_t> >() {
-  return complex<half_t>(one<half_t>(), half_t());
-}
-
-/// Returns 0, the additive identity element  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t zero<half_t>() {
-  uint16_t bits = 0x0u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 0, the additive identity element  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> zero< complex<half_t> >() {
-  return complex<half_t>(zero<half_t>(), half_t());
-}
-
-/// Returns 2  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t two<half_t>() {
-  uint16_t bits = 0x4000u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 2  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> two< complex<half_t> >() {
-  return complex<half_t>(two<half_t>(), half_t());
-}
-
-/// Returns pi, approximately 3.141  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t pi<half_t>() {
-  uint16_t bits = 0x4248u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns pi, approximately 3.141  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> pi< complex<half_t> >() {
-  return complex<half_t>(pi<half_t>(), half_t());
-}
-
-/// Returns 2 * pi  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t two_pi<half_t>() {
-  uint16_t bits = 0x4648u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 2 * pi  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> two_pi< complex<half_t> >() {
-  return complex<half_t>(two_pi<half_t>(), half_t());
-}
-
-/// Returns pi / 2  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t half_pi<half_t>() {
-  uint16_t bits = 0x3e48u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns pi / 2  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> half_pi< complex<half_t> >() {
-  return complex<half_t>(half_pi<half_t>(), half_t());
-}
-
-/// Returns sqrt(pi)  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t root_pi<half_t>() {
-  uint16_t bits = 0x3f17u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(pi)  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> root_pi< complex<half_t> >() {
-  return complex<half_t>(root_pi<half_t>(), half_t());
-}
-
-/// Returns sqrt(pi / 2)  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t root_half_pi<half_t>() {
-  uint16_t bits = 0x3d03u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(pi / 2)  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> root_half_pi< complex<half_t> >() {
-  return complex<half_t>(root_half_pi<half_t>(), half_t());
-}
-
-/// Returns sqrt(2 * pi)  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t root_two_pi<half_t>() {
-  uint16_t bits = 0x4103u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(2 * pi)  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> root_two_pi< complex<half_t> >() {
-  return complex<half_t>(root_two_pi<half_t>(), half_t());
-}
-
-/// Returns sqrt(ln(4))  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t root_ln_four<half_t>() {
-  uint16_t bits = 0x3cb6u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(ln(4))  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> root_ln_four< complex<half_t> >() {
-  return complex<half_t>(root_ln_four<half_t>(), half_t());
-}
-
-/// Returns e, approximately 2.718...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t e<half_t>() {
-  uint16_t bits = 0x4170u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns e, approximately 2.718...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> e< complex<half_t> >() {
-  return complex<half_t>(e<half_t>(), half_t());
-}
-
-/// Returns (1/2)  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t half<half_t>() {
-  uint16_t bits = 0x3800u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns (1/2)  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> half< complex<half_t> >() {
-  return complex<half_t>(half<half_t>(), half_t());
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t root_two<half_t>() {
-  uint16_t bits = 0x3da8u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> root_two< complex<half_t> >() {
-  return complex<half_t>(root_two<half_t>(), half_t());
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t half_root_two<half_t>() {
-  uint16_t bits = 0x39a8u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> half_root_two< complex<half_t> >() {
-  return complex<half_t>(half_root_two<half_t>(), half_t());
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t ln_two<half_t>() {
-  uint16_t bits = 0x398cu;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> ln_two< complex<half_t> >() {
-  return complex<half_t>(ln_two<half_t>(), half_t());
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t ln_ln_two<half_t>() {
-  uint16_t bits = 0xb5ddu;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> ln_ln_two< complex<half_t> >() {
-  return complex<half_t>(ln_ln_two<half_t>(), half_t());
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t third<half_t>() {
-  uint16_t bits = 0x3555u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> third< complex<half_t> >() {
-  return complex<half_t>(third<half_t>(), half_t());
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t twothirds<half_t>() {
-  uint16_t bits = 0x3955u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> twothirds< complex<half_t> >() {
-  return complex<half_t>(twothirds<half_t>(), half_t());
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t pi_minus_three<half_t>() {
-  uint16_t bits = 0x3088u;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> pi_minus_three< complex<half_t> >() {
-  return complex<half_t>(pi_minus_three<half_t>(), half_t());
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for half_t)
-template <> CUTLASS_HOST_DEVICE half_t four_minus_pi<half_t>() {
-  uint16_t bits = 0x3adeu;
-  return reinterpret_cast<half_t const &>(bits);
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for complex<half_t>)
-template <> CUTLASS_HOST_DEVICE complex<half_t> four_minus_pi< complex<half_t> >() {
-  return complex<half_t>(four_minus_pi<half_t>(), half_t());
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-// Specialization for bfloat16_t
-
-/// Returns 1, the multiplicative identity element  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t one<bfloat16_t>() {
-  uint16_t bits = 0x3f80u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 1, the multiplicative identity element  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> one< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(one<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 0, the additive identity element  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t zero<bfloat16_t>() {
-  uint16_t bits = 0x0u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 0, the additive identity element  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> zero< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(zero<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 2  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t two<bfloat16_t>() {
-  uint16_t bits = 0x4000u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 2  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(two<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns pi, approximately 3.141  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t pi<bfloat16_t>() {
-  uint16_t bits = 0x4049u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns pi, approximately 3.141  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 2 * pi  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t two_pi<bfloat16_t>() {
-  uint16_t bits = 0x40c9u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 2 * pi  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(two_pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns pi / 2  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t half_pi<bfloat16_t>() {
-  uint16_t bits = 0x3fc9u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns pi / 2  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(half_pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(pi)  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t root_pi<bfloat16_t>() {
-  uint16_t bits = 0x3fe3u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(pi)  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(root_pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(pi / 2)  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t root_half_pi<bfloat16_t>() {
-  uint16_t bits = 0x3fa0u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(pi / 2)  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_half_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(root_half_pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(2 * pi)  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t root_two_pi<bfloat16_t>() {
-  uint16_t bits = 0x4020u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(2 * pi)  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(root_two_pi<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(ln(4))  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t root_ln_four<bfloat16_t>() {
-  uint16_t bits = 0x3f97u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(ln(4))  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_ln_four< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(root_ln_four<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns e, approximately 2.718...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t e<bfloat16_t>() {
-  uint16_t bits = 0x402eu;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns e, approximately 2.718...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> e< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(e<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns (1/2)  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t half<bfloat16_t>() {
-  uint16_t bits = 0x3f00u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns (1/2)  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(half<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t root_two<bfloat16_t>() {
-  uint16_t bits = 0x3fb5u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(2), approximately 1.414...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(root_two<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t half_root_two<bfloat16_t>() {
-  uint16_t bits = 0x3f35u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_root_two< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(half_root_two<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t ln_two<bfloat16_t>() {
-  uint16_t bits = 0x3f31u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns ln(2), approximately 0.693...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_two< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(ln_two<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t ln_ln_two<bfloat16_t>() {
-  uint16_t bits = 0xbebcu;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_ln_two< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(ln_ln_two<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t third<bfloat16_t>() {
-  uint16_t bits = 0x3eabu;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 1/3, approximately 0.333...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> third< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(third<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t twothirds<bfloat16_t>() {
-  uint16_t bits = 0x3f2bu;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 2/3, approximately 0.666...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> twothirds< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(twothirds<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t pi_minus_three<bfloat16_t>() {
-  uint16_t bits = 0x3e11u;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns pi - 3, approximately 0.1416...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi_minus_three< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(pi_minus_three<bfloat16_t>(), bfloat16_t());
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for bfloat16_t)
-template <> CUTLASS_HOST_DEVICE bfloat16_t four_minus_pi<bfloat16_t>() {
-  uint16_t bits = 0x3f5cu;
-  return reinterpret_cast<bfloat16_t const &>(bits);
-}
-
-/// Returns 4 - pi, approximately 0.858...  (specialization for complex<bfloat16_t>)
-template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> four_minus_pi< complex<bfloat16_t> >() {
-  return complex<bfloat16_t>(four_minus_pi<bfloat16_t>(), bfloat16_t());
-}
-///////////////////////////////////////////////////////////////////////////////////
-
-} // namespace constants
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
deleted file mode 100755
index 526db83ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl
+++ /dev/null
@@ -1,96 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/layout/tensor.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/dispatch_policy.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective::detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Maps a rank-1 cute::Shape<> representing the cluster shape on to the IM2COL TMA atom that should be used with it
-template <class UnimodalClusterShape>
-constexpr auto
-sm90_cluster_shape_to_im2col_tma_atom(UnimodalClusterShape unimodal_cluster_shape) {
-  static_assert(cute::rank(unimodal_cluster_shape) == 1,
-    "Use this function to figure out TMA for each mode individually.");
-
-  if constexpr (cute::size(unimodal_cluster_shape) == 1) {
-    return cute::SM90_TMA_LOAD_IM2COL{};
-  }
-  else {
-    return cute::SM90_TMA_LOAD_IM2COL_MULTICAST{};
-  }
-}
-
-// Collective tile traits struct that serves as a type list containing a tensor's mem layouts and atoms for the
-template<
-  class GmemTiledCopy_,
-  class SmemLayout_,
-  class SmemCopyAtom_ = void
->
-struct Sm90ImplicitGemmTileTraits {
-  using GmemTiledCopy = GmemTiledCopy_;
-  using SmemLayout = SmemLayout_;
-  using SmemCopyAtom = SmemCopyAtom_;
-};
-
-// Accepts a cutlass::layout::Tensor tag and computes the corresponding spatial dimension count
-template <class GmemLayoutTagA, class GmemLayoutTagB>
-constexpr int
-gmem_layout_tags_to_spatial_dims() {
-  static_assert(cute::is_same_v<GmemLayoutTagA, GmemLayoutTagB>);
-  if constexpr      (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNWC>) {
-    return 1;
-  }
-  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNHWC>) {
-    return 2;
-  }
-  else if constexpr (cute::is_same_v<GmemLayoutTagA, cutlass::layout::TensorNDHWC>) {
-    return 3;
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<GmemLayoutTagA>);
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective::detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
deleted file mode 100755
index a08209efb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
+++ /dev/null
@@ -1,257 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/conv/collective/builders/sm90_common.inl"
-
-// SM90 Collective Builders should be used only starting CUDA 12.0
-#if (__CUDACC_VER_MAJOR__ >= 12)
-#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective {
-using namespace cute;
-
-namespace detail {
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override(StageCount<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override(cute::Int<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
-constexpr int
-compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes> stage_count) {
-  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
-  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
-  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
-  constexpr int stage_bytes =
-    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    static_cast<int>(mainloop_pipeline_bytes);
-
-  return (CapacityBytes - carveout_bytes) / stage_bytes;
-}
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS_FPROP
-template <
-  conv::Operator ConvOp,
-  class ElementA,
-  class GmemLayoutA,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutB,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ConvOp,
-    ElementA,
-    GmemLayoutA,
-    AlignmentA,
-    ElementB,
-    GmemLayoutB,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90> ||
-                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
-                      cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Pingpong>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(cutlass::gemm::collective::detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, cutlass::gemm::collective::detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  // For fprop, majorA = K,  major B = K;
-  // For wgrad, majorA = MN, major B = MN;
-  // For dgrad, majorA = K,  major B = MN;
-  static constexpr cute::GMMA::Major GmmaMajorA =
-    (ConvOp == conv::Operator::kWgrad) ? cute::GMMA::Major::MN : cute::GMMA::Major::K;
-  static constexpr cute::GMMA::Major GmmaMajorB =
-    (ConvOp == conv::Operator::kFprop) ? cute::GMMA::Major::K : cute::GMMA::Major::MN;
-
-  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelImplicitTmaWarpSpecializedSm90Cooperative>,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  // For wgrad kernel, tensor A uses tma tiled mode and tensor B uses tma im2col mode.
-  using GmemTiledCopyA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
-      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<1>(ClusterShape_MNK{}))),
-      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<1>(ClusterShape_MNK{})))>;
-  using GmemTiledCopyB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
-      decltype(cutlass::conv::collective::detail::sm90_cluster_shape_to_im2col_tma_atom(cute::shape<0>(ClusterShape_MNK{}))),
-      decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(cute::shape<0>(ClusterShape_MNK{})))>;
-
-  using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<cutlass::gemm::collective::detail::sm90_smem_capacity_bytes,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
-      Step<_2,_1,_3>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<PipelineStages>{}),
-      Step<_2,_1,_3>{}));
-
-  constexpr static int NumSpatialDimensions = cutlass::conv::collective::detail::gmem_layout_tags_to_spatial_dims<GmemLayoutA, GmemLayoutB>();
-
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
-      ConvOp, PipelineStages, NumSpatialDimensions, ClusterShape_MNK, KernelScheduleType>;
-
-  using CollectiveOp = CollectiveConv<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      ElementB,
-      TiledMma,
-      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyA, SmemLayoutA>,
-      detail::Sm90ImplicitGemmTileTraits<GmemTiledCopyB, SmemLayoutB>
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA auto kernel schedule
-template <
-  conv::Operator ConvOp,
-  class ElementA,
-  class GmemLayoutA,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutB,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ConvOp,
-    ElementA,
-    GmemLayoutA,
-    AlignmentA,
-    ElementB,
-    GmemLayoutB,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-/*
-#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
-  // Cooperative schedule performs best for CUDA Toolkits with version >= 12.1
-
-  // For TileShape_M == 64, choosing KernelTmaWarpSpecialized as the KernelSchedule
-  // Since KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
-  using KernelWarpSpecializedSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
-      KernelImplicitTmaWarpSpecializedSm90PingPong, KernelImplicitTmaWarpSpecializedSm90Cooperative>;
-#else
-  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
-#endif
-*/
-  using KernelWarpSpecializedSchedule = KernelImplicitTmaWarpSpecializedSm90;
-
-  using CollectiveOp = typename CollectiveBuilder<
-      arch::Sm90,
-      arch::OpClassTensorOp,
-      ConvOp,
-      ElementA,
-      GmemLayoutA,
-      AlignmentA,
-      ElementB,
-      GmemLayoutB,
-      AlignmentB,
-      ElementAccumulator,
-      TileShape_MNK,
-      ClusterShape_MNK,
-      StageCountType,
-      KernelWarpSpecializedSchedule
-    >::CollectiveOp;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
deleted file mode 100755
index 9d6a16c0d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/conv/collective/collective_conv.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Used to specify stage counts or dispatch to automatic computation of stage count
-template<int num_stages>
-struct StageCount {
-  static constexpr int value = num_stages;
-
-  StageCount() = default;
-  explicit StageCount(cute::Int<num_stages>) {}
-};
-
-template<int carveout_bytes>
-struct StageCountAutoCarveout {
-  static constexpr int bytes = carveout_bytes;
-
-  StageCountAutoCarveout() = default;
-  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
-};
-
-// Used to automatically let the builder pick the kernel schedule.
-// Can be overridden with kernel schedule tags in cutlass/conv/dispatch_policy.hpp
-struct KernelScheduleAuto {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ArchTag,
-  class OpClass,
-  conv::Operator,
-  class ElementA,
-  class GmemLayoutA,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutB,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType,
-  class Enable = void
->
-struct CollectiveBuilder {
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not build a collective for given parameters.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "builders/sm90_gmma_builder.inl"
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
deleted file mode 100755
index d187b5ece..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/conv/collective/detail.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class DispatchPolicy,
-  class TileShape,
-  class ElementA,
-  class ElementB,
-  class TiledMma,
-  class TileTraitsA,
-  class TileTraitsB
->
-struct CollectiveConv {
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "sm90_implicit_gemm_gmma_ss_warpspecialized.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
deleted file mode 100755
index ac272c8e2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/conv/convnd_problem_shape.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective::detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Construct the stride types for conv collectives based on the dispatch policy, strides 64b by default
-template <class DispatchPolicy>
-constexpr auto
-sm90_dispatch_policy_to_stride_A() {
-  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
-    // Maps to modes ((w,n), C)
-    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
-      return cute::Stride<cute::Stride<int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // Maps to modes ((w,h,n), C)
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
-      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // Maps to modes ((w,h,d,n), C)
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
-    // Maps to modes (k, nq/npq/nzpq)
-    if constexpr (DispatchPolicy::NumSpatialDimensions == 1 ||
-                  DispatchPolicy::NumSpatialDimensions == 2 ||
-                  DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<cute::Int<1>, int64_t>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
-    // Maps to modes ((q,n), K)
-    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
-      return cute::Stride<cute::Stride<int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // Maps to modes ((q,p,n), K)
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
-      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // Maps to modes ((q,p,z,n), K)
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
-                          cute::Int<1>>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
-  }
-}
-
-// Construct the stirde types for conv collectives based on the dispatch policy, strides 64b by default
-template <class DispatchPolicy>
-constexpr auto
-sm90_dispatch_policy_to_stride_B() {
-  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
-    // Maps to modes (k, (C,s))
-    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
-      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>>{};
-    }
-    // Maps to modes (k, (C,s,r))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
-      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>>{};
-    }
-    // Maps to modes (k, (C,s,r,t))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
-    // Maps to modes (C, (w,n))
-    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
-      return cute::Stride<cute::Int<1>,
-                          cute::Stride<int64_t, int64_t>>{};
-    }
-    // Maps to modes (C, (w,h,n))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
-      return cute::Stride<cute::Int<1>,
-                          cute::Stride<int64_t, int64_t, int64_t>>{};
-    }
-    // Maps to modes (C, (w,h,d,n))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<cute::Int<1>,
-                          cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
-    // Maps to modes (C, (k,s))
-    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
-      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t>>{};
-    }
-    // Maps to modes (C, (k,s,r))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
-      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t>>{};
-    }
-    // Maps to modes (C, (k,s,r,t))
-    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
-      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
-    }
-    // error dims assert
-    else {
-      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
-    }
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Compute the lower/near corner, returning it as a cute::array in [W,H,D] order
-template <conv::Operator ConvOp, int NumSpatialDimensions>
-CUTLASS_HOST_DEVICE
-constexpr auto
-compute_lower_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
-  using cute::for_each;
-  using cute::make_seq;
-
-  cute::array<int, NumSpatialDimensions> lower{};
-  if constexpr (ConvOp == conv::Operator::kFprop ||
-                ConvOp == conv::Operator::kWgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      lower[NumSpatialDimensions-1-i] = -1 * problem_shape.lower_padding[i];
-    });
-  }
-  else if constexpr (ConvOp == conv::Operator::kDgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      lower[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
-        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
-    });
-  }
-  return lower;
-}
-
-// Computes the upper/far corner, returning it as a cute::array in [W,H,D] order
-template <conv::Operator ConvOp, int NumSpatialDimensions>
-CUTLASS_HOST_DEVICE
-constexpr auto
-compute_upper_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
-  using cute::for_each;
-  using cute::make_seq;
-
-  cute::array<int, NumSpatialDimensions> upper{};
-  if constexpr (ConvOp == conv::Operator::kFprop) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
-        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
-    });
-  }
-  else if constexpr (ConvOp == conv::Operator::kWgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
-        (problem_shape.shape_C[i+1] - 1) * problem_shape.dilation[i];
-    });
-  }
-  else if constexpr (ConvOp == conv::Operator::kDgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      upper[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
-        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i] + problem_shape.shape_C[i+1] - problem_shape.shape_A[i+1];
-    });
-  }
-  return upper;
-}
-
-// Compute the lower/near corner of (t,r,s), returning it as a cute::array in [S,R,T] order
-template <conv::Operator ConvOp, int NumSpatialDimensions>
-CUTLASS_HOST_DEVICE
-constexpr auto
-compute_lower_srt(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
-  using cute::for_each;
-  using cute::make_seq;
-
-  cute::array<int, NumSpatialDimensions> lower{};
-  if constexpr (ConvOp == conv::Operator::kFprop ||
-                ConvOp == conv::Operator::kWgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      lower[NumSpatialDimensions-1-i] = 0;
-    });
-  }
-  else if constexpr (ConvOp == conv::Operator::kDgrad) {
-    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
-      lower[NumSpatialDimensions-1-i] = (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
-    });
-  }
-  return lower;
-}
-
-template <class CopyOp> struct is_im2col_load { static constexpr bool value = false; };
-template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL          > { static constexpr bool value = true; };
-template <> struct is_im2col_load<SM90_TMA_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
deleted file mode 100755
index 78862b0a0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
+++ /dev/null
@@ -1,663 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor_predicate.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/atom/copy_traits_sm90_im2col.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/algorithm/gemm.hpp"
-
-#include "cutlass/conv/detail.hpp"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/dispatch_policy.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/util/packed_stride.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  conv::Operator ConvOp,
-  int Stages,
-  int NumSpatialDims,
-  class ClusterShape,
-  class KernelSchedule,
-  int PipelineAsyncMmaStages,
-  class TileShape_,
-  class ElementA_,
-  class ElementB_,
-  class TiledMma_,
-  class TileTraitsA_,
-  class TileTraitsB_>
-struct CollectiveConv<
-    MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
-        ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>,
-    TileShape_,
-    ElementA_,
-    ElementB_,
-    TiledMma_,
-    TileTraitsA_,
-    TileTraitsB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
-      ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using ElementB = ElementB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
-  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
-  using SmemLayoutA = typename TileTraitsA_::SmemLayout;
-  using SmemLayoutB = typename TileTraitsB_::SmemLayout;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
-  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
-  // Deduce the kernel-facing stride tuple types based on the dispatch policy
-  // (which is a function of the number of spatial dimensions, the algorithm, etc.)
-  using StrideA = decltype(detail::sm90_dispatch_policy_to_stride_A<DispatchPolicy>());
-  using StrideB = decltype(detail::sm90_dispatch_policy_to_stride_B<DispatchPolicy>());
-
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
-  
-  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
-
-  // TODO: move pipeline mode tiling into the collective setup phase instead
-  static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
-  static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
-  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
-
-  static_assert(rank(SmemLayoutB{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
-  static_assert((size<1>(TileShape{}) == size<0>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
-  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-
-  // The tma load mode of wgrad is tiled for tensor A and im2col for tensor B while the tma load mode of fprop and dgrad
-  // kernel is im2col for tensor A and tiled for tensor B.
-  static_assert((ConvOp == conv::Operator::kWgrad
-             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>))
-             || (ConvOp != conv::Operator::kWgrad
-             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST>)),
-      "GmemTiledCopyA - invalid SM90 TMA copy atom specified.");
-  static_assert((ConvOp == conv::Operator::kWgrad
-             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST>))
-             || (ConvOp != conv::Operator::kWgrad
-             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)),
-      "GmemTiledCopyB - invalid SM90 TMA copy atom specified.");
-
-  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
-  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
-
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
-  static constexpr uint32_t TmaTransactionBytes =
-      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
-      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A{nullptr};
-    ElementB const* ptr_B{nullptr};
-  };
-
-private:
-  // Note that for fprop and dgrad kernel, the tma load mode is im2col for tensor A and tiled for
-  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
-  // B since operand A, B is swapped.
-  // Get tma_load_a instantce.
-  template <class TensorA>
-  static constexpr auto
-  get_tma_load_a_instance(TensorA const& tensor_a, ProblemShape const& problem_shape) {
-    if constexpr (is_im2col_A) {
-      // compute the upper and lower corners based on the conv padding
-      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
-      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
-      auto lower_srt = detail::compute_lower_srt(problem_shape);
-
-      // The calculation of gbasis strides for dgrad kernel needs perform negate for dilation values.
-      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
-      for (int i = 0; i < NumSpatialDimensions; ++i) {
-        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
-            -problem_shape.dilation[NumSpatialDimensions-1-i] :
-            problem_shape.dilation[NumSpatialDimensions-1-i];
-      }
-  
-      return make_im2col_tma_copy(
-          GmemTiledCopyA{},
-          tensor_a,
-          SmemLayoutA{}(_,_,_0{}),
-          product_each(shape(SmemLayoutA{}(_,_,_0{}))),
-          size<1>(ClusterShape{}),
-          shape(lower_corner_whd),
-          shape(upper_corner_whd),
-          cute::reverse(shape(problem_shape.lower_padding)),
-          cute::reverse(shape(problem_shape.upper_padding)),
-          cute::reverse(shape(problem_shape.traversal_stride)),
-          shape(lower_srt),
-          shape(stride_srt));
-    }
-    // TMA tiled mode for tensor A in wgrad kernel.
-    else {
-      return make_tma_copy(
-          GmemTiledCopyA{},
-          tensor_a,
-          SmemLayoutA{}(_,_,_0{}),
-          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-          size<1>(ClusterShape{}));
-    }
-  }
-
-  // Get tma_load_b instantce.
-  template <class TensorB>
-  static constexpr auto
-  get_tma_load_b_instance(TensorB const& tensor_b, ProblemShape const& problem_shape) {
-    // TMA im2col mode for tensor B in wgrad kernel.
-    if constexpr (is_im2col_B) {
-      // compute the upper and lower corners based on the conv padding
-      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
-      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
-      auto lower_srt = detail::compute_lower_srt(problem_shape);
-  
-      return make_im2col_tma_copy(
-          GmemTiledCopyB{},
-          tensor_b,
-          SmemLayoutB{}(_,_,_0{}),
-          product_each(shape(SmemLayoutB{}(_,_,_0{}))),
-          size<0>(ClusterShape{}),
-          shape(lower_corner_whd),
-          shape(upper_corner_whd),
-          cute::reverse(shape(problem_shape.lower_padding)),
-          cute::reverse(shape(problem_shape.upper_padding)),
-          cute::reverse(shape(problem_shape.traversal_stride)),
-          shape(lower_srt),
-          cute::reverse(shape(problem_shape.dilation)));
-    }
-    else {
-      return make_tma_copy(
-          GmemTiledCopyB{},
-          tensor_b,
-          SmemLayoutB{}(_,_,_0{}),
-          make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-          size<0>(ClusterShape{}));
-    }
-  }
-
-public:
-
-  // Performs im2col transformations on the input of type ConvProblemShape
-  static constexpr auto
-  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
-
-    if constexpr (is_im2col_A || is_im2col_B) {
-      // transformation + im2col linearization
-      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
-    }
-    else {
-      // transformation
-      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
-    }
-  }
-
-  // Device side kernel params
-  struct Params {
-    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
-
-    // Assumption: StrideA is congruent with Problem_MK
-    // Select TMA load type according to convolution operator.
-    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
-        decltype(repeat_like(StrideA{}, int32_t(0))),
-        decltype(make_shape(_Submode{}, int(0)))>;
-
-    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
-        decltype(make_shape(int(0), _Submode{})),
-        decltype(repeat_like(StrideB{}, int32_t(0)))>;
-
-    using TMA_A = decltype(get_tma_load_a_instance(
-        make_tensor(
-            make_gmem_ptr(static_cast<InternalElementA const*>(nullptr)),
-            make_layout(TensorShapeA{}, StrideA{})),
-        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
-
-    using TMA_B = decltype(get_tma_load_b_instance(
-        make_tensor(
-            make_gmem_ptr(static_cast<InternalElementB const*>(nullptr)),
-            make_layout(TensorShapeB{}, StrideB{})),
-        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
-
-    // Members
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-  };
-
-  //
-  // Methods
-  //
-
-  // Lowers the host side user facing arguments to the kernel facing lauch params
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-    // from the flat problem shape arrays of ConvProblemShape<ConvOp, N>, create a rank-3 MNK problem shape tuple
-    // tma desc creation depends on the original untransformed domain.
-
-    // A extents.
-    auto shape_A_orig = problem_shape.get_shape_A();
-    // B extents.
-    auto shape_B_orig = problem_shape.get_shape_B();
-
-    // Fill inferred cute strides from flat stride arrays
-    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
-    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
-
-    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
-    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
-
-    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape);
-    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape);
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      TmaTransactionBytes
-    };
-  }
-  
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      Arguments const& args) {
-    // Activation and Filter channel mode extents much match
-    bool implementable = true;
-    // channel mode is major
-    implementable &= problem_shape.stride_A[NumTensorDimensions-1] == 1;
-    implementable &= problem_shape.stride_B[NumTensorDimensions-1] == 1;
-
-    constexpr int tma_alignment_bits = 128;
-    // A extents.
-    auto shape_A_orig = problem_shape.get_shape_A();
-    // B extents.
-    auto shape_B_orig = problem_shape.get_shape_B();
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-      return false;
-    }
-
-    // Check valid padding values for TMA_LOAD_IM2COL
-    constexpr int padding_limit = (ProblemShape::RankS == 1) ? 65536 : (ProblemShape::RankS == 2 ? 256 : 16);
-    for (int i = 0; i < problem_shape.RankS; ++i) {
-      implementable = implementable && problem_shape.lower_padding[i] <= padding_limit && problem_shape.lower_padding[i] >= 0;
-      implementable = implementable && problem_shape.upper_padding[i] <= padding_limit && problem_shape.upper_padding[i] >= 0;
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
-      return false;
-    }
-
-    if (problem_shape.groups > 1) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: This kernel does not support conv groups > 1.\n");
-      return false;
-    }
-
-    return true;
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mk - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k)
-  /// gB_nk - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k)
-  /// The rest of the tensors can be specified as needed by this collective.
-  /// The dimensions of gA_mk and gA_nk do not contain L to maintain consistency with 
-  /// StrideA and StrideB set up for TMA 
-  template <class ProblemShapeMNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params){
-  //load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M, N, K, L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mk = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K));                            // (m,k)
-    Tensor mB_nk = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K));                            // (n,k)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k)
-    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k)
-
-    return cute::make_tuple(gA_mk, gB_nk);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_producer_state,
-      cute::tuple<TensorA, TensorB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-
-    int lane_predicate = cute::elect_one_sync();
-    if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-      //
-      // Prepare the TMA loads for A and B
-      //
-      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-      auto [gA_mk, gB_nk] = load_inputs;
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-
-      Tensor gA = gA_mk(_,_,m_coord,_);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nk(_,_,n_coord,_);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
-                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
-                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count) {
-        // LOCK smem_pipe_producer_state for _writing_
-        pipeline.producer_acquire(smem_pipe_producer_state);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_producer_state);
-
-        int write_stage = smem_pipe_producer_state.index();
-
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_producer_state
-        ++smem_pipe_producer_state;
-      }
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_producer_state) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all 
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was 
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_producer_state);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <class FrgTensorC>
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_consumer_state,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_consumer_state;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
-      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
-      pipeline.consumer_wait(smem_pipe_consumer_state);
-
-      int read_stage = smem_pipe_consumer_state.index();
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_consumer_state;
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
-      pipeline.consumer_wait(smem_pipe_consumer_state);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_consumer_state.index();
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_producer_state is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
-      // Advance smem_pipe_consumer_state and smem_pipe_release
-      ++smem_pipe_consumer_state;
-      ++smem_pipe_release;
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
deleted file mode 100755
index d2e895299..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h
+++ /dev/null
@@ -1,654 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This file contains definitions and utility functions for describing convolution problem sizes.
-
-  Conv2dProblem desciption:
-    activation (NHWC), 
-    filter (KRSC), 
-    output (NPQK), 
-    pading (pad_h, pad_w),
-    stride (stride_h, stride_w),
-    dilation (dilation_h, dilation_w).
-    
-  Free functions to map:
-    Map tensor extents (Conv2d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
-    Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
-    Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm_enumerated_types.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/functional.h"
-
-namespace cutlass {
-namespace conv {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Problem size structure
-struct Conv2dProblemSize {
-
-  // Conv2d strictly problem size parameters
-  int N, H, W, C, P, Q, K, R, S;
-  int pad_h, pad_w;
-  int stride_h, stride_w;
-  int dilation_h, dilation_w;
-  Mode mode;
-
-  // Conv2d implementation-related parameters 
-  int split_k_slices;
-  int groups;
-
-  //
-  // Methods
-  //
-
-public:
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize():
-    N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
-    pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
-    mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
- 
-  /// Constructor for default padding, stride, dilation, and split-K
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(
-    int N,
-    int H,
-    int W,
-    int C,
-    int P,
-    int Q,
-    int K,
-    int R,
-    int S,
-    Mode mode
-  ): 
-    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
-    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
-    mode(mode), split_k_slices(1), groups (1) { }
-  
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(
-    int N,
-    int H,
-    int W,
-    int C,
-    int K,
-    int R,
-    int S,
-    int P,
-    int Q,
-    int pad_h,
-    int pad_w,
-    int stride_h,
-    int stride_w,
-    int dilation_h,
-    int dilation_w,
-    Mode mode,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
-    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w),
-    dilation_h(dilation_h), dilation_w(dilation_w), 
-    mode(mode), split_k_slices(split_k_slices), groups (groups) { }
-
-  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
-  // set user-defined output size and sets P and Q (include all data members in ctor)
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(
-    cutlass::Tensor4DCoord input_size,    // NHWC
-    cutlass::Tensor4DCoord filter_size,   // KRSC
-    cutlass::Tensor4DCoord padding,       // pad_h, _, pad_w, _
-    cutlass::MatrixCoord stride,          // stride_h, stride_w
-    cutlass::MatrixCoord dilation,        // dilation_h, dilation_w
-    cutlass::Tensor4DCoord output_size,   // NPQK
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
-    P(output_size.h()), Q(output_size.w()),
-    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    pad_h(padding[0]), pad_w(padding[2]),
-    stride_h(stride.row()), stride_w(stride.column()),
-    dilation_h(dilation.row()), dilation_w(dilation.column()),
-    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
-
-  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
-  // computes output size and sets P and Q (skip output from ctor arguments)
-  CUTLASS_HOST_DEVICE  
-  Conv2dProblemSize(
-    cutlass::Tensor4DCoord input_size,   // NHWC
-    cutlass::Tensor4DCoord filter_size,  // KRSC
-    cutlass::Tensor4DCoord padding,      // pad_h, upper_pad_h, pad_w, upper_pad_w
-    cutlass::MatrixCoord stride,         // stride_h, stride_w
-    cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
-    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    pad_h(padding[0]), pad_w(padding[2]),
-    stride_h(stride.row()), stride_w(stride.column()),
-    dilation_h(dilation.row()), dilation_w(dilation.column()),
-    mode(mode), split_k_slices(split_k_slices), groups(groups) {
-      // set output P and Q
-      P = ((H + pad_h + padding[1] - R * dilation_h) / stride_h) + 1;
-      Q = ((W + pad_w + padding[3] - S * dilation_w) / stride_w) + 1;
-    }
-
-  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
-  // set user-defined output size and sets P and Q (skip padding, striding, and dilation)
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize(
-    cutlass::Tensor4DCoord input_size,    // NHWC
-    cutlass::Tensor4DCoord filter_size,   // KRSC
-    cutlass::Tensor4DCoord output_size,   // NPQK
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
-    P(output_size.h()), Q(output_size.w()),
-    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
-    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1),
-    dilation_h(1), dilation_w(1),
-    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
-
-  // Reset covolution mode in the problem
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) {
-    Conv2dProblemSize tmp(*this);
-    tmp.mode = mode_; 
-    return tmp; 
-  }
-
-  // Reset covolution mode in the problem
-  CUTLASS_HOST_DEVICE
-  Conv2dProblemSize reset_split_k_slices(int split_k_slices_) {
-    Conv2dProblemSize tmp(*this);
-    tmp.split_k_slices = split_k_slices_; 
-    return tmp; 
-  }
-
-  /// Equality operator (ignores mode and split_k_slice)
-  CUTLASS_HOST_DEVICE
-  bool operator==(Conv2dProblemSize const &conv) const {
-    return (
-      (N == conv.N) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
-      (K == conv.K) && (R == conv.R) && (S == conv.S) &&
-      (P == conv.P) && (Q == conv.Q) &&
-      (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
-      (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
-      (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
-    );  
-  }
-
-  /// Inequality operator
-  CUTLASS_HOST_DEVICE
-  bool operator!=(Conv2dProblemSize const &rhs) const {
-    return !(*this == rhs);
-  }
-
-  /// Returns activation extent as Tensor4DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor4DCoord activation_extent() const {
-
-    return cutlass::Tensor4DCoord ({N, H, W, C});
-  }
-
-  /// Returns filter extent as Tensor4DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor4DCoord filter_extent(bool is_deconv = false) const {
-
-    return is_deconv ? cutlass::Tensor4DCoord ({C, R, S, K / groups})
-        : cutlass::Tensor4DCoord ({K, R, S, C / groups});
-  }
-
-  /// Returns output extent as Tensor4DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor4DCoord output_extent() const {
-
-    return cutlass::Tensor4DCoord ({N, P, Q, K});
-  }
-
-  /// Returns activation size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t activation_size() const {
-
-    return (N * H * W * C);
-  }
-
-  /// Returns filter size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t filter_size() const {
-
-    return (K * R * S * C / groups);
-  }
-
-  /// Returns output size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t output_size() const {
-
-    return (N * P * Q * K);
-  }
-  
-  /// Returns padding as Tensor4DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor4DCoord padding() const {
-
-    return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w});
-  }
-
-  /// Returns stride as MatrixCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::MatrixCoord stride() const {
-
-    return cutlass::MatrixCoord ({stride_h, stride_w});
-  }
-
-  /// Returns dilation as MatrixCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::MatrixCoord dilation() const {
-
-    return cutlass::MatrixCoord ({dilation_h, dilation_w});
-  }
-
-  /////////////////////////////////////////////////////////////////
-  //        Methods used for strided dgrad implementation
-  /////////////////////////////////////////////////////////////////
-  /// Number of filter r positions to accumulate in gemm-k dim
-  CUTLASS_HOST_DEVICE
-  int num_gemm_k_filter_r(int r) const {
-    return ((R - r + stride_h - 1) / stride_h);
-  }
-
-  /// Number of filter s positions to accumulate in gemm-k dim
-  CUTLASS_HOST_DEVICE
-  int num_gemm_k_filter_s(int s) const {
-    return ((S - s + stride_w - 1) / stride_w);
-  }
-
-  /// Number of filter positions to accumulate in gemm-k dim
-  CUTLASS_HOST_DEVICE
-  int num_gemm_k_filter_positions(int r, int s) const {
-    return num_gemm_k_filter_r(r) * num_gemm_k_filter_s(s);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//                                  ImplicitGemm helper functions                                 //
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Determine the problem size of the implicit GEMM operation
-CUTLASS_HOST_DEVICE
-cutlass::gemm::GemmCoord implicit_gemm_problem_size(
-  Operator conv_operator, 
-  Conv2dProblemSize const &problem_size) {
-  // Compute problem size
-  switch (conv_operator) {
-  case Operator::kFprop:
-    return gemm::GemmCoord(
-      problem_size.N * problem_size.P * problem_size.Q,
-      problem_size.K,
-      problem_size.R * problem_size.S * problem_size.C / problem_size.groups
-    );
-  case Operator::kDeconv:
-  case Operator::kDgrad:
-    return gemm::GemmCoord(
-      problem_size.N * problem_size.H * problem_size.W,
-      problem_size.C,
-      problem_size.R * problem_size.S * problem_size.K
-    );
-  case Operator::kWgrad:
-    return gemm::GemmCoord(
-      problem_size.K,
-      problem_size.R * problem_size.S * problem_size.C,
-      problem_size.N * problem_size.P * problem_size.Q
-    );
-  default:
-    break;
-  }
-  return gemm::GemmCoord();
-}
-
-// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
-CUTLASS_HOST_DEVICE
-int implicit_gemm_k_iterations(
-  Operator conv_operator, 
-  int threadblock_K, 
-  Conv2dProblemSize const &problem_size,
-  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
-  GroupMode group_mode = GroupMode::kNone,
-  int threadblock_N = 0) {
-
-  int iterations = 0;
-
-  if (group_mode == GroupMode::kNone) {
-
-    if (algorithm == IteratorAlgorithm::kFixedChannels) {
-
-      int positions_per_iteration = threadblock_K / problem_size.C;
-      switch (conv_operator) {
-      case Operator::kFprop:
-        iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
-        break;
-
-      default:
-        break;
-      }
-    }
-    else if (algorithm == IteratorAlgorithm::kFewChannels) {
-
-      switch (conv_operator) {
-      case Operator::kFprop:
-        iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
-        break;
-
-      default:
-        break;
-      }
-    }
-    else {
-      int elements_per_split_k_slice = 0;
-
-      switch (conv_operator) {
-      case Operator::kFprop:
-        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-        break;
-
-      case Operator::kDeconv:
-      case Operator::kDgrad:
-        elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-        break;
-
-      case Operator::kWgrad:
-        elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
-        break;
-
-      default:
-        break;
-      }
-    }
-
-  } else if (group_mode == GroupMode::kDepthwise) {
-    int channels_per_cta = threadblock_N;
-
-    if (algorithm == IteratorAlgorithm::kAnalytic) {
-      switch (conv_operator) {
-        case Operator::kFprop:
-          iterations = problem_size.R * problem_size.S *
-                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
-          break;
-
-        default:
-          break;
-      }
-    }
-  } else {  // Group conv
-
-    int channels_per_group = problem_size.C / problem_size.groups;
-    int k_per_group = problem_size.K / problem_size.groups;
-
-    if (algorithm == IteratorAlgorithm::kAnalytic) {
-      switch (conv_operator) {
-        case Operator::kFprop:
-          iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
-          // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
-          if (problem_size.groups != 1) {
-            if (k_per_group < threadblock_N) {
-              iterations *= threadblock_N / k_per_group;
-            }
-          }
-          break;
-
-        default:
-          break;
-      }
-    } else if (algorithm == IteratorAlgorithm::kOptimized) {
-      // Current optimized iterator only support GroupMode::kSingleGroup
-      if (group_mode == GroupMode::kSingleGroup) {
-        switch (conv_operator) {
-          case Operator::kFprop:
-            iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
-            break;
-
-          default:
-            break;
-        }
-      }
-    }
-
-  }
-
-  return iterations;
-}
-
-
-template <int N = 1, int Output_P = 1, int Output_Q = 1>
-CUTLASS_HOST_DEVICE
-int depthwise_gemm_k_iterations(
-  Operator conv_operator, 
-  int threadblock_K, 
-  Conv2dProblemSize const &problem_size,
-  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
-  GroupMode group_mode = GroupMode::kNone,
-  int threadblock_N = 0) {
-
-    int n =  problem_size.N;
-    int p = (problem_size.P + Output_P - 1) /  Output_P;
-    int q = (problem_size.Q + Output_Q - 1) /  Output_Q;
-
-    int iterations = (n * p * q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-    return iterations;
-}
-
-
-CUTLASS_HOST_DEVICE
-int implicit_gemm_k_iterations_per_channel(
-    Operator conv_operator,
-    Conv2dProblemSize const &problem_size,
-    IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
-
-  int iterations = 0; //0 means not applicable
-  if (algorithm == IteratorAlgorithm::kAnalytic || algorithm == IteratorAlgorithm::kOptimized) {
-    switch (conv_operator) {
-      case Operator::kFprop:
-        iterations = problem_size.R * problem_size.S;
-        break;
-
-      case Operator::kDeconv:
-      case Operator::kDgrad:
-        iterations = problem_size.R * problem_size.S;
-        break;
-
-      default:
-        break;
-    }
-  }
-  return iterations;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
-////////////////////////////////////////////////////////////////////////////////
-/// Returns ImplicitGemm tensor A extent as Tensor4DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
-    default : break;
-  }
-  return cutlass::Tensor4DCoord();
-}
-
-/// Returns ImplicitGemm tensor B extent as Tensor4DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
-    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
-    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
-    default : break;
-  }
-  return cutlass::Tensor4DCoord();
-}
-
-/// Returns ImplicitGemm tensor C extent as Tensor4DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
-    default : break;
-  }
-  return cutlass::Tensor4DCoord();
-}
-
-/// Returns ImplicitGemm tensor A size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_a_size(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
-    default : break;
-  }
-  return 0;
-}
-
-/// Returns ImplicitGemm tensor B size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_b_size(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
-    default : break;
-  }
-  return 0;
-}
-
-/// Returns ImplicitGemm tensor C size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_c_size(
-  Operator conv_operator,
-  Conv2dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
-    default : break;
-  }
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//                                  Strided dgrad helper functions                                 //
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Returns number of CTAs tile M to cover valid MMAs per starting filter postion
-CUTLASS_HOST_DEVICE
-int strided_dgrad_tile_m_per_filter(
-  Conv2dProblemSize const &problem_size,
-  int tile_size_m) {
-
-  // Compute NHW rows in Dx output that needs MMA per starting filter position
-  int rows_h_per_filter = (problem_size.H + problem_size.stride_h - 1) / problem_size.stride_h;
-  int rows_w_per_filter = (problem_size.W + problem_size.stride_w - 1) / problem_size.stride_w;
-  int rows_nhw_per_filter = problem_size.N * rows_h_per_filter * rows_w_per_filter;
-
-  // Number of CTAs tile M to cover valid MMAs per starting filter postion
-  int tile_m_per_filter = (rows_nhw_per_filter + tile_size_m - 1) / tile_size_m;
-
-  return tile_m_per_filter;
-}
-
-// Computes starting Dx coord (h, w) for given starting filter postion
-CUTLASS_HOST_DEVICE
-void strided_dgrad_starting_coords(
-  Conv2dProblemSize const &problem_size,
-  FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
-  int r, int s,
-  int &start_h, int &start_w) {
-
-  // function locals for remainder by fast divmod
-  int pad_h_rem_, pad_w_rem_;
-
-  // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
-  stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
-  int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
-  stride_h_divmod.divmod(start_h, r_);
-
-  //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
-  stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
-  int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
-  stride_w_divmod.divmod(start_w, s_);
-}
-
-} // namespace conv
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
deleted file mode 100755
index 9a9514f2d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h
+++ /dev/null
@@ -1,513 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This file contains definitions and utility functions for describing convolution problem sizes.
-
-  Conv3dProblem desciption:
-    activation (NDHWC), 
-    filter (KTRSC), 
-    output (NZPQK), 
-    pading (pad_d, pad_h, pad_w), 
-    stride (stride_d, stride_h, stride_w), 
-    dilation (dilation_d, dilation_h, dilation_w).
-  
-  Free functions to map:
-    Map tensor extents (Conv3d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
-    Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
-    Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)  
-*/
-
-#pragma once
-
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-namespace cutlass {
-namespace conv {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Problem size structure
-struct Conv3dProblemSize : public Conv2dProblemSize {
-  //
-  // Type definitions
-  //
-
-  // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions
-  using Coord3D = Coord<3>;
-
-  //
-  // Data members
-  //
-
-  // Conv3d strictly problem size parameters
-  int D, T, Z;    // input depth, filter depth, output depth
-  int pad_d;      // padding in depth dimension
-  int stride_d;   // stride in depth dimension
-  int dilation_d; // dilation in depth dimension
-
-  //
-  // Methods
-  //
-public:
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(): 
-    Conv2dProblemSize(),
-    D(0), T(0), Z(0), 
-    pad_d(0),
-    stride_d(1), 
-    dilation_d(1) { }
- 
-  /// Constructor for default padding, stride, dilation, and split-K
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(
-    int N,
-    int D,
-    int H,
-    int W,
-    int C,
-    int Z,
-    int P,
-    int Q,
-    int K,
-    int T,
-    int R,
-    int S,
-    Mode mode
-  ):
-    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
-    D(D), T(T), Z(Z), 
-    pad_d(T / 2), stride_d(1), dilation_d(1) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(
-    int N,
-    int D,
-    int H,
-    int W,
-    int C,
-    int K,
-    int T,
-    int R,
-    int S,
-    int Z,
-    int P,
-    int Q,
-    int pad_d,
-    int pad_h,
-    int pad_w,
-    int stride_d,
-    int stride_h,
-    int stride_w,
-    int dilation_d,
-    int dilation_h,
-    int dilation_w,
-    Mode mode,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    Conv2dProblemSize(
-    N, H, W, C, K, R, S, P, Q, 
-    pad_h, pad_w, 
-    stride_h, stride_w, 
-    dilation_h, dilation_w,
-    mode, split_k_slices, groups),
-    D(D), T(T), Z(Z), 
-    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
-
-  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
-  // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(
-    cutlass::Tensor5DCoord input_size,    // NDHWC
-    cutlass::Tensor5DCoord filter_size,   // KTRSC
-    Coord3D padding,                      // pad_d, pad_h, pad_w
-    Coord3D stride,                       // stride_d, stride_h, stride_w
-    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
-    cutlass::Tensor5DCoord output_size,   // NZPQK
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    Conv2dProblemSize(
-      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
-      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
-      {padding[1], padding[1], padding[2], padding[2]},
-      {stride[1], stride[2]},
-      {dilation[1], dilation[2]},
-      {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
-      mode, split_k_slices, groups),
-    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
-
-  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
-  // *computes* output size and sets Z, P and Q (include all data members in ctor)
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(
-    cutlass::Tensor5DCoord input_size,    // NDHWC
-    cutlass::Tensor5DCoord filter_size,   // KTRSC
-    Coord3D padding,                      // pad_d, pad_h, pad_w
-    Coord3D stride,                       // stride_d, stride_h, stride_w
-    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    Conv2dProblemSize(
-      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
-      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
-      {padding[1], padding[1], padding[2], padding[2]},
-      {stride[1], stride[2]},
-      {dilation[1], dilation[2]},
-      mode, split_k_slices, groups),
-    D(input_size.d()), T(filter_size.d()),
-    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
-    {
-      // set output Z
-      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
-    }
-
-  /// Constructs convolution problem size from cutlass Tensor5DCoord, Coord3D
-  // *computes* output size and sets Z, P and Q (include all data members in ctor)
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize(
-    cutlass::Tensor5DCoord input_size,    // NDHWC
-    cutlass::Tensor5DCoord filter_size,   // KTRSC
-    CUTLASS_STL_NAMESPACE::tuple<Coord3D, Coord3D> padding, // Coord3D {pad_d, pad_h, pad_w} & Coord3D {far pad_d, pad_h, pad_w} to calculate o/p/q
-    Coord3D stride,                       // stride_d, stride_h, stride_w
-    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
-    int split_k_slices = 1,
-    int groups = 1
-  ):
-    Conv2dProblemSize(
-      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
-      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
-      {CUTLASS_STL_NAMESPACE::get<0>(padding)[1], CUTLASS_STL_NAMESPACE::get<1>(padding)[1],
-       CUTLASS_STL_NAMESPACE::get<0>(padding)[2], CUTLASS_STL_NAMESPACE::get<1>(padding)[2]},
-      {stride[1], stride[2]},
-      {dilation[1], dilation[2]},
-      mode, split_k_slices, groups),
-    D(input_size.d()), T(filter_size.d()),
-    pad_d(CUTLASS_STL_NAMESPACE::get<0>(padding)[0]), stride_d(stride[0]), dilation_d(dilation[0])
-    {
-      // set output Z
-      Z = ((D + pad_d + CUTLASS_STL_NAMESPACE::get<1>(padding)[0] - T * dilation_d) / stride_d) + 1;
-    }
-
-  /// Equality operator (ignores mode and split_k_slice)
-  CUTLASS_HOST_DEVICE
-  bool operator==(Conv3dProblemSize const &conv) const {
-    return (
-      (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
-      (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
-      (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
-      (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
-      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
-      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
-    );  
-  }
-
-  /// Inequality operator
-  CUTLASS_HOST_DEVICE
-  bool operator!=(Conv3dProblemSize const &rhs) const {
-    return !(*this == rhs);
-  }
-
-  // Reset covolution mode in the problem
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) {
-    Conv3dProblemSize tmp(*this);
-    tmp.mode = mode_; 
-    return tmp; 
-  }
-
-  // Reset covolution mode in the problem
-  CUTLASS_HOST_DEVICE
-  Conv3dProblemSize reset_split_k_slices(int split_k_slices_) {
-    Conv3dProblemSize tmp(*this);
-    tmp.split_k_slices = split_k_slices_; 
-    return tmp; 
-  }
-  
-  /// Returns activation extent as Tensor5DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor5DCoord activation_extent() const {
-
-    return cutlass::Tensor5DCoord ({N, D, H, W, C});
-  }
-
-  /// Returns filter extent as Tensor5DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor5DCoord filter_extent(bool is_deconv = false) const {
-
-    return is_deconv ? cutlass::Tensor5DCoord ({C, T, R, S, K})
-        : cutlass::Tensor5DCoord ({K, T, R, S, C});
-  }
-
-  /// Returns output extent as Tensor5DCoord
-  CUTLASS_HOST_DEVICE
-  cutlass::Tensor5DCoord output_extent() const {
-
-    return cutlass::Tensor5DCoord ({N, Z, P, Q, K});
-  }
-
-  /// Returns activation size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t activation_size() const {
-
-    return (N * D * H * W * C);
-  }
-
-  /// Returns filter size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t filter_size() const {
-
-    return (K * T * R * S * C);
-  }
-
-  /// Returns output size in number of elements
-  CUTLASS_HOST_DEVICE
-  int64_t output_size() const {
-
-    return (N * Z * P * Q * K);
-  }
-
-  /// Returns padding as Coord3D
-  CUTLASS_HOST_DEVICE
-  Coord3D padding() const {
-
-    return Coord3D ({pad_d, pad_h, pad_w});
-  }
-
-  /// Returns stride as MatrixCoord
-  CUTLASS_HOST_DEVICE
-  Coord3D stride() const {
-
-    return Coord3D ({stride_d, stride_h, stride_w});
-  }
-
-  /// Returns dilation as MatrixCoord
-  CUTLASS_HOST_DEVICE
-  Coord3D dilation() const {
-
-    return Coord3D ({dilation_d, dilation_h, dilation_w});
-  }
-
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//                                  ImplicitGemm helper functions                                 //
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Determine the problem size of the implicit GEMM operation
-CUTLASS_HOST_DEVICE
-cutlass::gemm::GemmCoord implicit_gemm_problem_size(
-  Operator conv_operator, 
-  Conv3dProblemSize const &problem_size) {
-  // Compute problem size
-  switch (conv_operator) {
-  case Operator::kFprop:
-    return gemm::GemmCoord(
-      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q,
-      problem_size.K,
-      problem_size.T * problem_size.R * problem_size.S * problem_size.C
-    );
-  case Operator::kDeconv:
-  case Operator::kDgrad:
-    return gemm::GemmCoord(
-      problem_size.N * problem_size.D * problem_size.H * problem_size.W,
-      problem_size.C,
-      problem_size.T * problem_size.R * problem_size.S * problem_size.K
-    );
-  case Operator::kWgrad:
-    return gemm::GemmCoord(
-      problem_size.K,
-      problem_size.T * problem_size.R * problem_size.S * problem_size.C,
-      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q
-    );
-  default:
-    break;
-  }
-  return gemm::GemmCoord();
-}
-
-// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
-CUTLASS_HOST_DEVICE
-int implicit_gemm_k_iterations(
-  Operator conv_operator, 
-  int threadblock_K, 
-  Conv3dProblemSize const &problem_size,
-  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
-  GroupMode group_mode = GroupMode::kNone,
-  int threadblock_N = 0) {
-
-  int iterations = 0;
-  int elements_per_split_k_slice = 0;
-  if (group_mode == GroupMode::kNone) {
-    switch (conv_operator) {
-      case Operator::kFprop:
-        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-        break;
-
-      case Operator::kDeconv:
-      case Operator::kDgrad:
-        elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-        break;
-    
-      case Operator::kWgrad:
-        elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
-        break;
-    
-      default:
-        break;
-    }
-  } else if (group_mode == GroupMode::kDepthwise) {
-    int channels_per_cta = threadblock_N;
-
-    if (algorithm == IteratorAlgorithm::kAnalytic) {
-      switch (conv_operator) {
-        case Operator::kFprop:
-          iterations = problem_size.T * problem_size.R * problem_size.S *
-                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
-          break;
-
-        default:
-          break;
-      }
-    }
-  }
-
-  return iterations;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
-////////////////////////////////////////////////////////////////////////////////
-/// Returns ImplicitGemm tensor A extent as Tensor5DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
-    default : break;
-  }
-  return cutlass::Tensor5DCoord();
-}
-
-/// Returns ImplicitGemm tensor B extent as Tensor5DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
-    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
-    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
-    default : break;
-  }
-  return cutlass::Tensor5DCoord();
-}
-
-/// Returns ImplicitGemm tensor C extent as Tensor5DCoord
-CUTLASS_HOST_DEVICE
-cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
-    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
-    default : break;
-  }
-  return cutlass::Tensor5DCoord();
-}
-
-/// Returns ImplicitGemm tensor A size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_a_size(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
-    default : break;
-  }
-  return 0;
-}
-
-/// Returns ImplicitGemm tensor B size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_b_size(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
-    default : break;
-  }
-  return 0;
-}
-
-/// Returns ImplicitGemm tensor C size in number of elements
-CUTLASS_HOST_DEVICE
-int64_t implicit_gemm_tensor_c_size(
-  Operator conv_operator,
-  Conv3dProblemSize const &problem_size) {
-  switch (conv_operator) {
-    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
-    case cutlass::conv::Operator::kDeconv:
-    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
-    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
-    default : break;
-  }
-  return 0;
-}
-
-} // namespace conv
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
deleted file mode 100755
index ffcc547fb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
+++ /dev/null
@@ -1,561 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This file contains definitions and utility functions for describing convolution problem shapes.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/conv/convolution.h"
-
-#include "cute/container/array.hpp"
-
-#if ! defined(__CUDACC_RTC__)
-#include <initializer_list>
-#endif
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Implements the user facing argument for all CUTLASS 3.x convolutions in a rank agnostic fashion.
-// All tensors are flat and by default treated as layout right (NDHWC, KTRSC, NZPQK)
-// Supports asymmetric padding, traversal strides, dilations, and all conv algorithm types.
-template <
-  conv::Operator ConvOp_,
-  int NumSpatialDimensions_
->
-struct ConvProblemShape {
-  //
-  // Alias types for members
-  //
-
-  static constexpr int RankS = NumSpatialDimensions_;
-  static constexpr int RankT = NumSpatialDimensions_ + 2;
-  static constexpr conv::Operator ConvOp = ConvOp_;
-  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
-  using SpatialExtent = cute::array<int, RankS>;
-  using TensorExtent  = cute::array<int, RankT>;
-  using TensorStride  = cute::array<int64_t, RankT>;
-  using ShapePadding = SpatialExtent;
-  using TraversalStride = SpatialExtent;
-  using ShapeDilation = SpatialExtent;
-  using Corner = SpatialExtent;
-
-  //
-  // Members
-  //
-  cutlass::conv::Mode mode{};
-  TensorExtent shape_A{};
-  TensorStride stride_A{};
-  TensorExtent shape_B{};
-  TensorStride stride_B{};
-  TensorExtent shape_C{};
-  TensorStride stride_C{};
-
-  // asymmetric padding, both upper and lower padding must be >= 0
-  ShapePadding lower_padding{};
-  ShapePadding upper_padding{};
-  TraversalStride traversal_stride{};
-  ShapeDilation dilation{};
-  int groups = 1;
-
-  //
-  // Methods
-  //
-
-  ConvProblemShape() = default;
-
-  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
-  ConvProblemShape(
-      conv::Mode mode,                                                     // convolution/cross-correlation
-      TensorExtent shape_act,                                              // [n,d,h,w,c]
-      TensorStride stride_act,                                             // [n,d,h,w,c]
-      TensorExtent shape_flt,                                              // [k,t,r,s,c]
-      TensorStride stride_flt,                                             // [k,t,r,s,c]
-      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
-      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
-      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
-      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
-      int groups)
-      : mode(mode)
-      , lower_padding(lower_padding)
-      , upper_padding(upper_padding)
-      , traversal_stride(tstride)
-      , dilation(dilation)
-      , groups(groups) {
-
-    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
-    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
-  }
-
-  // Allow user input of xformed activation stride to support non-packed strides.
-  ConvProblemShape(
-      conv::Mode mode,                                                     // convolution/cross-correlation
-      TensorExtent shape_act,                                              // [n,d,h,w,c]
-      TensorStride stride_act,                                             // [n,d,h,w,c]
-      TensorExtent shape_flt,                                              // [k,t,r,s,c]
-      TensorStride stride_flt,                                             // [k,t,r,s,c]
-      TensorStride stride_xformed_act,                                     // [n,z,p,q,k]
-      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
-      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
-      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
-      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
-      int groups)
-      : mode(mode)
-      , lower_padding(lower_padding)
-      , upper_padding(upper_padding)
-      , traversal_stride(tstride)
-      , dilation(dilation)
-      , groups(groups) {
-
-    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
-    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
-    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
-
-    auto stride_act_packed = packed_stride_right_major(shape_act);
-    auto stride_flt_packed = packed_stride_right_major(shape_flt);
-    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i = 0; i < RankT - 1; ++i) {
-      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
-      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
-      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
-    }
-
-    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
-  }
-
-  // Constructor accepts user facing arguments and presume packed tensor strides in canonical (CWHDN) order.
-  ConvProblemShape(
-      conv::Mode mode,
-      TensorExtent shape_act,
-      TensorExtent shape_flt,
-      ShapePadding lower_padding,
-      ShapePadding upper_padding,
-      TraversalStride tstride,
-      ShapeDilation dilation,
-      int groups)
-      : ConvProblemShape(
-        mode,
-        shape_act,
-        packed_stride_right_major(shape_act),
-        shape_flt,
-        packed_stride_right_major(shape_flt),
-        lower_padding,
-        upper_padding,
-        tstride,
-        dilation,
-        groups) {
-    }
-
-#if ! defined(__CUDACC_RTC__)
-  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
-  ConvProblemShape(
-      conv::Mode                     mode,
-      std::initializer_list<int>     shape_act_,
-      std::initializer_list<int64_t> stride_act_,
-      std::initializer_list<int>     shape_flt_,
-      std::initializer_list<int64_t> stride_flt_,
-      std::initializer_list<int>     lower_padding_,
-      std::initializer_list<int>     upper_padding_,
-      std::initializer_list<int>     traversal_stride_,
-      std::initializer_list<int>     dilation_,
-      int groups)
-      : mode(mode)
-      , groups(groups) {
-
-    TensorExtent shape_act{};
-    TensorStride stride_act{};
-    TensorExtent shape_flt{};
-    TensorStride stride_flt{};
-
-    assert(shape_act_.size() == shape_act.size());
-    assert(stride_act_.size() == stride_act.size());
-    assert(shape_flt_.size() == shape_flt.size());
-    assert(stride_flt_.size() == stride_flt.size());
-    assert(lower_padding_.size() == lower_padding.size());
-    assert(upper_padding_.size() == upper_padding.size());
-    assert(traversal_stride_.size() == traversal_stride.size());
-    assert(dilation_.size() == dilation.size());
-
-    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
-    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
-    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
-    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
-    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
-    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
-    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
-    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
-
-    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
-    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
-  }
-
-  // Allow user input of xformed activation stride to support non-packed strides.
-  ConvProblemShape(
-      conv::Mode                     mode,
-      std::initializer_list<int>     shape_act_,
-      std::initializer_list<int64_t> stride_act_,
-      std::initializer_list<int>     shape_flt_,
-      std::initializer_list<int64_t> stride_flt_,
-      std::initializer_list<int64_t> stride_xformed_act_,
-      std::initializer_list<int>     lower_padding_,
-      std::initializer_list<int>     upper_padding_,
-      std::initializer_list<int>     traversal_stride_,
-      std::initializer_list<int>     dilation_,
-      int groups)
-      : mode(mode)
-      , groups(groups) {
-    TensorExtent shape_act{};
-    TensorStride stride_act{};
-    TensorExtent shape_flt{};
-    TensorStride stride_flt{};
-    TensorStride stride_xformed_act{};
-
-    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
-    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
-    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
-    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
-    std::copy(stride_xformed_act_.begin(), stride_xformed_act_.end(), stride_xformed_act.begin());
-    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
-    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
-    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
-    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
-
-    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
-    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
-    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
-
-    auto stride_act_packed = packed_stride_right_major(shape_act);
-    auto stride_flt_packed = packed_stride_right_major(shape_flt);
-    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i = 0; i < RankT - 1; ++i) {
-      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
-      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
-      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
-    }
-
-    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
-  }
-
-  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
-  ConvProblemShape(
-      conv::Mode                     mode,
-      std::initializer_list<int>     shape_act_,
-      std::initializer_list<int>     shape_flt_,
-      std::initializer_list<int>     lower_padding_,
-      std::initializer_list<int>     upper_padding_,
-      std::initializer_list<int>     traversal_stride_,
-      std::initializer_list<int>     dilation_,
-      int groups)
-      : mode(mode)
-      , groups(groups) {
-    TensorExtent shape_act{};
-    TensorStride stride_act{};
-    TensorExtent shape_flt{};
-    TensorStride stride_flt{};
-
-    assert(shape_act_.size() == shape_act.size());
-    assert(shape_flt_.size() == shape_flt.size());
-    assert(lower_padding_.size() == lower_padding.size());
-    assert(upper_padding_.size() == upper_padding.size());
-    assert(traversal_stride_.size() == traversal_stride.size());
-    assert(dilation_.size() == dilation.size());
-
-    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
-    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
-    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
-    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
-    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
-    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
-    stride_act = packed_stride_right_major(shape_act);
-    stride_flt = packed_stride_right_major(shape_flt);
-
-    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
-    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
-  }
-#endif // not defined(__CUDACC_RTC__)
-
-  // Set shape and stride of tensor A/B/C according to following table:
-  // |              | Fprop  | Dgrad  | Wgrad |
-  // | ------       | ------ | ------ | ------|
-  // |   ShapeA     | NDHWC  | NZPQK  | NZPQK |
-  // |   ShapeB     | KTRSC  | KTRSC  | NDHWC |
-  // |   ShapeC     | NZPQK  | NDHWC  | KTRSC |
-  //
-  CUTLASS_HOST_DEVICE
-  constexpr void
-  set_shape_stride_ABC(
-    TensorExtent shape_act,
-    TensorStride stride_act,
-    TensorExtent shape_flt,
-    TensorStride stride_flt,
-    TensorExtent shape_xformed_act,
-    TensorStride stride_xformed_act) {
-
-    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
-      shape_A = shape_act;
-      stride_A = stride_act;
-      shape_B = shape_flt;
-      stride_B = stride_flt;
-      shape_C = shape_xformed_act;
-      stride_C = stride_xformed_act;
-    }
-    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
-      shape_A = shape_xformed_act;
-      stride_A = stride_xformed_act;
-      shape_B = shape_flt;
-      stride_B = stride_flt;
-      shape_C = shape_act;
-      stride_C = stride_act;
-    }
-    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
-      shape_A = shape_xformed_act;
-      stride_A = stride_xformed_act;
-      shape_B = shape_act;
-      stride_B = stride_act;
-      shape_C = shape_flt;
-      stride_C = stride_flt;
-    }
-  }
-
-  // Get A extents.
-  // fprop: A extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
-  // dgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
-  // wgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((K), (Q,P,Z,N))
-  CUTLASS_HOST_DEVICE
-  constexpr auto
-  get_shape_A() const {
-    using cute::make_shape;
-    using cute::take;
-
-    if constexpr (ConvOp == conv::Operator::kFprop ||
-                  ConvOp == conv::Operator::kDgrad) {
-      return make_shape(
-        cute::reverse(take<0, RankT - 1>(shape_A)),
-        shape_A[RankT - 1]);
-    }
-    // For wgrad kernel, we need to linearize NZPQ for tensor A
-    else if constexpr (ConvOp == conv::Operator::kWgrad) {
-      return make_shape(
-        shape_A[RankT - 1],
-        cute::product(take<0, RankT - 1>(shape_A)));
-    }
-  }
-
-  // Get B extents.
-  // fprop: B extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
-  // dgrad: B extents array contains [K,T,R,S,C]. Turn that into ((C), (K,S,R,T))
-  // wgrad: B extents array contains [N,D,H,W,C]. Turn that into ((C), (W,H,D,N))
-  CUTLASS_HOST_DEVICE
-  constexpr auto
-  get_shape_B() const {
-    using cute::make_shape;
-    using cute::reverse;
-    using cute::take;
-
-    if constexpr (ConvOp == conv::Operator::kFprop) {
-      return make_shape(
-        shape_B[0],
-        reverse(take<1, RankT>(shape_B)));
-    }
-    else if constexpr (ConvOp == conv::Operator::kWgrad) {
-      return make_shape(
-        shape_B[RankT - 1],
-        reverse(take<0, RankT - 1>(shape_B)));
-    }
-    else if constexpr (ConvOp == conv::Operator::kDgrad) {
-      // shape_B: [K,T,R,S,C], return: [(C),(K,S,R,T)]
-      return make_shape(
-        shape_B[RankT - 1],
-        cute::insert<0>(
-          reverse(take<1, RankT - 1>(shape_B)),
-          shape_B[0]));
-    }
-  }
-
-  // Get C extents.
-  // fprop: C extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
-  // dgrad: C extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
-  // wgrad: C extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
-  CUTLASS_HOST_DEVICE
-  constexpr auto
-  get_shape_C() const {
-    using cute::make_shape;
-    using cute::reverse;
-    using cute::take;
-
-    if constexpr (ConvOp == conv::Operator::kFprop ||
-                  ConvOp == conv::Operator::kDgrad) {
-      return make_shape(
-        reverse(take<0, RankT - 1>(shape_C)),
-        shape_C[RankT - 1]);
-    }
-    else if constexpr (ConvOp == conv::Operator::kWgrad) {
-      return make_shape(
-        shape_C[0],
-        reverse(take<1, RankT>(shape_C)));
-    }
-  }
-
-  // Static method that returns the canonical strides of tensors (layouts are right major and compact)
-  CUTLASS_HOST_DEVICE
-  static constexpr TensorStride
-  packed_stride_right_major(TensorExtent const& extents) {
-    TensorStride strides{};
-    strides[RankT-1] = 1;
-    cute::for_each(cute::make_rseq<RankT-1>{}, [&](auto i) {
-      strides[i] = extents[i+1] * strides[i+1];
-    });
-    return strides;
-  }
-
-  // Static method that returns the packed logical size of any TensorExtent
-  CUTLASS_HOST_DEVICE
-  static constexpr size_t
-  size(TensorExtent const& extents) {
-    size_t size = 1;
-    cute::for_each(cute::make_seq<RankT>{}, [&](auto i) {
-      size *= extents[i];
-    });
-    return size;
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_t
-  size_A() const {
-    return shape_A[0] * stride_A[0];
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_t
-  size_B() const {
-    return shape_B[0] * stride_B[0];
-  }
-
-  CUTLASS_HOST_DEVICE
-  constexpr size_t
-  size_C() const {
-    return shape_C[0] * stride_C[0];
-  }
-
-  // Equality operator
-  CUTLASS_HOST_DEVICE
-  bool operator==(ConvProblemShape<ConvOp, NumSpatialDimensions> const& rhs) const {
-    using cute::for_each;
-    using cute::make_seq;
-
-    bool is_equal = true;
-
-    // Compare all tensor extents
-    for_each(make_seq<RankT>{}, [&](auto i) {
-      is_equal = is_equal
-          && (shape_A[i] == rhs.shape_A[i])
-          && (shape_B[i] == rhs.shape_B[i]);
-    });
-
-    // Compare all spatial extents
-    for_each(make_seq<RankS>{}, [&](auto i) {
-      is_equal = is_equal
-          && (lower_padding[i] == rhs.lower_padding[i])
-          && (upper_padding[i] == rhs.upper_padding[i])
-          && (traversal_stride[i] == rhs.traversal_stride[i])
-          && (dilation[i] == rhs.dilation[i]);
-    });
-
-    return is_equal;
-  }
-
-  /// Inequality operator
-  CUTLASS_HOST_DEVICE
-  bool operator!=(ConvProblemShape<ConvOp, NumSpatialDimensions> const &rhs) const {
-    return !(*this == rhs);
-  }
-
-private:
-  CUTLASS_HOST_DEVICE
-  constexpr auto
-  calculate_xformed_act(TensorExtent shape_act, TensorExtent shape_flt) {
-    TensorExtent shape_xformed_act{};
-    // calculate n,z,p,q,k.
-    // a helper lambda to compute a single spatial extent of the nzpqk tensor
-    auto nzpqk_extent = [](int act_ext, int filter_ext, int pad_total, int dilation, int tstride) {
-      return 1 + (act_ext + pad_total - ((filter_ext -1) * dilation + 1)) / tstride;
-    };
-
-    shape_xformed_act[0] = shape_act[0]; // Activation N extent
-    cute::for_each(cute::make_seq<RankS>{}, [&](auto i) {
-      shape_xformed_act[i+1] = nzpqk_extent(
-          shape_act[i+1], shape_flt[i+1], upper_padding[i] + lower_padding[i], dilation[i], traversal_stride[i]);
-      });
-    shape_xformed_act[RankT-1] = shape_flt[0]; // Filter K extent
-
-    TensorStride stride_xformed_act = packed_stride_right_major(shape_xformed_act);
-
-    return cute::make_tuple(shape_xformed_act, stride_xformed_act);
-  }
-};
-
-template<
-  conv::Operator ConvOp,
-  int SpatialDim
->
-void print(ConvProblemShape<ConvOp, SpatialDim> const& problem) {
-  printf("ConvProblemShape with %d spatial dimensions implementing cutlass::conv::Operator::%d\n",
-      SpatialDim, int(ConvOp));
-  printf("\tTensorA: ");
-      cute::print(problem.shape_A); printf(":");
-      cute::print(problem.stride_A); printf("\n");
-  printf("\tTensorB: ");
-      cute::print(problem.shape_B); printf(":");
-      cute::print(problem.stride_B); printf("\n");
-  printf("\tTensorC: ");
-      cute::print(problem.shape_C); printf(":");
-      cute::print(problem.stride_C); printf("\n");
-  printf("\tLower padding:     "); print(problem.lower_padding);       printf("\n");
-  printf("\tUpper padding:     "); print(problem.upper_padding);       printf("\n");
-  printf("\tTraversal strides: "); print(problem.traversal_stride);    printf("\n");
-  printf("\tDilation:          "); print(problem.dilation);            printf("\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
deleted file mode 100755
index 243ee269d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/convolution.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-
-This file contains definitions and utility functions for describing convolution problem sizes in terms of
-activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
-dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map CUTLASS's implicit gemm
-tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
-
-                        * Mapping convolutions to Gemm computation *
-
-Cutlass implements convolutions with the Implicit Gemm algorithm.  This algorithm performs a gemm
-(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
-The underlying gemm operation follows the standard gemm definition:
-
-                                     C = A * B + C
-
-                               A and B are input matrices
-                            C is source and output matrix
-
-
-For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
-to convolution tensors Activation, Filter and Output as described in the table below.
-
-        ___________________________________________________________________________
-         ConvolutionalOperator |        A        |      B         |       C
-        ___________________________________________________________________________
-        |                      |                 |                |               |
-        |       Fprop          |    Activation   |    Filter      |     Output    |
-        |       Dgrad          |     Output      |    Filter      |   Activation  |
-        |       Wgrad          |     Output      |  Activation    |     Filter    |
-        ___________________________________________________________________________
-
-In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
-
-For example, it's confusing and error prone to document a convolution class or function
-as operating on "A, B, Output."  Instead, use the mapping functions below,
-and adhere to using either A, B, C or Activation, Filter, Output.
-
-Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
-Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm_enumerated_types.h"
-#include "cutlass/matrix_coord.h"
-
-namespace cutlass {
-namespace conv {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Convolutional operator
-enum class Operator {
-  kFprop,
-  kDgrad,
-  kWgrad,
-  kDeconv
-};
-
-/// Distinguishes convolution from cross correlation
-enum class Mode {
-  kCrossCorrelation,
-  kConvolution
-};
-
-/// Selects among several implementation variants trading off performance with simplicity
-enum class IteratorAlgorithm {
-  kAnalytic,      ///< functionally correct in all cases but lower performance
-  kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
-  kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
-  kFewChannels,   ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
-  kFixedStrideDilation ///< Optimized for fixed stride and dilation
-};
-
-/// Distinguishes among partial specializations that accelerate certain problems where convolution
-/// stride is unit.
-enum class StrideSupport {
-  kStrided,       ///< arbitrary convolution stride
-  kUnity,         ///< unit convolution stride
-  kFixed          ///< fixed convolution stride
-};
-
-/// Identifies split-K mode
-enum class SplitKMode {
-  kNone,
-  kSerial,
-  kParallel
-};
-
-/// Identifies group mode
-enum class GroupMode {
-  kNone,
-  kSingleGroup,   ///< One CTA calculates one group or less
-  kMultipleGroup, ///< One CTA calculates multiple groups
-  kDepthwise      ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Shape of a tensor
-template <
-  int N = 1,
-  int H = 1,
-  int W = 1,
-  int C = 1
->
-struct TensorNHWCShape {
-  static int const kN = N;
-  static int const kH = H;
-  static int const kW = W;
-  static int const kC = C;
-
-  static int const kHW = H * W;
-  static int const kNHW = N * kHW;
-  static int const kNHWC = N * H * W * C;
-
-  static int const kCount = kNHWC;
-
-  //
-  // Static member functions
-  //
-
-  /// Returns a Coord object
-  CUTLASS_HOST_DEVICE
-  static Coord<4> toCoord() {
-    return make_Coord(kN, kH, kW, kC);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
-template <
-  /// Stride in horizontal direction
-  int u = 1,
-  /// Stride in vertical direction
-  int v = 1
->
-struct Stride2D {
-  static int const kU = u;
-  static int const kV = v;
-
-  //
-  // Static member functions
-  //
-
-  /// Returns a Coord object
-  CUTLASS_HOST_DEVICE
-  static Coord<2> toCoord() {
-    return make_Coord(kU, kV);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace conv
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
deleted file mode 100755
index 3e4173569..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/conv/convnd_problem_shape.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-  // Helper function to get the problem shape
-template <typename T, class ProblemShape>
-auto get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::true_type) {
-  return T::get_problem_shape_MNKL(problem_shape);
-}
-
-template <typename T, class ProblemShape>
-ProblemShape get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::false_type) {
-  return problem_shape;
-}
-
-// Get problem shape MNKL according to following table:
-// |               |   Fprop   |   Dgrad         |   Wgrad   |
-// |   ----        | --------- | --------        | --------  |
-// |   Shape_M     | (Q,P,Z,N) | (W/V,H/U,D/O,N) | (K)       |
-// |   Shape_N     | (K)       | (C)             | (C,S,R,T) |
-// |   Shape_K     | (C,S,R,T) | (K,S,R,T)       | (Q,P,Z,N) |
-// |   Shape_L     | _1        | (V,U,O)         | _1        |
-
-template <class ProblemShape>
-CUTLASS_HOST_DEVICE
-constexpr auto
-get_transformed_problem_shape_MNKL(ProblemShape const& problem_shape) {
-  return problem_shape;
-}
-
-
-template <conv::Operator ConvOp, int SpatialDim>
-CUTLASS_HOST_DEVICE
-constexpr auto
-get_transformed_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
-  using cute::insert;
-  using cute::make_shape;
-  using cute::reverse;
-  using cute::take;
-
-  constexpr int RankT = SpatialDim + 2;
-
-  if constexpr (ConvOp == conv::Operator::kWgrad) {
-    auto M_xformed = problem_shape.shape_C[0];
-    auto N_xformed = reverse(take<1, RankT>(problem_shape.shape_C));
-    auto K_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_A));
-    auto L_xformed = cute::Int<1>{};
-
-    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
-  }
-  else if constexpr (ConvOp == conv::Operator::kFprop){
-    auto M_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_C));
-    auto N_xformed = problem_shape.shape_C[RankT - 1];
-    auto K_xformed = reverse(take<1, RankT>(problem_shape.shape_B));
-    auto L_xformed = cute::Int<1>{};
-
-    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
-  }
-  else if constexpr (ConvOp == conv::Operator::kDgrad) {
-    auto L_xformed = reverse(problem_shape.traversal_stride); // (V,U,O)
-    auto M_xformed = ceil_div(reverse(take<0,RankT - 1>(problem_shape.shape_C)), L_xformed);
-    auto N_xformed = problem_shape.shape_C[RankT - 1];
-    // shape_B: [K,T,R,S,C], K_xformed: [K,S,R,T]
-    auto K_xformed = insert<0>(
-                (reverse(take<1,RankT - 1>(problem_shape.shape_B))),
-                problem_shape.shape_B[0]);
-
-    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
-  }
-}
-
-// Assuming im2col linearization
-// Get problem shape MNKL according to following table:
-// |               |   Fprop   |   Dgrad               |   Wgrad   |
-// |   ----        | --------- | --------              | --------  |
-// |   Shape_M     | (Q*P*Z*N) | ([W/V]*[H/U]*[D/O]*N) | (K)       |
-// |   Shape_N     | (K)       | (C)                   | (C,S,R,T) |
-// |   Shape_K     | (C,S,R,T) | (K,S,R,T)             | (Q*P*Z*N) |
-// |   Shape_L     | _1        | (V*U*O)               | _1        |
-template <conv::Operator ConvOp, int SpatialDim>
-CUTLASS_HOST_DEVICE
-constexpr auto
-get_linearized_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
-
-  auto [M, N, K, L] = get_transformed_problem_shape_MNKL(problem_shape);
-
-  if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
-    return cute::make_shape(cute::product(M), N, K, cute::product(L));
-  }
-  else if constexpr (ConvOp == conv::Operator::kWgrad) {
-    return cute::make_shape(M, N, cute::product(K), L);
-  }
-
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::detail
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
deleted file mode 100755
index 193f8d885..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-// common
-#include "cutlass/arch/mma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/trace.h"
-#include "cutlass/cluster_launch.hpp"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/conv/kernel/conv_universal.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  ConvUniversalAdapter is a stateful, reusable handle built around a kernel
-  of type cutlass::conv::kernel::ConvUniversal.
-
-  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
-  to create it from the host facing arguments. For power users, static methods
-  are exposed that bypass the stateful methods or args->params lowering.
-*/
-template <class ConvKernel_>
-class ConvUniversalAdapter
-{
-public:
-  using ConvKernel = GetUnderlyingKernel_t<ConvKernel_>;
-  using TileShape = typename ConvKernel::TileShape;
-  using ElementA = typename ConvKernel::ElementA;
-  using ElementB = typename ConvKernel::ElementB;
-  using ElementC = typename ConvKernel::ElementC;
-  using ElementD = typename ConvKernel::ElementD;
-  using ElementAccumulator = typename ConvKernel::TiledMma::ValTypeC;
-  using DispatchPolicy = typename ConvKernel::DispatchPolicy;
-  using CollectiveMainloop = typename ConvKernel::CollectiveMainloop;
-  using CollectiveEpilogue = typename ConvKernel::CollectiveEpilogue;
-
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  // Tease out meta-information about the conv algorithm
-  static constexpr conv::Operator kConvolutionalOperator = DispatchPolicy::ConvOp;
-  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
-
-  // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop!
-  using OperatorClass = cute::conditional_t<
-      (cute::size(typename ConvKernel::TiledMma::AtomThrID{}) > 1),
-      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
-
-  using ArchTag = typename ConvKernel::ArchTag;
-
-  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
-  using ThreadblockShape = cutlass::gemm::GemmShape<
-      cute::size<0>(TileShape{}),
-      cute::size<1>(TileShape{}),
-      cute::size<2>(TileShape{})>;
-
-  using ClusterShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{})>;
-
-  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
-  using InstructionShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
-
-  // Legacy: provide a correct warp count, but no reliable warp shape
-  static int const kThreadCount = ConvKernel::MaxThreadsPerBlock;
-
-  // Warp shape is not a primary API type in 3.x
-  // But we can best approximate it by inspecting the TiledMma
-  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
-  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
-  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename ConvKernel::TiledMma{})) / 32);
-  static constexpr int WarpsInMmaM = 4;
-  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
-  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
-  using WarpShape = cutlass::gemm::GemmShape<
-      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
-      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
-      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
-
-  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
-
-  // Inspect TiledCopy for A and B to compute the alignment size
-  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
-  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
-  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
-  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
-
-  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
-
-  /// Argument structure: User API
-  using Arguments = typename ConvKernel::Arguments;
-  /// Argument structure: Kernel API
-  using Params = typename ConvKernel::Params;
-
-private:
-
-  /// Kernel API parameters object
-  Params params_;
-
-public:
-
-  /// Access the Params structure
-  Params const& params() const {
-    return params_;
-  }
-
-  /// Determines whether the conv can execute the given problem.
-  static Status
-  can_implement(Arguments const& args) {
-    if (ConvKernel::can_implement(args)) {
-      return Status::kSuccess;
-    }
-    else {
-      return Status::kInvalid;
-    }
-  }
-
-  /// Gets the workspace size
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_bytes = 0;
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-
-    workspace_bytes += ConvKernel::get_workspace_size(args);
-    return workspace_bytes;
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
-    auto tmp_params = ConvKernel::to_underlying_arguments(args, workspace);
-    return ConvKernel::get_grid_shape(tmp_params);
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Params const& params) {
-    return ConvKernel::get_grid_shape(params);
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
-    CUTLASS_TRACE_HOST("ConvUniversal::maximum_active_blocks()");
-    int max_active_blocks = -1;
-    int smem_size = ConvKernel::SharedStorageSize;
-
-    // first, account for dynamic smem capacity if needed
-    cudaError_t result;
-    if (smem_size >= (48 << 10)) {
-      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-      result = cudaFuncSetAttribute(
-          device_kernel<ConvKernel>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          smem_size);
-      if (cudaSuccess != result) {
-        result = cudaGetLastError(); // to clear the error bit
-        CUTLASS_TRACE_HOST(
-          "  cudaFuncSetAttribute() returned error: "
-          << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    // query occupancy after setting smem size
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        device_kernel<ConvKernel>,
-        ConvKernel::MaxThreadsPerBlock,
-        smem_size);
-
-    if (cudaSuccess != result) {
-      result = cudaGetLastError(); // to clear the error bit
-      CUTLASS_TRACE_HOST(
-        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
-        << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Initializes conv state from arguments.
-  Status
-  initialize(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-
-    CUTLASS_TRACE_HOST("ConvUniversal::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize the workspace
-    Status status = ConvKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    // Initialize the Params structure
-    params_ = ConvKernel::to_underlying_arguments(args, workspace);
-
-    // Don't set the function attributes - require the CudaHostAdapter to set it.
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      return Status::kSuccess;
-    }
-    else {
-      // account for dynamic smem capacity if needed
-      int smem_size = ConvKernel::SharedStorageSize;
-      if (smem_size >= (48 << 10)) {
-        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-        cudaError_t result = cudaFuncSetAttribute(
-            device_kernel<ConvKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            smem_size);
-        if (cudaSuccess != result) {
-          result = cudaGetLastError(); // to clear the error bit
-          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    return Status::kSuccess;
-  }
-
-  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
-  Status
-  update(Arguments const& args, void* workspace = nullptr) {
-    CUTLASS_TRACE_HOST("ConvUniversal()::update() - workspace: " << workspace);
-
-    size_t workspace_bytes = get_workspace_size(args);
-    if (workspace_bytes > 0 && nullptr == workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_ = ConvKernel::to_underlying_arguments(args, workspace);
-    return Status::kSuccess;
-  }
-
-  /// Primary run() entry point API that is static allowing users to create and manage their own params.
-  /// Supplied params struct must be construct by calling ConvKernel::to_underling_arguments()
-  static Status
-  run(Params& params, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-    CUTLASS_TRACE_HOST("ConvUniversal::run()");
-    dim3 const block = ConvKernel::get_block_shape();
-    dim3 const grid = get_grid_shape(params);
-
-    // configure smem size and carveout
-    int smem_size = ConvKernel::SharedStorageSize;
-
-    Status launch_result;
-    // Use extended launch API only for mainloops that use it
-    if constexpr (ConvKernel::ArchTag::kMinComputeCapability >= 90) {
-      [[maybe_unused]] constexpr bool is_static_1x1x1 =
-        cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape> and
-        cute::size(typename ConvKernel::DispatchPolicy::ClusterShape{}) == 1;
-      dim3 cluster(cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
-                   cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
-                   cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{}));
-      void* kernel_params[] = {&params};
-      if constexpr (kEnableCudaHostAdapter) {
-        //
-        // Use the cuda host adapter
-        //
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-
-          launch_result = cuda_adapter->launch(grid,
-                                               cluster, 
-                                               block, 
-                                               smem_size, 
-                                               stream, 
-                                               kernel_params,
-                                               kernel_index);
-        }
-        else {
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        void const* kernel = (void const*) device_kernel<ConvKernel>;
-        if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 90) {
-          if constexpr (is_static_1x1x1) {
-            device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
-            launch_result = Status::kSuccess;
-          }
-          else {
-            launch_result = ClusterLauncher::launch(
-                grid, cluster, block, smem_size, stream, kernel, kernel_params);
-          }
-        }
-      }
-    }
-    else {
-      launch_result = Status::kSuccess;
-
-      if constexpr (kEnableCudaHostAdapter) {
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          void* kernel_params[] = {&params};
-
-          launch_result = cuda_adapter->launch(
-              grid, block, smem_size, stream, kernel_params, 0
-              );
-
-        }
-        else {
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
-      }
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess == result && Status::kSuccess == launch_result) {
-      return Status::kSuccess;
-    }
-    else {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-      return Status::kErrorInternal;
-    }
-  }
-
-  //
-  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
-  //
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  run(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    int32_t kernel_index = 0
-  ) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-    if (Status::kSuccess == status) {
-      status = run(params_, stream, cuda_adapter, kernel_index);
-    }
-    return status;
-  }
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  operator()(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-    return run(args, workspace, stream, cuda_adapter);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  run(cudaStream_t stream = nullptr) {
-    return run(params_, stream);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  operator()(cudaStream_t stream = nullptr) {
-    return run(params_, stream);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::device
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
deleted file mode 100755
index 43ab94b5f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Template for device-level Depthwise Convolution
-*/
-
-#pragma once
-
-#include <limits>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/conv/convolution.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename DirectConvolutionKernel_>
-class DirectConvolution {
-public:
-
-  using UnderlyingKernel = DirectConvolutionKernel_;
-
-  using ElementA = typename UnderlyingKernel::ElementA;
-  using LayoutA = typename UnderlyingKernel::LayoutA;
-  using ElementB = typename UnderlyingKernel::ElementB;
-  using LayoutB = typename UnderlyingKernel::LayoutB;
-  using ElementC = typename UnderlyingKernel::ElementC;
-  using LayoutC = typename UnderlyingKernel::LayoutC;
-  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
-  using ElementCompute = typename UnderlyingKernel::ElementCompute;
-  using OperatorClass = typename UnderlyingKernel::OperatorClass;
-  using ArchTag = typename UnderlyingKernel::ArchTag;
-  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
-  using WarpShape = typename UnderlyingKernel::WarpShape;
-  using InstructionShape = typename UnderlyingKernel::InstructionShape;
-  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
-  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
-  static int const kStages = UnderlyingKernel::kStages;
-  static int const kConvDim = UnderlyingKernel::kConvDim;
-  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
-  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
-  using MathOperator = typename UnderlyingKernel::MathOperator; 
-
-  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
-  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
-  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
-  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
-
-  static int const kWarpCount = 
-    (ThreadblockShape::kM / WarpShape::kM) * 
-    (ThreadblockShape::kN / WarpShape::kN) *
-    (ThreadblockShape::kK / WarpShape::kK);
-
-  /// Argument structure
-  using Arguments = typename UnderlyingKernel::Arguments;
-
-  using ReorderKernel = typename UnderlyingKernel::ReorderKernel;
-
- private:
-
-  /// Kernel parameters object
-  typename UnderlyingKernel::Params params_;
-
-public:
-
-  /// Constructs Implicit GEMM
-  DirectConvolution() { }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    // dispatch to iterators
-    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    if (kGroupMode != conv::GroupMode::kDepthwise) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    // C and K should be multiple of groups
-    if (args.problem_size.K != args.problem_size.groups &&
-      args.problem_size.C != args.problem_size.groups) {
-      return Status::kErrorInvalidProblem;
-    }
-    
-
-    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
-    if (kConvolutionalOperator == conv::Operator::kFprop) {
-      if (args.problem_size.K % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
-       if (args.problem_size.C % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
-       if (args.problem_size.C % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    }
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(
-      threadblock_swizzle.get_tiled_shape(
-        kConvolutionalOperator,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices));
-
-    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
-          grid.z <= std::numeric_limits<uint16_t>::max())) {
-
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {  
-    return 0;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    // initialize the params structure from the arguments
-    params_ = typename UnderlyingKernel::Params(
-    	args,
-    	static_cast<int *>(workspace)
-    );
-    
-    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-    
-    return Status::kSuccess;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    // update the params structure from the arguments
-    params_.ptr_A = args.ref_A.data();
-    params_.ptr_B = args.ref_B.data();
-    params_.ptr_C = args.ref_C.data();
-    params_.ptr_D = args.ref_D.data();
-    params_.output_op = args.output_op;
-    params_.ptr_reordered_B = args.ref_reordered_B.data();
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    // Launch reorder kernel
-    if (params_.ptr_reordered_B != nullptr) {
-      dim3 grid = ReorderKernel::get_grid_shape(params_);
-      dim3 block = ReorderKernel::get_block_shape();
-
-      cutlass::arch::synclog_setup();
-      cutlass::Kernel<ReorderKernel><<<grid, block, 0, stream>>>(params_);
-    }
-
-    // Launch main kernel
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(32 * kWarpCount, 1, 1);
-
-    // Dynamic SMEM size based on input params.
-    int smem_size = int(params_.get_smem_size());
-
-    // Make sure we can use that much shared memory.
-    cudaError_t status = 
-        cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-    if (status != cudaSuccess)
-      return Status::kErrorInternal;
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-
-  int get_smem_size() { return int(params_.get_smem_size()); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
deleted file mode 100755
index a1cb06e98..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Template for device-level Implicit GEMM Convolution
-*/
-
-#pragma once
-
-#include <limits>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename ImplicitGemmKernel_>
-class ImplicitGemmConvolution {
-public:
-
-  using UnderlyingKernel = GetUnderlyingKernel_t<ImplicitGemmKernel_>;
-
-  using ElementA = typename UnderlyingKernel::ElementA;
-  using LayoutA = typename UnderlyingKernel::LayoutA;
-  using ElementB = typename UnderlyingKernel::ElementB;
-  using LayoutB = typename UnderlyingKernel::LayoutB;
-  using ElementC = typename UnderlyingKernel::ElementC;
-  using LayoutC = typename UnderlyingKernel::LayoutC;
-  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
-  using ElementCompute = typename UnderlyingKernel::ElementCompute;
-  using OperatorClass = typename UnderlyingKernel::OperatorClass;
-  using ArchTag = typename UnderlyingKernel::ArchTag;
-  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
-  using WarpShape = typename UnderlyingKernel::WarpShape;
-  using InstructionShape = typename UnderlyingKernel::InstructionShape;
-  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
-  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
-  static int const kStages = UnderlyingKernel::kStages;
-  static int const kConvDim = UnderlyingKernel::kConvDim;
-  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
-  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
-  using MathOperator = typename UnderlyingKernel::MathOperator; 
-
-  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
-  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
-  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
-  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
-
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  static int const kWarpCount = 
-    (ThreadblockShape::kM / WarpShape::kM) * 
-    (ThreadblockShape::kN / WarpShape::kN) *
-    (ThreadblockShape::kK / WarpShape::kK);
-
-  /// Argument structure
-  using Arguments = typename UnderlyingKernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename UnderlyingKernel::Params params_;
-
-public:
-
-  /// Constructs Implicit GEMM
-  ImplicitGemmConvolution() { }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-    // dispatch to iterators
-    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    // check group conv constraint
-    if (args.problem_size.groups != 1) {
-      if (kGroupMode == conv::GroupMode::kNone) {
-        return Status::kErrorInvalidProblem;
-      } 
-
-      // C and K should be multiple of groups
-      if (args.problem_size.K % args.problem_size.groups ||
-        args.problem_size.C % args.problem_size.groups) {
-        return Status::kErrorInvalidProblem;
-      }
-
-      // split-k is not supported
-      if (args.problem_size.split_k_slices != 1) {
-        return Status::kErrorInvalidProblem;
-      }
-
-      int k_per_group = args.problem_size.K / args.problem_size.groups;
-      // k_per_group should be multiple of ThreadblockShape N, one CTA calculate one group
-      if (kGroupMode == conv::GroupMode::kSingleGroup && k_per_group % ThreadblockShape::kN) {
-        return Status::kErrorInvalidProblem;
-      }
-      // ThreadblockShape::kN should be divisible by k_per_group, one CTA calculate multiple groups
-      if (kGroupMode == conv::GroupMode::kMultipleGroup && ThreadblockShape::kN % k_per_group) {
-        return Status::kErrorInvalidProblem;
-      }
-
-      // current optimized iterator algo only supports SingleGroup mode
-      if (kIteratorAlgorithm == IteratorAlgorithm::kOptimized &&
-        kGroupMode != conv::GroupMode::kSingleGroup) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
-    if (kConvolutionalOperator == conv::Operator::kFprop) {
-      if (args.problem_size.K % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    } else if (kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) {
-       if (args.problem_size.C % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
-       if (args.problem_size.C % kAlignmentC)
-        return Status::kErrorMisalignedOperand;
-    }
-
-    // check for unsupported problem sizes for strided dgrad / deconv implementation
-    if ((kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) &&
-      kStrideSupport == conv::StrideSupport::kStrided) {
-      // split-k (serial or parallel) is not supported for strided dgrad / deconv
-      if(args.problem_size.split_k_slices > 1 && (args.problem_size.stride().at(args.problem_size.stride().max_dim_index()) > 1)) {
-        return Status::kErrorNotSupported;
-      }
-
-      // dilation > {1x1} is not supported for strided dgrad / deconv
-      if(args.problem_size.dilation_h > 1 || args.problem_size.dilation_w > 1) {
-        return Status::kErrorNotSupported;
-      }
-    }
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(
-      threadblock_swizzle.get_tiled_shape(
-        kConvolutionalOperator,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices));
-
-    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
-          grid.z <= std::numeric_limits<uint16_t>::max())) {
-
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-  
-    size_t workspace_bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        kConvolutionalOperator,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-    if(args.split_k_mode == SplitKMode::kParallel) {
-
-      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
-      // The user needs to call a reduction operator to optain the final output tensor
-      workspace_bytes = 
-        sizeof(ElementAccumulator) *
-        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
-        size_t(grid_tiled_shape.k());
-    }
-
-    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
-
-      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
-      // final reduced output to user's output tensor
-      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
-    }
-
-    return workspace_bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-   
-    if (args.problem_size.split_k_slices > 1) {
-
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-
-      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
-
-      if (status != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    // initialize the params structure from the arguments
-    params_ = typename UnderlyingKernel::Params(
-    	args,
-    	static_cast<int *>(workspace)
-    );
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      return Status::kSuccess;
-    }
-    else {
-      int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
-  
-      if (smem_size >= (48 << 10)) {
-        cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
-                                      cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                      smem_size);
-  
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    
-    return Status::kSuccess;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    // update the params structure from the arguments
-    params_.ptr_A = args.ref_A.data();
-    params_.ptr_B = args.ref_B.data();
-    params_.ptr_C = args.ref_C.data();
-    params_.ptr_D = args.ref_D.data();
-    params_.output_op = args.output_op;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(32 * kWarpCount, 1, 1);
-
-    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
-    cutlass::Status launch_result = cutlass::Status::kSuccess ;
-
-    if constexpr (kEnableCudaHostAdapter) {
-        //
-        // Use the cuda host adapter
-        //
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-
-          void* kernel_params[] = {&params_};
-          launch_result = cuda_adapter->launch(
-              grid, dim3(1,1,1), block, smem_size, stream, kernel_params, kernel_index
-              );
-        }
-        else {
-          launch_result = Status::kErrorInternal;
-        }
-    }
-    else {
-      cutlass::arch::synclog_setup();
-      cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);      
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess == result && Status::kSuccess == launch_result) {
-      return Status::kSuccess;
-    }
-    else {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-      return Status::kErrorInternal;
-    }
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-    return run(stream, cuda_adapter, kernel_index);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-    
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream, cuda_adapter, kernel_index);
-    }
-
-    return status;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
deleted file mode 100755
index 265156cc5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Template for device-level fused activation's scale+bias+relu and Implicit GEMM Convolution
-*/
-
-#pragma once
-
-#include <limits>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/conv/convolution.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename ImplicitGemmFusionKernel_>
-class ImplicitGemmConvolutionFusion {
-public:
-
-  using ImplicitGemmFusionKernel = ImplicitGemmFusionKernel_;
-
-  using ElementA = typename ImplicitGemmFusionKernel::ElementA;
-  using LayoutA = typename ImplicitGemmFusionKernel::LayoutA;
-  using ElementB = typename ImplicitGemmFusionKernel::ElementB;
-  using LayoutB = typename ImplicitGemmFusionKernel::LayoutB;
-
-//  using ElementScaleBias = typename ImplicitGemmFusionKernel::ElementScaleBias;
-//  using LayoutScaleBias = typename ImplicitGemmFusionKernel::LayoutScaleBias;
-
-  using ElementC = typename ImplicitGemmFusionKernel::ElementC;
-  using LayoutC = typename ImplicitGemmFusionKernel::LayoutC;
-  using ElementAccumulator = typename ImplicitGemmFusionKernel::ElementAccumulator;
-  using ElementCompute = typename ImplicitGemmFusionKernel::ElementCompute;
-  using OperatorClass = typename ImplicitGemmFusionKernel::OperatorClass;
-  using ArchTag = typename ImplicitGemmFusionKernel::ArchTag;
-  using ThreadblockShape = typename ImplicitGemmFusionKernel::ThreadblockShape;
-  using WarpShape = typename ImplicitGemmFusionKernel::WarpShape;
-  using InstructionShape = typename ImplicitGemmFusionKernel::InstructionShape;
-  using ThreadblockSwizzle = typename ImplicitGemmFusionKernel::ThreadblockSwizzle;
-  using EpilogueOutputOp = typename ImplicitGemmFusionKernel::EpilogueOutputOp;
-  static int const kStages = ImplicitGemmFusionKernel::kStages;
-  static int const kConvDim = ImplicitGemmFusionKernel::kConvDim;
-  using WarpMmaOperator = typename ImplicitGemmFusionKernel::WarpMmaOperator;
-  using ArchMmaOperator = typename ImplicitGemmFusionKernel::ArchMmaOperator;
-  using MathOperator = typename ImplicitGemmFusionKernel::MathOperator; 
-
-  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmFusionKernel::kConvolutionalOperator;
-  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmFusionKernel::kIteratorAlgorithm;
-
-  static int const kWarpCount = 
-    (ThreadblockShape::kM / WarpShape::kM) * 
-    (ThreadblockShape::kN / WarpShape::kN) *
-    (ThreadblockShape::kK / WarpShape::kK);
-
-  /// Argument structure
-  using Arguments = typename ImplicitGemmFusionKernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename ImplicitGemmFusionKernel::Params params_;
-
-public:
-
-  /// Constructs Implicit GEMM
-  ImplicitGemmConvolutionFusion() { }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    // dispatch to iterators
-    Status status = ImplicitGemmFusionKernel::Mma::IteratorA::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    status = ImplicitGemmFusionKernel::Mma::IteratorB::can_implement(args.problem_size);
-    if (Status::kSuccess != status) {
-      return status;
-    }
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(
-      threadblock_swizzle.get_tiled_shape(
-        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices));
-
-    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
-          grid.z <= std::numeric_limits<uint16_t>::max())) {
-
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-  
-    size_t workspace_bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-    if(args.split_k_mode == SplitKMode::kParallel) {
-
-      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
-      // The user needs to call a reduction operator to optain the final output tensor
-      workspace_bytes = 
-        sizeof(ElementAccumulator) *
-        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
-        size_t(grid_tiled_shape.k());
-    }
-
-    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
-
-      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
-      // final reduced output to user's output tensor
-      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
-    }
-
-    return workspace_bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-   
-    if (args.problem_size.split_k_slices > 1) {
-
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-
-      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
-
-      if (status != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    // initialize the params structure from the arguments
-    params_ = typename ImplicitGemmFusionKernel::Params(
-    	args,
-    	static_cast<int *>(workspace)
-    );
-    
-    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmFusionKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-    
-    return Status::kSuccess;
-  }
-
-  /// Initializes Impicit GEMM state from arguments.
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    // update the params structure from the arguments
-    params_.ptr_A = args.ref_A.data();
-    params_.ptr_B = args.ref_B.data();
-    params_.ptr_scale = args.ref_A_scale.data();
-    params_.ptr_bias = args.ref_A_bias.data();
-    params_.ptr_C = args.ref_C.data();
-    params_.ptr_D = args.ref_D.data();
-    params_.output_op = args.output_op;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(32 * kWarpCount, 1, 1);
-
-    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<ImplicitGemmFusionKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
deleted file mode 100755
index b8b5eb2bf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/conv/convolution.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/arch/arch.h"
-
-#include "cute/layout.hpp"
-#include "cute/numeric/integral_constant.hpp"
-
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-//////////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv {
-
-//////////////////////////////////////////////////////////////////////////////
-
-//
-// Policies for categorical dispatch of mainloop against kernel grid schedules
-//
-struct KernelImplicitTmaWarpSpecializedSm90 : cutlass::gemm::KernelTmaWarpSpecialized { };
-struct KernelImplicitTmaWarpSpecializedSm90Cooperative { };
-struct KernelImplicitTmaWarpSpecializedSm90Pingpong { };
-
-//
-// Collective Mainloop Policies
-//
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
-// for fprop
-template<
-  conv::Operator ConvOp_,
-  int Stages_,
-  int NumSpatialDimensions_,
-  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>,
-  class KernelSchedule = KernelImplicitTmaWarpSpecializedSm90,
-  int PipelineAsyncMmaStages_ = 1
->
-struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
-  static constexpr int Stages = Stages_;
-  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
-  static constexpr Operator ConvOp = ConvOp_;
-  static constexpr int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-
-  static_assert(NumSpatialDimensions >= 1);
-  static_assert(! (cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
-                   cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Pingpong>),
-    "Persistent schedules not support for conv yet.");
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv 
-
-//////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
deleted file mode 100755
index 23ccea2f8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/conv/convnd_problem_shape.hpp"
-#include "cutlass/detail/dependent_false.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*
- * Stateless universal device CONV kernel type that treats CONV as
- * a composition of a collective mainloop and a collective epilogue.
-**/
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileSchedulerTag_ = void,
-  class Enable = void
->
-class ConvUniversal {
-  static_assert(cutlass::detail::dependent_false<Enable>,
-      "Could not find a valid specialization at the kernel layer to dispatch against.");
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::kernel
-
-////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
deleted file mode 100755
index 79bedb2c8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-      Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/conv/threadblock/threadblock_swizzle.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
-#include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
-#include "cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h"
-#include "cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_fusion.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename OutputOp
->
-struct DefaultConvEpilogue {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    OutputOp::kCount
-  >::Epilogue;
-};
-
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename OutputOp
->
-struct DefaultConvEpilogue<
-  arch::Sm70,
-  Shape,
-  WarpMmaTensorOp,
-  PartitionsK,
-  OutputOp
-> {
-
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    OutputOp::kCount
-  >::Epilogue;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  typename PermuteDLayout = layout::NoPermute,
-  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
-  int Rank = 4
->
-struct DefaultConvEpilogueWithBroadcastSimt {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimt<
-    Shape,
-    WarpMmaSimt,
-    ElementOutput,
-    ElementTensor,
-    ElementVector,
-    OutputOp,
-    ElementsPerAccess,
-    false,
-    PermuteDLayout,
-    StrideSupport,
-    Rank
-  >::Epilogue;
-};
-
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess
->
-struct DefaultConvEpilogueWithBroadcastSimtStridedDgrad {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimtStridedDgrad<
-    Shape,
-    WarpMmaSimt,
-    ElementOutput,
-    ElementTensor,
-    ElementVector,
-    OutputOp,
-    ElementsPerAccess
-  >::Epilogue;
-};
-
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess
->
-struct DefaultConvEpilogueWithBroadcastTensorOp {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    ElementOutput,
-    ElementTensor,
-    ElementVector,
-    OutputOp,
-    ElementsPerAccess
-  >::Epilogue;
-};
-
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess
->
-struct DefaultConvEpilogueWithBroadcastTensorOp<
-  arch::Sm70,
-  Shape,
-  WarpMmaTensorOp,
-  PartitionsK,
-  ElementOutput,
-  ElementTensor,
-  ElementVector,
-  OutputOp,
-  ElementsPerAccess
-  > {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    ElementOutput,
-    ElementTensor,
-    ElementVector,
-    OutputOp,
-    ElementsPerAccess
-  >::Epilogue;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  typename ReductionOp,
-  int ElementsPerAccess
->
-struct DefaultConvEpilogueWithReductionTensorOp {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    ElementOutput,
-    OutputOp,
-    ReductionOp,
-    ElementsPerAccess
-  >::Epilogue;
-};
-
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  typename ReductionOp,
-  int ElementsPerAccess
->
-struct DefaultConvEpilogueWithReductionTensorOp<
-  arch::Sm70,
-  Shape,
-  WarpMmaTensorOp,
-  PartitionsK,
-  ElementOutput,
-  OutputOp,
-  ReductionOp,
-  ElementsPerAccess
-  > {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    ElementOutput,
-    OutputOp,
-    ReductionOp,
-    ElementsPerAccess
-  >::Epilogue;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Defaults for strided Dgrad
-template <
-  typename ArchTag,
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename OutputOp
->
-struct DefaultConvEpilogueStridedDgrad {
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    OutputOp::kCount
-  >::Epilogue;
-};
-
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename OutputOp
->
-struct DefaultConvEpilogueStridedDgrad<
-  arch::Sm70,
-  Shape,
-  WarpMmaTensorOp,
-  PartitionsK,
-  OutputOp
-> {
-
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOpStridedDgrad<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    OutputOp::kCount
-  >::Epilogue;
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
deleted file mode 100755
index c5a8b1315..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
+++ /dev/null
@@ -1,1927 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
-#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dDgrad
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultConv2dDgrad;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                               OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided and
-// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kStrided,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      StrideSupport::kStrided,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
-// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kStrided,
-        AccessTypeA 
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        StrideSupport::kStrided,
-        AccessTypeB 
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity Strided
-// and multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      StrideSupport::kUnity,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity
-// 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        StrideSupport::kUnity,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for optimized IteratorAlgorithm Dgrad Unity Strided
-// and multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      StrideSupport::kUnity,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided and
-// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kStrided,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      StrideSupport::kStrided,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
-// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kStrided,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        StrideSupport::kStrided,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Unity
-// 2 stage pipeline
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        StrideSupport::kUnity,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      conv::StrideSupport::kUnity
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      conv::StrideSupport::kStrided
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      StrideSupport::kUnity
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      conv::StrideSupport::kStrided
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kUnity
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        conv::StrideSupport::kUnity
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        conv::StrideSupport::kStrided
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        StrideSupport::kUnity
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        conv::StrideSupport::kStrided
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad
-  >;
-
-};
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
deleted file mode 100755
index 9fbd97e58..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
+++ /dev/null
@@ -1,2007 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultConv2dFprop;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport, 
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kFixedChannels,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kFixedChannels,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kFewChannels,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kFewChannels,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB,
-  int InterleavedK
->
-struct DefaultConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA = typename MmaCore::SmemThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, layout::TensorNCxHWx<InterleavedK>,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB = typename MmaCore::SmemThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB,
-  int InterleavedK
->
-struct DefaultConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA = typename MmaCore::SmemThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB = typename MmaCore::SmemThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
-/// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-    Stages, MathOperatorTag
-  >;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand 
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
-// multistage pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB,
-  int InterleavedK
->
-struct DefaultConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-    ElementB, layout::RowMajorInterleaved<InterleavedK>, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-    Stages, MathOperatorTag, true
-  >;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::SmemThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      layout::TensorNCxHWx<InterleavedK>,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand 
-  using ThreadMapB = typename MmaCore::SmemThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        LayoutA,
-        ThreadMapA,
-        AccessTypeA 
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB,
-  int InterleavedK
->
-struct DefaultConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::SmemThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::SmemThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        LayoutA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
deleted file mode 100755
index 8589ace02..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
-   definitions that combine threadblock-scoped matrix multiply-add with the
-   appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
-#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for fused batch norm and Conv2dFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity
-> struct DefaultConv2dFpropFusion;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv2dFpropFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  using SmemIteratorScaleBias =
-      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static int const kThreadCount = 32;
-
-  // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
-      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
-      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
-      MmaCore::WarpCount::kK>;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    IteratorScaleBias,
-    SmemIteratorScaleBias,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    WarpIteratorScaleBias,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
-/// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv2dFpropFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-    Stages, MathOperatorTag
-  >;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand 
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  using SmemIteratorScaleBias =
-      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static int const kThreadCount = 32;
-
-  // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
-      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
-      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
-      MmaCore::WarpCount::kK>;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    IteratorScaleBias,
-    SmemIteratorScaleBias,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    WarpIteratorScaleBias,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
deleted file mode 100755
index 76bc12886..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Defines a default configuration for convolution with absolute maximum calculation.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultConv2dFpropWithAbsMax {
-
-  using ImplicitGemmBase = typename DefaultConv2dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementAuxOutput,
-    ElementC,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithAbsMax<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
deleted file mode 100755
index 0825789ce..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultConv2dFpropWithBroadcast {
-
-  using ImplicitGemmBase = typename DefaultConv2dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dFpropWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultConv2dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
deleted file mode 100755
index e6e8a8220..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename EpilogueReductionOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultConv2dFpropWithReduction {
-
-  using ImplicitGemmBase = typename DefaultConv2dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithReductionTensorOp<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    EpilogueOutputOp,
-    EpilogueReductionOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
deleted file mode 100755
index e2deaf6fe..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
+++ /dev/null
@@ -1,622 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dGroupFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::GroupMode GroupMode,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultConv2dGroupFprop;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and multistage 
-/// pipeline that supports all GroupMode.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::GroupMode GroupMode,
-  conv::StrideSupport StrideSupport, 
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dGroupFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  GroupMode,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA,
-      AccessTypeA,
-      GroupMode
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      AccessTypeB,
-      GroupMode
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    GroupMode
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and
-/// 2 stage pipeline that supports all GroupMode.
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::GroupMode GroupMode,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dGroupFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  GroupMode,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA,
-        AccessTypeA,
-        GroupMode
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        AccessTypeB,
-        GroupMode
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    GroupMode
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and multistage
-/// pipeline that supports GroupMode::kSingleGroup.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dGroupFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  GroupMode::kSingleGroup,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA, LayoutA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    GroupMode::kSingleGroup
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and
-/// 2 stage pipeline that supports GroupMode::kSingleGroup.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dGroupFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  GroupMode::kSingleGroup,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
-    "Current group conv only support NHWC layout");
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-    2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        LayoutA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    GroupMode::kSingleGroup
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
deleted file mode 100755
index d0e52dfe3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
+++ /dev/null
@@ -1,1011 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultConv2dWgrad;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                          OpClassTensorOp convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and two 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      AccessTypeA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      AccessTypeB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and two 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        AccessTypeA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        AccessTypeB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AccessTypeA,
-  int AccessTypeB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AccessTypeA,
-  AccessTypeB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AccessTypeA,
-  int AccessTypeB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AccessTypeA,
-  AccessTypeB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AccessTypeA,
-  int AccessTypeB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AccessTypeA,
-  AccessTypeB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AccessTypeA,
-  int AccessTypeB
->
-struct DefaultConv2dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  AccessTypeA,
-  AccessTypeB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
deleted file mode 100755
index 110e07db9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-#include "cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
-> struct DefaultConv2dWgradFusion;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                          OpClassTensorOp convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv2dWgradFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
-          cutlass::MatrixShape<1, WarpShape::kN>,
-          ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    IteratorScaleBias,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv2dWgradFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
-          cutlass::MatrixShape<1, WarpShape::kN>,
-          ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    IteratorScaleBias,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
deleted file mode 100755
index cb50ba49b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
+++ /dev/null
@@ -1,736 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h"
-
-#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dDgrad
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
-> struct DefaultConv3dDgrad;
-
-/// Defines a kernel for Conv3dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
-// and multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport::kStrided
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
-// and multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-      // ThreadMapB,
-      // StrideSupport::kUnity
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      // >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      // >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dDgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    // cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity
-      // >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    // cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-        // ThreadMapB,
-        // StrideSupport::kUnity
-      // >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDgrad,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
deleted file mode 100755
index 41fdd64a5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
+++ /dev/null
@@ -1,981 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.    
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
-
-
-#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity
-> struct DefaultConv3dFprop;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Analytic Iterator Algorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm and multistage
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Optimized Iterator Algorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        LayoutA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB
-      >
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm and multistage
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB; 
-
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB
-    >;
-
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport
->
-struct DefaultConv3dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        LayoutA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
deleted file mode 100755
index d0457d572..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
-   definitions that combine threadblock-scoped matrix multiply-add with the
-   appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
-#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for fused batch norm and Conv3dFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity
-> struct DefaultConv3dFpropFusion;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialzation for Analytic IteratorAlgorithm and multistage 
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dFpropFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  using SmemIteratorScaleBias =
-      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static int const kThreadCount = 32;
-
-  // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
-      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
-      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
-      MmaCore::WarpCount::kK>;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    IteratorScaleBias,
-    SmemIteratorScaleBias,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    WarpIteratorScaleBias,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dFprop specialzation for Optimzed IteratorAlgorithm and 
-/// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementScaleBias,
-  typename LayoutScaleBias,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dFpropFusion <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementScaleBias,
-  LayoutScaleBias,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-    Stages, MathOperatorTag
-  >;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand 
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorScaleBias =
-      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  using SmemIteratorScaleBias =
-      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  static int const kThreadCount = 32;
-
-  // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
-      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
-      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
-      MmaCore::WarpCount::kK>;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Global,
-    IteratorScaleBias,
-    SmemIteratorScaleBias,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    WarpIteratorScaleBias,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
deleted file mode 100755
index 0fc291e60..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_conv3d_fprop.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultConv3dFpropWithBroadcast {
-
-  using ImplicitGemmBase = typename DefaultConv3dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm,
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultConv3dFpropWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultConv3dFprop<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess,
-    layout::NoPermute,
-    StrideSupport,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
deleted file mode 100755
index 4ed5e0c1b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
+++ /dev/null
@@ -1,936 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
-> struct DefaultConv3dWgrad;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and two 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and multistage 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and two 
-// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
->  {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape,
-    WarpMmaTensorOp,
-    1,
-    EpilogueOutputOp
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultConv3dWgrad <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
-      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kWgrad,
-    Conv3dProblemSize
-  >;
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
deleted file mode 100755
index 4db152cd7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
+++ /dev/null
@@ -1,999 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Deconv2d
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultDeconv2d;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-      conv::GroupMode::kNone,
-      true /*IsDeconv*/
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-      conv::GroupMode::kNone,
-      true /*IsDeconv*/
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-      true /*IsDeconv*/
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB, LayoutB,
-      ThreadMapB,
-      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-      true /*IsDeconv*/
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kUnity
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-        conv::GroupMode::kNone,
-        true /*IsDeconv*/
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-        conv::GroupMode::kNone,
-        true /*IsDeconv*/
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-        true /*IsDeconv*/
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    4
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
-        true /*IsDeconv*/
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-
-};
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
deleted file mode 100755
index d11432ed3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_deconv2d.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultDeconv2dWithBroadcast {
-
-  using ImplicitGemmBase = typename DefaultDeconv2d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Deconv2d specialization,
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2dWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultDeconv2d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    conv::StrideSupport::kUnity,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv2dWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultDeconv2d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    conv::StrideSupport::kStrided,
-    AlignmentA,
-    AlignmentB
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimtStridedDgrad<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
deleted file mode 100755
index 70800c7af..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
+++ /dev/null
@@ -1,541 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-
-#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
-
-#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Deconv3d
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided
-> struct DefaultDeconv3d;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultDeconv3d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      conv::StrideSupport::kStrided
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      ThreadMapB,
-      true /*IsDeconv*/
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultDeconv3d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-      ElementA,
-      ThreadMapA,
-      StrideSupport::kUnity
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-      ElementB,
-      LayoutB,
-      ThreadMapB,
-      true /*IsDeconv*/
-      // ThreadMapB,
-      // StrideSupport::kUnity
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmMultistage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    arch::CacheOperation::Always,
-    IteratorB,
-    SmemIteratorB,
-    arch::CacheOperation::Always,
-    MmaPolicy,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultDeconv3d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport::kStrided
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        conv::StrideSupport::kStrided
-      // >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        ThreadMapB,
-        true /*IsDeconv*/
-      // >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
-/// 2 stage pipeline, and FFMA-based mainloop for SM50
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultDeconv3d <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport::kUnity
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    // cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA,
-        ThreadMapA,
-        StrideSupport::kUnity
-      // >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using IteratorB =
-    // cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB,
-        LayoutB,
-        ThreadMapB,
-        true /*IsDeconv*/
-        // ThreadMapB,
-        // StrideSupport::kUnity
-      // >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::ImplicitGemmPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    false,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
deleted file mode 100755
index affe7a06f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/kernel/default_deconv3d.h"
-#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
-  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
->
-struct DefaultDeconv3dWithBroadcast {
-
-  using ImplicitGemmBase = typename DefaultDeconv3d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    StrideSupport
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ImplicitGemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Deconv3d specialization for Analytic IteratorAlgorithm,
-/// multi-stage pipeline, and FFMA-based mainloop for SM80
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv3dWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  conv::StrideSupport::kUnity,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultDeconv3d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    conv::StrideSupport::kUnity
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDeconv3dWithBroadcast <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm,
-  conv::StrideSupport::kStrided,
-  AlignmentA,
-  AlignmentB
-> {
-
-  using ImplicitGemmBase = typename DefaultDeconv3d<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MathOperatorTag,
-    IteratorAlgorithm,
-    conv::StrideSupport::kStrided
-  >::Kernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
-    ArchTag,
-    typename ImplicitGemmBase::Epilogue::Shape,
-    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
-    ElementC,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    ImplicitGemmBase::Epilogue::kElementsPerAccess,
-    layout::NoPermute,
-    StrideSupport::kStrided,
-    5
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
-    typename ImplicitGemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kDeconv,
-    Conv3dProblemSize
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
deleted file mode 100755
index aa4f2c359..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
+++ /dev/null
@@ -1,588 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-    Default kernel-level Depthwise implicit GEMM convolution definitions combine threadblock-scoped 
-      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d.h"
-#include "cutlass/conv/kernel/direct_convolution.h"
-
-#include "cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h"
-
-#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/depthwise_fprop_pipelined.h"
-
-// Direct Conv Related Header files
-#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h"
-#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h"
-
-#include "cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h"
-#include "cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for DepthwiseFprop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = cutlass::sizeof_bits<ElementB>::value / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultDepthwiseFprop;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for DepthwiseFprop with direct convolution algorithm
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename OperatorClass,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename ThreadBlockOutputShape,
-  typename FilterShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
-  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
-  // MatrixShape<Height, Width>
-  typename StrideShape = cutlass::MatrixShape<-1, -1>,
-  // MatrixShape< Height, Width> 
-  typename DilationShape =  cutlass::MatrixShape<-1, -1>, 
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
-> struct DefaultDepthwiseDirect2dConvFprop;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                            OpClassSimt convolutions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Depthwise specialization for Analytic IteratorAlgorithm
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDepthwiseFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag, //   cutlass::arch::OpMultiplyAdd
-  IteratorAlgorithm::kAnalytic,
-  StrideSupport,
-  AlignmentA,
-  AlignmentB
-> {
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::conv::threadblock::DepthwiseMmaCoreWithLaneAccessSize<
-      ThreadblockShape,
-      WarpShape,
-      InstructionShape,
-      ElementA,
-      layout::RowMajor,
-      ElementB,
-      layout::ColumnMajor,
-      ElementAccumulator,
-      layout::RowMajor,
-      arch::OpClassSimt,
-      128,
-      sizeof_bits<ElementB>::value,
-      2,
-      MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-        ElementA, LayoutA,
-        ThreadMapA
-      >
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-        ElementB, LayoutB,
-        ThreadMapB,
-        AccessTypeB,
-        cutlass::conv::GroupMode::kDepthwise
-      >
-    >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-
-  // Define the Mma
-  using Mma = threadblock::DepthwiseFpropPipelined<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    IteratorB,
-    SmemIteratorB,
-    ElementC,
-    LayoutC,
-    MmaPolicy
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
-    ThreadblockShape,
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    cutlass::conv::GroupMode::kDepthwise
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
-/// multiple stage pipeline, and SIMT-based mainloop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename ThreadBlockOutputShape,
-  typename FilterShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  typename StrideShape,
-  typename DilationShape,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDepthwiseDirect2dConvFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  ThreadBlockOutputShape,
-  FilterShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  StrideSupport,
-  StrideShape,
-  DilationShape,
-  AlignmentA,
-  AlignmentB
-> {
-  // One warp handles the entrie groups per cta.
-  static_assert(ThreadblockShape::kN == WarpShape::kN,
-                "ThreadblockShape::kN should be same as WarpShape::kN ");
-  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
-                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
-  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
-                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
-  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
-      ThreadblockShape,
-      ThreadBlockOutputShape,
-      FilterShape,
-      WarpShape,
-      InstructionShape,
-      ElementA,
-      layout::RowMajor,
-      ElementB,
-      layout::ColumnMajor,
-      ElementAccumulator,
-      layout::RowMajor,
-      arch::OpClassSimt,
-      128,
-      128,
-      Stages,
-      MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
-      ThreadBlockOutputShape,
-      ElementA, LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
-        ElementB, LayoutB,
-        ThreadMapB
-      >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
-    ThreadblockShape, // < outputShape:KMNK, groups per cta>
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    ThreadOutputShape,
-    ThreadBlockOutputShape
-  >::Epilogue;
-
-  // Define the Mma
-  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    CacheOpA,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages,
-    Epilogue
-  >;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::DirectConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    cutlass::conv::GroupMode::kDepthwise,
-    ThreadBlockOutputShape
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
-/// multiple stage pipeline, and SIMT-based mainloop
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape,
-  typename ThreadBlockOutputShape,
-  typename FilterShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename EpilogueOutputOp,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  conv::StrideSupport StrideSupport,
-  typename StrideShape,
-  typename DilationShape,
-  int AlignmentA,
-  int AlignmentB
->
-struct DefaultDepthwiseDirect2dConvFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassSimt,
-  ArchTag,
-  ThreadblockShape,
-  ThreadBlockOutputShape,
-  FilterShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kFixedStrideDilation,
-  StrideSupport,
-  StrideShape,
-  DilationShape,
-  AlignmentA,
-  AlignmentB
-> {
-
-
-
-  // One warp handles the entrie groups per cta.
-  static_assert(ThreadblockShape::kN == WarpShape::kN,
-                "ThreadblockShape::kN should be same as WarpShape::kN ");
-  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
-                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
-  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
-                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
-  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
-
-  static_assert(StrideShape::kRow >= 0 && StrideShape::kColumn >= 0, "Stride should be fixed");
-  static_assert(DilationShape::kRow >= 0 && DilationShape::kColumn >= 0, "Stride should be fixed");
-
-  // Activations loaded by threadblock
-  static int const ActivationShapeH = (ThreadBlockOutputShape::kH - 1) * StrideShape::kRow +
-                             (FilterShape::kRow - 1) * DilationShape::kRow + 1;
-
-  static int const ActivationShapeW = (ThreadBlockOutputShape::kW - 1) * StrideShape::kColumn +
-                             (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
-
-  using ActivationShape =
-      cutlass::conv::TensorNHWCShape<1, ActivationShapeH, ActivationShapeW, ThreadblockShape::kN >;
-
-  // Define the core components from GEMM
-  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
-      ThreadblockShape,
-      ThreadBlockOutputShape,
-      FilterShape,
-      WarpShape,
-      InstructionShape,
-      ElementA,
-      layout::RowMajor,
-      ElementB,
-      layout::ColumnMajor,
-      ElementAccumulator,
-      layout::RowMajor,
-      arch::OpClassSimt,
-      128,
-      128,
-      Stages,
-      MathOperatorTag,
-      IteratorAlgorithm::kFixedStrideDilation,
-      StrideShape,
-      DilationShape,
-      ActivationShape>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using IteratorA =
-    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation<
-      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
-      ThreadBlockOutputShape,
-      StrideShape,
-      DilationShape,
-      ActivationShape,
-      ElementA, LayoutA,
-      ThreadMapA
-    >;
-
-  using SmemIteratorA = typename MmaCore::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
-  using IteratorB =
-      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
-        ElementB, LayoutB,
-        ThreadMapB
-      >;
-  
-  using SmemIteratorB = typename MmaCore::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
-  using MmaPolicy = typename MmaCore::MmaPolicy;
-  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
-    ThreadblockShape, // < outputShape:KMNK, groups per cta>
-    WarpMmaSimtOp,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    ThreadOutputShape,
-    ThreadBlockOutputShape
-  >::Epilogue;
-
-  // Define the Mma
-  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
-    ThreadblockShape,
-    IteratorA,
-    SmemIteratorA,
-    CacheOpA,
-    IteratorB,
-    SmemIteratorB,
-    CacheOpB,
-    MmaPolicy,
-    Stages,
-    Epilogue,
-    IteratorAlgorithm::kFixedStrideDilation
-  >;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::DirectConvolution<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop,
-    Conv2dProblemSize,
-    cutlass::conv::GroupMode::kDepthwise,
-    ThreadBlockOutputShape
-  >;
-};
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
deleted file mode 100755
index 5e4299564..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h
+++ /dev/null
@@ -1,505 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multi-staged Depthwise Convolution kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure
-template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
-          typename Epilogue_,            ///! Epilogue
-          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
-          conv::Operator ConvOperator,   ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-          typename Arguments_,           ///! Kernel Arguments
-          typename ConvOutputIteratorParameter_, ///! Output Iterator Params
-          typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
-          conv::GroupMode GroupMode_ = conv::GroupMode::kNone,  ///! Group mode
-          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >  ///! OutputShape per ThreadBlock
-struct DirectConvolutionParams {
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-  using ConvProblemSize = ConvProblemSize_;
-  using Arguments = Arguments_;
-  using ConvOutputIteratorParameter = ConvOutputIteratorParameter_;
-
-  using ThreadblockShape = typename Mma::Shape;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm;
-  static conv::GroupMode const kGroupMode = GroupMode_;
-  static int const kStages = Mma::kStages;
-
-  ConvProblemSize problem_size;
-  cutlass::gemm::GemmCoord grid_tiled_shape;
-  gemm::GemmCoord implicit_gemm_problem_size;
-  int swizzle_log_tile;
-  int smem_size_;
-
-  int gemm_k_iterations;
-  int gemm_k_iterations_per_channel;
-  typename Mma::IteratorA::Params iterator_A;
-  typename Mma::IteratorA::Element const *ptr_A;
-  typename Mma::IteratorB::Params iterator_B;
-  typename Mma::IteratorB::Element const *ptr_B;
-  typename Mma::IteratorB::Element *ptr_reordered_B;
-  typename Epilogue::OutputTileIterator::Params iterator_C;
-  typename Epilogue::OutputTileIterator::Element *ptr_C;
-  typename Epilogue::OutputTileIterator::Params iterator_D;
-  typename Epilogue::OutputTileIterator::Element *ptr_D;
-  typename EpilogueOutputOp::Params output_op;
-  int *semaphore;
-  SplitKMode split_k_mode;
-  int split_k_slices;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  DirectConvolutionParams() : swizzle_log_tile(0), gemm_k_iterations(0) {}
-
-  ///
-  CUTLASS_HOST_DEVICE
-  DirectConvolutionParams(Arguments const &args, int *semaphore = nullptr)
-      : problem_size(args.problem_size),
-        implicit_gemm_problem_size(
-            cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-        iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-        ptr_A(args.ref_A.data()),
-        iterator_B(Mma::IteratorB::getParams(args.problem_size, args.ref_B.layout())),
-        ptr_B(args.ref_B.data()),
-        ptr_reordered_B(args.ref_reordered_B.data()),
-        iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size),
-        ptr_C(args.ref_C.data()),
-        iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size),
-        ptr_D(args.ref_D.data()),
-        output_op(args.output_op),
-        semaphore(semaphore),
-        split_k_mode(args.split_k_mode),
-        split_k_slices(args.problem_size.split_k_slices) {
-    gemm_k_iterations =
-        depthwise_gemm_k_iterations<ThreadBlockOutputShape::kN,
-                                    ThreadBlockOutputShape::kH,
-                                    ThreadBlockOutputShape::kW>(kConvolutionalOperator,
-                                                                ThreadblockShape::kK,
-                                                                args.problem_size,
-                                                                kIteratorAlgorithm,
-                                                                kGroupMode,
-                                                                ThreadblockShape::kN);
-
-    gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
-        kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        kConvolutionalOperator,
-        problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-    swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-
-    // Dynamic SMEM usage because stride and dilation are runtime params.
-    smem_size_ = (max(iterator_A.activation_size, int(sizeof(typename Epilogue::SharedStorage))) * kStages + iterator_B.filter_size);
-  }
-
-  CUTLASS_HOST_DEVICE
-  int get_smem_size() {
-    // Dynamic Smem Size
-    return smem_size_;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Params_, typename ElementB_>
-struct ReorderKernel {
-  using Params = Params_;
-  using ElementB = ElementB_;
-
-  union SharedStorage {};
-
-  static unsigned int const kReorderKernelThreadPerCTA = 128;
-
-  CUTLASS_HOST_DEVICE
-  ReorderKernel() {}
-
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(Params const &params) {
-    return dim3{static_cast<unsigned int>(
-                    (params.problem_size.filter_size() + kReorderKernelThreadPerCTA - 1) /
-                    kReorderKernelThreadPerCTA),
-                1,
-                1};
-  }
-
-  CUTLASS_HOST_DEVICE
-  static dim3 get_block_shape() { return dim3{kReorderKernelThreadPerCTA, 1, 1}; }
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    int64_t m = static_cast<int64_t>(params.problem_size.groups);
-    int64_t n = static_cast<int64_t>(params.problem_size.filter_size() / params.problem_size.K);
-    const ElementB *src_with_type = static_cast<const ElementB *>(params.ptr_B);
-    ElementB *dst_with_type = static_cast<ElementB *>(params.ptr_reordered_B);
-
-    int64_t linear_index = blockIdx.x * kReorderKernelThreadPerCTA + threadIdx.x;
-    int64_t index_m = linear_index / n;
-    int64_t index_n = linear_index % n;
-    int64_t new_linear_index = index_m + index_n * m;
-
-    if (linear_index < m * n) {
-      dst_with_type[new_linear_index] = src_with_type[linear_index];
-    }
-    return;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
-  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,    ///! Group mode
-  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
->
-struct DirectConvolution {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-
-  /// Set output tensor C layout
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename cutlass::gemm::GemmShape<1, 1, 1>;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
-  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = GroupMode_;
-
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefB ref_reordered_B;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      typename EpilogueOutputOp::Params const & output_op,
-      TensorRefB const & ref_reordered_B = nullptr,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      output_op(output_op),
-      ref_reordered_B(ref_reordered_B),
-      split_k_mode(split_k_mode)
-    {
-
-    }
-
-  };
-
-  using Params =
-      typename cutlass::conv::kernel::DirectConvolutionParams<Mma,
-                                                              Epilogue,
-                                                              ThreadblockSwizzle,
-                                                              kConvolutionalOperator,
-                                                              Arguments,
-                                                              ConvOutputIteratorParameter,
-                                                              ConvProblemSize,
-                                                              kGroupMode,
-                                                              ThreadBlockOutputShape>;
-
-  using ReorderKernel = typename cutlass::conv::kernel::ReorderKernel<Params, ElementB>;
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  DirectConvolution() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if threadblock is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-    int iterator_column_offset = 0;
-    int filter_row_offset = 0;
-    if (kGroupMode != GroupMode::kNone) {
-      if (kGroupMode == GroupMode::kDepthwise) {
-        iterator_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
-      }
-    } 
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.iterator_A,
-      params.problem_size,
-      params.ptr_A,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.m() + threadblock_tile_idx.k(),
-        iterator_column_offset
-      )
-    );
-    
-    typename Mma::IteratorB iterator_B(
-      params.iterator_B,
-      params.problem_size,
-      params.ptr_reordered_B,
-      thread_idx,
-      MatrixCoord(
-        filter_row_offset,
-        iterator_column_offset
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-    
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() + threadblock_tile_idx.k(),
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-    
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-
-    // Compute threadblock-scoped matrix multiply-add
-    // Epilogue is fused in the mainloop
-    mma(params.gemm_k_iterations,
-        accumulators,
-        iterator_A,
-        params.iterator_A,
-        iterator_B,
-        params.iterator_B,
-        accumulators,
-        epilogue,
-        output_op,
-        iterator_D,
-        iterator_C,
-        params.split_k_slices);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
deleted file mode 100755
index b1e0b477a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ /dev/null
@@ -1,455 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined Implicit GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
-  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
-  conv::GroupMode GroupMode_ = conv::GroupMode::kNone    ///! Group mode
->
-struct ImplicitGemmConvolution {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-
-  /// Set output tensor C layout
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
-  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = GroupMode_;
-
-  /// Wgrad C stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix C (KxRSC) 
-  // Conv3d row-major matrix C (KxTRSC)
-  static int const kWgradCStrideIdx = 
-    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorCStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      typename EpilogueOutputOp::Params const & output_op,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      output_op(output_op),
-      split_k_mode(split_k_mode)
-    {
-
-    }
-
-  };
-
-  /// Parameters structure
-  struct Params {
-    ConvProblemSize problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    gemm::GemmCoord implicit_gemm_problem_size;
-    int swizzle_log_tile;
-
-    int gemm_k_iterations;
-    int gemm_k_iterations_per_channel;
-    typename Mma::IteratorA::Params iterator_A;
-    typename Mma::IteratorA::Element const *ptr_A;
-    typename Mma::IteratorB::Params iterator_B;
-    typename Mma::IteratorB::Element const *ptr_B;
-    typename Epilogue::OutputTileIterator::Params iterator_C;
-    typename Epilogue::OutputTileIterator::Element *ptr_C;
-    typename Epilogue::OutputTileIterator::Params iterator_D;
-    typename Epilogue::OutputTileIterator::Element *ptr_D;
-    typename EpilogueOutputOp::Params output_op;
-    int *semaphore;
-    SplitKMode split_k_mode;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), gemm_k_iterations(0) { }
-
-    /// 
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int *semaphore = nullptr
-    ):
-      problem_size(args.problem_size),
-      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-      ptr_A(args.ref_A.data()),
-      iterator_B(args.problem_size, args.ref_B.layout()),
-      ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
-      ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
-      ptr_D(args.ref_D.data()),
-      output_op(args.output_op),
-      semaphore(semaphore),
-      split_k_mode(args.split_k_mode)
-    {
-      gemm_k_iterations = implicit_gemm_k_iterations(
-        kConvolutionalOperator,
-        ThreadblockShape::kK,
-        args.problem_size,
-        kIteratorAlgorithm,
-        kGroupMode,
-        ThreadblockShape::kN);
-
-      gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
-          kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
-
-      ThreadblockSwizzle threadblock_swizzle;
-
-      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        implicit_gemm_problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  ImplicitGemmConvolution() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-    int iterator_A_column_offset = threadblock_tile_idx.k() * Mma::Shape::kK;
-    if (kGroupMode != GroupMode::kNone) {
-      if (kGroupMode != GroupMode::kDepthwise) {
-        int k_per_group = params.problem_size.K / params.problem_size.groups;
-        int group_idx = threadblock_tile_idx.n() * Mma::Shape::kN / k_per_group;
-        int channels_per_group = params.problem_size.C / params.problem_size.groups;
-        iterator_A_column_offset += group_idx * channels_per_group;
-      } else {
-        iterator_A_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
-      }
-    } 
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.iterator_A,
-      params.problem_size,
-      params.ptr_A,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.m() * Mma::Shape::kM,
-        iterator_A_column_offset
-      )
-    );
-    
-    typename Mma::IteratorB iterator_B(
-      params.iterator_B,
-      params.problem_size,
-      params.ptr_B,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.k() * Mma::Shape::kK,
-        threadblock_tile_idx.n() * Mma::Shape::kN
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, params.gemm_k_iterations_per_channel);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    // Construct the semaphore.
-    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
-    }
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() * Mma::Shape::kM,
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-    
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_idx.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_idx.k());
-
-    }
-    // Each split-k-slice writes to a unique tensor location
-    else if (params.split_k_mode == SplitKMode::kParallel) {
-      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
-        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
-    }
-
-    // Run efficient epilogue
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-  
-    //
-    // Release the semaphore
-    //
-
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_idx.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  } 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
deleted file mode 100755
index 74ecae401..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
+++ /dev/null
@@ -1,461 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined fused activation's scale+bias+relu and Implicit GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
->
-struct ImplicitGemmConvolutionFusion {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-
-  using ElementScaleBias = typename Mma::IteratorScaleBias::Element;
-  using LayoutScaleBias = typename Mma::IteratorScaleBias::Layout;
-
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
- 
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefScaleBias = typename Mma::IteratorScaleBias::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
-
-  /// Wgrad C stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix C (KxRSC) 
-  // Conv3d row-major matrix C (KxTRSC)
-  static int const kWgradCStrideIdx = 
-    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorCStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefScaleBias ref_scale;
-    TensorRefScaleBias ref_bias;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefScaleBias const & ref_scale,
-      TensorRefScaleBias const & ref_bias,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      typename EpilogueOutputOp::Params const & output_op,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_scale(ref_scale),
-      ref_bias(ref_bias),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      output_op(output_op),
-      split_k_mode(split_k_mode)
-    {
-
-    }
-
-  };
-
-  /// Parameters structure
-  struct Params {
-    ConvProblemSize problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    gemm::GemmCoord implicit_gemm_problem_size{};
-    int swizzle_log_tile{0};
-    int gemm_k_iterations{0};
-    typename Mma::IteratorA::Params iterator_A{};
-    typename Mma::IteratorA::Element const *ptr_A = nullptr;
-    typename Mma::IteratorB::Params iterator_B{};
-    typename Mma::IteratorB::Element const *ptr_B = nullptr;
-    typename Mma::IteratorScaleBias::Params iterator_scale_bias{};
-    typename Mma::IteratorScaleBias::Element const *ptr_scale = nullptr;
-    typename Mma::IteratorScaleBias::Element const *ptr_bias = nullptr;
-    typename Epilogue::OutputTileIterator::Params iterator_C {};
-    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
-    typename Epilogue::OutputTileIterator::Params iterator_D {};
-    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
-    typename EpilogueOutputOp::Params output_op {};
-    int *semaphore = nullptr;
-    SplitKMode split_k_mode {};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    /// 
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int *semaphore = nullptr
-    ):
-      problem_size(args.problem_size),
-      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-      ptr_A(args.ref_A.data()),
-      iterator_B(args.problem_size, args.ref_B.layout()),
-      ptr_B(args.ref_B.data()),
-      iterator_scale_bias(args.problem_size, args.ref_scale.layout()),
-      ptr_scale(args.ref_scale.data()),
-      ptr_bias(args.ref_bias.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
-      ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
-      ptr_D(args.ref_D.data()),
-      output_op(args.output_op),
-      semaphore(semaphore),
-      split_k_mode(args.split_k_mode)
-    {
-      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
-
-      ThreadblockSwizzle threadblock_swizzle;
-
-      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        implicit_gemm_problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  ImplicitGemmConvolutionFusion() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A operand
-    typename Mma::IteratorA iterator_A(
-      params.iterator_A,
-      params.problem_size,
-      params.ptr_A,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.m() * Mma::Shape::kM,
-        threadblock_tile_idx.k() * Mma::Shape::kK
-      )
-    );
-    
-    // Construct iterators to B operand
-    typename Mma::IteratorB iterator_B(
-      params.iterator_B,
-      params.problem_size,
-      params.ptr_B,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.k() * Mma::Shape::kK,
-        threadblock_tile_idx.n() * Mma::Shape::kN
-      )
-    );
- 
-    // Construct iterators to A scale/bias vector
-    typename Mma::IteratorScaleBias iterator_scale_bias(
-      params.iterator_scale_bias,
-      params.problem_size,
-      params.ptr_scale,
-      params.ptr_bias,
-      thread_idx,
-      MatrixCoord(
-        0, (kConvolutionalOperator == conv::Operator::kFprop) ?
-                  (threadblock_tile_idx.k() * Mma::Shape::kK) :
-                  // Wgrad
-                  (threadblock_tile_idx.n() * Mma::Shape::kN)
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(params.gemm_k_iterations, accumulators, iterator_A,
-        iterator_B, iterator_scale_bias, accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    // Construct the semaphore.
-    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
-    }
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() * Mma::Shape::kM,
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-    
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_idx.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_idx.k());
-
-    }
-    // Each split-k-slice writes to a unique tensor location
-    else if (params.split_k_mode == SplitKMode::kParallel) {
-      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
-        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
-    }
-
-    // Run efficient epilogue
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-  
-    //
-    // Release the semaphore
-    //
-
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_idx.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  } 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
deleted file mode 100755
index bf00f90ba..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
+++ /dev/null
@@ -1,492 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined Implicit GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
->
-struct ImplicitGemmConvolutionStridedDgrad {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-
-  /// Set output tensor C layout
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
-  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
-  
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
-
-  /// Wgrad C stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix C (KxRSC) 
-  // Conv3d row-major matrix C (KxTRSC)
-  static int const kWgradCStrideIdx = 
-    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorCStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
-
-  // Strided dgrad uses a specialized threadblock swizzle for functionality and performance
-  static_assert((platform::is_same<ThreadblockSwizzle,
-                      threadblock::StridedDgradHorizontalThreadblockSwizzle>::value) ||
-                (platform::is_same<ThreadblockSwizzle,
-                      threadblock::StridedDgradIdentityThreadblockSwizzle<1>>::value) ||
-                (platform::is_same<ThreadblockSwizzle,
-                      threadblock::StridedDgradIdentityThreadblockSwizzle<4>>::value) ||
-                (platform::is_same<ThreadblockSwizzle,
-                      threadblock::StridedDgradIdentityThreadblockSwizzle<8>>::value),
-    "Needs ThreadblockSwizzle type specialized for strided dgrad");
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size{};
-    TensorRefA ref_A{};
-    TensorRefB ref_B{};
-    TensorRefC ref_C{};
-    TensorRefC ref_D{};
-    typename EpilogueOutputOp::Params output_op{};
-    SplitKMode split_k_mode{};
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    Arguments() = default;
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      typename EpilogueOutputOp::Params const & output_op,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      output_op(output_op),
-      split_k_mode(split_k_mode)
-    {
-
-    }
-
-  };
-
-  /// Parameters structure
-  struct Params {
-    ConvProblemSize problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    FastDivmod stride_h_divmod{};
-    FastDivmod stride_w_divmod{};
-    int gemm_k_iterations{0};
-    typename Mma::IteratorA::Params iterator_A{};
-    typename Mma::IteratorA::Element const *ptr_A = nullptr;
-    typename Mma::IteratorB::Params iterator_B{};
-    typename Mma::IteratorB::Element const *ptr_B = nullptr;
-    typename Epilogue::OutputTileIterator::Params iterator_C{};
-    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
-    typename Epilogue::OutputTileIterator::Params iterator_D{};
-    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
-    typename EpilogueOutputOp::Params output_op {};
-    int *semaphore = nullptr;
-    SplitKMode split_k_mode {};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    /// 
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int *semaphore = nullptr
-    ):
-      problem_size(args.problem_size),
-      stride_h_divmod(args.problem_size.stride_h),
-      stride_w_divmod(args.problem_size.stride_w),
-      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-      ptr_A(args.ref_A.data()),
-      iterator_B(args.problem_size, args.ref_B.layout()),
-      ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size, ThreadblockShape::kM),
-      ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size, ThreadblockShape::kM),
-      ptr_D(args.ref_D.data()),
-      output_op(args.output_op),
-      semaphore(semaphore),
-      split_k_mode(args.split_k_mode)
-    {
-      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
-
-      ThreadblockSwizzle threadblock_swizzle;
-
-      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        kConvolutionalOperator,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-      
-      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-  
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  ImplicitGemmConvolutionStridedDgrad() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Compute starting filter position for strided dgrad
-    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(params.problem_size, 
-                                                            ThreadblockShape::kM);
-    int filter_tile_m = (threadblock_tile_idx.m() / tile_m_per_filter);
-    
-
-    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
-    //
-    // int start_r = filter_tile_m / (params.problem_size.stride_w);
-    // int start_s = filter_tile_m % (params.problem_size.stride_w);
-
-    int start_r, start_s;
-    params.stride_w_divmod(start_r, start_s, filter_tile_m);
-
-    int filter_r = start_r;
-    int filter_s = start_s;
-
-    if (params.problem_size.mode == Mode::kConvolution) {
-      filter_r = (params.problem_size.R - 1 - filter_r);
-      filter_s = (params.problem_size.S - 1 - filter_s);
-    }
-
-    // Starting h, w positions for filter position in gemm_k=0
-    int start_h, start_w;
-    strided_dgrad_starting_coords(
-      params.problem_size,
-      params.stride_h_divmod, params.stride_w_divmod,
-      filter_r, filter_s,
-      start_h, start_w);
-
-    if (start_h >= params.problem_size.H || start_w >= params.problem_size.W) {
-      return;
-    }
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    // Check if CTA contributes valid MMA (Dy * w) and accumulator will be non-zero after MMA
-    if (start_r < params.problem_size.R && start_s < params.problem_size.S) {
-      // Scale gemm_k_iterations for strided dgrad
-      int gemm_k_iterations = (params.gemm_k_iterations / (params.problem_size.R * params.problem_size.S)
-                              ) * params.problem_size.num_gemm_k_filter_positions(start_r, start_s);
-      
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        params.iterator_A,
-        params.problem_size,
-        params.ptr_A,
-        thread_idx,
-        params.stride_h_divmod, params.stride_w_divmod,
-        start_r, start_s,
-        MatrixCoord(
-          threadblock_tile_idx.m() * Mma::Shape::kM,
-          threadblock_tile_idx.k() * Mma::Shape::kK
-        ) 
-      );
-      
-      typename Mma::IteratorB iterator_B(
-        params.iterator_B,
-        params.problem_size,
-        params.ptr_B,
-        thread_idx,
-        start_r, start_s,
-        MatrixCoord(
-          threadblock_tile_idx.k() * Mma::Shape::kK,
-          threadblock_tile_idx.n() * Mma::Shape::kN
-        )
-      );
-
-      //
-      // Main loop
-      //
-
-      // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-    }
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    // Construct the semaphore.
-    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
-    }
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() * Mma::Shape::kM,
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      params.stride_h_divmod, params.stride_w_divmod,
-      start_r, start_s,
-      threadblock_offset
-    );
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    if (output_op.is_source_needed())
-    {
-      // Tile iterator reading from source accumulator tensor
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.iterator_C,
-        params.ptr_C,
-        ConvOutputIteratorParameter::extent(params.problem_size),
-        thread_idx,
-        params.stride_h_divmod, params.stride_w_divmod,
-        start_r, start_s,
-        threadblock_offset);
-
-      // Wait on the semaphore - this latency may have been covered by iterator construction
-      if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-
-        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-        if (threadblock_tile_idx.k()) {
-          iterator_C = iterator_D;
-        }
-
-        semaphore.wait(threadblock_tile_idx.k());
-      }
-
-      // Run epilogue with addend source iterator
-      epilogue(output_op, iterator_D, accumulators, iterator_C);
-    }
-    else
-    {
-      // Run epilogue without addend source iterator
-      epilogue(output_op, iterator_D, accumulators);
-    }
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_idx.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
deleted file mode 100755
index b05fd2d3e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
+++ /dev/null
@@ -1,494 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Convolution kernel with an epilogue that computes the absolute maximum value of the output
-    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
-    stored to global memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
->
-struct ImplicitGemmConvolutionWithAbsMax {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-
-  /// Set output tensor C layout
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
-  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-  using TensorRefAux = cutlass::TensorRef<typename EpilogueOutputOp::ElementAuxOutput, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
-
-  /// Wgrad C stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix C (KxRSC) 
-  // Conv3d row-major matrix C (KxTRSC)
-  static int const kWgradCStrideIdx = 
-    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorCStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-    TensorRefC ref_Aux;
-
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
-
-    void * ptr_Vector;
-
-    typename LayoutC::Stride::Index ldr;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      TensorRefAux const & ref_Aux,
-      typename EpilogueOutputOp::Params const & output_op,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial,
-      void * ptr_Vector = nullptr,
-      typename LayoutC::Stride::Index ldr = 0
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      ref_Aux(ref_Aux),
-      output_op(output_op),
-      split_k_mode(split_k_mode),
-      ptr_Vector(ptr_Vector),
-      ldr(ldr)
-    {
-
-    }
-
-  };
-
-  /// Parameters structure
-  struct Params {
-    ConvProblemSize problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    gemm::GemmCoord implicit_gemm_problem_size;
-    int swizzle_log_tile;
-
-    int gemm_k_iterations;
-    typename Mma::IteratorA::Params iterator_A;
-    typename Mma::IteratorA::Element const *ptr_A;
-    typename Mma::IteratorB::Params iterator_B;
-    typename Mma::IteratorB::Element const *ptr_B;
-    typename Epilogue::OutputTileIterator::Params iterator_C;
-    typename Epilogue::OutputTileIterator::Element *ptr_C;
-    typename Epilogue::OutputTileIterator::Params iterator_D;
-    typename Epilogue::OutputTileIterator::Element *ptr_D;
-    typename Epilogue::AuxOutputTileIterator::Params iterator_Aux;
-    typename Epilogue::AuxOutputTileIterator::Element *ptr_Aux;
-    typename EpilogueOutputOp::Params output_op;
-    int *semaphore;
-    SplitKMode split_k_mode;
-
-    void * ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0), 
-      gemm_k_iterations(0),
-      ptr_Vector(nullptr),
-      ldr(0)
-    { }
-
-    /// 
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int *semaphore = nullptr
-    ):
-      problem_size(args.problem_size),
-      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-      ptr_A(args.ref_A.data()),
-      iterator_B(args.problem_size, args.ref_B.layout()),
-      ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
-      ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
-      ptr_D(args.ref_D.data()),
-      iterator_Aux(ConvOutputIteratorParameter::layout(args.ref_Aux)),
-      ptr_Aux(args.ref_Aux.data()),
-      output_op(args.output_op),
-      semaphore(semaphore),
-      split_k_mode(args.split_k_mode),
-      ptr_Vector(args.ptr_Vector), 
-      ldr(args.ldr)
-
-    {
-      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
-
-      ThreadblockSwizzle threadblock_swizzle;
-
-      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        implicit_gemm_problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  ImplicitGemmConvolutionWithAbsMax() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.iterator_A,
-      params.problem_size,
-      params.ptr_A,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.m() * Mma::Shape::kM,
-        threadblock_tile_idx.k() * Mma::Shape::kK
-      )
-    );
-    
-    typename Mma::IteratorB iterator_B(
-      params.iterator_B,
-      params.problem_size,
-      params.ptr_B,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.k() * Mma::Shape::kK,
-        threadblock_tile_idx.n() * Mma::Shape::kN
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    // Construct the semaphore.
-    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
-    }
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() * Mma::Shape::kM,
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to auxiliary tensor.
-    typename Epilogue::AuxOutputTileIterator iterator_Aux(
-      params.iterator_Aux,
-      params.ptr_Aux,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector = 
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
-    }
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_idx.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_idx.k());
-
-    }
-    // Each split-k-slice writes to a unique tensor location
-    else if (params.split_k_mode == SplitKMode::kParallel) {
-      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
-        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C,
-             iterator_Aux,
-             ConvOutputIteratorParameter::extent(params.problem_size),
-             threadblock_offset);
-  
-    //
-    // Release the semaphore
-    //
-
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_idx.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  } 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
deleted file mode 100755
index 1f27e0686..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
+++ /dev/null
@@ -1,499 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined Implicit GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,                             ///! Epilogue
-  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
-  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
-  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
->
-struct ImplicitGemmConvolutionWithFusedEpilogue {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static Operator const kConvolutionalOperator = ConvOperator;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename EpilogueOutputOp::ElementOutput;
-
-  /// Set output tensor C layout
-  using LayoutC = LayoutA;
-
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
-
-  using WarpMmaOperator = typename Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  static int const kStages = Mma::kStages;
-  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
-  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
-
-  /// Check iterator A and B convolution dimension are the same and 
-  // set device::ImplicitGemmConvolution::kConvDim
-  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
-    "Convolution on different different dimensions is not supported");
-  static int const kConvDim = Mma::IteratorA::kConvDim;
-
-  /// Conv dimension and problem size structure (Conv2d or Conv3d)
-  using ConvProblemSize = ConvProblemSize_;
-
-  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
-
-  /// Wgrad C stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix C (KxRSC) 
-  // Conv3d row-major matrix C (KxTRSC)
-  static int const kWgradCStrideIdx = 
-    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorCStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
-
-  //
-  //
-  //
-  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
-    LayoutC,
-    typename Epilogue::OutputTileIterator::Layout, 
-    TensorRefC,
-    ConvOperator,
-    ConvProblemSize
-    >;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    ConvProblemSize problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefC ref_C;
-    TensorRefC ref_D;
-
-    typename EpilogueOutputOp::Params output_op;
-    SplitKMode split_k_mode;
-
-    void * ptr_Vector;
-    void * ptr_Tensor;
-
-    typename LayoutC::Stride::Index ldr;
-    typename LayoutC::Stride::Index ldt;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      ConvProblemSize const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      ConvProblemSize const & problem_size,
-      TensorRefA const & ref_A,
-      TensorRefB const & ref_B,
-      TensorRefC const & ref_C,
-      TensorRefC const & ref_D,
-      typename EpilogueOutputOp::Params const & output_op,
-      SplitKMode const & split_k_mode = SplitKMode::kSerial,
-      void * ptr_Vector = nullptr,
-      void * ptr_Tensor = nullptr,
-      typename LayoutC::Stride::Index ldr = 0,
-      typename LayoutC::Stride::Index ldt = 0
-    ):
-      problem_size(problem_size),
-      ref_A(ref_A),
-      ref_B(ref_B),
-      ref_C(ref_C),
-      ref_D(ref_D),
-      output_op(output_op),
-      split_k_mode(split_k_mode),
-      ptr_Vector(ptr_Vector),
-      ptr_Tensor(ptr_Tensor),
-      ldr(ldr),
-      ldt(ldt)
-    {
-
-    }
-
-  };
-
-  /// Parameters structure
-  struct Params {
-    ConvProblemSize problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    gemm::GemmCoord implicit_gemm_problem_size;
-    int swizzle_log_tile;
-
-    int gemm_k_iterations;
-    typename Mma::IteratorA::Params iterator_A;
-    typename Mma::IteratorA::Element const *ptr_A;
-    typename Mma::IteratorB::Params iterator_B;
-    typename Mma::IteratorB::Element const *ptr_B;
-    typename Epilogue::OutputTileIterator::Params iterator_C;
-    typename Epilogue::OutputTileIterator::Element *ptr_C;
-    typename Epilogue::OutputTileIterator::Params iterator_D;
-    typename Epilogue::OutputTileIterator::Element *ptr_D;
-    typename EpilogueOutputOp::Params output_op;
-    int *semaphore;
-    SplitKMode split_k_mode;
-
-    typename Epilogue::TensorTileIterator::Params params_Tensor;
-    void * ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-    void * ptr_Tensor;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      swizzle_log_tile(0), 
-      gemm_k_iterations(0),
-      ptr_Vector(nullptr),
-      ldr(0),
-      ptr_Tensor(nullptr)
-    { }
-
-    /// 
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      int *semaphore = nullptr
-    ):
-      problem_size(args.problem_size),
-      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
-      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
-      ptr_A(args.ref_A.data()),
-      iterator_B(args.problem_size, args.ref_B.layout()),
-      ptr_B(args.ref_B.data()),
-      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
-      ptr_C(args.ref_C.data()),
-      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
-      ptr_D(args.ref_D.data()),
-      output_op(args.output_op),
-      semaphore(semaphore),
-      split_k_mode(args.split_k_mode),
-      params_Tensor(args.ldt),
-      ptr_Vector(args.ptr_Vector), 
-      ldr(args.ldr),
-      ptr_Tensor(args.ptr_Tensor)
-
-    {
-      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
-
-      ThreadblockSwizzle threadblock_swizzle;
-
-      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-        implicit_gemm_problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.problem_size.split_k_slices);
-
-      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  ImplicitGemmConvolutionWithFusedEpilogue() { } 
-
-  /// Executes one ImplicitGEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
-
-      return;
-    }
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.iterator_A,
-      params.problem_size,
-      params.ptr_A,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.m() * Mma::Shape::kM,
-        threadblock_tile_idx.k() * Mma::Shape::kK
-      )
-    );
-    
-    typename Mma::IteratorB iterator_B(
-      params.iterator_B,
-      params.problem_size,
-      params.ptr_B,
-      thread_idx,
-      MatrixCoord(
-        threadblock_tile_idx.k() * Mma::Shape::kK,
-        threadblock_tile_idx.n() * Mma::Shape::kN
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    // Construct the semaphore.
-    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
-
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    
-    // Compute logical position within grid
-    threadblock_tile_idx =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
-    }
-
-    MatrixCoord threadblock_offset(
-      threadblock_tile_idx.m() * Mma::Shape::kM,
-      threadblock_tile_idx.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to destination tensor
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.iterator_D,
-      params.ptr_D,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-    
-    // Tile iterator reading from source accumulator tensor
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.iterator_C,
-      params.ptr_C,
-      ConvOutputIteratorParameter::extent(params.problem_size),
-      thread_idx,
-      threadblock_offset
-    );
-
-    typename Epilogue::ElementTensor *ptr_Tensor = 
-      static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector = 
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        // Only the final block outputs Tensor
-        ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
-         (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
-            ? nullptr
-            : ptr_Tensor,
-        ConvOutputIteratorParameter::extent(params.problem_size),
-        thread_idx,
-        threadblock_offset);
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
-    }
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_idx.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_idx.k());
-
-    }
-    // Each split-k-slice writes to a unique tensor location
-    else if (params.split_k_mode == SplitKMode::kParallel) {
-      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
-        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C,
-             tensor_iterator,
-            ConvOutputIteratorParameter::extent(params.problem_size),
-             threadblock_offset);
-  
-    //
-    // Release the semaphore
-    //
-
-    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_idx.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  } 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
deleted file mode 100755
index 657ac6b3e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-
-#include "cutlass/conv/detail.hpp"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/dispatch_policy.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/pipeline/sm90_pipeline.hpp"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::conv::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class ConvUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelImplicitTmaWarpSpecializedSm90, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
-> : public cutlass::gemm::kernel::GemmUniversal< 
-  ProblemShape_, 
-  CollectiveMainloop_, 
-  CollectiveEpilogue_, 
-  TileScheduler_
->
-{};
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::conv::kernel
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h b/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
deleted file mode 100755
index 37ece7927..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for depthwise convolution
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/thread/mma.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// MMA operation
-template <
-  /// Size of the matrix product (concept: GemmShape)
-  typename Shape_,
-  /// Number of threads participating
-  int kThreads_,
-  /// Data type of A elements
-  typename ElementA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Element type of C matrix
-  typename ElementC,
-  /// Inner product operator
-  typename Operator
->
-struct ElementwiseInnerProduct;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// General implementation
-template <
-    /// Size of the matrix product (concept: GemmShape)
-    typename Shape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Element type of C matrix
-    typename ElementC_>
-struct ElementwiseInnerProduct<Shape_, 1, ElementA_, ElementB_, ElementC_, arch::OpMultiplyAdd> {
-  using Shape = Shape_;
-  using Operator = arch::OpMultiplyAdd;
-  using ElementC = ElementC_;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Array<ElementC_, Shape::kN> &d,
-                  Array<ElementA_, Shape::kN> const &a,
-                  Array<ElementB_, Shape::kN> const &b,
-                  Array<ElementC_, Shape::kN> const &c) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Shape::kN; ++i) {
-      d[i] = a[i] * b[i] + c[i];
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Specialization of half_t
-template <>
-struct ElementwiseInnerProduct<
-  gemm::GemmShape<2, 2, 1>,
-  1,
-  half_t,
-  half_t,
-  half_t,
-  arch::OpMultiplyAdd> {
-
-  using Shape = gemm::GemmShape<2, 2, 1>;
-  using Operator =  arch::OpMultiplyAdd;
-  using ElementC = half_t;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<half_t, 2> &d,
-    Array<half_t, 2> const &a,
-    Array<half_t, 2> const &b,
-    Array<half_t, 2> const &c
-  ) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
-
-    __half2 const & A = reinterpret_cast<__half2 const &>(a);
-    __half2 const & B = reinterpret_cast<__half2 const &>(b);
-    __half2 const & C = reinterpret_cast<__half2 const &>(c);
-
-    __half2 tmp_D = __hfma2(A, B, C);
-
-    d = reinterpret_cast<Array<half_t, 2> const &>(tmp_D);
-
-#else
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      d[i] = a[i] * b[i] + c[i];
-    }
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape,
-  /// Data type of A elements
-  typename ElementA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Element type of C matrix
-  typename ElementC,
-  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
-  typename Operator = arch::OpMultiplyAdd,
-  /// Used for partial specialization
-  typename Enable = bool
->
-struct DepthwiseDirectConvElementwiseInnerProduct;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Gemplate that handles all packed matrix layouts
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Operator used to compute GEMM
-  typename Operator_
->
-struct DepthwiseDirectConvElementwiseInnerProductGeneric {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = ElementA_;
-
-  /// Data type of operand B
-  using ElementB = ElementB_;
-
-  /// Element type of operand C
-  using ElementC = ElementC_;
-
-  /// Underlying mathematical operator
-  using Operator = Operator_;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMN>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Instruction
-  using MmaOp = cutlass::conv::thread::ElementwiseInnerProduct<
-    gemm::GemmShape<Shape::kN, Shape::kN, 1>,
-    1,
-    ElementA,
-    ElementB,
-    ElementC,
-    Operator>;
-
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-    Array<ElementC, Shape::kN> *ptr_D = reinterpret_cast<Array<ElementC, Shape::kN> *>(&D);
-    Array<ElementA, Shape::kN> const *ptr_A =
-        reinterpret_cast<Array<ElementA, Shape::kN> const *>(&A);
-    Array<ElementB, Shape::kN> const *ptr_B =
-        reinterpret_cast<Array<ElementB, Shape::kN> const *>(&B);
-
-    MmaOp mma_op;
-
-    // Copy accumulators
-    D = C;
-
-    // Compute matrix product
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Shape::kN / MmaOp::Shape::kN; ++n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM; ++m) {
-
-          Array<ElementC, MmaOp::Shape::kN> tmpD = ptr_D[m * Shape::kN / MmaOp::Shape::kN + n];
-          Array<ElementA, MmaOp::Shape::kN> tmpA = ptr_A[m * Shape::kN / MmaOp::Shape::kN + n];
-          Array<ElementB, MmaOp::Shape::kN> tmpB = ptr_B[n];
-
-          mma_op(tmpD, tmpA, tmpB, tmpD);
-
-          ptr_D[m * Shape::kN / MmaOp::Shape::kN + n] = tmpD;
-
-        }
-      }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-    /// Data type of A elements
-  typename ElementA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Element type of C matrix
-  typename ElementC_
->
-struct DepthwiseDirectConvElementwiseInnerProduct<
-  Shape_,
-  ElementA_,
-  ElementB_,
-  ElementC_,
-  arch::OpMultiplyAdd
-  > {
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = ElementA_;
-
-  /// Data type of operand B
-  using ElementB = ElementB_;
-
-  /// Element type of operand C
-  using ElementC = ElementC_;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA =
-      Array<ElementA, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kN>;  // 1 * groups_per_thread
-
-  /// C operand storage
-  using FragmentC =
-      Array<ElementC, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
-
-  static bool const use_optimized = 0;
-
-  using ArchMmaOperator =  DepthwiseDirectConvElementwiseInnerProductGeneric<Shape,
-                                                        ElementA,
-                                                        ElementB,
-                                                        ElementC,
-                                                        Operator>;
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    ArchMmaOperator mma;
-
-    mma(D, A, B, C);
-
-  }
-};
-
-} // namespace thread
-} // namespace conv
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
deleted file mode 100755
index 978c14feb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,485 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dDgradFilterTileAccessIteratorAnalytic;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradFilterTileAccessIteratorAnalytic strided dgrad needs special handling to skip MMAs
-// on non-contributing w positions
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradFilterTileAccessIteratorAnalytic <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kStrided,
-  AccessType_
-> {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static_assert(sizeof_bits<Element>::value >= 8, 
-    "DGRAD requires elements of size 8b or larger.");
-  
-  //
-  // Parameters structure
-  //
-  
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
-  int filter_r_;
-  int filter_s_;
-  int start_r_;
-  int start_s_;
-  int offset_k_[ThreadMap::Iterations::kStrided]; 
-  int offset_c_[ThreadMap::Iterations::kContiguous];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_r_(start_r),
-    filter_s_(start_s),
-    start_r_(start_r),
-    start_s_(start_s) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
-        + c * ThreadMap::Delta::kContiguous;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = 
-        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // Moves filter_s
-    filter_s_ += problem_size_.stride_w;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    // Restore filter_s
-    filter_s_ = start_s_;
-
-    // Move filter_r 
-    filter_r_ += problem_size_.stride_h;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    // Restore filter_r
-    filter_r_ = start_r_;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the filter tensor w that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int k = offset_k_[iteration_strided_];
-    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
-    
-    return TensorCoord(k, filter_r_, filter_s_, c);
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor w
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradFilterTileAccessIteratorAnalytic unity strided dgrad is more performant for  dgrad
-// on problem sizes with stride = {1x1}
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradFilterTileAccessIteratorAnalytic <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kUnity,
-  AccessType_
->{
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  static_assert(sizeof_bits<Element>::value >= 8, 
-    "DGRAD requires elements of size 8b or larger.");
-  
-  //
-  // Parameters structure
-  //
-  
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
-  int filter_r_;
-  int filter_s_;
-  int offset_k_[ThreadMap::Iterations::kStrided]; 
-  int offset_c_[ThreadMap::Iterations::kContiguous];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_r_(0),
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
-        + c * ThreadMap::Delta::kContiguous;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = 
-        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the filter tensor w that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int k = offset_k_[iteration_strided_];
-    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
-
-    return TensorCoord(k, filter_r_, filter_s_, c);
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor w
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
deleted file mode 100755
index 6fb1cb18e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,619 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dDgradFilterTileAccessIteratorOptimized;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
-// on problem sizes with stride = {1x1}
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradFilterTileAccessIteratorOptimized <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kStrided,
-  AccessType_
-  > {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv2dStridedDgradFilterIteratorOptimizedParams {
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dStridedDgradFilterIteratorOptimizedParams const &base): 
-      Conv2dStridedDgradFilterIteratorOptimizedParams(base) { }
-      
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ):
-      Conv2dStridedDgradFilterIteratorOptimizedParams(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) { }
-
-  };
-
-private:
-
-  Conv2dStridedDgradFilterIteratorOptimizedParams const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  uint32_t predicates_[kAccessesPerVector];
-  int filter_k_;
-  int filter_r_;
-  int filter_s_;
-
-  int start_r_;
-  int start_s_;
-
-  int64_t reset_bytes_s_;
-  int64_t reset_bytes_r_;
-
-  //
-  // Assertions
-  //
-
-  // We map predicates into bits packed in this uint32_t container
-  static_assert(ThreadMap::Iterations::kStrided *
-    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
-    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorOptimized(
-    Conv2dStridedDgradFilterIteratorOptimizedParams const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_{0},
-    filter_r_(start_r),
-    filter_s_(start_s),
-    start_r_(start_r),
-    start_s_(start_s) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.row() + thread_coord.strided();
-    Index column = threadblock_offset.column() + thread_coord.contiguous();
-
-    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
-    reset_bytes_r_ = reset_bytes_s_ +
-                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
-        int filter_c = column + c * ThreadMap::Delta::kContiguous;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
-  
-          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
-          
-          predicates_[v] |= (pred << pred_idx);
-        }
-      }
-    }
-
-    TensorCoord coord{filter_k_, filter_r_, filter_s_, column};
-
-    pointer_ += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-
-    int next_idx = 0;
-    LongIndex reset_bytes = params_.reset_bytes;
-
-    // Move filter_s by stride_w
-    filter_s_ +=  problem_size_.stride_w;
-    if (filter_s_ >= problem_size_.S) {
-      
-      // Restore filter_s
-      filter_s_ = start_s_;
-
-      // Move filter_r by stride_h
-      filter_r_ += problem_size_.stride_h;
-#if 0
-      bool check = (filter_r_ < problem_size_.R);
-
-      filter_r_ = check ? filter_r_ : start_r_;
-      next_idx = check ? 1 : 2;
-      reset_bytes += (check ? reset_bytes_s_ : reset_bytes_r_);
-#else
-    asm volatile(
-        "{\n\t"
-        " .reg .pred %%p;\n\t"
-        " .reg .s64 t1;\n\t"
-        " setp.lt.s32 %%p, %3, %4;\n\t"
-        " selp.s32 %0, %3, %5, %%p;\n\t"
-        " selp.s32 %1, 1, 2, %%p;\n\t"
-        " selp.s64 t1, %6, %7, %%p;\n\t"
-        " add.s64 %2, %8, t1;\n\t"
-        "}\n"
-        : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
-        : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
-          "l"(reset_bytes_s_), "l"(reset_bytes_r_), "l"(reset_bytes));
-#endif
-    }
-
-    // offset pointers by offset_bytes
-    pointer_ += (params_.inc_next[next_idx] - reset_bytes);
-
-    if (next_idx == 2) {
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    // Clear predicates if needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
-        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          predicates_[v] = (predicates_[v] & (~kClearMask));
-        }
-      }
-    }
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
-    return (predicates_[iteration_vector_] & (1u << pred_idx));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    return reinterpret_cast<AccessType const *>(pointer_ + 
-      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-
-      // Move to the next K coordinate within the tile
-      pointer_ += params_.inc_next_strided;
-
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
-// on problem sizes with stride = {1x1}
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradFilterTileAccessIteratorOptimized <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kUnity,
-  AccessType_
-  > {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv2dDgradFilterIteratorOptimizedParams {
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dDgradFilterIteratorOptimizedParams const &base): 
-      Conv2dDgradFilterIteratorOptimizedParams(base) { }
-      
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ):
-      Conv2dDgradFilterIteratorOptimizedParams(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) { }
-
-  };
-
-private:
-
-  Conv2dDgradFilterIteratorOptimizedParams const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  uint32_t predicates_[kAccessesPerVector];
-  int filter_rs_;
-  int filter_k_;
-
-  //
-  // Assertions
-  //
-
-  // We map predicates into bits packed in this uint32_t container
-  static_assert(ThreadMap::Iterations::kStrided *
-    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
-    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorOptimized(
-    Conv2dDgradFilterIteratorOptimizedParams const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_{0},
-    filter_rs_(0),
-    filter_k_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.row() + thread_coord.strided();
-    Index column = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
-        int filter_c = column + c * ThreadMap::Delta::kContiguous;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
-  
-          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
-          
-          predicates_[v] |= (pred << pred_idx);
-        }
-      }
-    }
-
-    pointer_ += (
-      filter_k_ * params.layout.stride()[2] + column
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    LongIndex next = params_.inc_next_rs;
-
-    // moves to the next tile
-    ++filter_rs_;
-    if (filter_rs_ == params_.RS) {
-
-      filter_rs_ = 0;
-      next = params_.inc_next_k;
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    // Clear predicates if needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
-        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          predicates_[v] = (predicates_[v] & (~kClearMask));
-        }
-      }
-    }
-      
-    pointer_ += next;
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
-    return (predicates_[iteration_vector_] & (1u << pred_idx));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    return reinterpret_cast<AccessType const *>(pointer_ + 
-      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-
-      // Move to the next K coordinate within the tile
-      pointer_ += params_.inc_next_strided;
-
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
deleted file mode 100755
index 1de41f3f7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,606 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dDgradOutputGradientTileAccessIteratorAnalytic;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
-// unscaled coordinations
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kStrided,
-  AccessType_
-> {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "DGRAD requires elements of size 8b or greater.");
- 
-  //
-  // Simpligying assertions
-  //
-
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_k_;
-  int filter_r_;
-  int filter_s_;
-  int start_r_;
-  int start_s_;
-
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_p_[ThreadMap::Iterations::kStrided];
-  int offset_q_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_k_(0),
-    filter_r_(start_r),
-    filter_s_(start_s),
-    start_r_(start_r),
-    start_s_(start_s) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    int filter_r = filter_r_;
-    int filter_s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      filter_r = (problem_size_.R - 1 - filter_r);
-      filter_s = (problem_size_.S - 1 - filter_s);
-    }
-
-    // Starting h, w positions for filter position in gemm_k=0
-    int start_h, start_w;
-    strided_dgrad_starting_coords(
-      problem_size_, 
-      stride_h_divmod, stride_w_divmod, 
-      filter_r, filter_s, 
-      start_h, start_w);
-
-    // Effective P and Q for filter position required for remapping NHW rows
-    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
-    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
-
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
-
-      // (STEP 1) [reorder NHW rows to start with same filter positions]
-      offset_n_[s] = offset_npq / (P * Q);
-      int residual = offset_npq % (P * Q);
-
-      int p = (residual / Q);
-      int q = (residual % Q);
-
-      int mapped_h = (start_h + p * problem_size_.stride_h);
-      int mapped_w = (start_w + q * problem_size_.stride_w);
-      
-      // Access (p, q) coordinates for Dy tensor and a filter position in gemm_k=0
-      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are divisible 
-      // by stride_h and stride_w
-      offset_p_[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
-      offset_q_[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, 
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn});
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    // Move filter_s by stride_w
-    filter_s_ +=  problem_size_.stride_w;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-
-    // Restore filter_s 
-    filter_s_ = start_s_;
-
-    // Move filter_r by stride_h
-    filter_r_ +=  problem_size_.stride_h;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-
-    // Restore filter_r 
-    filter_r_ = start_r_;
-
-    // Move filter_k
-    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the output tensor Dy that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int n = offset_n_[iteration_strided_];
-    int p = offset_p_[iteration_strided_]; 
-    int q = offset_q_[iteration_strided_];
-    
-    int conv_sign = (problem_size_.mode == Mode::kConvolution ? 1 : -1);
-
-    p += (conv_sign * (filter_r_ / problem_size_.stride_h));
-    q += (conv_sign * (filter_s_ / problem_size_.stride_w));
-
-    int k = filter_k_ + iteration_vector_ * AccessType::kElements; 
-
-    return TensorCoord(
-      n, 
-      p, 
-      q, 
-      k);
-  }
-
-
-  /// Returns true if the current coordinate is within the output tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return 
-      coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.P &&
-      coord.w() >= 0 && coord.w() < problem_size_.Q &&
-      coord.c() < problem_size_.K;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2dDgradOutputGradientTileAccessIteratorAnalytic for unity strides can be optimized by 
-// eliminating modulo arithmetic to compute unscaled coordinates 
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < 
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kUnity,
-  AccessType_
-> {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "DGRAD requires elements of size 8b or greater.");
- 
-  //
-  // Simpligying assertions
-  //
-
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_k_;
-  int filter_r_;
-  int filter_s_;
-
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_w_[ThreadMap::Iterations::kStrided];
-  int offset_h_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_k_(0), 
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W);
-      int residual = offset_nhw % (problem_size_.H * problem_size_.W);
-
-      offset_h_[s] = residual / problem_size_.W;
-      offset_w_[s] = residual % problem_size_.W;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // move to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_  = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-
-    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the output tensor Dy that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int n = offset_n_[iteration_strided_];
-    int h = offset_h_[iteration_strided_];
-    int w = offset_w_[iteration_strided_];
-
-    int r = filter_r_;
-    int s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h) / problem_size_.stride_h;
-    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w) / problem_size_.stride_w;
-
-    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
-
-    return TensorCoord(n, p, q, k);
-  }
-
-  /// Returns true if the current coordinate is within the output tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.P &&
-      coord.w() >= 0 && coord.w() < problem_size_.Q &&
-      coord.c() < problem_size_.K;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // Conv2dDgradFilterTileAccessIteratorAnalytic unity stride specialization 
-    // only supports (stride_h, stride_w) = (1, 1)
-    if (problem_size.stride() != MatrixCoord({1, 1})) {
-      return Status::kErrorNotSupported;
-    }
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
deleted file mode 100755
index ffa13c934..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,821 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dDgradOutputGradientTileAccessIteratorOptimized;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Conv2dDgradOutputGradientTileAccessIteratorOptimized strided dgrad needs special handling 
-// to skip MMAs (Dx = Dy * w) on invalid filter positions
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kStrided,
-  AccessType_
-> {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  using Mask = uint64_t;
-  
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "DGRAD requires elements of size 8b or greater.");
- 
-  //
-  // Simpligying assertions
-  //
-
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dStridedDgradOutputGradientIteratorOptimizedParams;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  
-  // One pointer per access
-  char const *pointer_[ThreadMap::Iterations::kStrided];
-  
-  int filter_k_;
-  int filter_r_;
-  int filter_s_;
-  int start_r_;
-  int start_s_;
-  int64_t reset_bytes_s_;
-  int64_t reset_bytes_r_;
-
-  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    filter_k_(0),
-    filter_r_(start_r),
-    filter_s_(start_s),
-    start_r_(start_r),
-    start_s_(start_s) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
-
-    reset_bytes_r_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0] +
-                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
-
-    int offset_n[ThreadMap::Iterations::kStrided];
-    int offset_p[ThreadMap::Iterations::kStrided];
-    int offset_q[ThreadMap::Iterations::kStrided];
-
-    int filter_r = filter_r_;
-    int filter_s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      filter_r = (problem_size_.R - 1 - filter_r);
-      filter_s = (problem_size_.S - 1 - filter_s);
-    }
-
-    // Starting h, w positions for filter position in gemm_k=0
-    int start_h, start_w;
-    strided_dgrad_starting_coords(
-      problem_size_, 
-      stride_h_divmod, stride_w_divmod, 
-      filter_r, filter_s, 
-      start_h, start_w);
-
-
-    // Effective starting P and Q for filter position required for remapping NHW rows
-    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
-    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      pointer_[s] = reinterpret_cast<char const *>(ptr);      
-
-      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
-
-      // (STEP 1) [reorder NHW rows to start with same filter positions]
-      offset_n[s] = offset_npq / (P * Q);
-      int residual = offset_npq % (P * Q);
-
-      int p = (residual / Q);
-      int q = (residual % Q);
-
-      int mapped_h = (start_h + p * problem_size_.stride_h);
-      int mapped_w = (start_w + q * problem_size_.stride_w);
-      
-      // Access (p, q) coordinates for Dy tensor for filter position in gemm_k=0
-      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are ensured to be 
-      // divisible by stride_h and stride_w
-      offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
-      offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
-
-      // Initialize pointers for gemm_k=0
-      TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
-
-      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-    }
-
-    //
-    // Precompute mask predicates
-    //
-    clear_mask();
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int r = start_r; r < problem_size_.R; r += problem_size_.stride_h) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int p = offset_p[s_idx] ;
-
-        p += (params_.conv_sign * (r / problem_size_.stride_h));
-
-        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][0] |= (pred << r);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for(int s = start_s; s < problem_size_.S; s += problem_size_.stride_w) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int q = offset_q[s_idx];
-        q += (params_.conv_sign * (s / problem_size_.stride_w));
-
-        bool pred = (q >=0 && q < problem_size_.Q);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][1] |= (pred << s);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size.K);
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, 
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn});
-  }
-
-private:
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_byte_offset_(LongIndex byte_offset, LongIndex byte_reset = 0) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      pointer_[s] += byte_offset - byte_reset;
-    }
-  }
-
-public:
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-
-    int next_idx = 0;
-    int64_t reset_bytes = 0;
-
-    // Move filter_s by stride_w
-    filter_s_ +=  problem_size_.stride_w;
-    if (filter_s_ >= problem_size_.S) {
-      
-      // Restore filter_s
-      filter_s_ = start_s_;
-
-      // Move filter_r by stride_h
-      filter_r_ += problem_size_.stride_h;
-#if 0
-      if (filter_r_ < problem_size_.R) {
-
-        next_idx = 1;
-
-        // Restore bytes in q coordinate (Mma in filter s dimension)
-        reset_bytes = reset_bytes_s_;
-
-      } else {
-
-        // Restore filter_r
-        filter_r_ = start_r_;
-
-        next_idx = 2;
-
-        // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
-        reset_bytes = reset_bytes_r_;
-      }
-#else
-      asm volatile(
-          "{\n\t"
-          " .reg .pred %%p;\n\t"
-          " setp.lt.s32 %%p, %3, %4;\n\t"
-          " selp.s32 %0, %3, %5, %%p;\n\t"
-          " selp.s32 %1, 1, 2, %%p;\n\t"
-          " selp.s64 %2, %6, %7, %%p;\n\t"
-          "}\n"
-          : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
-          : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
-            "l"(reset_bytes_s_), "l"(reset_bytes_r_));
-#endif
-    }
-
-    // offset pointers by offset_bytes
-    add_byte_offset_(params_.inc_next[next_idx] - reset_bytes);
-
-    if (next_idx == 2) {  
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
-    }
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int v = 0; v < kAccessesPerVector; ++v) {
-        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
-        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
-      }
-    }
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(int v, bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
-      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
-    }
-  }
-
-  /// Returns true if the current coordinate is within the output tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    return 
-      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
-      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    // Limit on filter size
-    if (problem_size.R > 32 || problem_size.S > 32) {
-      return Status::kErrorNotSupported;
-    }
-    
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Conv2dDgradOutputGradientTileAccessIteratorOptimized unity stride dgrad is optimized for dgrad
-// with problem stride = {1x1}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_
->
-class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kUnity,
-  AccessType_
-> {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  using Mask = uint64_t;
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dDgradOutputGradientIteratorOptimizedParams;
-
-private:
-
-  Conv2dDgradOutputGradientIteratorOptimizedParams const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-
-  // One pointer per access
-  char const *pointer_[ThreadMap::Iterations::kStrided];
-
-  // current filter position (r, s)
-  int filter_r_;
-  int filter_s_;
-  int filter_k_;
-
-  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
-    Conv2dDgradOutputGradientIteratorOptimizedParams const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    filter_k_(0), 
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    int offset_n[ThreadMap::Iterations::kStrided];
-    int offset_h[ThreadMap::Iterations::kStrided];
-    int offset_w[ThreadMap::Iterations::kStrided];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      pointer_[s] = reinterpret_cast<char const *>(ptr);
- 
-      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      //
-      //  offset_n[s] = offset_nhw / (problem_size_.H * problem_size_.W);
-      //  int residual = offset_nhw % (problem_size_.H * problem_size_.W);
-      //
-      //  offset_h[s] = residual / problem_size_.W;
-      //  offset_w[s] = residual % problem_size_.W;
-      //
-
-      int residual;
-
-      params_.hw_divmod(offset_n[s], residual, offset_nhw);
-      params_.w_divmod(offset_h[s], offset_w[s], residual);
-
-      TensorCoord coord = at_(offset_n[s], offset_h[s], offset_w[s], 0, 0);
-
-      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-    }
-
-    clear_mask();
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int r = 0; r < problem_size_.R; ++r) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int r_ = r;
-        if (problem_size_.mode == Mode::kConvolution) {
-          r_ = problem_size_.R - 1 - r;
-        }
-
-        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
-
-        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][0] |= (pred << r);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int s = 0; s < problem_size_.S; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int s_ = s;
-        if (problem_size_.mode == Mode::kConvolution) {
-          s_ = problem_size_.S - 1 - s;
-        }
-
-        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
-
-        bool pred = (q >= 0 && q < problem_size_.Q);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][1] |= (pred << s);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_k_ + v_idx * AccessType::kElements >= problem_size.K);
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size,
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn},
-                  ThreadMap::kThreads,
-                  ThreadMap::kElementsPerAccess,
-                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
-  }
-
-private:
-
-  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
-  // activation nhw and filter position k, r, s
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int n, int h, int w, int r, int s) const {
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = problem_size_.R - 1 - r;
-      s = problem_size_.S - 1 - s;
-    }
-
-    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
-    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
-
-    return TensorCoord(n, p, q, filter_k_);
-  }
-  
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_byte_offset_(LongIndex byte_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      pointer_[s] += byte_offset;
-    }
-  }
-  
-public:
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() { 
-
-    int next_idx = 0;
- 
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ == problem_size_.S) {
-      filter_s_ = 0;
-      ++filter_r_;
- 
-      if (filter_r_ < problem_size_.R) {
-        next_idx = 1;
-      }
-      else {
-        filter_r_ = 0;
-        next_idx = 2;
-      }
-    }
-    
-    add_byte_offset_(params_.inc_next[next_idx]);
-      
-    if (next_idx == 2) {  
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
-    }
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int v = 0; v < kAccessesPerVector; ++v) {
-        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
-        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
-      }
-    }
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(int v, bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
-      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    return 
-      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
-      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // This is specialized for unit stride
-    if (problem_size.stride() != MatrixCoord({1, 1})) {
-      return Status::kErrorNotSupported;
-    }
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorNotSupported;
-    }
-
-    // Limit on filter size
-    if (problem_size.R > 32 || problem_size.S > 32) {
-      return Status::kErrorNotSupported;
-    }
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
deleted file mode 100755
index 9317ea0cd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,332 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
-  conv::GroupMode GroupMode_ = conv::GroupMode::kNone
->
-class Conv2dFpropActivationTileAccessIteratorAnalytic {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-  static conv::GroupMode const kGroupMode = GroupMode_;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_c_;
-  int filter_r_;
-  int filter_s_;
-  int filter_c_init_;
-  int group_idx_offset_;
-  int channels_per_group_;
-  int crs_cnt_;
-  int crs_per_group_;
-
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_p_[ThreadMap::Iterations::kStrided];
-  int offset_q_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    crs_cnt_(0),
-    group_idx_offset_(0),
-    filter_c_(0), 
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    if (kGroupMode != conv::GroupMode::kNone) {
-      filter_c_init_ = filter_c_;
-      channels_per_group_ = problem_size_.C / problem_size_.groups;
-      crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kColumn - 1) / Shape::kColumn);
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    
-      offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
-      int residual = offset_npq % (problem_size_.P * problem_size_.Q);
-
-      offset_p_[s] = residual / problem_size_.Q;
-      offset_q_[s] = residual % problem_size_.Q;
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    if (kGroupMode != conv::GroupMode::kNone) {
-      ++crs_cnt_;
-    }
-
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-
-    if (kGroupMode == conv::GroupMode::kNone) {
-      filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
-    } else {
-      if (crs_cnt_ == crs_per_group_) {
-        // moves to next group
-        crs_cnt_ = 0;
-        ++group_idx_offset_;
-        filter_c_ = group_idx_offset_ * channels_per_group_ + filter_c_init_;
-      } else {
-        filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
-      }
-    }
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int n = offset_n_[iteration_strided_];
-    int p = offset_p_[iteration_strided_];
-    int q = offset_q_[iteration_strided_];
-
-    int r = filter_r_;
-    int s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = (problem_size_.R - 1 - filter_r_);
-      s = (problem_size_.S - 1 - filter_s_);
-    }
-
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-    int c = filter_c_ + iteration_vector_ * AccessType::kElements; 
-
-    return TensorCoord(n, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W &&
-      coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-    
-    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
-      if (problem_size.C % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
-      if (problem_size.C % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
deleted file mode 100755
index 5a4489c01..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dFpropActivationTileAccessIteratorFewChannels {
-public:
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kPositionsPerTile = Shape::kColumn;
-
-  static int const kAccessesPerVector = kElementsPerAccess / AccessType::kElements;
-
-  static bool const kUseFastDivmodPrologue = true;
-  static bool const kUseFastDivmodMainloop = true;
-
-  static int const kStrideH = 0;
-  static int const kStrideW = 0;
-  static int const kDilationH = 0;
-  static int const kDilationW = 0;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dFewChannelsParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int rsc_index_;
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_p_[ThreadMap::Iterations::kStrided];
-  int offset_q_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorFewChannels(
-    Params const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params),
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    rsc_index_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    rsc_index_ = (threadblock_offset.column() + thread_coord.contiguous());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      if (kUseFastDivmodPrologue) {
-        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
-        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
-      }
-      else {
-        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
-        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
-
-        offset_p_[s] = residual / problem_size_.Q;
-        offset_q_[s] = residual % problem_size_.Q;
-      }
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int n = offset_n_[iteration_strided_];
-    int p = offset_p_[iteration_strided_];
-    int q = offset_q_[iteration_strided_];
-
-    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
-
-    int r = 0;
-    int s = 0;
-    int c = 0;
-
-    if (kUseFastDivmodMainloop) {
-      int rs_index = params_.divmod_C.divmod(c, rsc_index);
-      r = params_.divmod_S.divmod(s, rs_index);
-    }
-    else {
-      c = (rsc_index % problem_size_.C);
-
-      int rs_index = (rsc_index / problem_size_.C);
-      s = (rs_index % problem_size_.S);
-      r = (rs_index / problem_size_.S);
-    }
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int stride_h = kStrideH;
-    if (!kStrideH) {
-      stride_h = problem_size_.stride_h;
-    }
-
-    int stride_w = kStrideW;
-    if (!kStrideW) {
-      stride_w = problem_size_.stride_w;
-    }
-
-    int dilation_h = kDilationH;
-    if (!kDilationH) {
-      dilation_h = problem_size_.dilation_h;
-    }
-
-    int dilation_w = kDilationW;
-    if (!kDilationW) {
-      dilation_w = problem_size_.dilation_w;
-    }
-
-    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
-    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
-
-    return TensorCoord(n, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    bool in_bounds =
-      coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W &&
-      coord.c() < problem_size_.C;
-
-    return in_bounds;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-
-    int32_t offset =
-      coord.n() * params_.stride_n +
-      coord.h() * params_.stride_h +
-      coord.w() * params_.stride_w +
-      coord.c();
-
-    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorFewChannels &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kDilationH && problem_size.dilation_h != kDilationH) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kDilationW && problem_size.dilation_w != kDilationW) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kStrideH && problem_size.stride_h != kStrideH) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kStrideW && problem_size.stride_w != kStrideW) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
-      if (problem_size.C % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
-      if (problem_size.C % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
deleted file mode 100755
index 3f1f2bc14..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dFpropActivationTileAccessIteratorFixedChannels {
-public:
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kFilterPositionsPerTile = Shape::kColumn / AccessType::kElements;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static bool const kUseFastDivmodPrologue = true;
-  static bool const kUseFastDivmodMainloop = true;
-
-  static int const kStrideH = 0;
-  static int const kStrideW = 0;
-  static int const kDilationH = 0;
-  static int const kDilationW = 0;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dFewChannelsParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int rs_index_;
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_p_[ThreadMap::Iterations::kStrided];
-  int offset_q_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorFixedChannels(
-    Params const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params),
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    rs_index_(0) {
-
-    //
-    // This requires problem_size.C == AccessType::kElements
-    //
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    rs_index_ = (threadblock_offset.column() + thread_coord.contiguous()) / AccessType::kElements;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      if (kUseFastDivmodPrologue) {
-        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
-        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
-      }
-      else {
-        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
-        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
-
-        offset_p_[s] = residual / problem_size_.Q;
-        offset_q_[s] = residual % problem_size_.Q;
-      }
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int n = offset_n_[iteration_strided_];
-    int p = offset_p_[iteration_strided_];
-    int q = offset_q_[iteration_strided_];
-
-    int rs_index = rs_index_ + iteration_vector_;
-
-    int r = 0;
-    int s = 0;
-
-    if (kUseFastDivmodMainloop) {
-      r = params_.divmod_S.divmod(s, rs_index);
-    }
-    else {
-      s = (rs_index % problem_size_.S);
-      r = (rs_index / problem_size_.S);
-    }
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int stride_h = kStrideH;
-    if (!kStrideH) {
-      stride_h = problem_size_.stride_h;
-    }
-
-    int stride_w = kStrideW;
-    if (!kStrideW) {
-      stride_w = problem_size_.stride_w;
-    }
-
-    int dilation_h = kDilationH;
-    if (!kDilationH) {
-      dilation_h = problem_size_.dilation_h;
-    }
-
-    int dilation_w = kDilationW;
-    if (!kDilationW) {
-      dilation_w = problem_size_.dilation_w;
-    }
-
-    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
-    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
-
-    return TensorCoord(n, h, w, 0);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-
-    int32_t offset =
-      coord.n() * params_.stride_n +
-      coord.h() * params_.stride_h +
-      coord.w() * params_.stride_w + coord.c();
-
-    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorFixedChannels &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C != AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kDilationH && problem_size.dilation_h != kDilationH) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kDilationW && problem_size.dilation_w != kDilationW) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kStrideH && problem_size.stride_h != kStrideH) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (kStrideW && problem_size.stride_w != kStrideW) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
-      if (problem_size.C % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
-      if (problem_size.C % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
deleted file mode 100755
index 243d724b3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,422 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
-    
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dFpropActivationTileAccessIteratorOptimized {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-  
-  using Mask = uint64_t;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dFpropActivationIteratorOptimizedParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-
-  // One pointer per access
-  char const *pointer_[ThreadMap::Iterations::kStrided];
-
-  // current filter position (r, s)
-  int filter_r_;
-  int filter_s_;
-  int filter_c_;
-
-  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorOptimized(
-    Params const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    filter_c_(0), 
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    int offset_n[ThreadMap::Iterations::kStrided];
-    int offset_p[ThreadMap::Iterations::kStrided];
-    int offset_q[ThreadMap::Iterations::kStrided];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      pointer_[s] = reinterpret_cast<char const *>(ptr);
- 
-      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      //
-      //  offset_n[s] = offset_npq / (problem_size_.P * problem_size_.Q);
-      //  int residual = offset_npq % (problem_size_.P * problem_size_.Q);
-      //
-      //  offset_p[s] = residual / problem_size_.Q;
-      //  offset_q[s] = residual % problem_size_.Q;
-      //
-
-      int residual;
-
-      params.pq_divmod(offset_n[s], residual, offset_npq);
-      params.q_divmod(offset_p[s], offset_q[s], residual);
-
-      TensorCoord coord = at_(offset_n[s], offset_p[s], offset_q[s], 0, 0);
-
-      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-    }
-
-    clear_mask();
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int r = 0; r < problem_size_.R; ++r) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int r_ = r;
-        if (problem_size_.mode == Mode::kConvolution) {
-          r_ = problem_size_.R - 1 - r;
-        }
-
-        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
-
-        bool pred = (offset_n[s_idx] < problem_size_.N && h >= 0 && h < problem_size_.H);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][0] |= (pred << r);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int s = 0; s < problem_size_.S; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int s_ = s;
-        if (problem_size_.mode == Mode::kConvolution) {
-          s_ = problem_size_.S - 1 - s;
-        }
-
-        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
-
-        bool pred = (w >= 0 && w < problem_size_.W);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-          masks_[s_idx][v_idx][1] |= (pred << s);
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size,
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn},
-                  ThreadMap::kThreads,
-                  ThreadMap::kElementsPerAccess,
-                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
-  }
-
-private:
-
-  /// Returns the coordinate in the activations tensor X that is correspoinding to 
-  // output npq and filter position r, s
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int n, int p, int q, int r, int s) const {
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = problem_size_.R - 1 - r;
-      s = problem_size_.S - 1 - s;
-    }
-
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-    return TensorCoord(n, h, w, filter_c_);
-  }
-  
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_byte_offset_(LongIndex byte_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      pointer_[s] += byte_offset;
-    }
-  }
-
-public:
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() { 
-
-    int next_idx = 0;
- 
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ == problem_size_.S) {
-      filter_s_ = 0;
-      ++filter_r_;
- 
-      if (filter_r_ < problem_size_.R) {
-        next_idx = 1;
-      }
-      else {
-        filter_r_ = 0;
-        next_idx = 2;
-      }
-    }
-    
-    add_byte_offset_(params_.inc_next[next_idx]);
-      
-    if (next_idx == 2) {  
-      filter_c_ += params_.filter_c_delta;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
-    }
-  }
-   
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int v = 0; v < kAccessesPerVector; ++v) {
-        masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
-        masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
-      }
-    }
-  } 
-   
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(int v, bool clear = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
-      masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    return 
-      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
-      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationTileAccessIteratorOptimized &operator++() {
-
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
-      if (problem_size.C % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
-      if (problem_size.C % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Conv2dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
-    // due to the number of mask bits.
-    if (problem_size.R > 32 || problem_size.S > 32) {
-      return Status::kErrorNotSupported;
-    }
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
deleted file mode 100755
index 1725db5af..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
-  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,
-  bool IsDeconv_ = false
->
-class Conv2dFpropFilterTileAccessIteratorAnalytic {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static bool const IsDeconv = IsDeconv_;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-  static conv::GroupMode const kGroupMode = GroupMode_;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_r_;
-  int filter_s_;
-  int filter_c_;
-  int filter_c_init_;
-  int crs_cnt_;
-  int crs_per_group_;  
-  int group_idx_offset_c_;
-  int channels_per_group_;
-
-  int offset_k_[ThreadMap::Iterations::kStrided];
-  int group_idx_offset_k_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    crs_cnt_(0),
-    group_idx_offset_c_(0),
-    filter_r_(0),
-    filter_s_(0),
-    filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
-
-    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
-    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
-
-    if (kGroupMode != conv::GroupMode::kNone) {
-      filter_c_init_ = filter_c_;
-      if (kGroupMode == conv::GroupMode::kDepthwise){
-        channels_per_group_ = 1;
-        crs_per_group_ = problem_size_.S * problem_size_.R;
-      } else {
-        channels_per_group_ = input_channels / problem_size_.groups;
-        crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kRow - 1) / Shape::kRow);
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-      if (kGroupMode != conv::GroupMode::kNone && kGroupMode != conv::GroupMode::kDepthwise) {
-        group_idx_offset_k_[s] = (thread_coord.strided() + s * ThreadMap::Delta::kStrided) / (output_channels / problem_size_.groups);
-      }
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    if (kGroupMode != conv::GroupMode::kNone) {
-      ++crs_cnt_;
-    }
-
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-
-    if (kGroupMode == conv::GroupMode::kNone) {
-      filter_c_ += Shape::kRow * problem_size_.split_k_slices;
-    } else {
-      if (crs_cnt_ == crs_per_group_) {
-        crs_cnt_ = 0;
-        filter_c_ = filter_c_init_;
-        if (kGroupMode != conv::GroupMode::kDepthwise) {
-          // moves to next group
-          ++group_idx_offset_c_;
-        }
-      } else {
-        filter_c_ += Shape::kRow * problem_size_.split_k_slices;
-      }
-    }
-  }
-
-  /// Returns the coordinate in the filter tensor W that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int k = offset_k_[iteration_strided_];
-    int c = filter_c_ + iteration_vector_ * AccessType::kElements;
-
-    return TensorCoord(k, filter_r_, filter_s_, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
-    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
-
-    if (kGroupMode == conv::GroupMode::kNone) {
-      return coord.n() < output_channels && coord.c() < input_channels;
-    } else if (kGroupMode == conv::GroupMode::kDepthwise) {
-      return coord.n() < output_channels && coord.c() < 1; // channels_per_group_ is always equal to ONE.
-    } else {
-      return coord.n() < output_channels && coord.c() < channels_per_group_ &&
-             group_idx_offset_c_ == group_idx_offset_k_[iteration_strided_];
-    }
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-    
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
-    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
-
-    // check alignment constraint on iterator's contiguous dimension
-    if ((input_channels / problem_size.groups) % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (output_channels % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (output_channels % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
deleted file mode 100755
index a1291aa01..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
+++ /dev/null
@@ -1,289 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dFpropFilterTileAccessIteratorFewChannels {
-public:
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kPositionsPerTile = Shape::kRow;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static bool const kUseFastDivmodPrologue = true;
-  static bool const kUseFastDivmodMainloop = true;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dFewChannelsParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int rsc_index_;
-
-  int offset_k_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorFewChannels(
-    Params const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params),
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    rsc_index_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    rsc_index_ = (threadblock_offset.row() + thread_coord.contiguous());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the filter tensor W that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
-
-    int c = 0;
-    int s = 0;
-    int r = 0;
-
-    if (kUseFastDivmodMainloop) {
-      int rs_index = params_.divmod_C.divmod(c, rsc_index);
-      r = params_.divmod_S.divmod(s, rs_index);
-    }
-    else {
-      c = (rsc_index % problem_size_.C);
-      int rs_index = (rsc_index / problem_size_.C);
-
-      s = (rs_index % problem_size_.S);
-      r = (rs_index / problem_size_.S);
-    }
-
-    int k = offset_k_[iteration_strided_];
-
-    return TensorCoord(k, r, s, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    bool in_bounds =
-      coord.n() < problem_size_.K &&
-      coord.h() >= 0 &&
-      coord.h() < problem_size_.R &&
-      coord.c() < problem_size_.C;
-
-    return in_bounds;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-
-    int32_t offset =
-      coord.n() * params_.stride_n +
-      coord.h() * params_.stride_h +
-      coord.w() * params_.stride_w +
-      coord.c();
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorFewChannels &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (problem_size.K % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (problem_size.K % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
deleted file mode 100755
index e90d50174..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dFpropFilterTileAccessIteratorFixedChannels {
-public:
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kFilterPositionsPerTile = Shape::kRow / AccessType::kElements;
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static bool const kUseFastDivmodPrologue = true;
-  static bool const kUseFastDivmodMainloop = true;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dFewChannelsParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int rs_index_;
-
-  int offset_k_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorFixedChannels(
-    Params const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params),
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    rs_index_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    rs_index_ = (threadblock_offset.row() + thread_coord.contiguous()) / AccessType::kElements;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the filter tensor W that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int rs_index = rs_index_ + iteration_vector_;
-
-    int r = 0;
-    int s = 0;
-
-    if (kUseFastDivmodMainloop) {
-      r = params_.divmod_S.divmod(s, rs_index);
-    }
-    else {
-      s = (rs_index % problem_size_.S);
-      r = (rs_index / problem_size_.S);
-    }
-
-    int k = offset_k_[iteration_strided_];
-
-    return TensorCoord(k, r, s, 0);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.K && coord.h() >= 0 && coord.h() < problem_size_.R;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-
-    int32_t offset =
-      coord.n() * params_.stride_n +
-      coord.h() * params_.stride_h +
-      coord.w() * params_.stride_w + coord.c();
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorFixedChannels &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C != AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (problem_size.K % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (problem_size.K % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
deleted file mode 100755
index 4c2343c32..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
-    
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
-  bool IsDeconv_ = false
->
-class Conv2dFpropFilterTileAccessIteratorOptimized{
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static bool const IsDeconv = IsDeconv_;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv2dFpropFilterIteratorOptimizedParams<Layout> {
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dFpropFilterIteratorOptimizedParams<Layout> const &base): 
-      Conv2dFpropFilterIteratorOptimizedParams<Layout>(base) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv2dProblemSize const &problem_size,
-      Layout const &layout
-    ):
-      Conv2dFpropFilterIteratorOptimizedParams<Layout>(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) {
-
-    }
-  };
-
-private:
-
-  Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  uint32_t predicates_[kAccessesPerVector];
-  int filter_rs_;
-  int filter_c_;
-  int channels_per_group_;
-
-  //
-  // Assertions
-  //
-
-  // We map predicates into bits packed in this uint32_t container
-  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
-    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorOptimized(
-    Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_{0},
-    filter_rs_(0),
-    filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
-    Index column = threadblock_offset.column() + thread_coord.strided();
-    channels_per_group_ = (IsDeconv ? problem_size_.K : problem_size_.C) / problem_size_.groups;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-        predicates_[v_idx] |= (pred << s);
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
-    }
-
-    pointer_ += (
-      params_.layout({filter_c_, column}) 
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    LongIndex next = params_.inc_next_rs;
-
-    // moves to the next tile
-    ++filter_rs_;
-    if (filter_rs_ == params_.RS) {
-
-      filter_rs_ = 0;
-      next = params_.inc_next_c;
-      filter_c_ += params_.filter_c_delta;
-    }
- 
-    CUTLASS_PRAGMA_UNROLL
-    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
-      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
-    }
-      
-    pointer_ += next;
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask(int v, bool clear = true) {
-    predicates_[v] = clear ? 0u : predicates_[v];
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return (predicates_[iteration_vector_] & (1u << iteration_strided_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    return reinterpret_cast<AccessType const *>(pointer_) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-
-      // Move to the next K coordinate within the tile
-      pointer_ += params_.inc_next_k;
-
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
-    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
-
-    // check alignment constraint on iterator's contiguous dimension
-    if ((input_channels / problem_size.groups) % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
-      if (output_channels % 32) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
-      if (output_channels % 64) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
deleted file mode 100755
index d34bc9faf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
+++ /dev/null
@@ -1,893 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-  \file 
-  \brief Extracts the host-params objects into non-template code.
-*/
-
-#pragma once
-
-#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
-#include <fstream>
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Params structure used for all Conv2d analytic tile iterators
-template< typename Layout_ = layout::TensorNHWC >
-struct Conv2dAnalyticParams {
-
-  using Layout = Layout_;
-
-  Layout layout;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dAnalyticParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dAnalyticParams(
-    Conv2dProblemSize const &,  // unused; placeholder to match other Params interfaces.
-    Layout const &layout
-  ): layout(layout) {
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Params structure used for all Conv2d analytic tile iterators
-template< typename Layout_ = layout::TensorNHWC >
-struct Conv2dFewChannelsParams {
-
-  using Layout = Layout_;
-
-
-  int32_t stride_w;
-  int32_t stride_h;
-  int32_t stride_n;
-
-  FastDivmod divmod_P;
-  FastDivmod divmod_Q;
-  FastDivmod divmod_S;
-  FastDivmod divmod_C;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFewChannelsParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFewChannelsParams(
-    Conv2dProblemSize const &problem_size,  // unused; placeholder to match other Params interfaces.
-    Layout const &layout
-  ):
-    stride_w(int32_t(layout.stride()[0])),
-    stride_h(int32_t(layout.stride()[1])),
-    stride_n(int32_t(layout.stride()[2])),
-    divmod_P(problem_size.P),
-    divmod_Q(problem_size.Q),
-    divmod_S(problem_size.S),
-    divmod_C(problem_size.C)
-  {
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams
-struct Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams {
-  
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-  int tiled_rows_per_filter;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,                            ///< layout object
-    int element_size_bits,                           ///< size of each element in bits
-    MatrixCoord threadblock_shape
-  ): layout(layout) {
-    
-    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
-  
-    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
-    
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
-
-CUTLASS_HOST_DEVICE
-void TraceIteratorParams(
-  char const *conv_operator, 
-  char const *operand,
-  int element_size_bits,
-  MatrixCoord threadblock_shape,
-  int thread_count,
-  int access_size,
-  layout::PitchLinearCoord threadmap_iterations,
-  layout::PitchLinearCoord threadmap_delta
-) {
- 
-#if !defined(__CUDA_ARCH__)
-
-  char const *fname = "conv_iterator_params.csv";
-
-  std::ifstream test(fname);
-  bool file_exists = test.is_open();
-
-  if (file_exists) {
-    test.close();
-  }
- 
-  std::ofstream trace("conv_iterator_params.csv", std::ofstream::app);
-
-  if (!file_exists) {
-    trace 
-      << "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize,"
-      << "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n";
-  }
-
-  trace << conv_operator << "," << operand << "," << element_size_bits << "," 
-    << threadblock_shape.row() << "," << threadblock_shape.column()
-    << "," << thread_count << "," << access_size 
-    << "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided()
-    << "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n";
-#endif
-}
-
-#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \
-  TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta);
-
-#else
-
-#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {}
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
-template< typename Layout_ = layout::TensorNHWC >
-struct Conv2dFpropActivationIteratorOptimizedParams;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
-template<>
-struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
-  
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  int64_t inc_next[3];    // {next S, next R, next C}
-  int filter_c_delta;     // number of logical elements to add to filter_c_
-  int PQ;                 // product of P*Q
-
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,                             ///< layout object
-    int element_size_bits,                            ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), 
-    PQ(problem_size.P * problem_size.Q), 
-    pq_divmod(PQ), 
-    q_divmod(problem_size.Q) {
-
-    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
-
-    // next S
-    inc_next[0] = conv_sign * (
-      int64_t(layout.stride()[0]) * problem_size.dilation_w
-    ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        int64_t(layout.stride()[1]) * problem_size.dilation_h
-        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next C
-    inc_next[2] = (
-        threadblock_shape.column() * problem_size.split_k_slices
-        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-
-#if ENABLE_CONV2D_PARAMS_PRINT
-  /// Prints internal state.
-  CUTLASS_HOST_DEVICE
-  void print() {
-    auto stride = layout.stride();
-    printf(
-      "Conv2dFpropActivationIteratorOptimizedParams:\n"
-      "  layout(w: %d, h: %d, n: %d)\n"
-      "  inc_next[%ld, %ld, %ld]\n"
-      "  filter_c_delta(%d) - PQ(%d)\n"
-      "  pq_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n"
-      "  q_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n",
-      stride[0], stride[1], stride[2],
-      inc_next[0], inc_next[1], inc_next[2],
-      filter_c_delta,
-      PQ,
-      pq_divmod.divisor,
-      pq_divmod.multiplier,
-      pq_divmod.shift_right,
-      q_divmod.divisor,
-      q_divmod.multiplier,
-      q_divmod.shift_right
-    );
-  }
-#endif  
-};
-
-/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
-template <int Interleaved_>
-struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNCxHWx<Interleaved_>> {
-  static int const kInterleaved = Interleaved_;
- 
-  using Layout = layout::TensorNCxHWx<kInterleaved>;
-
-  Layout layout;
-
-  int64_t inc_next[3];    // {next S, next R, next C}
-  int filter_c_delta;     // number of logical elements to add to filter_c_
-  int PQ;                 // product of P*Q
-
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropActivationIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,                             ///< layout object
-    int element_size_bits,                            ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
-
-    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
-
-    // next S
-    inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        int64_t(layout.stride()[0]) * problem_size.dilation_h
-        - (problem_size.S - 1) * kInterleaved * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next C
-    inc_next[2] = (
-        threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1])
-        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h
-        - conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename Layout_ = layout::TensorNHWC >
-struct Conv2dFpropFilterIteratorOptimizedParams;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<>
-struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorNHWC>
-{
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-  int RS;
-  int filter_c_delta;
-
-  int64_t inc_next_k;         // offset in units of bytes to next K position
-  int64_t inc_next_rs;        // offset in units of bytes to next RS position
-  int64_t inc_next_c;         // offset in units of bytes to next C position
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout) {
-    
-    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    RS = problem_size.R * problem_size.S;
-
-    inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8;
-
-    inc_next_rs =
-      ( int64_t(layout.stride()[0])
-        - int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
-      ) * element_size_bits / 8;
-
-    inc_next_c =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices
-        - int64_t(RS - 1) * layout.stride()[0]
-        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-
-#if ENABLE_CONV2D_PARAMS_PRINT
-  /// Prints internal state.
-  CUTLASS_HOST_DEVICE
-  void print() {
-    auto stride = layout.stride();
-    printf(
-      "Conv2dFpropFilterIteratorOptimizedParams:\n"
-      "  layout[%d, %d, %d]\n"
-      "  RS(%d), filter_c_delta(%d), inc_next(k: %ld, rs: %ld, c: %ld)\n",
-      stride[0], stride[1], stride[2],
-      RS,
-      filter_c_delta,
-      inc_next_k, inc_next_rs, inc_next_c
-    );
-  }
-#endif
-};
-
-template<int Interleaved_>
-struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorCxRSKx<Interleaved_>>
-{
-  static int const kInterleaved = Interleaved_;
-  using Layout = layout::TensorCxRSKx<kInterleaved>;
-
-  Layout layout;
-  int RS;
-  int filter_c_delta;
-
-  int64_t inc_next_k;         // offset in units of bytes to next K position
-  int64_t inc_next_rs;        // offset in units of bytes to next RS position
-  int64_t inc_next_c;         // offset in units of bytes to next C position
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dFpropFilterIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout) {
-    
-    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    RS = problem_size.R * problem_size.S;
-
-    inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8;
-
-    inc_next_rs =
-      (  int64_t(layout.stride()[0])
-        - kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
-      ) * element_size_bits / 8;
-
-    inc_next_c =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2])
-        - int64_t(RS - 1) * layout.stride()[0]
-        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved 
-      ) * element_size_bits / 8;
-
-    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Dgrad Optimized Dy params (layout::TensorNHWC)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator
-struct Conv2dDgradOutputGradientIteratorOptimizedParams {
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  int64_t inc_next[3];    // {next S, next R, next K}
-
-  int filter_k_delta;     // number of logical elements to add to filter_k_
-
-  int HW;                  // product of H*W
-
-  FastDivmod hw_divmod;
-  FastDivmod w_divmod;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradOutputGradientIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), 
-    HW(problem_size.H *problem_size.W), 
-    hw_divmod(HW), 
-    w_divmod(problem_size.W) {
-    
-    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
-
-    // next S
-    inc_next[0] = conv_sign * (
-      (int64_t)layout.stride()[0] * problem_size.dilation_w
-    ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        (int64_t)layout.stride()[1] * problem_size.dilation_h
-        - (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next K
-    inc_next[2] = (
-        threadblock_shape.column() * problem_size.split_k_slices
-        - conv_sign * (problem_size.R - 1) * (int64_t)layout.stride()[1] * problem_size.dilation_h
-        - conv_sign * (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strided Dgrad Optimized Dy params (layout::TensorNHWC)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-struct Conv2dStridedDgradOutputGradientIteratorOptimizedParams {
-  
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-  
-  int64_t inc_next[3];    // {next S, next R, next K}
-
-  int filter_k_delta;     // number of logical elements to add to filter_k_
-
-  int tiled_rows_per_filter;
-
-  int conv_sign;
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dStridedDgradOutputGradientIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dStridedDgradOutputGradientIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,                            ///< layout object
-    int element_size_bits,                           ///< size of each element in bits
-    MatrixCoord threadblock_shape
-  ): layout(layout) {
-    
-    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
-  
-    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
-
-    conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
-
-    // next S
-    inc_next[0] = conv_sign * (
-      (int64_t)layout.stride()[0] * problem_size.dilation_w
-    ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        (int64_t)layout.stride()[1] * problem_size.dilation_h
-      ) * element_size_bits / 8;
-
-    // next K
-    inc_next[2] = (
-        threadblock_shape.column() * problem_size.split_k_slices
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-// Dgrad Optimized w params (layout::TensorNHWC)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-struct Conv2dDgradFilterIteratorOptimizedParams {
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-  int RS;
-  int filter_k_delta;
-
-  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
-  int64_t inc_next_rs;        // offset in units of bytes to next RS position
-  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dDgradFilterIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size, 
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), RS(problem_size.R * problem_size.S) {
-
-    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    inc_next_strided = ((int64_t)layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
-
-    inc_next_rs =
-      ( (int64_t)layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    inc_next_k =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
-        - (problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-// StridedDgrad Optimized w params (layout::TensorNHWC)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-struct Conv2dStridedDgradFilterIteratorOptimizedParams {
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-  int RS;
-  int filter_k_delta;
-
-  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
-  int64_t inc_next[3];        // {next S, next R, next K}
-  int64_t reset_bytes;        // offset in units of bytes to move back the pointer 
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv2dStridedDgradFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dStridedDgradFilterIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size, 
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), RS(problem_size.R * problem_size.S) {
-
-    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
-
-    // next S
-    inc_next[0] =
-      ( (int64_t)layout.stride()[0] * problem_size.stride_w
-        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] =
-      ( (int64_t)layout.stride()[1] * problem_size.stride_h
-        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    // next K
-    inc_next[2] =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
-        //- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
-        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
-      ) * element_size_bits / 8;
-
-    // offset in units of bytes to move the pointer in backward direction
-    reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
-            * element_size_bits / 8;
-
-    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator
-struct Conv2dWgradOutputGradientIteratorOptimizedParams {
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  int NPQ;                      // precomputd product of N*P*Q for clearing predicates
-
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  int64_t offset_next_strided;    // offset in units of bytes to next npq coordinate within tile
-  int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile
-  int64_t inc_next_npq;           // offset in units of bytes to next npq position in subsequent tile
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ):
-    layout(layout),
-    NPQ(problem_size.N * problem_size.P * problem_size.Q),
-    pq_divmod(problem_size.P * problem_size.Q),
-    q_divmod(problem_size.Q) {
-    
-    TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
-    offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
-                        * element_size_bits / 8;
-
-    offset_next_contiguous = (threadmap_delta.contiguous())
-                            * element_size_bits / 8;
-
-    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
-                      * element_size_bits / 8;
-  }
-};
-
-struct Conv2dWgradActivationIteratorOptimizedParams {
-
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  FastDivmod sc_divmod;
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-  FastDivmod c_divmod;
-  FastDivmod s_divmod;
-  int small_channel_conv_s_offset;
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout
-  ):
-    layout(layout),
-    sc_divmod(problem_size.S * problem_size.C),
-    pq_divmod(problem_size.P * problem_size.Q),
-    q_divmod(problem_size.Q),
-    c_divmod(problem_size.C),
-    s_divmod(problem_size.S * problem_size.dilation_w),
-    small_channel_conv_s_offset((problem_size.S - 1) * problem_size.dilation_w - problem_size.pad_w) {
-  }
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationIteratorOptimizedParams(
-    Conv2dProblemSize const &problem_size,
-    Layout const &layout,
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ):
-    Conv2dWgradActivationIteratorOptimizedParams(
-      problem_size,
-      layout
-    ) { 
-    
-      TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation", 
-        element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-    }
-};
-
-struct PredicatedScaleBiasVectorAccessIteratorParams {
-  public:
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    PredicatedScaleBiasVectorAccessIteratorParams() { }
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    PredicatedScaleBiasVectorAccessIteratorParams(
-      Conv2dProblemSize const &problem_size,
-      layout::PitchLinear const &layout) {}
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    PredicatedScaleBiasVectorAccessIteratorParams(
-      Conv2dProblemSize const &problem_size,
-      layout::RowMajor const &layout) {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
deleted file mode 100755
index 17f4594ba..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template wraps the tile access iterator concept to load whole tiles from tensors in
-      memory used for implicit GEMM convolution.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename TileAccessIterator_>
-class TileIterator {
-public:
-  using TileAccessIterator = TileAccessIterator_;
-
-  using Shape = typename TileAccessIterator::Shape;
-  using Element = typename TileAccessIterator::Element;
-  using Layout = typename TileAccessIterator::Layout;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = typename TileAccessIterator::ThreadMap;
-  using AccessType = typename TileAccessIterator::AccessType;
-  using TensorRef = typename TileAccessIterator::TensorRef;
-  using Index = typename TileAccessIterator::Index;
-  using LongIndex = typename TileAccessIterator::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
-  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
-  using Params = typename TileAccessIterator::Params;
-  static int const kConvDim = TileAccessIterator::kConvDim;
-  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-    Element, 
-    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-private:
-
-  /// Internal state
-  TileAccessIterator tile_access_iterator_;
-
-public:
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TileIterator(
-    Params const &params,
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
-    return TileAccessIterator::getParams(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    tile_access_iterator_.set_iteration_index(index);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    tile_access_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  TileIterator &operator++() {
-    tile_access_iterator_.advance();
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  TileIterator operator++(int) {
-    TileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    frag.clear();
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          cutlass::arch::global_load<
-            AccessType,
-            sizeof(AccessType)
-          >(
-            frag_ptr[idx],
-            tile_access_iterator_.get() + pointer_offset,
-            tile_access_iterator_.valid()
-          );
-  
-          ++tile_access_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    tile_access_iterator_.set_iteration_index(0);
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-    tile_access_iterator_.advance();
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-
-    // dispatch to iterator implementation
-    return TileAccessIterator::can_implement(problem_size);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strided Dgrad Tile Iterator
-template <typename TileAccessIterator_>
-class TileIteratorStridedDgrad {
-public:
-  using TileAccessIterator = TileAccessIterator_;
-
-  using Shape = typename TileAccessIterator::Shape;
-  using Element = typename TileAccessIterator::Element;
-  using Layout = typename TileAccessIterator::Layout;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = typename TileAccessIterator::ThreadMap;
-  using AccessType = typename TileAccessIterator::AccessType;
-  using TensorRef = typename TileAccessIterator::TensorRef;
-  using Index = typename TileAccessIterator::Index;
-  using LongIndex = typename TileAccessIterator::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
-  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
-  using Params = typename TileAccessIterator::Params;
-  static int const kConvDim = TileAccessIterator::kConvDim;
-  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-    Element, 
-    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-private:
-
-  /// Internal state
-  TileAccessIterator tile_access_iterator_;
-
-public:
-
-  /// Constructor (output gradient (Dy) OperandA ctor)
-  CUTLASS_HOST_DEVICE
-  TileIteratorStridedDgrad(
-    Params const &params,
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    tile_access_iterator_(
-      params, 
-      problem_size, 
-      ptr, 
-      thread_idx, 
-      stride_h_divmod, stride_w_divmod, 
-      start_r, start_s, 
-      threadblock_offset) { }
-
-  /// Constructor (filter (w) OperandB ctor)
-  CUTLASS_HOST_DEVICE
-  TileIteratorStridedDgrad(
-    Params const &params,
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    int start_r, int start_s,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    tile_access_iterator_(params, 
-      problem_size, 
-      ptr, 
-      thread_idx, 
-      start_r, start_s, 
-      threadblock_offset) { }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
-    return TileAccessIterator::getParams(problem_size, layout);
-  }
-
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    tile_access_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  TileIteratorStridedDgrad &operator++() {
-    tile_access_iterator_.advance();
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  TileIteratorStridedDgrad operator++(int) {
-    TileIteratorStridedDgrad self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    frag.clear();
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        cutlass::arch::global_load<
-          AccessType,
-          sizeof(AccessType)
-        >(
-          frag_ptr[c + s * ThreadMap::Iterations::kContiguous],
-          tile_access_iterator_.get() + pointer_offset,
-          tile_access_iterator_.valid()
-        );
-
-        ++tile_access_iterator_;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    tile_access_iterator_.set_iteration_index(0);
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-    tile_access_iterator_.advance();
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-
-    // dispatch to iterator implementation
-    return TileAccessIterator::can_implement(problem_size);
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
deleted file mode 100755
index 3e3a4f155..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dWgradActivationTileAccessIteratorAnalytic {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  // Filter postion (r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
-  int filter_r_[ThreadMap::Iterations::kContiguous];
-  int filter_s_[ThreadMap::Iterations::kContiguous];
-  int filter_c_[ThreadMap::Iterations::kContiguous];
-
-  int offset_npq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr))
-  {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize r,s,c filter position for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
-                        + c * ThreadMap::Delta::kContiguous;
-
-      filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
-      int residual = rsc_offset % (problem_size_.S * problem_size_.C);
-
-      filter_s_[c] = residual / problem_size_.C;
-      filter_c_[c] = residual % problem_size_.C;
-    }
-
-    // initialize n, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-    
-      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;   
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    
-    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the activation tensor x that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int r, s, c;
-
-    if (kAccessesPerVector == 1) {
-      /// One 128b aligned access fetching more than one element
-      c = filter_c_[iteration_contiguous_];
-      r = filter_r_[iteration_contiguous_];
-      s = filter_s_[iteration_contiguous_];
-    }  
-    else {
-      /// Multiple access to support non-128b alignment in contiguous dimension
-      c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
-      int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
-      s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
-      int wrap_s = (filter_s_[iteration_contiguous_] + wrap_c) / problem_size_.S;
-      r = filter_r_[iteration_contiguous_] + wrap_s;
-    } 
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
-    int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
-    
-    int p = residual / problem_size_.Q;
-    int q = residual % problem_size_.Q;
-   
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
- 
-    return TensorCoord(n, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activation tensor x
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
deleted file mode 100755
index 8cbcc3d9f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dWgradActivationTileAccessIteratorOptimized {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dWgradActivationIteratorOptimizedParams;
-
-private:
-
-  Conv2dWgradActivationIteratorOptimizedParams const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  // Precomputed effective filter postion (r,s) in contiguous dimension stays constant for each gemm_iteration_k
-  // required for npq -> nhw translation
-  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
-  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
-
-  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
-  int filter_c_[ThreadMap::Iterations::kContiguous];
-
-  int offset_npq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationTileAccessIteratorOptimized(
-    Conv2dWgradActivationIteratorOptimizedParams const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr))
-  {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize r,s,c filter position for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
-                        + c * ThreadMap::Delta::kContiguous;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      //
-      // filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
-      // int residual = rsc_offset % (problem_size_.S * problem_size_.C);
-      //
-      // filter_s_[c] = residual / problem_size_.C;
-      // filter_c_[c] = residual % problem_size_.C;
-
-      int residual;
-      params_.sc_divmod(precomputed_filter_r_[c], residual, rsc_offset);
-      params_.c_divmod(precomputed_filter_s_[c], filter_c_[c], residual);
-
-      int r = precomputed_filter_r_[c];
-      int s = precomputed_filter_s_[c];
-
-      if (problem_size_.mode == Mode::kConvolution) {
-        r = (problem_size_.R - 1 - r);
-        s = (problem_size_.S - 1 - s);
-      }
-
-      precomputed_filter_r_[c] =  -problem_size_.pad_h + r * problem_size_.dilation_h;
-      precomputed_filter_s_[c] =  -problem_size_.pad_w + s * problem_size_.dilation_w;
-    }
-
-    // initialize n, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-    
-      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;   
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    
-    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the activation tensor x that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int r = precomputed_filter_r_[iteration_contiguous_];
-    int s = precomputed_filter_s_[iteration_contiguous_];
-    int c = filter_c_[iteration_contiguous_];
-
-    if (kAccessesPerVector > 1) {
-      // This code section is only to support non-128b alignment
-      // Multiple access to support non-128b alignment in contiguous dimension
-      int wrap_c;
-      params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
-
-      if (problem_size_.mode == Mode::kConvolution) {
-        s -= (problem_size_.dilation_w * wrap_c);
-        
-        int wrap_s;
-        params_.s_divmod(wrap_s, s, params_.small_channel_conv_s_offset - s);
-        s = params_.small_channel_conv_s_offset - s;
-
-        r -= (problem_size_.dilation_h * wrap_s);
-
-      } else {
-        s += (problem_size_.dilation_w * wrap_c);
-
-        int wrap_s;
-        params_.s_divmod(wrap_s, s, s + problem_size_.pad_w);
-        s -= problem_size_.pad_w;
-
-        r += (problem_size_.dilation_h * wrap_s);
-      }
-    }
-
-    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-    //
-    //
-    // int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
-    // int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
-    //
-    // int p = residual / problem_size_.Q;
-    // int q = residual % problem_size_.Q;
-
-    int residual, n, p, q;
-    
-    params_.pq_divmod(n, residual, offset_npq_[iteration_strided_]);
-    params_.q_divmod(p, q, residual);
-
-    int h = p * problem_size_.stride_h + r;
-    int w = q * problem_size_.stride_w + s;
-
-    return TensorCoord(n, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activation tensor x
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradActivationTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
deleted file mode 100755
index 793649dbe..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_k_[ThreadMap::Iterations::kContiguous];
-
-  int offset_npq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize filter_k for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
-                        + c * ThreadMap::Delta::kContiguous;
-    }
-
-    // initialize n, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_npq_[s] = threadblock_offset.column() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;  
-      
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_npq_[s] += Shape::kColumn * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int npq = offset_npq_[iteration_strided_];
-
-    int n = npq / (problem_size_.P * problem_size_.Q);
-    int residual = npq % (problem_size_.P * problem_size_.Q);
-
-    int p = residual / problem_size_.Q;
-    int q = residual % problem_size_.Q;
-
-    int k = filter_k_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
-
-    return TensorCoord(n, p, q, k);
-  }
-
-
-  /// Returns true if the current coordinate is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.h() < problem_size_.P &&
-      coord.w() < problem_size_.Q &&
-      coord.c() < problem_size_.K;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
deleted file mode 100755
index 07233d892..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
->
-class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv2dWgradOutputGradientIteratorOptimizedParams;
-
-private:
-
-  Conv2dWgradOutputGradientIteratorOptimizedParams const &params_;
-  Conv2dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  uint32_t predicates_[kAccessesPerVector];
-  int filter_k_;
-  int offset_npq_;
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientTileAccessIteratorOptimized(
-    Conv2dWgradOutputGradientIteratorOptimizedParams const &params,
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_{0},
-    filter_k_(0),
-    offset_npq_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
-    offset_npq_ = threadblock_offset.column() + thread_coord.strided();
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
-        int offset_npq = offset_npq_ + s * ThreadMap::Delta::kStrided;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          bool predicate = valid_(at_(offset_npq, filter_k + v * AccessType::kElements));
-  
-          uint32_t pred = (predicate ? 1u : 0);
-  
-          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
- 
-          predicates_[v] |= (pred << pred_idx);
-        }
-      }
-    }
-
-    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
-    pointer_ += (
-      offset_npq_ * params.layout.stride()[0] + filter_k_
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size,
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn},
-                  ThreadMap::kThreads,
-                  ThreadMap::kElementsPerAccess,
-                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
-    offset_npq_ += Shape::kColumn * problem_size_.split_k_slices;
-
-    // Clear predicates if needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      if (offset_npq_ + s * ThreadMap::Delta::kStrided >= params_.NPQ) {
-        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          predicates_[v] = (predicates_[v] & (~kClearMask));
-        }
-      }
-    }
-
-    pointer_ += params_.inc_next_npq; 
-  }
-
-private:
-  /// Returns the coordinate in the output gradient tensor Dy that is pointed to
-  /// by offset_npq and k.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int offset_npq, int k) const {
-
-    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
-    //
-    //
-    // int npq = offset_npq;
-    // int n = npq / (problem_size_.P * problem_size_.Q);
-    // int residual = npq % (problem_size_.P * problem_size_.Q);
-    // 
-    // int p = residual / problem_size_.Q;
-    // int q = residual % problem_size_.Q;
-    
-    int residual, n, p, q;
-    
-    params_.pq_divmod(n, residual, offset_npq);
-    params_.q_divmod(p, q, residual);
-
-    return TensorCoord(n, p, q, k);
-  }
-  
-  /// Returns true if the coord is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid_(TensorCoord coord) const {
-
-    return coord.n() < problem_size_.N &&
-      coord.c() < problem_size_.K;
-  }
-
-public:
-
-  /// Returns true if the current coordinate is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
-    return (predicates_[iteration_vector_] & (1u << pred_idx));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(
-      pointer_ +
-      iteration_strided_ * params_.offset_next_strided + 
-      iteration_contiguous_ * params_.offset_next_contiguous
-    ) + iteration_vector_;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv2dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
deleted file mode 100755
index 943ab88cf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,268 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dDgradFilterTileAccessIteratorAnalytic {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  static_assert(sizeof_bits<Element>::value >= 8, 
-    "DGRAD requires elements of size 8b or larger.");
-  
-  //
-  // Parameters structure
-  //
-
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv3dProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
-
-private:
-
-  Params const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  // For a fixed filter position (t,r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-  int offset_k_[ThreadMap::Iterations::kStrided]; 
-  int offset_c_[ThreadMap::Iterations::kContiguous];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    filter_t_(0), 
-    filter_r_(0),
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
-        + c * ThreadMap::Delta::kContiguous;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = 
-        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-     ++filter_t_;
-    if (filter_t_ < problem_size_.T) {
-      return;
-    }
-    filter_t_ = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the filter tensor w that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int c = offset_c_[iteration_contiguous_];
-    int k = offset_k_[iteration_strided_];
-
-    return TensorCoord(k, filter_t_, filter_r_, filter_s_, c);
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor w
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
deleted file mode 100755
index 2d5837dd3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,289 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
->
-class Conv3dDgradFilterTileAccessIteratorOptimized {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = StrideSupport_;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv3dDgradFilterIteratorOptimizedParams {
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dDgradFilterIteratorOptimizedParams const &base): 
-      Conv3dDgradFilterIteratorOptimizedParams(base) { }
-      
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv3dProblemSize const &problem_size, 
-      Layout const &layout
-    ):
-      Conv3dDgradFilterIteratorOptimizedParams(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) { }
-
-  };
-
-private:
-
-  Conv3dDgradFilterIteratorOptimizedParams const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  uint32_t predicates_;
-  int filter_trs_;
-  int filter_k_;
-
-  //
-  // Assertions
-  //
-
-  // We map predicates into bits packed in this uint32_t container
-  static_assert(ThreadMap::Iterations::kStrided *
-    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
-    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterTileAccessIteratorOptimized(
-    Conv3dDgradFilterIteratorOptimizedParams const &params,
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_(0),
-    filter_trs_(0),
-    filter_k_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.row() + thread_coord.strided();
-    Index column = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
-        int filter_c = column + c * ThreadMap::Delta::kContiguous;
-
-        uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0);
-
-        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
-        
-        predicates_ |= (pred << pred_idx);
-      }
-    }
-
-    pointer_ += (
-      filter_k_ * params.layout.stride()[3] + column
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    LongIndex next = params_.inc_next_trs;
-
-    // moves to the next tile
-    ++filter_trs_;
-    if (filter_trs_ == params_.TRS) {
-
-      filter_trs_ = 0;
-      next = params_.inc_next_k;
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    // Clear predicates if needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
-        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
-
-        predicates_ = (predicates_ & (~kClearMask));
-      }
-    }
-      
-    pointer_ += next;
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
-    return (predicates_ & (1u << pred_idx));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    return reinterpret_cast<AccessType const *>(pointer_ + 
-      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterTileAccessIteratorOptimized &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-
-      // Move to the next K coordinate within the tile
-      pointer_ += params_.inc_next_strided;
-
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
deleted file mode 100755
index 30b7f2fcf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,343 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided
->
-class Conv3dDgradOutputGradientTileAccessIteratorAnalytic;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv3dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
-// unscaled coordinations
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
-  Shape_,
-  Element_,
-  ThreadMap_,
-  conv::StrideSupport::kStrided
-> {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "DGRAD requires elements of size 8b or greater.");
- 
-  //
-  // Simpligying assertions
-  //
-
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ConvProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
-
-private:
-
-  Params const &params_;
-  ConvProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  int filter_k_;
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_d_[ThreadMap::Iterations::kStrided];
-  int offset_w_[ThreadMap::Iterations::kStrided];
-  int offset_h_[ThreadMap::Iterations::kStrided];
-  
-private:
-
-  /// Returns the coordinate in the output tensor Dy that is currently pointed to
-  /// by the iterator but DOES NOT scale by the convolution stride. This is needed
-  /// to compute predicates in the valid() method. The return value of the public at()
-  /// method is correctly scaled.
-  CUTLASS_HOST_DEVICE
-  TensorCoord unscaled_at_() const {
-    int n = offset_n_[iteration_strided_];
-    int d = offset_d_[iteration_strided_];
-    int h = offset_h_[iteration_strided_];
-    int w = offset_w_[iteration_strided_];
-
-    int t = filter_t_;
-    int r = filter_r_;
-    int s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      t = (problem_size_.T - 1 - t);
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int z = (d + problem_size_.pad_d - t * problem_size_.dilation_d);
-    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h);
-    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w);
-
-    return TensorCoord(n, z, p, q, filter_k_);
-  }
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientTileAccessIteratorAnalytic(
-    Params const &params, 
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_k_(0), 
-    filter_t_(0),
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      offset_n_[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
-      int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
-
-      offset_d_[s] = residual / (problem_size_.H * problem_size_.W);
-      residual     = residual % (problem_size_.H * problem_size_.W);
-
-      offset_h_[s] = residual / problem_size_.W;
-      offset_w_[s] = residual % problem_size_.W;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // move to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_  = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-    ++filter_t_;
-    if (filter_t_ < problem_size_.T) {
-      return;
-    }
-    filter_t_ = 0;
-
-    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the output tensor Dy that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    TensorCoord coord = unscaled_at_();
-
-    return TensorCoord(
-      coord.n(), 
-      coord.d() / problem_size_.stride_d, 
-      coord.h() / problem_size_.stride_h, 
-      coord.w() / problem_size_.stride_w, 
-      coord.c());
-  }
-
-
-  /// Returns true if the current coordinate is within the output tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord unscaled_coord = unscaled_at_();
-    TensorCoord coord = at();
-
-    return 
-      !(unscaled_coord.d() % problem_size_.stride_d) &&
-      !(unscaled_coord.h() % problem_size_.stride_h) && 
-      !(unscaled_coord.w() % problem_size_.stride_w) &&
-      coord.n() < problem_size_.N &&
-      coord.d() >= 0 && coord.d() < problem_size_.Z &&
-      coord.h() >= 0 && coord.h() < problem_size_.P &&
-      coord.w() >= 0 && coord.w() < problem_size_.Q &&
-      coord.c() < problem_size_.K;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
deleted file mode 100755
index 5a53c8cbd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,489 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
->
-class Conv3dDgradOutputGradientTileAccessIteratorOptimized {
-public:
-
-  static_assert(StrideSupport_ == conv::StrideSupport::kUnity,
-    "Only unit-stride dgrad is supported at this time.");
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  using Coord3D = Coord<3>;
-  static int const kAccessesPerVector = 1;
-  using Mask = uint64_t;
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv3dDgradOutputGradientIteratorOptimizedParams;
-
-private:
-
-  Params const &params_;
-  ConvProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-
-
-  // One pointer per access
-  char const *pointer_[ThreadMap::Iterations::kStrided];
-
-  // current filter position (t, r, s)
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-  int filter_k_;
-
-  Index masks_[ThreadMap::Iterations::kStrided][3];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientTileAccessIteratorOptimized(
-    Params const &params,
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    filter_k_(0), 
-    filter_t_(0),
-    filter_r_(0), 
-    filter_s_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    int offset_n[ThreadMap::Iterations::kStrided];
-    int offset_d[ThreadMap::Iterations::kStrided];
-    int offset_h[ThreadMap::Iterations::kStrided];
-    int offset_w[ThreadMap::Iterations::kStrided];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      pointer_[s] = reinterpret_cast<char const *>(ptr);
- 
-      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      //
-      //  offset_n[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
-      //  int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
-      //
-      //
-      //  offset_d[s] = residual / (problem_size_.H * problem_size_.W);
-      //  residual    = residual % (problem_size_.H * problem_size_.W);
-      //
-      //  offset_h[s] = residual / problem_size_.W;
-      //  offset_w[s] = residual % problem_size_.W;
-      //
-
-      int residual;
-
-      // input: (ndhw offset) output: (n offset and resudial (dhw offset))
-      params_.dhw_divmod(offset_n[s], residual, offset_ndhw);
-      // input: (dhw offset) output: (d offset and resudial (hw))
-      params_.hw_divmod(offset_d[s], residual, residual);
-      // input: (hw offset) output: (h offset and resudial (w offset))
-      params_.w_divmod(offset_h[s], offset_w[s], residual);
-
-      TensorCoord coord = at_(offset_n[s], offset_d[s], offset_h[s], offset_w[s], 0, 0, 0);
-
-      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-    }
-
-    clear_mask();
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int t = 0; t < problem_size_.T; ++t) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int t_ = t;
-        if (problem_size_.mode == Mode::kConvolution) {
-          t_ = problem_size_.T - 1 - t;
-        }
-
-        int z = offset_d[s_idx] + problem_size_.pad_d - t_ * problem_size_.dilation_d;
-
-        bool pred = (offset_n[s_idx] < problem_size_.N && z >= 0 && z < problem_size_.Z);
-        masks_[s_idx][0] |= (pred << t);
-      }
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int r = 0; r < problem_size_.R; ++r) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int r_ = r;
-        if (problem_size_.mode == Mode::kConvolution) {
-          r_ = problem_size_.R - 1 - r;
-        }
-
-        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
-
-        bool pred = (p >= 0 && p < problem_size_.P);
-        masks_[s_idx][1] |= (pred << r);
-      }
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int s = 0; s < problem_size_.S; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int s_ = s;
-        if (problem_size_.mode == Mode::kConvolution) {
-          s_ = problem_size_.S - 1 - s;
-        }
-
-        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
-
-        bool pred = (q >= 0 && q < problem_size_.Q);
-        masks_[s_idx][2] |= (pred << s);
-      }
-    }
-
-    if (filter_k_ >= problem_size.K) {
-      clear_mask();
-    }
-
-    set_iteration_index(0);
-
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size,
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn},
-                  ThreadMap::kThreads,
-                  ThreadMap::kElementsPerAccess,
-                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
-  }
-
-private:
-
-
-  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
-  // activation ndhw and filter position k, t, r, s
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int n, int d, int h, int w, int t, int r, int s) const {
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      t = problem_size_.T - 1 - t;
-      r = problem_size_.R - 1 - r;
-      s = problem_size_.S - 1 - s;
-    }
-
-    int z = d + problem_size_.pad_d - t * problem_size_.dilation_d;
-    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
-    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
-
-    return TensorCoord(n, z, p, q, filter_k_);
-  }
-
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_byte_offset_(LongIndex byte_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      pointer_[s] += byte_offset;
-    }
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask_(bool clear) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
-      // artifact in which control flow instructions are generated. Instead, our
-      // intent is to predicate the mov instructions.
-      #if defined(__CUDA_ARCH__)
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][0])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][0])
-      );
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][1])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][1])
-      );
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][2])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][2])
-      );
-      #else
-        if (clear) {
-          masks_[s][0] = 0;
-          masks_[s][1] = 0;
-          masks_[s][2] = 0;
-        }
-      #endif
-    }
-  }
-
-public:
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-
-  CUTLASS_HOST_DEVICE
-  void advance() { 
-
-    int next_idx = 0;
-
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ == problem_size_.S) {
-      
-      filter_s_ = 0;
-      ++filter_r_;
-      next_idx = 1;
-
-      if (filter_r_ == problem_size_.R) {
-        filter_r_ = 0;
-        ++filter_t_;
-
-        if (filter_t_ < problem_size_.T) {
-          next_idx = 2;
-        } 
-        else {
-          filter_t_ = 0;
-          next_idx = 3;
-        } 
-      }
-    }
-
-    add_byte_offset_(params_.inc_next[next_idx]);
-      
-    if (next_idx == 3) {  
-      filter_k_ += params_.filter_k_delta;
-    }
-
-    clear_mask_(filter_k_ >= problem_size_.K);
-  }
-
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      masks_[s][0] = Mask(0);
-      masks_[s][1] = Mask(0);
-      masks_[s][2] = Mask(0);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    return 
-      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
-      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
-      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-
-    // This is specialized for unit stride
-    if (problem_size.stride() != Coord3D({1, 1, 1})) {
-      return Status::kErrorNotSupported;
-    }
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorNotSupported;
-    }
-
-    // Limit on filter size
-    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
-      return Status::kErrorNotSupported;
-    }
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
deleted file mode 100755
index f0f9a86a3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dFpropActivationTileAccessIteratorAnalytic {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv3dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  ConvProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-  int filter_c_;
-
-  int offset_n_[ThreadMap::Iterations::kStrided];
-  int offset_z_[ThreadMap::Iterations::kStrided];
-  int offset_p_[ThreadMap::Iterations::kStrided];
-  int offset_q_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationTileAccessIteratorAnalytic(
-    Params const &params, 
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_t_(0),
-    filter_r_(0), 
-    filter_s_(0),
-    filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    
-      offset_n_[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-      int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-
-      offset_z_[s] = residual / (problem_size_.P * problem_size_.Q);
-      residual     = residual % (problem_size_.P * problem_size_.Q);
-
-      offset_p_[s] = residual / problem_size_.Q;
-      offset_q_[s] = residual % problem_size_.Q;
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-    ++filter_t_;
-    if (filter_t_ < problem_size_.T) {
-      return;
-    }
-    filter_t_ = 0;
-
-    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int n = offset_n_[iteration_strided_];
-    int z = offset_z_[iteration_strided_];
-    int p = offset_p_[iteration_strided_];
-    int q = offset_q_[iteration_strided_];
-
-    int t = filter_t_;
-    int r = filter_r_;
-    int s = filter_s_;
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      t = (problem_size_.T - 1 - filter_t_);
-      r = (problem_size_.R - 1 - filter_r_);
-      s = (problem_size_.S - 1 - filter_s_);
-    }
-
-    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-    return TensorCoord(n, d, h, w, filter_c_);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.d() >= 0 && coord.d() < problem_size_.D &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W &&
-      coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-    
-    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
deleted file mode 100755
index 78b270eb9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-    
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_
->
-class Conv3dFpropActivationTileAccessIteratorOptimized {
-public:
-
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;  
-  using Mask = uint64_t;
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv3dFpropActivationIteratorOptimizedParams<Layout>;
-
-private:
-
-  Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-
-  // One pointer per access
-  char const *pointer_[ThreadMap::Iterations::kStrided];
-
-  // current filter position (t, r, s)
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-  int filter_c_;
-
-  // mask for t, r, and s
-  Index masks_[ThreadMap::Iterations::kStrided][3];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationTileAccessIteratorOptimized(
-    Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params,
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
-  ) :    
-  params_(params), 
-  problem_size_(problem_size),
-  filter_t_(0), 
-  filter_r_(0), 
-  filter_s_(0),
-  filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    int offset_n[ThreadMap::Iterations::kStrided];
-    int offset_z[ThreadMap::Iterations::kStrided];
-    int offset_p[ThreadMap::Iterations::kStrided];
-    int offset_q[ThreadMap::Iterations::kStrided];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      pointer_[s] = reinterpret_cast<char const *>(ptr);
- 
-      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      //
-      //  offset_n[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-      //  int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-      //
-      //  offset_z[s] = residual / (problem_size_.P * problem_size_.Q);
-      //  residual = residual % (problem_size_.P * problem_size_.Q);
-      //
-      //  offset_p[s] = residual / problem_size_.Q;
-      //  offset_q[s] = residual % problem_size_.Q;
-      //
-
-      int residual;
-
-      // input: (nzpq offset) output: (n offset and resudial (zpq offset))
-      params.zpq_divmod(offset_n[s], residual, offset_nzpq);
-      // input: (zpq offset) output: (z offset and resudial (pq))
-      params.pq_divmod(offset_z[s], residual, residual);
-      // input: (pq offset) output: (p offset and resudial (q offset))
-      params.q_divmod(offset_p[s], offset_q[s], residual);
-
-      TensorCoord coord = at_(offset_n[s], offset_z[s], offset_p[s], offset_q[s], 0, 0, 0);
-
-      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
-    }
-
-    clear_mask();
-
-    // mask predicates for filter position T
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int t = 0; t < problem_size_.T; ++t) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int t_ = t;
-        if (problem_size_.mode == Mode::kConvolution) {
-          t_ = problem_size_.T - 1 - t;
-        }
-
-        int d = offset_z[s_idx] * problem_size_.stride_d - problem_size_.pad_d + t_ * problem_size_.dilation_d;
-
-        bool pred = (offset_n[s_idx] < problem_size_.N && d >= 0 && d < problem_size_.D);
-        masks_[s_idx][0] |= (pred << t);
-      }
-    }   
-
-    // mask predicates for filter position R
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int r = 0; r < problem_size_.R; ++r) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int r_ = r;
-        if (problem_size_.mode == Mode::kConvolution) {
-          r_ = problem_size_.R - 1 - r;
-        }
-
-        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
-
-        bool pred = (h >= 0 && h < problem_size_.H);
-        masks_[s_idx][1] |= (pred << r);
-      }
-    }  
-
-    // mask predicates for filter position S
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int s = 0; s < problem_size_.S; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
-
-        int s_ = s;
-        if (problem_size_.mode == Mode::kConvolution) {
-          s_ = problem_size_.S - 1 - s;
-        }
-
-        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
-
-        bool pred = (w >= 0 && w < problem_size_.W);
-        masks_[s_idx][2] |= (pred << s);
-      }
-    }
-
-    if (filter_c_ >= problem_size.C) {
-      clear_mask();
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size,
-                  layout,
-                  sizeof_bits<Element>::value,
-                  {Shape::kRow, Shape::kColumn},
-                  ThreadMap::kThreads,
-                  ThreadMap::kElementsPerAccess,
-                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
-  }
-
-private:
-
-  /// Returns the coordinate in the activations tensor X that is correspoinding to 
-  // output nzpq and filter position t, r, s
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int n, int z, int p, int q, int t, int r, int s) const {
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      t = problem_size_.T - 1 - t;
-      r = problem_size_.R - 1 - r;
-      s = problem_size_.S - 1 - s;
-    }
-
-    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-    return TensorCoord(n, d, h, w, filter_c_);
-  }
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_byte_offset_(LongIndex byte_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      pointer_[s] += byte_offset;
-    }
-  }
-
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask_(bool clear) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
-      // artifact in which control flow instructions are generated. Instead, our
-      // intent is to predicate the mov instructions.
-      #if defined(__CUDA_ARCH__)
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][0])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][0])
-      );
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][1])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][1])
-      );
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  .reg .u32  m;"
-          "  mov.u32 m, %2;"
-          "  setp.ne.b32 p, %1, 0;\n"
-          "  @p mov.u32 m, 0;\n"
-          "  mov.u32 %0, m;\n"
-          "}\n" 
-        :
-          "=r"(masks_[s][2])
-       : 
-          "r"((int)clear),
-          "r"(masks_[s][2])
-      );
-      #else
-        if (clear) {
-          masks_[s][0] = 0;
-          masks_[s][1] = 0;
-          masks_[s][2] = 0;
-        }
-      #endif
-    }
-  }
-
-public:
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() { 
-
-    int next_idx = 0;
- 
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ == problem_size_.S) {
-      
-      filter_s_ = 0;
-      ++filter_r_;
-      next_idx = 1;
-
-      if (filter_r_ == problem_size_.R) {
-        filter_r_ = 0;
-        ++filter_t_;
-
-        if (filter_t_ < problem_size_.T) {
-          next_idx = 2;
-        } 
-        else {
-          filter_t_ = 0;
-          next_idx = 3;
-        } 
-      }
-    }
-
-    add_byte_offset_(params_.inc_next[next_idx]);
-      
-    if (next_idx == 3) {  
-      filter_c_ += params_.filter_c_delta;
-    }
-
-    clear_mask_(filter_c_ >= problem_size_.C);
-  }
-
-  /// Clears the predicates
-  CUTLASS_HOST_DEVICE
-  void clear_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      masks_[s][0] = Mask(0);
-      masks_[s][1] = Mask(0);
-      masks_[s][2] = Mask(0);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    return 
-      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
-      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
-      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationTileAccessIteratorOptimized &operator++() {
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    // Conv3dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
-    // due to the number of mask bits.
-    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
-      return Status::kErrorNotSupported;
-    }
-    return Status::kSuccess;
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
deleted file mode 100755
index 9f04adc40..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_,
-  bool IsDeconv_ = false
->
-class Conv3dFpropFilterTileAccessIteratorAnalytic {
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static bool const IsDeconv = IsDeconv_;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Conv3dAnalyticParams<Layout>;
-
-private:
-
-  Params const &params_;
-  ConvProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  int filter_t_;
-  int filter_r_;
-  int filter_s_;
-  int filter_c_;
-
-  int offset_k_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterTileAccessIteratorAnalytic(
-    Params const &params, 
-    ConvProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    filter_t_(0),
-    filter_r_(0),
-    filter_s_(0),
-    filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    ++filter_s_;
-    if (filter_s_ < problem_size_.S) {
-      return;
-    }
-    filter_s_ = 0;
-    
-    ++filter_r_;
-    if (filter_r_ < problem_size_.R) {
-      return;
-    }
-    filter_r_ = 0;
-
-    ++filter_t_;
-    if (filter_t_ < problem_size_.T) {
-      return;
-    }
-    filter_t_ = 0;
-
-    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
-  }
-
-  /// Returns the coordinate in the filter tensor W that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int k = offset_k_[iteration_strided_];
-
-    return TensorCoord(k, filter_t_, filter_r_, filter_s_, filter_c_);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
-    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
-
-    return coord.n() < output_channels &&
-      coord.c() < input_channels;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-    
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(ConvProblemSize const &problem_size) {
-    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
-    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
-    // check alignment constraint on iterator's contiguous dimension
-    if (input_channels % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
deleted file mode 100755
index efe34497f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
-    
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename Layout_,
-  typename ThreadMap_,
-  bool IsDeconv_ = false
->
-class Conv3dFpropFilterTileAccessIteratorOptimized{
-public:
-  
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static bool const IsDeconv = IsDeconv_;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv3dFpropFilterIteratorOptimizedParams<Layout> {
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dFpropFilterIteratorOptimizedParams<Layout> const &base): 
-      Conv3dFpropFilterIteratorOptimizedParams<Layout>(base) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv3dProblemSize const &problem_size,
-      Layout const &layout
-    ):
-      Conv3dFpropFilterIteratorOptimizedParams<Layout>(
-        problem_size,
-        layout,
-        sizeof_bits<Element>::value,
-        {Shape::kRow, Shape::kColumn},
-        ThreadMap::kThreads,
-        ThreadMap::kElementsPerAccess,
-        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-      ) {
-
-    }
-  };
-
-private:
-
-  Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  uint32_t predicates_;
-  int filter_trs_;
-  int filter_c_;
-
-  //
-  // Assertions
-  //
-
-  // We map predicates into bits packed in this uint32_t container
-  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
-    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterTileAccessIteratorOptimized(
-    Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params,
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_{0},
-    filter_trs_(0),
-    filter_c_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
-    Index column = threadblock_offset.column() + thread_coord.strided();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
-      predicates_ |= (pred << s);
-    }
-
-    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
-      predicates_ = 0u;
-    }
-
-    pointer_ += (
-      params_.layout({filter_c_, column}) 
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-
-    LongIndex next = params_.inc_next_trs;
-
-    // moves to the next tile
-    ++filter_trs_;
-    if (filter_trs_ == params_.TRS) {
-
-      filter_trs_ = 0;
-      next = params_.inc_next_c;
-      filter_c_ += params_.filter_c_delta;
-    }
-      
-    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
-      predicates_ = 0;
-    }
-      
-    pointer_ += next;
-  }
-
-  /// Returns true if the current coordinate is within the filter tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return (predicates_ & (1u << iteration_strided_));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    return reinterpret_cast<AccessType const *>(pointer_);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterTileAccessIteratorOptimized &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-
-      // Move to the next K coordinate within the tile
-      pointer_ += params_.inc_next_k;
-
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (input_channels % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
deleted file mode 100755
index ac422b8f0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
+++ /dev/null
@@ -1,508 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-  \file 
-  \brief Extracts the host-params objects into non-template code.
-*/
-
-#pragma once
-
-#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
-#include <fstream>
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Params structure used for all Conv3d analytic tile iterators
-template< typename Layout_ = layout::TensorNDHWC >
-struct Conv3dAnalyticParams {
-
-  using Layout = Layout_;
-
-  Layout layout;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv3dAnalyticParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dAnalyticParams(
-    Conv3dProblemSize const &,  // unused; placeholder to match other Params interfaces.
-    Layout const &layout
-  ): layout(layout) {
-
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
-template< typename Layout_ = layout::TensorNDHWC >
-struct Conv3dFpropActivationIteratorOptimizedParams;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
-template<>
-struct Conv3dFpropActivationIteratorOptimizedParams<layout::TensorNDHWC> {
-  
-  using Layout = layout::TensorNDHWC;
-
-  Layout layout;
-
-  int64_t inc_next[4];    // {next S, next R, next T, next C}
-  int filter_c_delta;     // number of logical elements to add to filter_c_
-  int ZPQ;                // product of Z*P*Q
-  int PQ;                 // product of P*Q
-
-  FastDivmod zpq_divmod;
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropActivationIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,                             ///< layout object
-    int element_size_bits,                            ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), 
-    PQ(problem_size.P * problem_size.Q),
-    ZPQ(problem_size.Z * problem_size.P * problem_size.Q),  
-    zpq_divmod(ZPQ),
-    pq_divmod(PQ), 
-    q_divmod(problem_size.Q) {
-
-    TRACE_CONV_INITIALIZERS("conv3d_fprop", "activation", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-  
-
-    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
-
-    // next S
-    inc_next[0] = conv_sign * (
-      int64_t(layout.stride()[0]) * problem_size.dilation_w
-    ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        int64_t(layout.stride()[1]) * problem_size.dilation_h
-        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next T
-    inc_next[2] = conv_sign * (
-      int64_t(layout.stride()[2]) * problem_size.dilation_d
-      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next C
-    inc_next[3] = (
-        threadblock_shape.column() * problem_size.split_k_slices
-        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
-        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template< typename Layout_ = layout::TensorNDHWC >
-struct Conv3dFpropFilterIteratorOptimizedParams;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<>
-struct Conv3dFpropFilterIteratorOptimizedParams<layout::TensorNDHWC>
-{
-
-  using Layout = layout::TensorNDHWC;
-
-  Layout layout;
-  int TRS;
-  int filter_c_delta;
-
-  int64_t inc_next_k;         // offset in units of bytes to next K position
-  int64_t inc_next_trs;        // offset in units of bytes to next TRS position
-  int64_t inc_next_c;         // offset in units of bytes to next C position
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dFpropFilterIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout) {
-    
-    TRACE_CONV_INITIALIZERS("conv3d_fprop", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    TRS = problem_size.T * problem_size.R * problem_size.S;
-
-    inc_next_k = (int64_t(layout.stride()[3]) * threadmap_delta.strided() * element_size_bits) / 8;
-
-    inc_next_trs =
-      ( int64_t(layout.stride()[0])
-        - int64_t(layout.stride()[3]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
-      ) * element_size_bits / 8;
-
-    inc_next_c =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices
-        - int64_t(TRS - 1) * layout.stride()[0]
-        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
-      ) * element_size_bits / 8;
-
-    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters object for Conv3d DGRAD OutputGradient (dy) iterator
-struct Conv3dDgradOutputGradientIteratorOptimizedParams {
-
-  using Layout = layout::TensorNDHWC;
-
-  Layout layout;
-
-  int64_t inc_next[4];    // {next S, next R, next T, next K}
-  int filter_k_delta;     // number of logical elements to add to filter_k_
-
-  FastDivmod dhw_divmod;
-  FastDivmod hw_divmod;
-  FastDivmod w_divmod;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradOutputGradientIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,                             ///< layout object
-    int element_size_bits,                            ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size,
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), 
-    dhw_divmod(problem_size.D * problem_size.H * problem_size.W),
-    hw_divmod(problem_size.H * problem_size.W), 
-    w_divmod(problem_size.W) {
-
-    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "output_gradient", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
-
-    // next S
-    inc_next[0] = conv_sign * (
-      int64_t(layout.stride()[0]) * problem_size.dilation_w
-    ) * element_size_bits / 8;
-
-    // next R
-    inc_next[1] = conv_sign * (
-        int64_t(layout.stride()[1]) * problem_size.dilation_h
-        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next T
-    inc_next[2] = conv_sign * (
-      int64_t(layout.stride()[2]) * problem_size.dilation_d
-      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // next K
-    inc_next[3] = (
-        threadblock_shape.column() * problem_size.split_k_slices
-        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
-        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
-        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
-      ) * element_size_bits / 8;
-
-    // logical offset added to internal channel counter - units are elements, not bytes
-    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters object for Conv2d DGRAD Filter (w) iterator
-struct Conv3dDgradFilterIteratorOptimizedParams {
-
-  using Layout = layout::TensorNDHWC;
-
-  Layout layout;
-  int TRS;
-  int filter_k_delta;
-
-  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
-  int64_t inc_next_trs;       // offset in units of bytes to next TRS position
-  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dDgradFilterIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,                        ///< size of each element in bits
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size, 
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): 
-    layout(layout), TRS(problem_size.T * problem_size.R * problem_size.S) {
-
-    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", 
-      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-    inc_next_strided = ((int64_t)layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
-
-    inc_next_trs =
-      ( (int64_t)layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
-      ) * element_size_bits / 8;
-
-    inc_next_k =
-      (
-        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[3]
-        - (problem_size.T * problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
-        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
-      ) * element_size_bits / 8;
-
-    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
-  }
-};
-
-/// Parameters object for Conv3d WGRAD OutputGradient iterator
-struct Conv3dWgradOutputGradientIteratorOptimizedParams {
-
-  using Layout = layout::TensorNDHWC;
-  using LongIndex = typename Layout::LongIndex;
-
-  Layout layout;
-
-  int NZPQ;                // precomputd product of N*Z*P*Q for clearing predicates
-  int ZPQ;                 // product of Z*P*Q
-  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
-  unsigned zpq_shr;        //    in device code.
-
-  int PQ;                  // product of P*Q
-  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
-  unsigned pq_shr;         //    in device code.
-
-  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
-  unsigned q_shr;          //    in device code.
-
-  LongIndex offset_next_strided;     // offset in units of bytes to next nzpq coordinate within tile
-  LongIndex offset_next_contiguous;  // offset in units of bytes to next k coordinate within tile
-  LongIndex inc_next_nzpq;           // offset in units of bytes to next nzpq position in subsequent tile
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size, 
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): layout(layout) {
-
-  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "output_gradient", 
-    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-  // Incremental offsets in unites of bytes (number of elements) * element_size_bits / 8
-  offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
-                      * element_size_bits / 8;
-
-  offset_next_contiguous = (threadmap_delta.contiguous()) 
-                          * element_size_bits / 8;
-
-  inc_next_nzpq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
-                    * element_size_bits / 8;
-
-  // Precompute several quantities for fast modulo arithmetic.
-  NZPQ = problem_size.N * problem_size.Z * problem_size.P * problem_size.Q;
-  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
-  find_divisor(zpq_mul, zpq_shr, ZPQ);
-
-  PQ = problem_size.P * problem_size.Q;
-  find_divisor(pq_mul, pq_shr, PQ);
-
-  find_divisor(q_mul, q_shr, problem_size.Q);
-
-  }
-};
-
-/// Parameters object for Conv3d WGRAD Activation Tile Access Iterator
-struct Conv3dWgradActivationIteratorOptimizedParams {
-
-  using Layout = layout::TensorNDHWC;
-
-  Layout layout;
-
-  int RSC;                 // product of R*S*C
-  unsigned rsc_mul;        // precomputed quantities for fast computation of div/% by RSC
-  unsigned rsc_shr;        //    in device code.
-
-  int SC;                  // product of S*C
-  unsigned sc_mul;         // precomputed quantities for fast computation of div/% by SC
-  unsigned sc_shr;         //    in device code.
-
-  unsigned c_mul;          // precomputed quantities for fast computation of div/% by C
-  unsigned c_shr;          //    in device code.
-
-  int ZPQ;                 // product of Z*P*Q
-  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
-  unsigned zpq_shr;        //    in device code.
-
-  int PQ;                  // product of P*Q
-  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
-  unsigned pq_shr;         //    in device code.
-
-  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
-  unsigned q_shr;          //    in device code.
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationIteratorOptimizedParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationIteratorOptimizedParams(
-    Conv3dProblemSize const &problem_size,
-    Layout const &layout,    
-    int element_size_bits,
-    MatrixCoord threadblock_shape,
-    int thread_count,
-    int access_size, 
-    layout::PitchLinearCoord threadmap_iterations,
-    layout::PitchLinearCoord threadmap_delta
-  ): layout(layout) {
-
-  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "activation", 
-    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
-
-  // Precompute several quantities for fast modulo arithmetic.
-  RSC = problem_size.R * problem_size.S * problem_size.C;
-  find_divisor(rsc_mul, rsc_shr, RSC);
-
-  SC = problem_size.S * problem_size.C;
-  find_divisor(sc_mul, sc_shr, SC);
-      
-  find_divisor(c_mul, c_shr, problem_size.C);
-
-  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
-  find_divisor(zpq_mul, zpq_shr, ZPQ);
-
-  PQ = problem_size.P * problem_size.Q;
-  find_divisor(pq_mul, pq_shr, PQ);
-
-  find_divisor(q_mul, q_shr, problem_size.Q);
-
-  }
-};
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
deleted file mode 100755
index cc8faea70..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,289 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dWgradActivationTileAccessIteratorAnalytic {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  
-  static int const kAccessesPerVector = 1;
- 
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv3dProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
-
-private:
-
-  Params const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  // Filter postion (t,r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
-  int filter_t_[ThreadMap::Iterations::kContiguous];
-  int filter_r_[ThreadMap::Iterations::kContiguous];
-  int filter_s_[ThreadMap::Iterations::kContiguous];
-  int filter_c_[ThreadMap::Iterations::kContiguous];
-
-  int offset_nzpq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize t,r,s,c filter position for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
-                        + c * ThreadMap::Delta::kContiguous;
-
-      filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
-      int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
-
-      filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
-      residual = residual % (problem_size_.S * problem_size_.C);
-
-      filter_s_[c] = residual / problem_size_.C;
-      filter_c_[c] = residual % problem_size_.C;
-
-    }
-
-    // initialize n, z, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-    
-      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;   
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    
-    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the activation tensor x that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int t = filter_t_[iteration_contiguous_];
-    int r = filter_r_[iteration_contiguous_];
-    int s = filter_s_[iteration_contiguous_];
-
-    if (problem_size_.mode == Mode::kConvolution) {
-      t = (problem_size_.T - 1 - t);
-      r = (problem_size_.R - 1 - r);
-      s = (problem_size_.S - 1 - s);
-    }
-
-    int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-
-    int z = residual / (problem_size_.P * problem_size_.Q);
-    residual = residual % (problem_size_.P * problem_size_.Q);
-
-    int p = residual / problem_size_.Q;
-    int q = residual % problem_size_.Q;
- 
-    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
-    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
-    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
-  }
-
-  /// Returns true if the current coordinate is within the activation tensor x
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.d() >= 0 && coord.d() < problem_size_.D &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W &&
-      coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-  
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
deleted file mode 100755
index 2b10d207f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dWgradActivationTileAccessIteratorOptimized {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv3dWgradActivationIteratorOptimizedParams {
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dWgradActivationIteratorOptimizedParams const &base)
-          : Conv3dWgradActivationIteratorOptimizedParams(base) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
-          : Conv3dWgradActivationIteratorOptimizedParams(
-          problem_size,
-          layout,
-          sizeof_bits<Element>::value,
-          {Shape::kRow, Shape::kColumn},
-          ThreadMap::kThreads,
-          ThreadMap::kElementsPerAccess,
-          {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-          {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
-  };
-
-private:
-
-  Params const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-
-  // Precomputed effective filter postion (t,r,s) in contiguous dimension stays constant for each gemm_iteration_k
-  // required for nzpq -> ndhw translation
-  int precomputed_filter_t_[ThreadMap::Iterations::kContiguous];
-  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
-  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
-
-  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
-  int filter_c_[ThreadMap::Iterations::kContiguous];
-
-  int offset_nzpq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationTileAccessIteratorOptimized(
-    Params const &params, 
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize t,r,s,c filter position for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
-                        + c * ThreadMap::Delta::kContiguous;
-
-      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-      //
-      // 
-      // filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
-      // int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
-      //
-      // filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
-      // residual = residual % (problem_size_.S * problem_size_.C);
-      //
-      // filter_s_[c] = residual / problem_size_.C;
-      // filter_c_[c] = residual % problem_size_.C;
-
-      int residual;
-      fast_divmod(precomputed_filter_t_[c], residual, trsc_offset, params_.RSC, params_.rsc_mul, params_.rsc_shr);
-      fast_divmod(precomputed_filter_r_[c], residual, residual, params_.SC, params_.sc_mul, params_.sc_shr);
-      fast_divmod(precomputed_filter_s_[c], filter_c_[c], residual, problem_size_.C, params_.c_mul, params_.c_shr);
-
-      int t = precomputed_filter_t_[c];
-      int r = precomputed_filter_r_[c];
-      int s = precomputed_filter_s_[c];
-
-      if (problem_size_.mode == Mode::kConvolution) {
-        t = (problem_size_.T - 1 - t);
-        r = (problem_size_.R - 1 - r);
-        s = (problem_size_.S - 1 - s);
-      }
-      
-      // efective t,r,s for every contiguous dimension
-      precomputed_filter_t_[c] = - problem_size_.pad_d + t * problem_size_.dilation_d;
-      precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h;
-      precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w;
-
-
-    }
-
-    // initialize n, z, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-    
-      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;   
-    }
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    
-    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the activation tensor x that is currently pointed to
-  /// by the iterator.
-
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-    //
-    //
-    // int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    // int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    // 
-    // int z = residual / (problem_size_.P * problem_size_.Q);
-    // residual = residual % (problem_size_.P * problem_size_.Q);
-    // 
-    // int p = residual / problem_size_.Q;
-    // int q = residual % problem_size_.Q;
-
-    int residual, n, z, p, q;
-    fast_divmod(n, residual, offset_nzpq_[iteration_strided_], params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
-    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
-    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
- 
-    int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_];
-    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];
-    int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
-
-    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
-  }
-
-  /// Returns true if the current coordinate is within the activation tensor x
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.d() >= 0 && coord.d() < problem_size_.D &&
-      coord.h() >= 0 && coord.h() < problem_size_.H &&
-      coord.w() >= 0 && coord.w() < problem_size_.W &&
-      coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradActivationTileAccessIteratorOptimized &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-  
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
deleted file mode 100755
index be9d4fb7a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params {
-
-    Layout layout;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Conv3dProblemSize const &problem_size, 
-      Layout const &layout
-    ): layout(layout) {
-
-    }
-  };
-
-private:
-
-  Params const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-    
-  int filter_k_[ThreadMap::Iterations::kContiguous];
-
-  int offset_nzpq_[ThreadMap::Iterations::kStrided];
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientTileAccessIteratorAnalytic(
-    Params const &params, 
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)) {
-
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-    
-    // initialize filter_k for every contiguous iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
-                        + c * ThreadMap::Delta::kContiguous;
-    }
-
-    // initialize n, p, q offset for every strided iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_nzpq_[s] = threadblock_offset.column() + thread_coord.strided() 
-                      + s * ThreadMap::Delta::kStrided;  
-      
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-A by a CTA-K tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_nzpq_[s] += Shape::kColumn * problem_size_.split_k_slices;
-    }
-  }
-
-  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int nzpq = offset_nzpq_[iteration_strided_];
-
-    int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-
-    int z = residual / (problem_size_.P * problem_size_.Q);
-    residual = residual % (problem_size_.P * problem_size_.Q);
-
-    int p = residual / problem_size_.Q;
-    int q = residual % problem_size_.Q;
-
-    return TensorCoord(n, z, p, q, filter_k_[iteration_contiguous_]);
-  }
-
-
-  /// Returns true if the current coordinate is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N &&
-      coord.d() < problem_size_.Z &&
-      coord.h() < problem_size_.P &&
-      coord.w() < problem_size_.Q &&
-      coord.c() < problem_size_.K;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
deleted file mode 100755
index 0ef145f19..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
-
-    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
-    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/conv/threadblock/conv3d_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,
-  typename Element_,
-  typename ThreadMap_
->
-class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
-public:
-
-  //
-  // Types
-  //
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorNDHWC;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 3;
-  using ConvProblemSize = typename conv::Conv3dProblemSize;
-  static int const kAccessesPerVector = 1;  
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "WGRAD requires elements of size 8b or greater.");
-
-  //
-  // Parameters structure
-  //
-
-  struct Params : Conv3dWgradOutputGradientIteratorOptimizedParams {
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dWgradOutputGradientIteratorOptimizedParams const &base)
-          : Conv3dWgradOutputGradientIteratorOptimizedParams(base) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
-          : Conv3dWgradOutputGradientIteratorOptimizedParams(
-            problem_size,
-            layout,
-            sizeof_bits<Element>::value,
-            {Shape::kRow, Shape::kColumn},
-            ThreadMap::kThreads,
-            ThreadMap::kElementsPerAccess,
-            {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-            {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
-    };
-
-private:
-
-  Params const &params_;
-  Conv3dProblemSize const &problem_size_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  char const *pointer_;
-    
-  uint32_t predicates_;
-  int filter_k_;
-  int offset_nzpq_;
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientTileAccessIteratorOptimized(
-    Params const &params, 
-    Conv3dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size),
-    pointer_(reinterpret_cast<char const *>(ptr)),
-    predicates_(0),
-    filter_k_(0),
-    offset_nzpq_(0) {
-
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
-    offset_nzpq_ = threadblock_offset.column() + thread_coord.strided();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
-        int offset_nzpq = offset_nzpq_ + s * ThreadMap::Delta::kStrided;
-
-        bool predicate = valid_(at_(offset_nzpq, filter_k));
-
-        uint32_t pred = (predicate ? 1u : 0);
-
-        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
-        
-        predicates_ |= (pred << pred_idx);
-      }
-    }
-
-    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
-    pointer_ += (
-      offset_nzpq_ * params.layout.stride()[0] + filter_k_
-    ) * sizeof_bits<Element>::value / 8;
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
-    return Params(problem_size, layout);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
-    offset_nzpq_ += Shape::kColumn * problem_size_.split_k_slices;
-
-    // Clear predicates if needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      if (offset_nzpq_ + s * ThreadMap::Delta::kStrided >= params_.NZPQ) {
-        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
-        predicates_ = (predicates_ & (~kClearMask));
-      }
-    }
-    pointer_ += params_.inc_next_nzpq; 
-  }
-
-private:
-  /// Returns the coordinate in the output gradient tensor Dy that is (offset_nzpq, k) pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at_(int offset_nzpq, int k) const {
-
-    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
-    //
-    //
-    // int nzpq = offset_nzpq_;
-    // int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    // int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
-    //
-    // int z = residual / (problem_size_.P * problem_size_.Q);
-    // residual = residual % (problem_size_.P * problem_size_.Q);
-    //
-    // int p = residual / problem_size_.Q;
-    // int q = residual % problem_size_.Q;
-
-    int residual, n, z, p, q;
-    fast_divmod(n, residual, offset_nzpq, params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
-    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
-    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
-
-    return TensorCoord(n, z, p, q, k);
-  }
-
-  /// Returns true if the coord is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid_(TensorCoord coord) const {
-
-    return coord.n() < problem_size_.N &&
-      coord.c() < problem_size_.K;
-  }
-
-public:
-
-  /// Returns true if the current coordinate is within the output gradient tensor Dy
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
-    return (predicates_ & (1u << pred_idx));
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    
-    return reinterpret_cast<AccessType const *>(
-      pointer_ +
-      iteration_strided_ * params_.offset_next_strided + 
-      iteration_contiguous_ * params_.offset_next_contiguous
-    );
-
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  Conv3dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv3dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
deleted file mode 100755
index 802318349..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-  \file 
-  \brief Extracts the host-params objects into non-template code.
-*/
-
-#pragma once
-
-#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
-#include <fstream>
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
-template<typename Layout_ = layout::TensorNHWC >
-struct Depthwise2dFpropDirectConvParams;
-
-/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
-template<typename Layout_ = layout::TensorNHWC >
-struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams;
-
-/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
-template<typename Layout_ = layout::TensorNHWC >
-struct Depthwise2dFpropDirectConvFilterIteratorParams;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
-template<>
-struct Depthwise2dFpropDirectConvParams<layout::TensorNHWC> {
-  
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  int32_t activation_tile_h;
-  int32_t activation_tile_w;
-  int32_t activation_tile_hw;
-  FastDivmod activation_tile_w_divmod;
-  
-  int filter[2];
-  int stride[2];
-  int dilation[2];
-  int inc_next[2];
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  int activation_load_count;
-  int activation_storage_elements;
-  int activation_size;
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvParams() { }
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvParams(
-      Conv2dProblemSize const &problem_size,
-      Layout const &layout,             ///< layout object
-      MatrixCoord threadblock_shape,    ///< CTA threadblock Shape
-      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
-      const int element_size_bits,      ///< bits of activation element
-      const int thread_count,           ///< threads per threadblock
-      const int thread_count_contiguous, ///< number of threads for continuous dimension
-      const int element_per_load)       ///< element per each load
-      : layout(layout) {
-          
-    filter[0] = problem_size.S;
-    filter[1] = problem_size.R;
-    
-    stride[0] =  problem_size.stride_w;
-    stride[1] =  problem_size.stride_h;
-
-    dilation[0] = problem_size.dilation_w;
-    dilation[1] = problem_size.dilation_h;
-
-    // Compute activation_tile size per threadblock because stride and dilation are runtime params.
-    activation_tile_h = (threadblock_output_shape.h() - 1) * problem_size.stride_h +
-                        (problem_size.R - 1) * problem_size.dilation_h + 1;
-    activation_tile_w = (threadblock_output_shape.w() - 1) * problem_size.stride_w +
-                        (problem_size.S - 1) * problem_size.dilation_w + 1;
-    activation_tile_hw = activation_tile_h * activation_tile_w;
-
-    activation_tile_w_divmod = FastDivmod(activation_tile_w);
-
-    /// Below two values could not be templatized because the stride and dilation are runtime params
-    activation_load_count = (thread_count_contiguous * activation_tile_hw + (thread_count - 1)) / thread_count;
-    activation_storage_elements = activation_load_count * element_per_load * thread_count;
-    activation_size =  activation_storage_elements * element_size_bits / 8;
-
-    // Fastdivmod for output P, Q
-    int tiles_p =
-        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
-    int tiles_q = (problem_size.Q + (threadblock_output_shape.w() - 1)) /
-                  (threadblock_output_shape.w());
-
-    pq_divmod = FastDivmod(tiles_p * tiles_q);
-    q_divmod = FastDivmod(tiles_q);
-
-    // next S
-    inc_next[0] = problem_size.dilation_w;
-    // next R
-    inc_next[1] = (activation_tile_w * problem_size.dilation_h - (problem_size.S - 1) * problem_size.dilation_w);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
-template <>
-struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<layout::TensorNHWC> {
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  int activation_size;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams() {}
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams(
-      Conv2dProblemSize const &problem_size,
-      Layout const &layout,                          ///< Layout object
-      MatrixCoord threadblock_shape,                 ///< Threadblock Shape
-      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
-      const int activation_size_                     ///< Activation size loaded by iterator
-      )
-      : layout(layout),
-        activation_size(activation_size_) {
-    // Fastdivmod for output P, Q
-    int tiles_p =
-        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
-    int tiles_q =
-        (problem_size.Q + (threadblock_output_shape.w() - 1)) / (threadblock_output_shape.w());
-
-    pq_divmod = FastDivmod(tiles_p * tiles_q);
-    q_divmod = FastDivmod(tiles_q);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
-template <>
-struct Depthwise2dFpropDirectConvFilterIteratorParams<layout::TensorNHWC> {
-  using Layout = layout::TensorNHWC;
-
-  Layout layout;
-
-  int filter_size;
-
-  bool is_convolution;
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvFilterIteratorParams() {}
-
-  CUTLASS_HOST_DEVICE
-  Depthwise2dFpropDirectConvFilterIteratorParams(
-      Conv2dProblemSize const &problem_size,
-      Layout const &layout,           ///< Layout object
-      MatrixCoord threadblock_shape,  ///< Threadblock Shape
-      const int filter_size_)         ///< Filter size loaded by iterator
-      : layout(layout),
-        filter_size(filter_size_),
-        is_convolution(problem_size.mode == Mode::kConvolution){}
-};
-
-}  // namespace threadblock
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
deleted file mode 100755
index 192d96105..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
+++ /dev/null
@@ -1,314 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Shape_,
-          typename OutputTileShape_,
-          typename StrideShape_,
-          typename DilationShape_,
-          typename ActivationShape_,
-          typename Element_,
-          typename Layout_,
-          typename ThreadMap_,
-          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
-class DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation {
- public:
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using OutputTileShape = OutputTileShape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  // Compilation value of stride , dialtion and activation shape
-  using StrideShape = StrideShape_;
-  using DilationShape = DilationShape_;
-  using ActivationShape = ActivationShape_;
-
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  static int const kActivationSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
-           sizeof_bits<Element>::value / 8;
-
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-                "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
-  
-  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
-  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<Layout>;
-
- private:
-  Conv2dProblemSize const &problem_size_;
-  Params const &params_;
-  char const *pointer_;
-
-  // Base channels for current threadblock
-  int base_c_;
-  // Base activation index for current threadblock
-  int offset_intial_npq_;
-  // Base activation coord for current threadblock
-  TensorCoord activatioin_base_;
-  // Intial thread positioin
-  int offset_initial_hwc_;
-  // Overall load instruction per thread.
-  int iterator_load_;
-  // thread loading position.
-  int iterator_hwc_;
-  // activation N is inside the Tensor or not
-  bool valid_n_;
-
- public:
-
-
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation(
-      Params const &params,
-      Conv2dProblemSize const &problem_size,
-      Element const *ptr,
-      int thread_idx,
-      MatrixCoord const &threadblock_offset =
-          MatrixCoord()
-      )
-      : params_(params),
-        problem_size_(problem_size),
-        pointer_(reinterpret_cast<char const *>(ptr)),
-        offset_intial_npq_(threadblock_offset.row()),
-        offset_initial_hwc_(thread_idx),
-        iterator_load_(0) {
-    
-    base_c_ = threadblock_offset.column();
-
-    set_iteration_index(0);
-
-    set_activation_coord(offset_intial_npq_);
-
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_activation_coord(int offset_npq) {
-    int offset_inital_n, offset_inital_p, offset_inital_q;
-    int residual;
-
-    params_.pq_divmod(offset_inital_n, residual, offset_npq);
-    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
-
-    int base_n = offset_inital_n;
-
-    int base_h =
-        offset_inital_p * OutputTileShape::kH * StrideShape::kRow - problem_size_.pad_h;
-
-    int base_w =
-        offset_inital_q * OutputTileShape::kW * StrideShape::kColumn - problem_size_.pad_w;
-
-    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
-
-    valid_n_ = activatioin_base_.n() < problem_size_.N;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(
-        problem_size,
-        layout,
-        {Shape::kRow, Shape::kColumn},
-        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
-        kActivationSize);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
-    iterator_load_ = index;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // Go to next threadblock
-    offset_intial_npq_ += problem_size_.split_k_slices;
-
-    set_iteration_index(0);
-
-    set_activation_coord(offset_intial_npq_);
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
-    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
-    int h = next / ActivationShape::kW;
-    int w = next % ActivationShape::kW;
-
-    c = c * AccessType::kElements;
-
-    return activatioin_base_ + TensorCoord(0, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-    bool valid_c = coord.c() < problem_size_.C;
-    bool valid_h = coord.h() >= 0 && coord.h() < problem_size_.H;
-    bool valid_w = coord.w() >= 0 && coord.w() < problem_size_.W;
-    return valid_n_ ? valid_c & valid_h & valid_w : 0;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    AccessType const *ptr =
-        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation &operator++() {
-
-    ++iterator_load_;
-    iterator_hwc_ += ThreadMap::kThreads;
-
-    if (iterator_load_ < ThreadMap::Iterations::kCount) {
-       return *this;
-    }
-    
-    iterator_load_ = 0;
-    iterator_hwc_ = offset_initial_hwc_;
-
-    return *this;
-  }
-
-  /// Determines the activation size loaded by iterator
-  CUTLASS_HOST_DEVICE
-  int get_load_size() {
-    return kActivationSize;
-  }
-
-  /// Determines the iterations needed
-  CUTLASS_HOST_DEVICE
-  int get_iteration_num() {
-    return ThreadMap::Iterations::kCount;
-  }
-
-  /// Determines whether the Depthwise fprop can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check stride and dilation constraint
-    if (problem_size.stride_h != StrideShape::kRow || problem_size.stride_w != StrideShape::kColumn) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (problem_size.dilation_h != DilationShape::kRow || problem_size.dilation_w != DilationShape::kColumn) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
deleted file mode 100755
index a858a23f9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Shape_,
-          typename OutputTileShape_,
-          typename Element_,
-          typename Layout_,
-          typename ThreadMap_,
-          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
-class DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized {
- public:
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using OutputTileShape = OutputTileShape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using TensorCoord = typename Layout::TensorCoord;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-                "Vectors implied by the thread map must be divisible by the access type.");
-
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
-  
-  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
-  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
-
-  //
-  // Parameters structure
-  //
-
-  using Params = Depthwise2dFpropDirectConvParams<Layout>;
-
- private:
-  Conv2dProblemSize const &problem_size_;
-  Params const &params_;
-  char const *pointer_;
-
-  // Base channels for current threadblock
-  int base_c_;
-  // Base activation index for current threadblock
-  int offset_intial_npq_;
-  // Base activation coord for current threadblock
-  TensorCoord activatioin_base_;
-  // Intial thread positioin
-  int offset_initial_hwc_;
-  // Overall load instruction per thread.
-  int iterator_load_;
-  // thread loading position.
-  int iterator_hwc_;
-  // Number of loads for activations tensor X.
-  const int number_of_loads_;
-
- public:
-
-
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized(
-      Params const &params,
-      Conv2dProblemSize const &problem_size,
-      Element const *ptr,
-      int thread_idx,
-      MatrixCoord const &threadblock_offset =
-          MatrixCoord()
-      )
-      : params_(params),
-        problem_size_(problem_size),
-        pointer_(reinterpret_cast<char const *>(ptr)),
-        offset_intial_npq_(threadblock_offset.row()),
-        offset_initial_hwc_(thread_idx),
-        iterator_load_(0),
-        number_of_loads_(params.activation_load_count) {
-    
-    base_c_ = threadblock_offset.column();
-
-    set_activation_coord(offset_intial_npq_);
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_activation_coord(int offset_npq) {
-    int offset_inital_n, offset_inital_p, offset_inital_q;
-    int residual;
-
-    params_.pq_divmod(offset_inital_n, residual, offset_npq);
-    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
-
-    int base_n = offset_inital_n;
-
-    int base_h =
-        offset_inital_p * OutputTileShape::kH * problem_size_.stride_h - problem_size_.pad_h;
-
-    int base_w =
-        offset_inital_q * OutputTileShape::kW * problem_size_.stride_w - problem_size_.pad_w;
-
-    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-    return Params(
-        problem_size,
-        layout,
-        {Shape::kRow, Shape::kColumn},
-        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
-        sizeof_bits<Element>::value,
-        ThreadMap::kThreads,
-        ThreadMap::Detail::ShapeVec::kContiguous,
-        ThreadMap::kElementsPerAccess);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
-    iterator_load_ = index;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // Go to next threadblock
-    offset_intial_npq_ += problem_size_.split_k_slices;
-
-    set_activation_coord(offset_intial_npq_);
-  }
-
-  /// Returns the coordinate in the activations tensor X that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-    
-    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
-    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
-    int h, w;
-    params_.activation_tile_w_divmod(h, w, next) ;
-
-    c = c * AccessType::kElements;
-
-    return activatioin_base_ + TensorCoord(0, h, w, c);
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor X
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.N && coord.h() >= 0 && coord.h() < problem_size_.H &&
-           coord.w() >= 0 && coord.w() < problem_size_.W && coord.c() < problem_size_.C;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    TensorCoord coord = at();
-    LongIndex offset = params_.layout(coord);
-
-    AccessType const *ptr =
-        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
-
-    return ptr;
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized &operator++() {
-
-    ++iterator_load_;
-    iterator_hwc_ += ThreadMap::kThreads;
-
-    if (iterator_load_ < number_of_loads_) {
-       return *this;
-    }
-    
-    iterator_load_ = 0;
-    iterator_hwc_ = offset_initial_hwc_;
-
-    return *this;
-  }
-
-  /// Determines the activation size loaded by iterator
-  CUTLASS_HOST_DEVICE
-  int get_load_size() {
-    return params_.activation_size;
-  }
-
-  /// Determines the iterations needed
-  CUTLASS_HOST_DEVICE
-  int get_iteration_num() {
-    return number_of_loads_;
-  }
-
-  /// Determines whether the Depthwise fprop can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.C % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
deleted file mode 100755
index 50aeee006..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
+++ /dev/null
@@ -1,551 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/cache_operation.h"
-#include "cutlass/conv/threadblock/depthwise_mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Epilogue stores the data into global memory
-    typename Epilogue_,
-    /// iterator implementation variants
-    conv::IteratorAlgorithm IteratorAlgorithm_ = conv::IteratorAlgorithm::kOptimized,
-    /// Used for partial specialization
-    typename Enable = bool>
-class DepthwiseFpropDirectConvMultipleStage :
-   public DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using Epilogue = Epilogue_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  static conv::IteratorAlgorithm const kItertorAlgorithm = IteratorAlgorithm_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-
-  using ElementC = typename Policy::Operator::ElementC;
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-  
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB = 
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  DepthwiseFpropDirectConvMultipleStage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A,
-                              IteratorB &iterator_B,
-                              int group_start_A = 0,
-                              int group_start_B = 0) {
-    if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
-      // Number of iterators is a static value.
-      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
-      this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-        ++this->smem_iterator_A_;
-      }
-    } else {
-      // Number of iterators is a runtime value.
-      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
-      this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-        ++this->smem_iterator_A_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA &iterator_A,
-      ///< Params of global memory iterator
-      typename IteratorA::Params const &iterator_a_params,
-      ///< iterator over B operand in global memory
-      IteratorB &iterator_B,
-      ///< Params of global memory iterator
-      typename IteratorB::Params const &iterator_b_params,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      /// Epilogue
-      Epilogue &epilogue, 
-      ///< Output operator
-      typename Epilogue::OutputOp const &output_op, 
-      ///< Tile iterator for destination 
-      typename Epilogue::OutputTileIterator &destination_iterator,
-      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-      typename Epilogue::OutputTileIterator &source_iterator,
-
-      int split_k_slices = 1
-      ) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
-
-      if (stage == 0) {
-        iterator_B.set_iteration_index(0);
-        this->smem_iterator_B_.set_iteration_index(0);
-
-        // Async Copy for operand B
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-          typename IteratorB::AccessType *dst_ptr =
-              reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-            int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                                  IteratorB::ThreadMap::kElementsPerAccess /
-                                  IteratorB::kAccessesPerVector / 8;
-
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, iterator_B.get(), iterator_B.valid());
-            
-            ++iterator_B;
-          }
-
-          ++this->smem_iterator_B_;
-        }
-      }
-
-      if(kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation){
-        // Number of iterators is compilation static.
-        iterator_A.set_iteration_index(0);
-        this->smem_iterator_A_.set_iteration_index(0);
-
-        // Async Copy for operand A
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-          typename IteratorA::AccessType *dst_ptr =
-              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                                  IteratorA::ThreadMap::kElementsPerAccess /
-                                  IteratorA::kAccessesPerVector / 8;
-
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, iterator_A.get(), iterator_A.valid());
-          
-            ++iterator_A;
-          }
-
-          ++this->smem_iterator_A_;
-        }
-
-      } else {
-        // Number of iterators is a runtime value.
-        iterator_A.set_iteration_index(0);
-        this->smem_iterator_A_.set_iteration_num(iterator_A.get_iteration_num());
-        this->smem_iterator_A_.set_iteration_index(0);
-
-
-        // Async Copy for operand A
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
-          typename IteratorA::AccessType *dst_ptr =
-              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                                  IteratorA::ThreadMap::kElementsPerAccess /
-                                  IteratorA::kAccessesPerVector / 8;
-
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-            ++iterator_A;
-          }
-
-          ++this->smem_iterator_A_;
-        }
-      }
-
-      // Move to the next stage
-      iterator_A.advance();
-
-      this->smem_iterator_A_.add_tile_offset({1, 0});
-
-      // Inserts a fence to group cp.async instructions into stages.
-      cutlass::arch::cp_async_fence();
-    }
-
-    /////////////////////////////////////////////////////////////////////////////
-    // Waits until kStages-2 stages have committed. 
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
-
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    //
-    // Mainloop
-    //
-
-    unsigned int iterations = 0;
-    constexpr int inner_loop_iterations = round_up(Base::kWarpGemmIterations, 2);
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {   // Each iteration is a cta tile.
-
-      accum.clear();
-    
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < inner_loop_iterations; ++warp_mma_k) {
-        if (Base::kWarpGemmIterations % 2 == 0 || warp_mma_k + 1 != Base::kWarpGemmIterations) {
-          // Load warp-level tiles from shared memory, wrapping to k offset if
-          // this is the last group as the case may be.
-
-          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
-          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
-
-          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-          ++this->warp_tile_iterator_A_;
-          ++this->warp_tile_iterator_B_;
-        }
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                              warp_transformed_frag_B[warp_mma_k % 2],
-                              warp_loaded_frag_A[warp_mma_k % 2],
-                              warp_loaded_frag_B[warp_mma_k % 2]);
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k == 0) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-          copy_tiles_and_advance(
-              iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
-        }
-
-        if (warp_mma_k < Base::kWarpGemmIterations) {
-          warp_mma(
-            accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            accum
-          );
-        }
-
-        if (warp_mma_k + 1 == inner_loop_iterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        if (warp_mma_k + 2 == inner_loop_iterations) {
-          // Inserts a fence to group cp.async instructions into stages.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages of cp.async have committed
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next cta
-          iterator_A.advance();
-
-          this->smem_iterator_A_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({-Base::kStages, 0});
-   
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.advance(- (Base::kStages-1) * iterator_A.get_load_size());
-            smem_read_stage_idx = 0;
-          } else {
-            this->warp_tile_iterator_A_.advance(iterator_A.get_load_size());
-            ++smem_read_stage_idx;
-          }
-
-          if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
-            this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
-          }
-
-          // goback to start position. B has no multiple stage
-          this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Shape::kK, 0});
-
-          --gemm_k_iterations;
-        }
-      }
-
-      //
-      // Epilogue
-      //
-      int32_t smem_base_offset = iterator_B.get_load_size() + (iterations % Base::kStages) * iterator_A.get_load_size();
-
-      destination_iterator.set_tile_index(iterations * split_k_slices);
-      
-      source_iterator.set_tile_index(iterations * split_k_slices);
-    
-      epilogue(output_op, destination_iterator, accum, source_iterator, smem_base_offset);
-
-      ++iterations;
-    }
-
-    // Insert fence and wait for all outstanding cp.async operations to commit.
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
deleted file mode 100755
index 52d604e43..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
-    matrix from memory.
-
-    This iterator assumes TensorNHWC layout of tensors in Global Memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-template <typename Shape_,
-          typename Element_,
-          typename Layout_,
-          typename ThreadMap_,
-          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
-class DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized {
-public:   
-  //
-  // Types
-  //
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-  using TensorRef = cutlass::TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
-  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
-  static int const kConvDim = 2;
-  using ConvProblemSize = typename conv::Conv2dProblemSize;
- 
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static int const kFilterSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
-           sizeof_bits<Element>::value / 8;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
- 
-  //
-  // Simplifying assertions
-  //
-  static_assert(ThreadMap::Iterations::kContiguous == 1,
-    "Require Iterations::kContiguous == 1");
-
-  //
-  // Parameters structure
-  //
-  using Params = Depthwise2dFpropDirectConvFilterIteratorParams<Layout>;
-
- protected:
-
-  Conv2dProblemSize const &problem_size_;
-  Params const &params_;
-  LongIndex iteration_contiguous_;
-  LongIndex iteration_strided_;
-  LongIndex iteration_vector_;
-  char const *pointer_;
-
-  int filter_k_;
-  int offset_trs_[ThreadMap::Iterations::kStrided];
-
-public:
-
-
-
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized(
-    Params const &params, 
-    Conv2dProblemSize const &problem_size,
-    Element const *ptr,
-    int thread_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    params_(params), 
-    problem_size_(problem_size), 
-    pointer_(reinterpret_cast<char const *>(ptr)), 
-    filter_k_(0) {
-
-    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
-
-    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      offset_trs_[s] = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
-    }
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
-      return Params(problem_size, layout, {Shape::kRow, Shape::kColumn}, kFilterSize);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(Index index) {
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // Do nothing because the filter is persistent in the SMEM
-  }
-
-  /// Returns the coordinate in the filter tensor W that is currently pointed to
-  /// by the iterator.
-  CUTLASS_HOST_DEVICE
-  TensorCoord at() const {
-
-    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
-    int trs =  offset_trs_[iteration_strided_];
-
-    return TensorCoord(k, trs, 0 , 0);  // As a 2D-matrix
-  }
-
-  /// Returns true if the current coordinate is within the activations tensor W
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    TensorCoord coord = at();
-
-    return coord.n() < problem_size_.K &&
-            coord.h() < Shape::kColumn;
-  }
-
-  /// Returns a pointer to the vector starting at the current coordinate
-  CUTLASS_HOST_DEVICE
-  AccessType const *get() const {
-    TensorCoord coord = at();
-    int64_t offset = coord.n();
-    if (params_.is_convolution) {
-      offset += (Shape::kColumn - coord.h() - 1)* problem_size_.K;
-    } else {
-      offset += coord.h() * problem_size_.K;
-    }
-
-    return reinterpret_cast<AccessType const *>(pointer_ +
-                                                offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Increments to the next memory access
-  CUTLASS_HOST_DEVICE
-  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized &operator++() {
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-    iteration_vector_ = 0;
-
-    ++iteration_contiguous_;
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-    iteration_contiguous_ = 0;
-    
-    ++iteration_strided_;
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-    iteration_strided_ = 0;
- 
-    return *this;
-  }
-
-  /// Determines the filter size loaded by iterator
-  CUTLASS_HOST_DEVICE
-  int get_load_size() {
-    return kFilterSize;
-  }
-
-  /// Determines whether the Implicit GEMM can execute the given problem.
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(Conv2dProblemSize const &problem_size) {
-
-    // check alignment constraint on iterator's contiguous dimension
-    if (problem_size.K % AccessType::kElements) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    // check whether runtime filter size is same as templated filter size.
-    if ((problem_size.R * problem_size.S) != Shape::kColumn) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
deleted file mode 100755
index c2825fa60..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
+++ /dev/null
@@ -1,336 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Iterates over tiles of A operand in global memory 
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorA_,
-  /// Iterates over tiles of A operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorA_,
-  /// Iterates over tiles of B operand in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorB_,
-  /// Iterates over tiles of B operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorB_,
-  /// Data type of accumulator matrix
-  typename ElementC_,
-  /// Data type of accumulator matrix
-  typename LayoutC_,
-  /// Policy describing tuning details (concept: MmaPolicy)
-  typename Policy_,
-  /// Transformation applied to A operand
-  typename TransformA_ = NumericArrayConverter<
-    typename SmemIteratorA_::Element, 
-    typename IteratorA_::Element, 
-    IteratorA_::Fragment::kElements>,
-  ///
-  /// Transformation applied to A operand
-  typename TransformB_ = NumericArrayConverter<
-    typename SmemIteratorB_::Element, 
-    typename IteratorB_::Element, 
-    IteratorB_::Fragment::kElements>,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class DepthwiseFpropPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
-public:
-
-  ///< Base class
-  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
-
-  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
-  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
-  using ElementC = ElementC_;       ///< Data type of accumulator matrix
-  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-  using Policy = Policy_;           ///< Policy describing tuning details
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Obtain the arch tag from the warp-level operator
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
-  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
-
-private:
-
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
-protected:
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  DepthwiseFpropPipelined(
-    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
-    int thread_idx,                                     ///< ID within the threadblock
-    int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,                            ///< number of iterations of the mainloop
-    FragmentC &accum,                                 ///< destination accumulator tile
-    IteratorA iterator_A,                             ///< iterator over A operand in global memory
-    IteratorB iterator_B,                             ///< iterator over B operand in global memory
-    FragmentC const &src_accum,                       ///< source accumulator tile
-    int gemm_k_iterations_per_channel = 0,            ///< number of iterations per channel
-    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
-    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
-
-    //
-    // Prologue
-    //
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
-
-    tb_frag_A.clear();
-    tb_frag_B.clear();
-
-    // The last kblock is loaded in the prolog
-    iterator_A.load(tb_frag_A);
-    iterator_B.load(tb_frag_B);
-
-    ++iterator_A;
-    ++iterator_B;
-
-    this->smem_iterator_A_.store(transform_A(tb_frag_A));
-    this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_A[2];
-    WarpFragmentB warp_frag_B[2];
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    Operator warp_mma;
-
-    int smem_write_stage_idx = 1;
-    // Depthwise specific
-    int channel_start_index = 0;
-    int rs_plane_idx = 0;
-
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tightest latency requirement).
-
-    //
-    // Mainloop
-    //
-
-    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
-        // Reset interation index.
-        iterator_B.set_iteration_index(0);
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-
-        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
-
-          // Write fragments to shared memory
-          this->smem_iterator_A_.store(transform_A(tb_frag_A));
-
-          this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-          __syncthreads();
-          
-          if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
-            // Move to next set of filter groups.
-            channel_start_index += Base::kWarpGemmIterations;
-          }
-
-          ++this->smem_iterator_A_;
-          ++this->smem_iterator_B_;
-
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-          if (smem_write_stage_idx == 1) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-          }
-          else {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
-                 0});
-          }
-
-          smem_write_stage_idx ^= 1;
-        }
-
-        this->warp_tile_iterator_A_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k == 0) {
-
-          iterator_A.load(tb_frag_A);
-          iterator_B.load(tb_frag_B);
-    
-          ++iterator_A;
-          ++iterator_B;
-        }
-
-        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
-                 warp_frag_B[warp_mma_k % 2], accum);
-      }
-
-      rs_plane_idx = (rs_plane_idx == gemm_k_iterations_per_channel - 1) ? 0: (rs_plane_idx + 1);
-
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
deleted file mode 100755
index 967587be0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a directconv threadblock-scoped Depthwise kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy object describing MmaTensorOp
-template <
-    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
-    typename Operator_,
-    /// Padding used for A operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingA_,
-    /// Padding used for B operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingB_,
-    ///
-    typename ThreadMapA_,
-    ///
-    typename ThreadMapB_,
-    /// Number of partitions of K dimension of GEMM
-    int PartitionsK = 1>
-struct DepthwiseDirectConvMmaPolicy {
-  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
-  using Operator = Operator_;
-
-  /// Padding used for A operand in shared memory
-  using SmemPaddingA = SmemPaddingA_;
-
-  /// Padding used for B operand in shared memory
-  using SmemPaddingB = SmemPaddingB_;
-
-  using ThreadMapA = ThreadMapA_;
-  using ThreadMapB = ThreadMapB_;
-
-  /// Number of partitions of K dimension
-  static int const kPartitionsK = PartitionsK;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class DepthwiseDirectConvMmaBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = cutlass::gemm::
-      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  /// kWarpGemmIterations could be even and odd. 
-  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<1,  // Not determined at compile-time :(
-                               Shape::kN + Policy::SmemPaddingA::kRow>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB = MatrixShape<Policy::ThreadMapB::StorageShape::kStrided +
-                                   Policy::SmemPaddingB::kRow,  // filter_rs_size
-                               Policy::ThreadMapB::StorageShape::kContiguous +
-                                   Policy::SmemPaddingB::kColumn>;  // Tile N = 64?
-
-   public:
-    //
-    // Data members
-    //
-
-    // Let persistent B matrix in front of dynamic matrix A
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-    /// Buffer for A operand
-    /// Not be determined at compile-time -- Just to get a Smem start address.
-    AlignedBuffer<typename Operator::ElementA, 1> operand_A;  
-   public:
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
-  };
-
- protected:
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
- public:
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  DepthwiseDirectConvMmaBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace conv
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
deleted file mode 100755
index de84180f3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
+++ /dev/null
@@ -1,952 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting depthwise related simt instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/warp/mma_depthwise_simt.h"
-
-#include "cutlass/gemm/threadblock/mma_pipelined.h"
-#include "cutlass/gemm/threadblock/mma_singlestage.h"
-
-#include "cutlass/gemm/threadblock/mma_base.h"
-#include "cutlass/conv/threadblock/depthwise_mma_base.h"
-
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h"
-
-#include "cutlass/arch/cache_operation.h" 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-namespace detail {
-//
-// Convert a WarpShapeM which is the whole tile of elements into the number of elements (2D) held by
-// each partitions within warp. 
-// The goal is for each thread's tile of elements to be as square as
-// possible for performance (4x4 will be faster than 2x8).
-template<int WarpShapeM,  // The number of elements (1D) contained in the entire warp
-         int WarpNumThreadsM> // The number of partitions within the warp
-struct SimtWarpShape {
-  // kP * kQ * WarpNumThreadsM = WarpShapeM
-  // If needed, enable more specializations.
-};
-template <>
-struct SimtWarpShape<4, 4> {
-  static constexpr int kP = 1;
-  static constexpr int kQ = 1;
-};
-
-template <>
-struct SimtWarpShape<4, 2> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 1;
-};
-
-template <>
-struct SimtWarpShape<4, 1> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 2;
-};
-
-template <>
-struct SimtWarpShape<8, 1> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 4;
-};
-template <>
-struct SimtWarpShape<8, 2> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 2;
-};
-template <>
-struct SimtWarpShape<8, 4> {
-  static constexpr int kP = 1;
-  static constexpr int kQ = 2;
-};
-
-template <>
-struct SimtWarpShape<16, 1> {
-  static constexpr int kP = 4;
-  static constexpr int kQ = 4;
-};
-template <>
-struct SimtWarpShape<16, 2> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 4;
-};
-template <>
-struct SimtWarpShape<16, 4> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 2;
-};
-
-template <int WarpNumThreadsM>
-struct SimtWarpShape<25, WarpNumThreadsM> {
-  static_assert(WarpNumThreadsM == 1, "WarpShapeM could not be evenly splited by threads");
-  static constexpr int kP = 5;
-  static constexpr int kQ = 5;
-};
-
-template <>
-struct SimtWarpShape<32, 1> {
-  static constexpr int kP = 4;
-  static constexpr int kQ = 8;
-};
-
-template <>
-struct SimtWarpShape<32, 2> {
-  static constexpr int kP = 4;
-  static constexpr int kQ = 4;
-};
-
-template <>
-struct SimtWarpShape<32, 4> {
-  static constexpr int kP = 2;
-  static constexpr int kQ = 4;
-};
-
-}  // namespace detail
-
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Size of a warp-scoped per thread access
-    int kLaneAccessSizeA_ = 0,
-    /// Size of a warp-scoped per thread access 
-    int kLaneAccessSizeB_ = 0,
-    /// Number of stages
-    int Stages = 2,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
->
-struct DepthwiseMmaCoreWithLaneAccessSize;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of threadblock-scoped output tile 
-    typename ThreadBlockOutputShape,
-    /// Shape of filter shape per threadblock
-    typename FilterShape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Size of a warp-scoped per thread access
-    int kLaneAccessSizeA_ = 0,
-    /// Size of a warp-scoped per thread access 
-    int kLaneAccessSizeB_ = 0,
-    /// Number of stages
-    int Stages = 2,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Iterator algo type
-    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
->
-struct DepthwiseDirectConvMmaCoreWithLaneAccessSize;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB,
-    bool IsComplex
->
-struct DepthwiseMmaCoreWithLaneAccessSize<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-    OperatorClass, -1, -1, Stages, Operator, AccumulatorsInRowMajor,
-    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
-> : cutlass::gemm::threadblock::DefaultMmaCore<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
-    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
-> {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
-    int kLaneAccessSizeA_,
-    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
-    int kLaneAccessSizeB_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DepthwiseMmaCoreWithLaneAccessSize<Shape_,
-                                        WarpShape_,
-                                        cutlass::gemm::GemmShape<1, 1, 1>,
-                                        ElementA_,
-                                        layout::RowMajor,
-                                        ElementB_,
-                                        layout::ColumnMajor,
-                                        ElementC_,
-                                        LayoutC_,
-                                        arch::OpClassSimt,
-                                        kLaneAccessSizeA_,
-                                        kLaneAccessSizeB_,
-                                        2,
-                                        Operator_> : public cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
-                                                                           WarpShape_,
-                                                                           cutlass::gemm::GemmShape<1, 1, 1>,
-                                                                           ElementA_,
-                                                                           layout::RowMajor,
-                                                                           ElementB_,
-                                                                           layout::ColumnMajor,
-                                                                           ElementC_,
-                                                                           LayoutC_,
-                                                                           arch::OpClassSimt,
-                                                                           2,
-                                                                           Operator_> {
-  using Base = cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
-                              WarpShape_,
-                              cutlass::gemm::GemmShape<1, 1, 1>,
-                              ElementA_,
-                              layout::RowMajor,
-                              ElementB_,
-                              layout::ColumnMajor,
-                              ElementC_,
-                              LayoutC_,
-                              arch::OpClassSimt,
-                              2,
-                              Operator_>;
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  static int const kLaneAccessSizeA = kLaneAccessSizeA_;
-  static int const kLaneAccessSizeB = kLaneAccessSizeB_;
-
-  // Divisility requirements
-  static_assert( kLaneAccessSizeA > 0 && kLaneAccessSizeB > 0,
-    "Size of a warp-scoped per thread access should be larger then ZERO" );
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = typename Base::WarpCount;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
-
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory are same as base class
-  //
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = cutlass::gemm::threadblock::detail::simt_get_warp_threads_m<WarpShape>(); 
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = kLaneAccessSizeA / sizeof_bits<ElementA>::value;
-  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static int const kPaddingM = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-  static int const kPaddingN = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseSimt<
-      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
-      ElementA,       /// Data type of A elements
-      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,       /// Data type of B elements
-      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,       /// Element type of C matrix
-      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
-      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
-    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
-    typename ThreadBlockOutputShape_,
-    /// Shape of filter shape per threadblock
-    typename FilterShape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Size of a warp-scoped per thread access
-    int kLaneAccessSizeA_,
-    /// Number of stages
-    int Stages_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
-                                                    ThreadBlockOutputShape_,
-                                                    FilterShape_,
-                                                    WarpShape_,
-                                                    cutlass::gemm::GemmShape<1, 1, 1>,
-                                                    ElementA_,
-                                                    layout::RowMajor,
-                                                    ElementB_,
-                                                    layout::ColumnMajor,
-                                                    ElementC_,
-                                                    LayoutC_,
-                                                    arch::OpClassSimt,
-                                                    kLaneAccessSizeA_,
-                                                    128,
-                                                    Stages_,
-                                                    Operator_> {
-  using Shape = Shape_;
-  using FilterShape = FilterShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  static int const kLaneAccessSizeB = 128;
-
-  // Divisility requirements
-  static_assert( kLaneAccessSizeB > 0,
-    "Size of a warp-scoped per thread access should be larger then ZERO" );
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = cutlass::gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    1
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-  
-  // For Gmem load
-  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
-  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, 1>, // Set kStrided = 1 because activation shape is runtime value.
-    kThreads,
-    kElementsPerAccessA
-  >;
-
-  /// ThreadMap of iterator A
-  using SmemThreadMapA = IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
-    MatrixShape<1, Shape::kN>, // set kRow is 1 because it is a runtime value
-    ElementA, 
-    SmemLayoutA,
-    0,
-    SmemThreadMapA, // was IteratorThreadMapA
-    true  // Dynamic iterations.
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
-    kThreads,
-    kElementsPerAccessB
-  >;
-
-  /// Transpose the ThreadMap of iterator B
-  using SmemThreadMapB = IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand                                                  
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
-    MatrixShape<FilterShape::kCount, Shape::kN>,
-    ElementB, 
-    SmemLayoutB,
-    0,
-    SmemThreadMapB, // was IteratorThreadMapB
-    false   // static iterations.
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-  // Groups per threads
-  // Fp32: 2 groups
-  // Fp16: 2 groups
-  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
-  // Define the warp-level op  
-  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
-  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
-
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-
-  // Get output P, Q per thread
-  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
-  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
-
-  static const int LaneLayout = 1;
-  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
-  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
-  
-  // Define the output tile computed by each thread
-  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
-
-  // Fetch the channel with same access size
-  static const int LaneM = LaneN;
-
-  // No paddings
-  static int const kPaddingM = 0;
-  static int const kPaddingN = 0;
-
-  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
-      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
-      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
-      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
-      ThreadBlockOutputShape_, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
-      ElementA,       /// Data type of A elements
-      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,       /// Data type of B elements
-      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,       /// Element type of C matrix
-      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
-      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
-    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
-    IteratorThreadMapA,
-    IteratorThreadMapB,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
-    typename ThreadBlockOutputShape_,
-    /// Shape of filter shape per threadblock
-    typename FilterShape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Size of a warp-scoped per thread access
-    int kLaneAccessSizeA_,
-    /// Number of stages
-    int Stages_,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape_,   
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape_,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape_>
-struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
-                                                    ThreadBlockOutputShape_,
-                                                    FilterShape_,
-                                                    WarpShape_,
-                                                    cutlass::gemm::GemmShape<1, 1, 1>,
-                                                    ElementA_,
-                                                    layout::RowMajor,
-                                                    ElementB_,
-                                                    layout::ColumnMajor,
-                                                    ElementC_,
-                                                    LayoutC_,
-                                                    arch::OpClassSimt,
-                                                    kLaneAccessSizeA_,
-                                                    128,
-                                                    Stages_,
-                                                    Operator_,
-                                                    IteratorAlgorithm::kFixedStrideDilation,
-                                                    StrideShape_,
-                                                    DilationShape_,
-                                                    ActivationShape_> {
-  using Shape = Shape_;
-  using FilterShape = FilterShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  using StrideShape = StrideShape_;
-  using DilationShape = DilationShape_; 
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  using ActivationShape = ActivationShape_;
-
-  static int const kLaneAccessSizeB = 128;
-
-  // Divisility requirements
-  static_assert( kLaneAccessSizeB > 0,
-    "Size of a warp-scoped per thread access should be larger then ZERO" );
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = cutlass::gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    1
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-  
-  // For Gmem load
-  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
-  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<ActivationShape::kC, ActivationShape::kNHW>,
-    kThreads,
-    kElementsPerAccessA
-  >;
-
-  /// ThreadMap of iterator A
-  using SmemThreadMapA = IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
-    MatrixShape<ActivationShape::kNHW, ActivationShape::kC>,
-    ElementA,
-    SmemLayoutA,
-    0,
-    SmemThreadMapA, // was IteratorThreadMapA
-    false  // static iterations.
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
-    kThreads,
-    kElementsPerAccessB
-  >;
-
-  /// Transpose the ThreadMap of iterator B
-  using SmemThreadMapB = IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand                                                  
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
-    MatrixShape<FilterShape::kCount, Shape::kN>,
-    ElementB, 
-    SmemLayoutB,
-    0,
-    SmemThreadMapB, // was IteratorThreadMapB
-    false   // static iterations.
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-  // Groups per threads
-  // Fp32: 2 groups
-  // Fp16: 2 groups
-  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
-  // Define the warp-level op  
-  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
-  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
-
-  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
-  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
-
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-
-  static const int LaneLayout = 1;
-  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
-  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
-  
-  // Define the output tile computed by each thread
-  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
-
-  // Fetch the channel with same access size
-  static const int LaneM = LaneN;
-
-  // No paddings
-  static int const kPaddingM = 0;
-  static int const kPaddingN = 0;
-
-  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
-      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
-      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
-      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
-      ThreadBlockOutputShape, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
-      ElementA,       /// Data type of A elements
-      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,       /// Data type of B elements
-      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,       /// Element type of C matrix
-      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
-      Policy,          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-      IteratorAlgorithm::kFixedStrideDilation, /// Iterator algo type
-      StrideShape,   /// Stride ( MatrixShape<Height, Width> )
-      DilationShape,  /// Dilation ( MatrixShape<Height, Width> )
-      ActivationShape /// Activation Shape loaded by threadblock
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
-    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
-    IteratorThreadMapA,
-    IteratorThreadMapB,
-    WarpCount::kK
-  >;
-};
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
deleted file mode 100755
index 3bee07d0a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
+++ /dev/null
@@ -1,802 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multistage threadblock-scoped fused activation's 
-   scale+bias+relu and Implicit GEMM Convolution kernel.
-
-   The original implicit gemm will store out-of-bound data as zeroes in the
-   shared memory because zeros into the tensor core, zeroes out of the tensor
-   cores.  The result is remained the same.   When fusing scale+bias+relu
-   into the mainloop, it is no longer true because
-
-     0 x scale + bias = bias
-
-   which is no longer always 0.  So, instead of storing zeroes, this fused
-   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
-   scale+bias+relu, the code is like
-
-     if (data == 0x7eff)
-       data = 0;
-     else
-       data = scale+bias+relu(data, scale, bias);
-
-  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
-  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/cache_operation.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-#include "cutlass/conv/warp/scale_bias_relu_transform.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Element type of scale and bias vectors 
-    typename ElementScaleBias_,
-    /// Layout of scale and bias vectors
-    typename LayoutScaleBias_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// WarpIterator to load Scale or Bias vector from the shared memory
-    typename WarpIteratorScaleBias_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaFpropFusionBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Element type of scale and bias vectors 
-  using ElementScaleBias = ElementScaleBias_;
-
-  /// Layout of scale and bias vectors
-  using LayoutScaleBias = LayoutScaleBias_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  ///< WarpIterator to load Scale or Bias vector from the shared memory
-  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
-                                             Shape::kN / WarpGemm::kN,
-                                             Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the scale and bias vectors
-  using TensorRefScaleBias = TensorRef<ElementScaleBias, LayoutScaleBias>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-
-  static_assert((kWarpGemmIterations % 2) == 0,
-                "Inner loop iteration must be an even number.");
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the A scale and bias vectors in shared memory
-    using ShapeScaleBias =
-        MatrixShape<1 + Policy::SmemPaddingA::kRow,
-                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-    /// Buffer for A operand Scale and Bias
-    AlignedBuffer<ElementScaleBias, ShapeScaleBias::kCount> operand_A_scale_bias;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a layout object for the A scale and bias vectors
-    CUTLASS_DEVICE
-    static LayoutScaleBias LayoutScaleBias() {
-      return LayoutScaleBias::packed(
-          {ShapeScaleBias::kRow, ShapeScaleBias::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-
-    /// Returns a TensorRef to the A operand Scale vector
-    CUTLASS_HOST_DEVICE
-    TensorRefScaleBias operand_A_scale_bias_ref() {
-      return TensorRefScaleBias{operand_A_scale_bias.data(), LayoutScaleBias()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
-  /// from shared memory
-  WarpIteratorScaleBias warp_tile_iterator_A_scale_bias_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaFpropFusionBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-        warp_tile_iterator_A_scale_bias_(
-            shared_storage.operand_A_scale_bias_ref(), lane_idx),
-        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Iterates over vectors of scale and bias vector in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorScaleBias_,
-    /// Iterates over vectors of scale and bias vector in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorScaleBias_,
-    /// Cache operation for scale/bias operand 
-    cutlass::arch::CacheOperation::Kind CacheOpScaleBias,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// WarpIterator to load Scale or Bias vector from the shared memory
-    typename WarpIteratorScaleBias_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class ImplicitGemmFpropFusionMultistage
-    : public MmaFpropFusionBase<Shape_, typename IteratorScaleBias_::Element,
-                       typename IteratorScaleBias_::Layout, Policy_,
-                       WarpIteratorScaleBias_, Stages> {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Iterates over tiles of the scale and bias vectors in global memory
-  using IteratorScaleBias = IteratorScaleBias_;
-  ///< WarpIterator to load Scale or Bias vector from the shared memory
-  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-  ///< Base class
-  using Base = MmaFpropFusionBase<Shape_, typename IteratorScaleBias::Element,
-                         typename IteratorScaleBias::Layout, Policy,
-                         WarpIteratorScaleBias, Stages>;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-  using SmemIteratorScaleBias = SmemIteratorScaleBias_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpScaleBias =
-      CacheOpScaleBias;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-
-  using ElementC = typename Policy::Operator::ElementC;
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-  
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpLoadedFragmentScaleBias =
-      typename WarpIteratorScaleBias::Fragment;
-
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
-  SmemIteratorScaleBias smem_iterator_A_scale_bias_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-  
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  ImplicitGemmFpropFusionMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
-        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-        smem_iterator_A_scale_bias_(shared_storage.operand_A_scale_bias_ref(),
-                                    thread_idx),
-        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A,
-                              IteratorScaleBias &iterator_A_scale_bias,
-                              IteratorB &iterator_B, int group_start_A = 0,
-                              int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-      
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess / 8;
-
-        // Uses nan fill for out of bound data
-        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
-            dst_ptr, iterator_A.get(), iterator_A.valid());
-
-        ++iterator_A;
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
-    // are small.  One iteration is enough.
-    if (group_start_A == 0) {
-      typename IteratorScaleBias::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorScaleBias::AccessType *>(
-              this->smem_iterator_A_scale_bias_.get());
-
-      int const kSrcBytes =
-          sizeof_bits<typename IteratorScaleBias::Element>::value *
-          IteratorScaleBias::kElementsPerAccess / 8;
-
-      cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
-          dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
-    }
-
-    iterator_B.set_iteration_index(group_start_B);
-
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-    
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-        
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess / 8;
-
-        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr, iterator_B.get(), iterator_B.valid());
-
-        ++iterator_B;
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< iterator over scale and bias vectors in global memory
-      IteratorScaleBias iterator_A_scale_bias,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      ///< number of iterations per channel
-      int gemm_k_iterations_per_channel = 0,  
-      ///< Imaginary strides used for planar-complex only - ignored here
-      int64_t imag_stride_A = 0,
-      int64_t imag_stride_B = 0) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorA::AccessType *>(
-            this->smem_iterator_A_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorA::Element>::value *
-            IteratorA::ThreadMap::kElementsPerAccess / 8;
-        
-        // Uses Nan fill for out of bound data
-        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
-            dst_ptr, iterator_A.get(), iterator_A.valid());
-
-        ++iterator_A;
-        ++this->smem_iterator_A_;
-      }
-
-      // Async Copy for operand A scale and bias vectors.  Scale and bias
-      // vectors are small.  One iteration is enough.
-      {
-        typename IteratorScaleBias::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorScaleBias::AccessType *>(
-                this->smem_iterator_A_scale_bias_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorScaleBias::Element>::value *
-            IteratorScaleBias::kElementsPerAccess / 8;
-
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
-            dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorB::AccessType *>(
-              this->smem_iterator_B_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorB::Element>::value *
-            IteratorB::ThreadMap::kElementsPerAccess / 8;
-
-        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-            dst_ptr, iterator_B.get(), iterator_B.valid());
-
-        ++iterator_B;
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.advance();
-      iterator_A_scale_bias.advance();
-      iterator_B.advance();
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Inserts a fence to group cp.async instructions into stages.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed. 
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpLoadedFragmentScaleBias warp_loaded_frag_A_scale_bias[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-    cutlass::conv::warp::FpropScaleBiasReluTransform<WarpTransformedFragmentA,
-                                            WarpLoadedFragmentScaleBias>
-        elementwise_transform;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_A_scale_bias_.load(
-        warp_loaded_frag_A_scale_bias[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_A_scale_bias_;
-    ++this->warp_tile_iterator_B_;
-
-    // Start issuing the first group of the next stage outside of the mainloop
-    copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    elementwise_transform(warp_transformed_frag_A[0],
-                         warp_loaded_frag_A_scale_bias[0]);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(
-            (warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_A_scale_bias_.load(
-            warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_A_scale_bias_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0) {
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
-                               warp_loaded_frag_A_scale_bias[warp_mma_k % 2]);
-        }
-
-        warp_mma(
-                 accum, 
-                 warp_transformed_frag_A[warp_mma_k % 2],
-                 warp_transformed_frag_B[warp_mma_k % 2],
-                 accum
-                );
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-        } else {
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-        }
-
-        copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B,
-                               group_start_iteration_A,
-                               group_start_iteration_B);
-
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-          elementwise_transform(
-              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-              warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          // Inserts a fence to group cp.async instructions into stages.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages of cp.async have committed
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.advance();
-          iterator_A_scale_bias.advance();
-          iterator_B.advance();
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_A_scale_bias_.add_tile_offset(
-                {0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-        }
-      }
-
-    }
-
-    // Insert fence and wait for all outstanding cp.async operations to commit.
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
deleted file mode 100755
index eea7743a4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
+++ /dev/null
@@ -1,539 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/cache_operation.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class ImplicitGemmMultistage : 
-  public gemm::threadblock::MmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-
-  using ElementC = typename Policy::Operator::ElementC;
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-  
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
-    // accuracy, where each mainloop iteration first accumulates into a temporary
-    // set of freshly-cleared accumulators, which are subsequently added to the
-    // final accumulator set.
-    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  ImplicitGemmMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(
-    IteratorA &iterator_A, IteratorB &iterator_B,
-    int group_start_A = 0, int group_start_B = 0) {
-
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-      
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                  dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-    
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-        
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                  dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      ///< number of iterations per channel
-      int gemm_k_iterations_per_channel = 0,
-      ///< Imaginary strides used for planar-complex only - ignored here
-      int64_t imag_stride_A = 0,
-      int64_t imag_stride_B = 0) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorA::AccessType *>(
-            this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-            sizeof_bits<typename IteratorA::Element>::value *
-            IteratorA::ThreadMap::kElementsPerAccess /
-            IteratorA::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-            dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorB::AccessType *>(
-              this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-  
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.advance();
-      iterator_B.advance();
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Inserts a fence to group cp.async instructions into stages.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed. 
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    // Start issuing the first group of the next stage outside of the mainloop
-    copy_tiles_and_advance(iterator_A, iterator_B);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
-    // accumulator and this temporary accumulator is added to the final
-    // accumulator once in every mainloop iteration.
-    plus<FragmentC> plus_accum;
-
-    FragmentC tmp_accum;
-
-    if (Detail::kStagedAccumulation) {
-      tmp_accum.clear();
-    }
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-        } else {
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-        }
-
-        copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A,
-                               group_start_iteration_B);
-
-        if (Detail::kStagedAccumulation) {
-          warp_mma(
-            tmp_accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            tmp_accum
-          );
-
-          if (warp_mma_k == 0) {
-            accum = plus_accum(accum, tmp_accum);
-            tmp_accum.clear();
-          }
-        } else {
-          warp_mma(
-            accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            accum
-          );
-        }
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          // Inserts a fence to group cp.async instructions into stages.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages of cp.async have committed
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.advance();
-          iterator_B.advance();
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-        }
-      }
-
-    }
-
-    if (Detail::kStagedAccumulation) {
-      accum = plus_accum(accum, tmp_accum); 
-    }
-  
-    // Insert fence and wait for all outstanding cp.async operations to commit.
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
deleted file mode 100755
index 79bcb78aa..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ /dev/null
@@ -1,320 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Iterates over tiles of A operand in global memory 
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorA_,
-  /// Iterates over tiles of A operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorA_,
-  /// Iterates over tiles of B operand in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorB_,
-  /// Iterates over tiles of B operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorB_,
-  /// Data type of accumulator matrix
-  typename ElementC_,
-  /// Data type of accumulator matrix
-  typename LayoutC_,
-  /// Policy describing tuning details (concept: MmaPolicy)
-  typename Policy_,
-  /// Transformation applied to A operand
-  typename TransformA_ = NumericArrayConverter<
-    typename SmemIteratorA_::Element, 
-    typename IteratorA_::Element, 
-    IteratorA_::Fragment::kElements>,
-  ///
-  /// Transformation applied to A operand
-  typename TransformB_ = NumericArrayConverter<
-    typename SmemIteratorB_::Element, 
-    typename IteratorB_::Element, 
-    IteratorB_::Fragment::kElements>,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
-public:
-
-  ///< Base class
-  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
-
-  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
-  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
-  using ElementC = ElementC_;       ///< Data type of accumulator matrix
-  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-  using Policy = Policy_;           ///< Policy describing tuning details
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Obtain the arch tag from the warp-level operator
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
-  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
-
-private:
-
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
-protected:
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  ImplicitGemmPipelined(
-    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
-    int thread_idx,                                     ///< ID within the threadblock
-    int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,                            ///< number of iterations of the mainloop
-    FragmentC &accum,                                 ///< destination accumulator tile
-    IteratorA iterator_A,                             ///< iterator over A operand in global memory
-    IteratorB iterator_B,                             ///< iterator over B operand in global memory
-    FragmentC const &src_accum,                       ///< source accumulator tile
-    int gemm_k_iterations_per_channel = 0,             ///< number of iterations per channel
-    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
-    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
-
-    //
-    // Prologue
-    //
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
-
-    tb_frag_A.clear();
-    tb_frag_B.clear();
-
-    // The last kblock is loaded in the prolog
-    iterator_A.load(tb_frag_A);
-    iterator_B.load(tb_frag_B);
-
-    ++iterator_A;
-    ++iterator_B;
-
-    this->smem_iterator_A_.store(transform_A(tb_frag_A));
-    this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_A[2];
-    WarpFragmentB warp_frag_B[2];
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    Operator warp_mma;
-
-    int smem_write_stage_idx = 1;
-
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tightest latency requirement).
-
-    //
-    // Mainloop
-    //
-
-    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-
-        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
-
-          // Write fragments to shared memory
-          this->smem_iterator_A_.store(transform_A(tb_frag_A));
-
-          this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-          __syncthreads();
-          
-          ++this->smem_iterator_A_;
-          ++this->smem_iterator_B_;
-
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-          if (smem_write_stage_idx == 1) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-          }
-          else {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
-                 0});
-          }
-
-          smem_write_stage_idx ^= 1;
-        }
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k == 0) {
-
-          iterator_A.load(tb_frag_A);
-          iterator_B.load(tb_frag_B);
-    
-          ++iterator_A;
-          ++iterator_B;
-        }
-
-        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
-                 warp_frag_B[warp_mma_k % 2], accum);
-      }
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
deleted file mode 100755
index 1ec0c61dd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
+++ /dev/null
@@ -1,729 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multistage threadblock-scoped fused activation's scale+bias+relu and
-   Implicit GEMM Convolution kernel.
-
-   The original implicit gemm will store out-of-bound data as zeroes in the
-   shared memory because zeros into the tensor core, zeroes out of the tensor
-   cores.  The result is remained the same.   When fusing scale+bias+relu
-   into the mainloop, it is no longer true because
-
-     0 x scale + bias = bias
-
-   which is no longer always 0.  So, instead of storing zeroes, this fused
-   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
-   scale+bias+relu, the code is like
-
-     if (data == 0x7eff)
-       data = 0;
-     else
-       data = scale+bias+relu(data, scale, bias);
-
-  The biggest difference compared with the fused Fprop and scale+bias+relu is
-  that scale and bias are loop invariant in Wgrad so that they only needs to 
-  be loaded once before the mainloop.
-
-  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
-  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
-
-
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/cache_operation.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-#include "cutlass/conv/warp/scale_bias_relu_transform.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Element type of scale and bias vectors 
-    typename ElementScaleBias_,
-    /// Layout of scale and bias vectors
-    typename LayoutScaleBias_,
-    /// Element type of scale and bias vectors 
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaWgradFusionBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Element type of scale and bias vectors 
-  using ElementScaleBias = ElementScaleBias_;
-
-  /// Layout of scale and bias vectors
-  using LayoutScaleBias = LayoutScaleBias_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
-                                             Shape::kN / WarpGemm::kN,
-                                             Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-
-  static_assert((kWarpGemmIterations % 2) == 0,
-                "Inner loop iteration must be an even number.");
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaWgradFusionBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Iterates over vectors of scale and bias vector in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorScaleBias_,
-    /// Iterates over vectors of scale and bias vector i
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class ImplicitGemmWgradFusionMultistage
-    : public MmaWgradFusionBase<Shape_, typename IteratorScaleBias_::Element,
-                       typename IteratorScaleBias_::Layout, Policy_, Stages> {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Iterates over tiles of the scale and bias vectors in global memory
-  using IteratorScaleBias = IteratorScaleBias_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-  ///< Base class
-  using Base = MmaWgradFusionBase<Shape_, typename IteratorScaleBias::Element,
-                         typename IteratorScaleBias::Layout, Policy_, Stages>;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-
-  using ElementC = typename Policy::Operator::ElementC;
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-  
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    static int const kBBufferSize =
-        ((sizeof(typename Operator::ElementC) == 4) &&
-         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
-                             typename Operator::ElementA>::value &&
-           platform::is_same<typename Operator::Policy::Operator::ElementB,
-                             typename Operator::ElementB>::value)) &&
-         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
-            ? 1
-            : 2;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpLoadedFragmentScaleBias = typename IteratorScaleBias::Fragment;
-
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  int warp_idx_m_;
-
-  int warp_idx_n_;
-  
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  ImplicitGemmWgradFusionMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
-        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
-    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A,
-                              IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-
-    iterator_A.set_iteration_index(group_start_A);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-      
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess / 8;
-
-        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-            dst_ptr, iterator_A.get(), iterator_A.valid());
-
-        ++iterator_A;
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B);
-
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-    
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-        
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess / 8;
-
-        // Uses nan fill for out of bound data
-        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
-                dst_ptr, iterator_B.get(), iterator_B.valid());
-
-        ++iterator_B;
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< iterator over scale and bias vectors in global memory
-      IteratorScaleBias iterator_B_scale_bias,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      ///< number of iterations per channel
-      int gemm_k_iterations_per_channel = 0, 
-      ///< Imaginary strides used for planar-complex only - ignored here
-      int64_t imag_stride_A = 0,
-      int64_t imag_stride_B = 0) {
-
-    //
-    // Prologue
-    //
-
-    WarpLoadedFragmentScaleBias warp_loaded_frag_B_scale_bias;
-    iterator_B_scale_bias.add_tile_offset({0, warp_idx_n_});
-    iterator_B_scale_bias.load(warp_loaded_frag_B_scale_bias);
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorA::AccessType *>(
-            this->smem_iterator_A_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorA::Element>::value *
-            IteratorA::ThreadMap::kElementsPerAccess / 8;
-        
-        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-            dst_ptr, iterator_A.get(), iterator_A.valid());
-
-        ++iterator_A;
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorB::AccessType *>(
-              this->smem_iterator_B_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorB::Element>::value *
-            IteratorB::ThreadMap::kElementsPerAccess / 8;
-
-        // Uses Nan fill for out of bound data
-        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
-            dst_ptr, iterator_B.get(), iterator_B.valid());
-
-        ++iterator_B;
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.advance();
-      iterator_B.advance();
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Inserts a fence to group cp.async instructions into stages.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed. 
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[Detail::kBBufferSize];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[Detail::kBBufferSize];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-    cutlass::conv::warp::WgradScaleBiasReluTransform<WarpTransformedFragmentB,
-                                            WarpLoadedFragmentScaleBias>
-        elementwise_transform;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    // Start issuing the first group of the next stage outside of the mainloop
-    copy_tiles_and_advance(iterator_A, iterator_B);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    elementwise_transform(warp_transformed_frag_B[0],
-                         warp_loaded_frag_B_scale_bias);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        if (Detail::kBBufferSize == 2) {
-          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize]);
-          ++this->warp_tile_iterator_A_;
-        }
-
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0) {
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % Detail::kBBufferSize],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-          elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
-                               warp_loaded_frag_B_scale_bias);
-        }
-
-        warp_mma(
-                 accum, 
-                 warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
-                 warp_transformed_frag_B[warp_mma_k % 2],
-                 accum
-                );
-
-        if (Detail::kBBufferSize == 1) {
-          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-          ++this->warp_tile_iterator_A_;
-  
-        }
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-          elementwise_transform(
-              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-              warp_loaded_frag_B_scale_bias);
-        }
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-        } else {
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-        }
-
-        copy_tiles_and_advance(iterator_A, iterator_B,
-                               group_start_iteration_A,
-                               group_start_iteration_B);
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          // Inserts a fence to group cp.async instructions into stages.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages of cp.async have committed
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.advance();
-          iterator_B.advance();
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-        }
-      }
-
-    }
-
-    // Insert fence and wait for all outstanding cp.async operations to commit.
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
deleted file mode 100755
index bfe9a3981..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ /dev/null
@@ -1,470 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
-
-    This iterator uses masks to guard out-of-bounds accesses.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedScaleBiasVectorAccessIterator
-///
-template <typename ThreadblockShape,
-          typename Element,
-          typename Layout>
-class PredicatedScaleBiasVectorAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
-///
-template <typename ThreadblockShape_, typename Element_>
-class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
-                                              Element_,
-                                              layout::PitchLinear> {
- public:
-
-  using ThreadblockShape = ThreadblockShape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
-
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params const &params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  int problem_size_trs;
-  int problem_size_c;
-  int filter_trs_;
-
-  TensorCoord thread_offset_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      /// Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        problem_size_trs(problem_size.R * problem_size.S),
-        problem_size_c(problem_size.C),
-        filter_trs_(0) {
-    pointer_ = (thread_id < kThreads)
-                   ? reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(scale_pointer))
-                   : reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(bias_pointer));
-
-    // Per-thread offset in logical coordinates of tensor
-    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
-
-    thread_offset_ =
-        threadblock_offset +
-        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
-
-    set_iteration_index(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv3dProblemSize const &problem_size,
-      /// Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        problem_size_trs(problem_size.T * problem_size.R * problem_size.S),
-        problem_size_c(problem_size.C),
-        filter_trs_(0) {
-    pointer_ = (thread_id < kThreads)
-                   ? reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(scale_pointer))
-                   : reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(bias_pointer));
-
-    // Per-thread offset in logical coordinates of tensor
-    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
-
-    thread_offset_ =
-        threadblock_offset +
-        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
-
-    set_iteration_index(0);
-  }
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      /// Pointer to start of scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to start of scale vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv3dProblemSize const &problem_size,
-      /// Pointer to start of scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to start of scale vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {}
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-    thread_offset_ =
-        thread_offset_ +
-        TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    return reinterpret_cast<AccessType *>(
-        pointer_ +
-        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator &operator++() {
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    // moves to the next tile
-    ++filter_trs_;
-    if (filter_trs_ == problem_size_trs) {
-      filter_trs_ = 0;
-      add_tile_offset(TensorCoord(1, 0));
-    }
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_DEVICE
-  PredicatedScaleBiasVectorAccessIterator operator++(int) {
-    PredicatedScaleBiasVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    uint32_t enabled = 0;
-
-#if defined(_MSC_VER) || (__CUDACC_VER_MAJOR__ < 11)
-    enabled = threadIdx.x < kThreads * 2;
-#else
-    asm volatile(
-        "{\n"
-        "  .reg .u32 tid_reg;\n"
-        "  .reg .pred p;\n"
-        "  mov.u32 tid_reg, %%tid.x;\n"
-        "  setp.lt.u32 p, tid_reg, %1;\n"
-        "  selp.u32 %0, 1, 0, p;\n"
-        "}\n" : "+r"(enabled) :"n"(kThreads * 2));
-#endif
-
-    return ((thread_offset_.contiguous() < problem_size_c) && enabled);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename ThreadblockShape_,
-          typename Element_>
-class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
-                                        Element_,
-                                        layout::RowMajor> {
- public:
-
-  using ThreadblockShape = ThreadblockShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
-      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
-      Element,
-      layout::PitchLinear>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
-
-  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      ///< Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      ///< Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params, problem_size, scale_pointer, bias_pointer,
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Extent of tensor
-      Conv3dProblemSize const &problem_size,
-      ///< Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      ///< Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params, problem_size, scale_pointer, bias_pointer,
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      Params const &params,                   ///< Precomputed parameters object
-      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
-      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
-      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
-      int thread_id                ///< ID of each participating thread
-      )
-      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      Params const &params,                   ///< Precomputed parameters object
-      Conv3dProblemSize const &problem_size,  ///< Extent of tensor
-      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
-      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
-      int thread_id                ///< ID of each participating thread
-      )
-      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// threadblock tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator operator++(int) {
-    PredicatedScaleBiasVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    iterator_.advance();
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace conv 
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
deleted file mode 100755
index 24f0de4c2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
+++ /dev/null
@@ -1,371 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
-
-    This iterator uses masks to guard out-of-bounds accesses.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedScaleBiasVectorIterator
-///
-template <typename WarpShape,
-          typename Element,
-          typename Layout>
-class PredicatedScaleBiasVectorIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
-///
-template <typename WarpShape_, typename Element_>
-class PredicatedScaleBiasVectorIterator<WarpShape_,
-                                        Element_,
-                                        layout::PitchLinear> {
- public:
-
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kElementsPerAccess = 1;
-
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-  static int const kIterations = WarpShape::kContiguous / 8;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
-
-  /// Parameters object is precomputed state and is host-constructible
-  using Params = Conv2dWgradActivationIteratorOptimizedParams;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params const &params_;
-
-  /// Internal pointer to first access of tile
-  ConstPointer scale_pointer_;
-  ConstPointer bias_pointer_;
-
-  /// Size of tensor
-  Conv2dProblemSize problem_size_;
-
-  int32_t thread_offset_;
-
-  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
-  int32_t filter_c_[kIterations];
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      /// Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        problem_size_(problem_size),
-        scale_pointer_(scale_pointer),
-        bias_pointer_(bias_pointer) {
-
-    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
-  }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      /// Pointer to start of scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to start of scale vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedScaleBiasVectorIterator(params, problem_size,
-                                          scale_pointer, bias_pointer,
-                                          thread_id, make_Coord(0, 0)) {}
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-
-    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int c = 0; c < kIterations; ++c) {
-      int rsc_offset = thread_offset_ + c * 8;
-
-      int residual, tmp;
-      params_.sc_divmod(tmp, residual, rsc_offset);
-      params_.c_divmod(tmp, filter_c_[c], residual);
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    frag.fill(__float2half2_rn(0.0f));
-    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
-
-    // load scale
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-
-      cutlass::arch::global_load<
-        __half,
-        sizeof(AccessType)
-      >(
-        frag_ptr[c * 2].x,
-        scale_pointer_ + filter_c_[c],
-        true
-      );
-    }
-
-    // load bias
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-
-      cutlass::arch::global_load<
-        __half,
-        sizeof(AccessType)
-      >(
-        frag_ptr[c * 2 + 1].x,
-        bias_pointer_ + filter_c_[c],
-        true 
-      );
-    }
-
-    // duplicate scale
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
-    }
-
-    // duplicate bias
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename WarpShape_,
-          typename Element_>
-class PredicatedScaleBiasVectorIterator<WarpShape_,
-                                        Element_,
-                                        layout::RowMajor> {
- public:
-
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
-      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
-      Element,
-      layout::PitchLinear>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
-  using Fragment = typename UnderlyingIterator::Fragment;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedScaleBiasVectorIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Conv2dProblemSize const &problem_size, Layout const &layout)
-        : params_(problem_size, layout::TensorNHWC(0, 0, 0)){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Extent of tensor
-      Conv2dProblemSize const &problem_size,
-      ///< Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      ///< Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, problem_size, scale_pointer, bias_pointer,
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      Params const &params,                   ///< Precomputed parameters object
-      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
-      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
-      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
-      int thread_id                ///< ID of each participating thread
-      )
-      : PredicatedScaleBiasVectorIterator(params, problem_size,
-                                          scale_pointer, bias_pointer,
-                                          thread_id, make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// threadblock tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load(frag);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace conv 
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
deleted file mode 100755
index 67418e689..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
-      Convolution problems.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-CUTLASS_HOST_DEVICE
-static int get_strided_dgrad_tile_m(
-  cutlass::conv::Conv2dProblemSize const &problem_size,
-  int tile_size_m) {
-
-  // CTAs in M dimension per starting filter position
-  int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, tile_size_m);
-
-  // Inflate number of CTAs in M dimension to cover every strating filter position even those that
-  // may fall out of valid MMA (Dy * w) but are needed to apply epilogue (beta * Dx_source) 
-  // and point-wise fusion
-  int tile_m = tile_m_per_filter * int(problem_size.stride().product());
-
-  // There is a possible performance optimization here that leads up to 2x speeds than the current 
-  // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
-  //
-  // * Optimization * 
-  // Only launch CTAs in M dimension which contribute to a row in Dx output
-  // 
-  // 
-  // * Constraints *
-  // (A) stride <= filter, for example, stride={2x2} and filter={3x3}: 
-  //       - (A.1): There are no constraints for this case and the optimization does 
-  //                affect this case functionality or performance. 
-  // (B) stride > filter, for example, stride={2x2} and filter={1x1}: 
-  //       - (B.1): Dx output tensor should be zero initialized
-  //       - (B.2): The kernel epilogue cannot apply beta. Thus, beta should be zero 
-
-  return tile_m;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Threadblock swizzling function for strided dgrad convolution
-struct StridedDgradHorizontalThreadblockSwizzle : 
-  public gemm::threadblock::GemmHorizontalThreadblockSwizzle {
-
-  using Base = gemm::threadblock::GemmHorizontalThreadblockSwizzle;
-
-  CUTLASS_HOST_DEVICE
-  StridedDgradHorizontalThreadblockSwizzle() { }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
-  CUTLASS_HOST_DEVICE
-  static gemm::GemmCoord get_tiled_shape(
-    cutlass::conv::Operator conv_operator,
-    cutlass::conv::Conv2dProblemSize const &problem_size,
-    gemm::GemmCoord tile_size,
-    int split_k_slices) {
-
-    gemm::GemmCoord implicit_gemm_problem_size = 
-    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
-
-    // compute number of tiles in m dimension
-    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
-
-    // compute number of tiles in n dimension 
-    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
-
-    return gemm::GemmCoord(
-      tile_m,
-      tile_n,
-      split_k_slices);
-  }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
-  private:
-    using Base::get_tiled_shape;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Threadblock swizzling function for strided dgrad convolution
-template <int N = 1>
-struct StridedDgradIdentityThreadblockSwizzle : 
-  public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
-
-  using Base = gemm::threadblock::GemmIdentityThreadblockSwizzle<N>;
-
-  CUTLASS_HOST_DEVICE
-  StridedDgradIdentityThreadblockSwizzle() { }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
-  CUTLASS_HOST_DEVICE
-  static gemm::GemmCoord get_tiled_shape(
-    cutlass::conv::Operator conv_operator,
-    cutlass::conv::Conv2dProblemSize const &problem_size,
-    gemm::GemmCoord tile_size,
-    int split_k_slices) {
-
-    gemm::GemmCoord implicit_gemm_problem_size = 
-    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
-
-    // compute number of tiles in m dimension
-    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
-
-    // compute number of tiles in n dimension 
-    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
-
-    return gemm::GemmCoord(
-      tile_m,
-      tile_n,
-      split_k_slices);
-  }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
-  private:
-    using Base::get_tiled_shape;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for GEMMs
-template <int N = 1, int Output_N = 1, int Output_P = 1, int Output_Q = 1>
-struct DepthwiseDirect2dConvIdentityThreadblockSwizzle
-    : public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvIdentityThreadblockSwizzle() {}
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
-                            cutlass::conv::Conv2dProblemSize const &problem_size,
-                            gemm::GemmCoord tile_size,
-                            int split_k_slices) {
-        
-    gemm::GemmCoord implicit_gemm_problem_size =
-        cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
-
-    return gemm::GemmCoord(1,
-                     (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-                     split_k_slices);
-  }
-};
-
-} // namespace threadblock
-} // namespace conv
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
deleted file mode 100755
index ed385df03..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/thread/mma.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/thread/depthwise_mma.h"
-
-
-#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaDepthwiseSimt
-    : public cutlass::gemm::warp::
-          MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_> {
-  using Base = cutlass::gemm::warp::
-      MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_>;
-      
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Hard-coded for now
-  using ArchTag = arch::Sm50;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-public:
-
-  /// Iterates over the B operand in memory
-  using IteratorB = cutlass::conv::warp::DepthwiseMmaSimtTileIterator<
-    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
-    cutlass::gemm::Operand::kB,
-    ElementB,
-    LayoutB,
-    Policy,
-    PartitionsK,
-    Shape::kK
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentB = FragmentB;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaDepthwiseSimt():Base() {}
-};
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
-    typename FilterShape_,
-    /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
-    typename ThreadOutputShape_,
-    /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
-    typename ThreadBlockOutputShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Iterator algo type
-    conv::IteratorAlgorithm IteratorAlgorithm_ = IteratorAlgorithm::kAnalytic,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape_ = cutlass::MatrixShape<-1, -1>,   
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape_ =  cutlass::MatrixShape<-1, -1>,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape_ = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaDepthwiseDirectConvSimt {
- public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
-  using FilterShape = FilterShape_;
-
-  /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
-  using ThreadOutputShape = ThreadOutputShape_;
-
-  /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Iterator algo type
-  static conv::IteratorAlgorithm const IteratorAlgorithm = IteratorAlgorithm_;
-
-  /// Stride ( MatrixShape<Height, Width> )
-  using StrideShape = StrideShape_; 
-
-  /// Dilation ( MatrixShape<Height, Width> )
-  using DilationShape = DilationShape_;
-  
-  /// Activation Shape loaded by threadblock
-  using ActivationShape = ActivationShape_;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Hard-coded for now
-  using ArchTag = arch::Sm50;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
-                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
-                                    platform::is_same< ElementA, int8_t >::value && 
-                                    platform::is_same< ElementB, int8_t >::value;
-
-  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
-
-  /// Thread-level matrix multiply accumulate operator
-  using ThreadMma = cutlass::conv::thread::DepthwiseDirectConvElementwiseInnerProduct<
-    cutlass::gemm::GemmShape<
-      Shape::kM / Policy::WarpShape::kRow,    // number of output pixels proccessed per thread
-      Shape::kN / Policy::WarpShape::kColumn, // number of channels proccessed per thread
-      1>,
-    ElementA,
-    ElementB,
-    ElementC,
-    arch::OpMultiplyAdd,
-    dp4a_type
-  >;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  /// Shape of the underlying instruction
-  using InstructionShape = cutlass::gemm::GemmShape<1,1,use_dp4a ? 4 : 1>;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = cutlass::conv::warp::DepthwiseDirect2dConvSimtTileIterator<
-    MatrixShape<Shape::kM, Shape::kN>, // <output tile=(P*Q), output channels> per warp
-    FilterShape,
-    ThreadOutputShape,
-    ThreadBlockOutputShape,
-    cutlass::gemm::Operand::kA,
-    ElementA,
-    Policy,
-    IteratorAlgorithm,
-    StrideShape,
-    DilationShape,
-    ActivationShape,
-    PartitionsK,
-    Shape::kK
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = cutlass::gemm::warp::MmaSimtTileIterator<
-    MatrixShape<1, Shape::kN>,
-    cutlass::gemm::Operand::kB,
-    ElementB,
-    LayoutB,
-    Policy,
-    PartitionsK,
-    Shape::kK
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentB = FragmentB;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
-    MatrixShape<Shape::kM, Shape::kN>,
-    cutlass::gemm::Operand::kC,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Storage for C tile
-  using FragmentC = typename ThreadMma::FragmentC;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaDepthwiseDirectConvSimt() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &d, 
-    FragmentA a, 
-    FragmentB b, 
-    FragmentC const &c, int group_idx = 0) const {
-
-    ThreadMma mma;
-
-    mma(d, a, b, c);
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace conv
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
deleted file mode 100755
index 26d9638ba..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
+++ /dev/null
@@ -1,862 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
-      instructions
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/conv/convolution.h"
-
-#include "cutlass/arch/memory_sm75.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
-///
-/// concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Operand identity
-  cutlass::gemm::Operand Operand,
-  /// Data type of A elements
-  typename Element_,
-  /// Layout of operand
-  typename Layout_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension - used in sliced-K
-  int PartitionsK = 1,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize = 1
->
-class DepthwiseMmaSimtTileIterator;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for B operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of A elements
-    typename Element_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Group Size along kPartition - used in sliced-K
-    int PartitionGroupSize>
-class DepthwiseMmaSimtTileIterator<Shape_,
-                                   cutlass::gemm::Operand::kB,
-                                   Element_,
-                                   layout::RowMajor,
-                                   Policy_,
-                                   PartitionsK,
-                                   PartitionGroupSize>
-    : public cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
-                                               cutlass::gemm::Operand::kB,
-                                               Element_,
-                                               layout::RowMajor,
-                                               Policy_,
-                                               PartitionsK,
-                                               PartitionGroupSize> {
-
-  using Base = cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
-                                               cutlass::gemm::Operand::kB,
-                                               Element_,
-                                               layout::RowMajor,
-                                               Policy_,
-                                               PartitionsK,
-                                               PartitionGroupSize>;
- public:
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kB;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = typename Base::TensorRef;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = typename Base::ThreadShape;
-
-  /// Number of individual loads
-  using Iterations =  typename Base::Iterations;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-  static_assert(Policy::LaneMmaShape::kN == 1, "Each thread should be 1 element per LDS along the k-dim");
-  
-private:
-
-  MatrixCoord lane_offset_;
-  int channel_idx_;
-  int base_channel_idx_;
-  int warps_n_;
-
- public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  DepthwiseMmaSimtTileIterator():Base() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  DepthwiseMmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) : Base(ref, lane_id) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    warps_n_ = -1;
-    channel_idx_ = 0;
-    base_channel_idx_ = 0;
-    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
-  }
-  
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  DepthwiseMmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    if(warps_n_ == -1){
-        warps_n_ = coord.column();
-    }
-    
-    Base::add_tile_offset(coord);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
-        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-
-        void const *ptr = this->ref_.data() +
-                          this->ref_.offset({-(channel_idx_ - base_channel_idx_),
-                                             n * Policy::WarpShape::kColumn}) +
-                          pointer_offset / Policy::LaneMmaShape::kN;
-
-        // Base_k of a warp +  Base_k of current threads.
-        int thread_k_base_idx =
-            warps_n_ * Shape::kColumn / Policy::LaneMmaShape::kN + lane_offset_.column();
-
-        if (channel_idx_ + k == thread_k_base_idx + n * Policy::WarpShape::kColumn) {
-          // Depthwise kernel would only do computation when channel == k.
-          // Loads an element when the current computation channel == the k corresponding to this thread.
-          arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
-        } else {
-          // Reduce SMEM load
-          dst_ptr[n + k * Iterations::kColumn].fill(Element(0));
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    if(k_group % PartitionGroupSize == 0 && k_group != 0){
-      base_channel_idx_ = k_group;
-    }
-    channel_idx_ = k_group;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
-    typename FilterShape_,
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename ThreadOutputShape_,
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename ThreadBlockOutputShape_,
-    /// Operand identity
-    cutlass::gemm::Operand Operand,
-    /// Data type of A elements
-    typename Element_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Iterator algo type
-    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
-    /// Number of partitions along K dimension - used in sliced-K
-    int PartitionsK = 1,
-    /// Group Size along kPartition - used in sliced-K
-    int PartitionGroupSize = 1>
-class DepthwiseDirect2dConvSimtTileIterator;
-
-
-/// Specialization for A operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
-    typename FilterShape_,
-    /// Size of the matrix to load (concept: TensorNHWC)
-    typename ThreadOutputShape_,
-    /// Size of the matrix to load (concept: TensorNHWC)
-    typename ThreadBlockOutputShape_,
-    /// Data type of A elements
-    typename Element_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Iterator algo type
-    conv::IteratorAlgorithm IteratorAlgorithm,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape,   
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape,
-    /// Number of partitions along K dimension - used in sliced-K
-    int PartitionsK,
-    /// Group Size along kPartition - used in sliced-K
-    int PartitionGroupSize>
-class DepthwiseDirect2dConvSimtTileIterator<Shape_,
-                                            FilterShape_,
-                                            ThreadOutputShape_,
-                                            ThreadBlockOutputShape_,
-                                            cutlass::gemm::Operand::kA,
-                                            Element_,
-                                            Policy_,
-                                            IteratorAlgorithm,
-                                            StrideShape,   
-                                            DilationShape,
-                                            ActivationShape,
-                                            PartitionsK,
-                                            PartitionGroupSize> {
- public:
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
-  using FilterShape = FilterShape_;
-
-  /// Shape of tile to load (concept: TensorNHWC)
-  using ThreadOutputShape = ThreadOutputShape_;
-
-  /// Shape of tile to load (concept: TensorNHWC)
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-
-  /// Operand tag
-  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
-    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-
-// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    ThreadOutputShape::kNHW, // Output tile shape Computed by current threads
-    ThreadOutputShape::kC
-  >;
-
-  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  using ThreadTileCount = MatrixShape<
-    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
-    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-protected:
-
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
-
-  int activation_offset[ThreadOutputShape::kH][ThreadOutputShape::kW][Iterations::kColumn];
-  int iterator_r_;
-  int iterator_s_;
-  int iterator_offset_;
-
-  int inc_next_s_ ;
-  int inc_next_r_ ;
-  
-  MatrixCoord lane_offset_;
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    // Set channel offset
-    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    ref.add_coord_offset(lane_offset_);
-
-    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
-               ref.stride(0) / Policy::LaneMmaShape::kN);
-
-    iterator_r_ = 0;
-    iterator_s_ = 0;
-    iterator_offset_ = 0;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  template<typename Params>
-  CUTLASS_HOST_DEVICE
-  void setup_initial_status(Params const& params)  {
-  
-    inc_next_s_ = params.inc_next[0];
-    inc_next_r_ = params.inc_next[1];
-
-    // Get base HW offset of current threads
-    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
-    int base_p_ =
-        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
-    int base_q_ =
-        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int col = 0; col < Iterations::kColumn; ++col) {
-          int base_w = (base_q_ + q) * params.stride[0];
-          int base_h = (base_p_ + p) * params.stride[1];
-
-          int offset = base_h * params.activation_tile_w + base_w;
-          activation_offset[p][q][col] = offset;
-        }
-      }
-    }
-  }
-
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-    // Set warp row and col start
-    lane_offset_ = MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  void advance(int32_t pointer_offset) {
-    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
-    iterator_s_ = 0;
-    iterator_r_ = 0;
-    iterator_offset_ = 0;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &operator++() {
-    ++iterator_s_;
-    if (iterator_s_ < FilterShape::kColumn) {
-      iterator_offset_ += inc_next_s_;
-
-      return *this;
-    }
-
-    iterator_s_ = 0;
-
-    ++iterator_r_;
-    if (iterator_r_ < FilterShape::kRow) {
-      iterator_offset_ += inc_next_r_;
-      return *this;
-    }
-
-    iterator_r_ = 0;
-    iterator_offset_ = 0;
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator & operator--() {
-    // Do nothing
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < Iterations::kColumn; ++n) {
-          void const *ptr = ref_.data() +
-                            ref_.offset({activation_offset[p][q][n] + (iterator_offset_),
-                                         n * Policy::WarpShape::kColumn}) +
-                            pointer_offset / Policy::LaneMmaShape::kN;
-          arch::shared_load(dst_ptr[n + q + p * ThreadOutputShape::kW], ptr);
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    // Do nothing at present.
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, Index pointer_offset) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Specialization for A operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
-    typename FilterShape_,
-    /// Size of the matrix to load (concept: TensorNHWC)
-    typename ThreadOutputShape_,
-    /// Size of the matrix to load (concept: TensorNHWC)
-    typename ThreadBlockOutputShape_,
-    /// Data type of A elements
-    typename Element_,
-    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-    typename Policy_,
-    /// Stride ( MatrixShape<Height, Width> )
-    typename StrideShape_,
-    /// Dilation ( MatrixShape<Height, Width> )
-    typename DilationShape_,
-    /// Activation Shape loaded by threadblock
-    typename ActivationShape_,
-    /// Number of partitions along K dimension - used in sliced-K
-    int PartitionsK,
-    /// Group Size along kPartition - used in sliced-K
-    int PartitionGroupSize>
-class DepthwiseDirect2dConvSimtTileIterator<Shape_,
-                                            FilterShape_,
-                                            ThreadOutputShape_,
-                                            ThreadBlockOutputShape_,
-                                            cutlass::gemm::Operand::kA,
-                                            Element_,
-                                            Policy_,
-                                            IteratorAlgorithm::kFixedStrideDilation,
-                                            StrideShape_,
-                                            DilationShape_,
-                                            ActivationShape_,
-                                            PartitionsK,
-                                            PartitionGroupSize> {
- public:
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
-  using FilterShape = FilterShape_;
-
-  /// Shape of tile to load (concept: TensorNHWC)
-  using ThreadOutputShape = ThreadOutputShape_;
-
-  /// Shape of tile to load (concept: TensorNHWC)
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-
-  /// Stride ( MatrixShape<Height, Width> )
-  using StrideShape = StrideShape_;
-
-  /// Dilation ( MatrixShape<Height, Width> )
-  using DilationShape = DilationShape_;
-
-  /// Activation Shape loaded by threadblock
-  using ActivationShape = ActivationShape_;
-
-  /// Operand tag
-  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
-                "The warp-level GEMM M size must be divisible by the number of threads arranged "
-                "along the M dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0,
-                "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-
-  // Activations loaded by threadblock
-  static int const ThreadActivationShapeH = (ThreadOutputShape::kH - 1) * StrideShape::kRow +
-                                            (FilterShape::kRow - 1) * DilationShape::kRow + 1;
-
-  static int const ThreadActivationShapeW = (ThreadOutputShape::kW - 1) * StrideShape::kColumn +
-                                            (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
-
-  using ThreadActivationShape = cutlass::conv::
-      TensorNHWCShape<1, ThreadActivationShapeH, ThreadActivationShapeW, ThreadOutputShape::kC>;
-
-  // Thread-level shape of a fragment
-  using ThreadShape =
-      MatrixShape<ThreadOutputShape::kNHW,
-                  ThreadOutputShape::kC>;
-
-  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN),
-                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations =
-      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / Policy::LaneMmaShape::kN>;
-
-  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
-                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
- protected:
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
-
-  Array<Element, Policy::LaneMmaShape::kN>
-      activation[ThreadActivationShape::kH][ThreadActivationShape::kW][Iterations::kColumn];
-  int iterator_r_;
-  int iterator_s_;
-
-
-  MatrixCoord lane_offset_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator(TensorRef ref, int lane_id) {
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    // Set channel offset
-    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    ref.add_coord_offset(lane_offset_);
-
-    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
-               ref.stride(0) / Policy::LaneMmaShape::kN);
-
-    iterator_r_ = 0;
-    iterator_s_ = 0;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  template <typename Params>
-  CUTLASS_HOST_DEVICE void setup_initial_status(
-      Params const &params) {
-
-    // Get base HW offset of current threads
-    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
-    int base_h =
-        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH * StrideShape::kRow;
-    int base_w =
-        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW * StrideShape::kColumn;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int h = 0; h < ThreadActivationShape::kH; ++h) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int w = 0; w < ThreadActivationShape::kW; ++w) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int col = 0; col < Iterations::kColumn; ++col) {
-          int offset = (base_h + h) * ActivationShape::kW + (base_w + w);
-
-          void const *ptr = ref_.data() + ref_.offset({offset, col * Policy::WarpShape::kColumn});
-          arch::shared_load(activation[h][w][col], ptr);
-        }
-      }
-    }
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-    // Set warp row and col start
-    lane_offset_ =
-        MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  void advance(int32_t pointer_offset) {
-    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
-    iterator_s_ = 0;
-    iterator_r_ = 0;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &operator++() {
-    ++iterator_s_;
-    if (iterator_s_ < FilterShape::kColumn) {
-      return *this;
-    }
-
-    iterator_s_ = 0;
-
-    ++iterator_r_;
-    if (iterator_r_ < FilterShape::kRow) {
-      return *this;
-    }
-
-    iterator_r_ = 0;
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  DepthwiseDirect2dConvSimtTileIterator &operator--() {
-    // Do nothing
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
-        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < Iterations::kColumn; ++n) {
-          const int h = p * StrideShape::kRow + iterator_r_ * DilationShape::kRow;
-          const int w = q * StrideShape::kColumn + iterator_s_ * DilationShape::kColumn;
-
-          dst_ptr[n + q + p * ThreadOutputShape::kW] = activation[h][w][n];
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    // Do nothing at present.
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, Index pointer_offset) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-} // namespace warp
-} // namespace conv
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h b/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
deleted file mode 100755
index 4da31ab81..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level per channel scale+bias+relu before
-   matrix multiply-accumulate operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace conv {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename FragmentActivations, typename FragmentScaleBias>
-struct FpropScaleBiasReluTransform {
-
-  using T = typename FragmentActivations::Element;
-
-  static int const NumActivations = FragmentActivations::kElements;
-  static int const NumScaleBias = FragmentScaleBias::kElements;
-  static int const MmaElements = 2;
-  // One element has one scale and one bias
-  static int const MmaScaleBiasPair = 2;
-  // 16816 has 2 columns
-  static int const MmaCols = 2;
-
-  using MmaOperand = Array<T, MmaElements>;
-  using ScaleBiasOperand = Array<T, MmaElements * MmaScaleBiasPair>;
-
-  CUTLASS_DEVICE
-  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
-    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
-
-    // Apply per channel scale+bias+relu if the data is not a special NaN
-    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
-
-    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
-    // It requires C to be an even number.
-    asm volatile(
-        "{\n\t"
-        " .reg .pred %%p;\n\t"
-        " .reg .b32 t1;\n\t"
-        " setp.eq.u32 %%p, %2, %4;\n\t"
-        " fma.rn.f16x2.relu t1, %1, %2, %3;\n"
-        " selp.u32 %0, 0, t1, %%p;\n\t"
-        "}\n"
-        : "=r"(ptr_activations[0])
-        : "r"(ptr_scale_bias[0]), "r"(ptr_activations[0]),
-          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16x2));
-#else
-    assert(0);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  void operator()(FragmentActivations &activations,
-                  FragmentScaleBias const &scale_bias) {
-    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
-    ScaleBiasOperand const *ptr_scale_bias =
-        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
-      transform(ptr_activations[i], ptr_scale_bias[(i / MmaScaleBiasPair) % MmaCols]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename FragmentActivations, typename FragmentScaleBias>
-struct WgradScaleBiasReluTransform {
-
-  using T = typename FragmentActivations::Element;
-
-  static int const NumActivations = FragmentActivations::kElements;
-  static int const NumScaleBias = FragmentScaleBias::kElements;
-  static int const MmaElements = 2;
-  // One element has one scale and one bias
-  static int const MmaScaleBiasPair = 2;
-  // 16816 has 2 rows
-  static int const MmaRows = 2;
-
-  using MmaOperand = Array<T, MmaElements>;
-  using ScaleBiasOperand = Array<__half2, MmaScaleBiasPair>;
-
-  CUTLASS_DEVICE
-  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-
-    __half2 *ptr_activations = reinterpret_cast<__half2 *>(&activations);
-    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
-
-#if 1 
-    // CUDA + PTX version
-
-    bool h1_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].x) == cutlass::arch::OOB_NAN_F16);
-    bool h2_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].y) == cutlass::arch::OOB_NAN_F16);
-
-    // Apply per channel scale+bias+relu if the data is not a special NaN
-    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
-
-    // We cannot gurantee that the pair of F16 are both in bound or both 
-    // out-of-bound because C x R x S can be an odd number.
-    asm volatile(
-        "{\n\t"
-        " fma.rn.f16x2.relu %0, %1, %2, %3;\n"
-        "}"
-        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
-        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
-          "r"(ptr_scale_bias[1]));
-
-    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h1_oob ?
-            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff0000) :
-            reinterpret_cast<uint32_t &>(ptr_activations[0]);
-
-    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h2_oob ?
-            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff) :
-            reinterpret_cast<uint32_t &>(ptr_activations[0]);
-#else
-    // pure PTX version
-
-    // Apply per channel scale+bias+relu if the data is not a special NaN
-    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
-    asm volatile(
-        "{\n"
-        " .reg .b16 t1, t2;\n"
-        " .reg .b32 t3, t4, t5, t6;\n"
-        " .reg .pred p1, p2;\n"
-        " mov.b32 {t1, t2}, %2;\n"
-        " setp.eq.s16 p1, t1, %4;\n"
-        " setp.eq.s16 p2, t2, %4;\n"
-        " fma.rn.f16x2.relu t3, %1, %2, %3;\n"
-        " and.b32 t4, t3, %5;\n"
-        " selp.b32 t5, t4, t3, p1;\n"
-        " and.b32 t6, t5, %6;\n"
-        " selp.b32 %0, t6, t5, p2;\n"
-        "}\n"
-        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
-        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
-          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16), "n"(0xffff0000), "n"(0x0000ffff));
-#endif
-#else
-    assert(0);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  void operator()(FragmentActivations &activations,
-                  FragmentScaleBias const &scale_bias) {
-    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
-    ScaleBiasOperand const *ptr_scale_bias =
-        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
-      transform(ptr_activations[i], ptr_scale_bias[(i / MmaRows)]);
-    }
-  }
-};
-} // namespace warp
-} // namespace conv 
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/coord.h b/lightllm-kernel/cutlass/include/cutlass/coord.h
deleted file mode 100755
index d778046c2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/coord.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief A Coord is a coordinate of arbitrary rank into a tensor or matrix
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <stdint.h>
-#endif
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Statically-sized array specifying Coords within a tensor
-template <
-  int Rank_,                          ///< Logical rank of coordinate
-  typename Index_ = int,              ///< Index type used for each dimension
-  typename LongIndex_ = int64_t       ///< Long index type used for linear offsets
->
-struct Coord {
-
-public:
-
-  //
-  // Type and constant definitions
-  //
-
-  /// Number of elements in Coord
-  static int const kRank = Rank_;
-
-  /// Index type used to store elements
-  using Index = Index_;
-
-  /// Type used to represent linear offsets
-  using LongIndex = LongIndex_;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Indices
-  Index idx[kRank];
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Default ctor initializes uniformly
-  CUTLASS_HOST_DEVICE
-  explicit Coord(Index value = Index(0)) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] = value;
-    }
-  }
-
-  /// Constructs from an array of integers
-  CUTLASS_HOST_DEVICE
-  Coord(Index const (&_idx)[kRank]) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] = _idx[i];
-    }
-  }
-
-  /// Constructs from some other Coord
-  template <int R, typename I, typename L>
-  CUTLASS_HOST_DEVICE
-  Coord(Coord<R, I, L> other) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] = other[i];
-    }
-  }
-
-  /// Returns a slice of the Coord which may be larger or smaller in rank
-  /// than this.
-  template <int Slice>
-  CUTLASS_HOST_DEVICE
-  Coord<Slice, Index, LongIndex> slice(int start = 0, Index identity = 0) const {
-    Coord<Slice, Index, LongIndex> result;
-    for (int i = 0; i < Slice; ++i) {
-      if (i + start < kRank) {
-        result[i] = idx[i + start];
-      }
-      else {
-        result[i] = identity;
-      }
-    }
-    return result;
-  }
-
-  /// Returns the index of the dimension with least value
-  CUTLASS_HOST_DEVICE
-  int min_dim_index() const {
-    int i = 0;
-    for (int j = 1; j < kRank; ++j) {
-      if (idx[j] < idx[i]) {
-        i = j;
-      }
-    }
-    return i;
-  }
-
-  /// Returns the index of the dimension with greatest value
-  CUTLASS_HOST_DEVICE
-  int max_dim_index() const {
-    int i = 0;
-    for (int j = 1; j < kRank; ++j) {
-      if (idx[j] > idx[i]) {
-        i = j;
-      }
-    }
-    return i;
-  }
-
-  /// Returns true if Coord is non-zero.
-  CUTLASS_HOST_DEVICE
-  explicit operator bool() const {
-    for (int i = 0; i < kRank; ++i) {
-      if (idx[i]) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /// Returns true if Coord is uniformly zero.
-  CUTLASS_HOST_DEVICE
-  bool operator!() const {
-    for (int i = 0; i < kRank; ++i) {
-      if (idx[i]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  Coord operator+(Coord const& b) const {
-    Coord c;
-    for (int i = 0; i < kRank; ++i) {
-      c.idx[i] = idx[i] + b.idx[i];
-    }
-    return c;
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  Coord operator-(Coord const& b) const {
-    Coord c;
-    for (int i = 0; i < kRank; ++i) {
-      c.idx[i] = idx[i] - b.idx[i];
-    }
-    return c;
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  Coord operator*(Coord const& b) const {
-    Coord c;
-    for (int i = 0; i < kRank; ++i) {
-      c.idx[i] = idx[i] * b.idx[i];
-    }
-    return c;
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  Coord operator/(Coord const& b) const {
-    Coord c;
-    for (int i = 0; i < kRank; ++i) {
-      c.idx[i] = idx[i] / b.idx[i];
-    }
-    return c;
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  Coord& operator+=(Coord const& b) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] += b.idx[i];
-    }
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  Coord& operator-=(Coord const& b) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] -= b.idx[i];
-    }
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  Coord& operator*=(Coord const& b) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] *= b.idx[i];
-    }
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  Coord& operator/=(Coord const& b) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] /= b.idx[i];
-    }
-    return *this;
-  }
-
-  /// Member access operator
-  CUTLASS_HOST_DEVICE Index& operator[](int dim) { return idx[dim]; }
-
-  /// Member access operator
-  CUTLASS_HOST_DEVICE Index const& operator[](int dim) const { return idx[dim]; }
-
-  /// Computes the dot product with anotherCoord object
-  CUTLASS_HOST_DEVICE
-  LongIndex dot(Coord const& b, LongIndex sum = LongIndex(0)) const {
-    for (int i = 0; i < kRank; ++i) {
-      sum += idx[i] * b.idx[i];
-    }
-    return sum;
-  }
-
-  /// Gets the index of a given Coord element
-  template <int Dim>
-  CUTLASS_HOST_DEVICE Index& at() {
-    return idx[Dim];
-  }
-
-  /// Access via index; may limit unrolling potential
-  CUTLASS_HOST_DEVICE
-  Index& at(int dim) { return idx[dim]; }
-
-  /// Gets the index of a given Coord element
-  template <int Dim>
-  CUTLASS_HOST_DEVICE Index const& at() const {
-    return idx[Dim];
-  }
-
-  /// Access via index; may limit unrolling potential
-  CUTLASS_HOST_DEVICE
-  Index const& at(int dim) const { return idx[dim]; }
-
-  /// Determines if two Coord<> objects are equal
-  CUTLASS_HOST_DEVICE
-  bool operator==(Coord const& b) const {
-    bool equal = true;
-    for (int i = 0; equal && i < kRank; ++i) {
-      equal = (idx[i] == b.idx[i]);
-    }
-    return equal;
-  }
-
-  /// Not equal
-  CUTLASS_HOST_DEVICE
-  bool operator!=(Coord const& b) const { return !(*this == b); }
-
-  /// Clamps a coordinate to a range specified by maximum and minimum values
-  CUTLASS_HOST_DEVICE
-  Coord& clamp(Coord const& max, Coord const& min = Coord()) {
-    for (int i = 0; i < kRank; ++i) {
-      idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]);
-    }
-    return *this;
-  }
-
-  /// Returns the sum of all elements
-  CUTLASS_HOST_DEVICE
-  Index sum() const {
-    Index sum_(idx[0]);
-    for (int i = 1; i < kRank; ++i) {
-      sum_ += idx[i];
-    }
-    return sum_;
-  }
-
-  /// Returns the product of all elements
-  CUTLASS_HOST_DEVICE
-  LongIndex product() const {
-    LongIndex product_(idx[0]);
-    for (int i = 1; i < kRank; ++i) {
-      product_ *= idx[i];
-    }
-    return product_;
-  }
-
-  /// Less than operator
-  CUTLASS_HOST_DEVICE
-  bool operator<(Coord const &b) const {
-    for (int i = 0; i < kRank; ++i) {
-      if (!(idx[i] < b[i])) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /// Less than or equals operator
-  CUTLASS_HOST_DEVICE
-  bool operator<=(Coord const &b) const {
-    for (int i = 0; i < kRank; ++i) {
-      if (!(idx[i] <= b[i])) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /// Greater than operator
-  CUTLASS_HOST_DEVICE
-  bool operator>(Coord const &b) const {
-    return !(*this <= b);
-  }
-
-  /// Greater than or equals operator
-  CUTLASS_HOST_DEVICE
-  bool operator>=(Coord const &b) const {
-    return !(*this < b);
-  }
-};
-
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-
-/// Scalar multiplication
-template <int Rank, typename Index>
-CUTLASS_HOST_DEVICE
-Coord<Rank, Index> operator*(Index s, Coord<Rank, Index> coord) {
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    coord[i] *= s;
-  }
-  return coord;
-}
-
-/// Scalar multiplication
-template <int Rank, typename Index>
-CUTLASS_HOST_DEVICE
-Coord<Rank, Index> operator*(Coord<Rank, Index> coord, Index s) {
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    coord[i] *= s;
-  }
-  return coord;
-}
-
-/// Scalar division
-template <int Rank, typename Index>
-CUTLASS_HOST_DEVICE
-Coord<Rank, Index> operator/(Index s, Coord<Rank, Index> coord) {
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    coord[i] = s / coord[i];
-  }
-  return coord;
-}
-
-/// Scalar division
-template <int Rank, typename Index>
-CUTLASS_HOST_DEVICE
-Coord<Rank, Index> operator/(Coord<Rank, Index> coord, Index s) {
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    coord[i] /= s;
-  }
-  return coord;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Integer-valued make_Coord
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to make a 2-element coordinate
-template <typename T> 
-CUTLASS_HOST_DEVICE
-Coord<1, T> make_Coord(T _0) {
-  T values[1] = {_0};
-  return Coord<1, T>(values);
-}
-
-/// Helper to make a 2-element coordinate
-template <typename T> 
-CUTLASS_HOST_DEVICE
-Coord<2, T> make_Coord(T _0, T _1) {
-  T values[2] = {_0, _1};
-  return Coord<2, T>(values);
-}
-
-/// Helper to make a 3-element coordinate
-template <typename T> 
-CUTLASS_HOST_DEVICE
-Coord<3, T> make_Coord(T _0, T _1, T _2) {
-  T values[3] = {_0, _1, _2};
-  return Coord<3, T>(values);
-}
-
-/// Helper to make a 4-element coordinate
-template <typename T> 
-CUTLASS_HOST_DEVICE
-Coord<4, T> make_Coord(T _0, T _1, T _2, T _3) {
-  T values[4] = {_0, _1, _2, _3};
-  return Coord<4, T>(values);
-}
-
-/// Helper to make a 5-element coordinate
-template <typename T> 
-CUTLASS_HOST_DEVICE
-Coord<5, T> make_Coord(T _0, T _1, T _2, T _3, T _4) {
-  T values[5] = {_0, _1, _2, _3, _4};
-  return Coord<5, T>(values);
-}
-
-/// Helper to make a 1-element coordinate
-template <int N, typename T> 
-CUTLASS_HOST_DEVICE
-Coord<N, T>make_Coord_with_padding(T _0) {
-  Coord<N, T> coord;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = N - 1; i > 0; --i) {
-    coord[i] = 0;
-  }
-
-  coord[0] = _0;
-
-  return coord;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/core_io.h b/lightllm-kernel/cutlass/include/cutlass/core_io.h
deleted file mode 100755
index 40ae22246..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/core_io.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Helpers for printing cutlass/core objects
-*/
-#pragma once
-
-#include <iostream>
-#include <typeinfo>
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm_enumerated_types.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Output operator for CUDA built-in dim3 type
-inline std::ostream &operator<<(std::ostream &out, dim3 d) {
-  return out << d.x << ", " << d.y << ", " << d.z;
-}
-
-/// Output operator for CUDA built-in error type
-inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
-  return out << cudaGetErrorString(error);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//                    stream operators for cutlass namespace                                     //
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, int Rank>
-inline
-std::ostream& operator<<(std::ostream& out, Array<Element, Rank> const& v) {
-  for (int i = 0; i < Rank; ++i) {
-    out << (i ? ", " : "") << v[i];
-  }
-  return out;
-}
-
-template <int Rank>
-inline
-std::ostream& operator<<(std::ostream& out, Coord<Rank> const& coord) {
-  for (int i = 0; i < Rank; ++i) {
-    out << (i ? ", " : "") << coord[i];
-  }
-  return out;
-}
-
-inline
-std::istream & operator>>(std::istream &stream, half_t &x) {
-  float tmp;
-  stream >> tmp;
-  x = static_cast<cutlass::half_t>(tmp);
-  return stream;
-}
-
-inline
-std::ostream & operator<<(std::ostream &out, half_t const &x) {
-  return out << float(x);
-}
-
-inline
-std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) {
-  return out << float(x);
-}
-
-inline
-std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) {
-  return out << float(x);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to enable formatted printing of CUTLASS scalar types to an ostream
-template <typename T>
-struct ScalarIO {
-
-  /// Value to print
-  T value;
-
-  /// Default ctor
-  ScalarIO() { }
-
-  /// Constructs from a value
-  ScalarIO(T value): value(value) {}
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Default printing to ostream
-template <typename T>
-inline std::ostream &operator<<(std::ostream &out, ScalarIO<T> const &scalar) {
-  return out << scalar.value;
-}
-
-/// Printing to ostream of int8_t as integer rather than character
-template <>
-inline std::ostream &operator<<(std::ostream &out, ScalarIO<int8_t> const &scalar) {
-  return out << int(scalar.value);
-}
-
-/// Printing to ostream of uint8_t as integer rather than character
-template <>
-inline std::ostream &operator<<(std::ostream &out, ScalarIO<uint8_t> const &scalar) {
-  return out << unsigned(scalar.value);
-}
-
-
-/// Default printing to ostream for MatrixShape
-template <int Row, int Column>
-inline
-std::ostream & operator<<(std::ostream &out, MatrixShape<Row, Column> const &matrix_shape) {
-  out << "cutlass::MatrixShape::(kRow, kColumn) {"
-    << cutlass::MatrixShape<Row,Column>::kRow <<","
-    << cutlass::MatrixShape<Row,Column>::kColumn <<"}";
-  return out;
-}
-
-
-/// Prints matrix to ostream
-template <typename Element, int Rows, int Columns>
-std::ostream & operator<<(std::ostream &out, Matrix<Element, Rows, Columns> const &rhs) {
-
-  for (int i = 0; i < Rows; ++i) {
-    for (int j = 0; j < Columns; ++j) {
-      ScalarIO<Element> element(rhs.at(i, j));
-      out << (j ? ", " : "") << element;
-    }
-    out << "\\n";
-  }
-
-  return out;
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &out, Quaternion<T> const &rhs) {
-
-  out << ScalarIO<T>(rhs.w()) << " ";
-  if (rhs.x() >= 0) {
-    out << "+";
-  }
-
-  out << ScalarIO<T>(rhs.x()) << "*i ";
-  if (rhs.y() >= 0) {
-    out << "+";
-  }
-
-  out << ScalarIO<T>(rhs.y()) << "*j ";
-  if (rhs.z() >= 0) {
-    out << "+";
-  }
-
-  out << ScalarIO<T>(rhs.z()) << "*k";
-
-  return out;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//                         stream operators for cutlass::gemm namespace                          //
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace gemm {
-
-/// Default printing to ostream for GemmShape
-template <int M, int N, int K>
-inline
-std::ostream & operator<<(std::ostream &out, GemmShape<M,N,K> const &gemm_shape) {
-  out << "cutlass::gemm::GemmShape::(kM, kN, kK) {"
-    << cutlass::gemm::GemmShape<M,N,K>::kM <<","
-    << cutlass::gemm::GemmShape<M,N,K>::kN <<","
-    << cutlass::gemm::GemmShape<M,N,K>::kK << "}";
-  return out;
-}
-
-/// Default printing to ostream for GemmCoord
-inline
-std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) {
-  out << "cutlass::gemm::GemmCoord {"
-    << gemm_coord.m() <<","
-    << gemm_coord.n() <<","
-    << gemm_coord.k() << "}";
-  return out;
-}
-
-} //namespace gemm
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//                       stream operators for cutlass namespace                          //
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Default printing to ostream for PitchLinearShape
-template < int Contiguous, int Strided>
-inline
-std::ostream & operator<<(std::ostream &out, PitchLinearShape<Contiguous, Strided> const &pitch_linear_shape) {
-  out << "cutlass::PitchLinearShape:(kContiguous, kStrided) {"
-    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kContiguous <<","
-    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kStrided <<"}";
-  return out;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//                         stream operators for cutlass::conv namespace                          //
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace conv {
-/// Default printing to ostream for Conv2dProblemSize
-inline
-std::ostream& operator<<(std::ostream& out, Conv2dProblemSize const& problem) {
-  out << "NHWC: (" << problem.N << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
-      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C / problem.groups << ")" << std::endl
-      << "NPQK: (" << problem.N << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
-      << "groups: (" << problem.groups << ")" << std::endl
-      << "Pad_h, Pad_w: (" << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
-      << "Stride_h, Stride_w: (" << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
-      << "Dilation_h, Dilation_w: (" << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
-      << "split_k_slices: (" << problem.split_k_slices << ")" << std::endl
-      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
-
-  return out;
-}
-
-
-/// Default printing to ostream for Conv3dProblemSize
-inline
-std::ostream& operator<<(std::ostream& out, Conv3dProblemSize const& problem) {
-  out << "NDHWC: (" << problem.N << ", " << problem.D << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
-      << "KTRSC: (" << problem.K << ", " << problem.T << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
-      << "NZPQK: (" << problem.N << ", " << problem.Z << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
-      << "pad_d, pad_h, pad_w: ("  << problem.pad_d << ", " << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
-      << "stride_d, stride_h, stride_w: ("  << problem.stride_d << ", " << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
-      << "dilation_d, dilation_h, dilation_w: ("  << problem.dilation_d << ", " << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
-      << "split_k_slices: (" << problem.split_k_slices << ") " << std::endl
-      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
-
-  return out;
-}
-
-} // namespace conv
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
deleted file mode 100755
index 1c8f56a65..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Interface betweeen a CUTLASS device-wide operator and CUDA.
-*/
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/trace.h"
-
-#include "cutlass/platform/platform.h"
-#if ! defined(__CUDACC_RTC__)
-#include <cstdio>
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// NVRTC doesn't need definitions for these host classes
-
-#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
-    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) \
-    && !defined(__CUDACC_RTC__)
-#define CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED
-#endif
-
-#if ((__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__))
-#define CUDA_HOST_ADAPTER_TENSORMAP_ENABLED
-#endif
-
-// Include <cuda.h> for CUDA Driver API calls if any of these capabilities are enabled.
-#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||        \
-    defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
-
-#include <cuda.h>
-
-#endif // defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||
-       // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Macro-level guard for CUDA Host Adapter
-//
-#if !defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER)
-#define CUTLASS_ENABLE_CUDA_HOST_ADAPTER false
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#if !defined(__CUDACC_RTC__)
-
-#include <cudaTypedefs.h>
-#include <driver_types.h>
-
-#define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok
-
-#if defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
-
-#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver) \
-  template <typename... Args>                       \
-  CUresult call_##func(Args... args) {              \
-    return func(args...);                           \
-  }
-
-#else // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
-
-#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5)
-
-#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
-  template <typename... Args>                                   \
-  CUresult call_##func(Args... args) {                          \
-    cudaDriverEntryPointQueryResult cuda_status;                \
-    void* pfn = nullptr;                                        \
-    cudaError_t cuda_err = cudaGetDriverEntryPointByVersion(    \
-        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
-        &pfn, ver,                                              \
-        cudaEnableDefault,                                      \
-        &cuda_status);                                          \
-    if (cuda_status != cudaDriverEntryPointSuccess ||           \
-        cuda_err != cudaSuccess) {                              \
-      return CUDA_ERROR_UNKNOWN;                                \
-    }                                                           \
-    return reinterpret_cast<PFN_##func##_v##ver>(pfn)(args...); \
-  }
-
-#else
-
-#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
-  template <typename... Args>                                   \
-  CUresult call_##func(Args... args) {                          \
-    cudaDriverEntryPointQueryResult cuda_status;                \
-    void* pfn = nullptr;                                        \
-    cudaError_t cuda_err = cudaGetDriverEntryPoint(             \
-        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
-        &pfn,                                                   \
-        cudaEnableDefault,                                      \
-        &cuda_status);                                          \
-    if (cuda_status != cudaDriverEntryPointSuccess ||           \
-        cuda_err != cudaSuccess) {                              \
-      return CUDA_ERROR_UNKNOWN;                                \
-    }                                                           \
-    return reinterpret_cast<PFN_##func>(pfn)(args...);          \
-  }
-
-#endif // (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5)
-
-#endif // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
-
-#if (__CUDACC_VER_MAJOR__ >= 12)
-CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeTiled, 12000);
-CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeIm2col, 12000);
-#endif
-
-#undef CUTLASS_CUDA_DRIVER_STRINGIFY
-
-#define CUTLASS_CUDA_DRIVER_WRAPPER_CALL(func) cutlass::call_##func
-
-#endif // !defined(__CUDACC_RTC__)
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This class manages runtime CUlaunchAttribute that can be supplied to CudaHostAdapter
-/// CudaHostLaunchAttributes will be an empty struct in earlier CTK where CUlaunchAttribute
-/// is not introduced.
-struct CudaHostLaunchAttributes {
-
-#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
-
-  /// Reasonable maximum launch attributes that are commonly applied
-  static constexpr int32_t kMaximumAttributeCount = 5;
-
-  /// Launch attributes
-  CUlaunchAttribute launch_attributes[kMaximumAttributeCount];
-  int32_t      attribute_count = 0;
-
-  CUTLASS_HOST_DEVICE
-  CudaHostLaunchAttributes(CUlaunchAttribute *launch_attributes_ = nullptr,
-                           int32_t attribute_count_ = 0) {
-    CUTLASS_ASSERT(attribute_count_ >= 0 && attribute_count_ < kMaximumAttributeCount);
-    for (int32_t i = 0; i < attribute_count_ && i < kMaximumAttributeCount; ++i) {
-      launch_attributes[i] = launch_attributes_[i];
-    }
-    attribute_count = attribute_count_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  CUlaunchAttribute const* data() const {
-    return launch_attributes;
-  }
-
-  CUTLASS_HOST_DEVICE
-  size_t size() const {
-    return attribute_count;
-  }
-  
-#endif // (CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
-
-};
-
-
-/// This class defines an object which abstracts interactions between the CUTLASS device-wide GEMM and
-/// CUDA. The intention is to enable CUTLASS to be used with both the CUDA Runtime API and CUDA Driver API.
-struct CudaHostAdapter {
-
-  /// Limit the number of kernels
-  static constexpr int32_t kMaximumKernelCount = 4;
-
-  /// Maximum cluster size
-  static constexpr int MaxClusterSize = 32;
-
-  //
-  // Data members
-  //
-
-  /// Handles
-  void        *kernel_handles[kMaximumKernelCount];
-  int32_t      kernel_count = 0;
-
-  CudaHostLaunchAttributes launch_attributes;
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CudaHostAdapter() = default;
-
-  /// Dtor
-  virtual ~CudaHostAdapter() = default;
-
-  /// Copy Ctor
-  CUTLASS_HOST_DEVICE
-  CudaHostAdapter(const CudaHostAdapter & rhs)
-      : kernel_count(rhs.kernel_count),
-        launch_attributes(rhs.launch_attributes) {
-    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
-
-    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
-      kernel_handles[i] = rhs.kernel_handles[i];
-    }
-  }
-
-  /// Copy Assignment
-  CUTLASS_HOST_DEVICE
-  CudaHostAdapter& operator=(const CudaHostAdapter & rhs) {
-    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
-    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
-      kernel_handles[i] = rhs.kernel_handles[i];
-    }
-    kernel_count = rhs.kernel_count;
-
-    launch_attributes = rhs.launch_attributes;
-
-    return *this;
-  }
-
-
-  /// Move ctor
-  CUTLASS_HOST_DEVICE
-  CudaHostAdapter(CudaHostAdapter && rhs)
-      : kernel_count(rhs.kernel_count),
-        launch_attributes(std::move(rhs.launch_attributes)) {
-    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
-
-    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
-      kernel_handles[i] = rhs.kernel_handles[i];
-    }
-  }
-
-  // / Move assignment
-  CUTLASS_HOST_DEVICE 
-  CudaHostAdapter& operator=(CudaHostAdapter && rhs) {
-    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
-    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
-      kernel_handles[i] = rhs.kernel_handles[i];
-    }
-    kernel_count = rhs.kernel_count;
-    launch_attributes = std::move(rhs.launch_attributes);
-    return *this;
-  }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  CudaHostAdapter(void **kernel_handles_, 
-                  int32_t kernel_count_,
-                  CudaHostLaunchAttributes const &launch_attributes_ = { })
-      : kernel_count(kernel_count_),
-        launch_attributes(launch_attributes_) {
-    CUTLASS_ASSERT(kernel_count >= 0 && kernel_count < kMaximumKernelCount);
-
-    for (int32_t i = 0; i < kernel_count && i < kMaximumKernelCount; ++i) {
-      kernel_handles[i] = kernel_handles_[i];
-    }
-  }
-
-  /// Returns true if the CudaHostAdapter is empty (kernel_count == 0)
-  CUTLASS_HOST_DEVICE 
-  bool empty() const { return !kernel_count; }
-
-  /// Returns kernel_count
-  CUTLASS_HOST_DEVICE
-  size_t size() const { return static_cast<size_t>(kernel_count); }
-
-  /// Queries the occupancy of a kernel
-  virtual Status query_occupancy(
-    int32_t *device_sms, 
-    int32_t *sm_occupancy,
-    int32_t kernel_index,
-    int32_t thread_count,
-    int32_t smem_size) const = 0;
- 
-  /// Launches a kernel without using Threadblock Clusters. 
-  virtual Status launch(
-    dim3 const grid_dims,
-    dim3 const block_dims,
-    size_t const smem_size,
-    cudaStream_t cuda_stream,
-    void** kernel_params,
-    int32_t kernel_index) const = 0;
-
-  /// Launches a kernel using the CUDA Extensible Launch API and Threadblock Clusters.
-  virtual Status launch(
-    dim3 const grid_dims,
-    dim3 const cluster_dims,
-    dim3 const block_dims,
-    size_t const smem_size,
-    cudaStream_t cuda_stream,
-    void** kernel_params,
-    int32_t kernel_index) const = 0;
-
-#if defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
-
-  /// Create a tensor map descriptor object representing im2col memory region.
-  virtual CUresult tensorMapEncodeIm2col (
-    CUtensorMap* tensorMap,
-    CUtensorMapDataType tensorDataType,
-    cuuint32_t tensorRank,
-    void* globalAddress,
-    const cuuint64_t* globalDim,
-    const cuuint64_t* globalStrides,
-    const int* pixelBoxLowerCorner,
-    const int* pixelBoxUpperCorner,
-    cuuint32_t channelsPerPixel,
-    cuuint32_t pixelsPerColumn,
-    const cuuint32_t* elementStrides,
-    CUtensorMapInterleave interleave,
-    CUtensorMapSwizzle swizzle,
-    CUtensorMapL2promotion l2Promotion,
-    CUtensorMapFloatOOBfill oobFill) const = 0;
-
-  /// Create a tensor map descriptor object representing tiled memory region.
-  virtual CUresult tensorMapEncodeTiled (
-    CUtensorMap* tensorMap,
-    CUtensorMapDataType tensorDataType,
-    cuuint32_t tensorRank,
-    void* globalAddress,
-    const cuuint64_t* globalDim,
-    const cuuint64_t* globalStrides,
-    const cuuint32_t* boxDim,
-    const cuuint32_t* elementStrides,
-    CUtensorMapInterleave interleave,
-    CUtensorMapSwizzle swizzle,
-    CUtensorMapL2promotion l2Promotion,
-    CUtensorMapFloatOOBfill oobFill) const = 0;
-
-  /// Modify an existing tensor map descriptor with an updated global address.
-  virtual CUresult tensorMapReplaceAddress(
-    CUtensorMap* tensorMap,
-    void* globalAddress)  const = 0;
-
-#endif // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
-
-protected:
-
-  /**
-   * Fills a buffer in Global Memory with a byte sequence copied from host memory.
-   * This function can be overriden to dispatch to the appropriate cuMemsetD*Async API
-  */
-  virtual Status memsetDeviceImpl(
-    void* destination, ///< Device memory pointer to be filled
-    void const* fill_value, ///< Value to be filled in the buffer
-    size_t fill_size, ///< Size of the data type to be used for filling the buffer
-    size_t count, ///< Number of elements of size fill_size
-    cudaStream_t stream) const = 0;
-
-public:
-
-  /// Fills a buffer in Global Memory with a byte sequence copied from host memory
-  template<class FillValueType>
-  CUTLASS_HOST_DEVICE
-  Status memsetDevice(
-      void* destination,
-      FillValueType fill_value, 
-      size_t count,
-      cudaStream_t stream) const {
-    return this->memsetDeviceImpl(
-      destination,
-      &fill_value,
-      sizeof(FillValueType),
-      count,
-      stream);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/cutlass.h b/lightllm-kernel/cutlass/include/cutlass/cutlass.h
deleted file mode 100755
index e12616a20..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/cutlass.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Basic include for CUTLASS.
-*/
-
-#pragma once
-
-#include "cutlass/arch/synclog.hpp"
-#include "cutlass/detail/helper_macros.hpp"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/// Status code returned by CUTLASS operations
-enum class Status {
-  kSuccess,                    ///< Operation was successful.
-  kErrorMisalignedOperand,     ///< operands fail alignment requirements.
-  kErrorInvalidDataType,       ///< DataType fails requirement.
-  kErrorInvalidLayout,         ///< Layout fails alignment requirement.
-  kErrorInvalidProblem,        ///< Specified problem size is not supported by operator.
-  kErrorNotSupported,          ///< Operation is not supported on current device.
-  kErrorWorkspaceNull,         ///< The given workspace is null when it is required to be non-null.
-  kErrorInternal,              ///< An error within CUTLASS occurred.
-  kErrorArchMismatch,          ///< CUTLASS runs on a device that it was not compiled for.
-  kErrorInsufficientDriver,    ///< CUTLASS runs with a driver that is too old.
-  kErrorMemoryAllocation,      ///< Kernel launch failed due to insufficient device memory.
-  kInvalid                     ///< Status is unspecified.
-};
-
-/// Convert cutlass status to status strings
-CUTLASS_HOST_DEVICE
-static char const* cutlassGetStatusString(cutlass::Status status) {
-  switch (status) {
-    case cutlass::Status::kSuccess:
-      return "Success";
-    case cutlass::Status::kErrorMisalignedOperand:
-      return "Error Misaligned Operand";
-    case cutlass::Status::kErrorInvalidDataType:
-      return "Error Invalid Data Type";
-    case cutlass::Status::kErrorInvalidLayout:
-      return "Error Invalid Layout";
-    case cutlass::Status::kErrorInvalidProblem:
-      return "Error Invalid Problem";
-    case cutlass::Status::kErrorNotSupported:
-      return "Error Not Supported";
-    case cutlass::Status::kErrorWorkspaceNull:
-      return "Error Workspace Null";
-    case cutlass::Status::kErrorInternal:
-      return "Error Internal";
-    case cutlass::Status::kErrorInsufficientDriver:
-      return "Error Insufficient Driver";
-    case cutlass::Status::kErrorArchMismatch:
-      return "Error Architecture Mismatch";
-    case cutlass::Status::kErrorMemoryAllocation:
-      return "Error Memory Allocation failed";
-    case cutlass::Status::kInvalid: break;
-  }
-
-  return "Invalid status";
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static const int NumThreadsPerWarp = 32;
-static const int NumThreadsPerWarpGroup = 128;
-static const int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
-static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2;
-static const int NumThreadsPerQuad = 4;
-static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper function to return true when called by thread 0 of threadblock 0.
-CUTLASS_HOST_DEVICE bool thread0() {
-  #if defined(__CUDA_ARCH__)
-    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
-  #else
-    return false;
-  #endif
-}
-
-/// Returns a lane index in the warp. The threads in warp may not be convergent
-CUTLASS_DEVICE
-int canonical_lane_idx() { 
-  #if defined(__CUDA_ARCH__)
-    return threadIdx.x % NumThreadsPerWarp;
-  #else
-    return 0;
-  #endif
-}
-
-/// Returns a warp-uniform value indicating the canonical warp index of the calling threads.
-/// Threads within the warp must be converged.
-CUTLASS_DEVICE
-int canonical_warp_idx_sync() { 
-  #if defined(__CUDA_ARCH__)
-    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarp, 0);
-  #else
-    return 0;
-  #endif
-}
-
-/// Returns a warp index in the CTA. The threads in warp may not be convergent
-/// As it doesn't sync the warp, it faster and allows forward progress
-CUTLASS_DEVICE
-int canonical_warp_idx() { 
-  #if defined(__CUDA_ARCH__)
-    return threadIdx.x / NumThreadsPerWarp;
-  #else
-    return 0;
-  #endif
-}
-
-/// Returns a warp-uniform value indicating the canonical warp group index of the calling threads.
-/// Threads within the warp must be converged.
-CUTLASS_DEVICE
-int canonical_warp_group_idx() {
-  #if defined(__CUDA_ARCH__)
-    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0);
-  #else
-    return 0;
-  #endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
deleted file mode 100755
index a4b288e7c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/container/tuple.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <size_t I, class Tuple>
-struct deduce_mixed_width_dtype {
-static_assert(I >= 0u && I <= 2u, "Valid indices are 0, 1, and 2, which represent Operand, Scale, and Bias, respectively.");
-
-private:
-  using underlying_tuple = cute::conditional_t<cute::is_tuple<Tuple>::value, Tuple, cute::tuple<Tuple>>;
-  static constexpr size_t valid_index = cute::min(I, cute::tuple_size_v<underlying_tuple> - 1);
-
-public:
-  using type = cute::conditional_t<(I < cute::tuple_size_v<underlying_tuple>), 
-                                    cute::tuple_element_t<valid_index, underlying_tuple>,
-                                    void>;
-};
-
-template <size_t I, class Tuple>
-using deduce_mixed_width_dtype_t = typename deduce_mixed_width_dtype<I, Tuple>::type;
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
deleted file mode 100755
index 76e52d2bf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::detail {
-
-/// @brief A bool constant that depends on one or more template parameters.
-///
-/// For more detailed documentation and use cases,
-/// please see `dependent_false` below.
-template <bool Value, class... Args>
-inline constexpr bool dependent_bool_value = Value;
-
-/// @brief An always-false value that depends on one or more template parameters.
-///
-/// This exists because `static_assert(false);` always fails,
-/// even if it occurs in the `else` branch of an `if constexpr`.
-/// The following example shows how to use `dependent_false` in that case.
-///
-/// @code
-/// template<class T>
-/// void foo (T t)
-/// {
-///     if constexpr (std::is_integral_v<T>) {
-///         do_integer_stuff(t);
-///     }
-///     else if constexpr (std::is_floating_point_v<T>) {
-///         do_floating_point_stuff(t);
-///     }
-///     else {
-///         static_assert(dependent_false<T>, "T must be "
-///             "an integral or floating-point type.");
-///     }
-/// }
-/// @endcode
-///
-/// This implements the C++ Standard Library proposal P1830R1.
-///
-/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
-///
-/// That proposal is under review as of 2022/12/05.
-/// The following link shows P1830's current review status.
-///
-/// https://github.com/cplusplus/papers/issues/572
-///
-/// P2593R0 proposes an alternate solution to this problem,
-/// that would change the C++ language itself.
-///
-/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
-///
-/// For headers in this library, however, we only consider library solutions
-/// as work-arounds for future C++ features.
-template <class... Args>
-inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
-
-}  // end namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
deleted file mode 100755
index 4cd895f14..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Helper macros for the CUTLASS library
-*/
-
-#pragma once
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#ifdef CUTLASS_NAMESPACE
-#define concat_tok(a, b) a ## b
-#define mkcutlassnamespace(pre, ns) concat_tok(pre, ns)
-#define cutlass mkcutlassnamespace(cutlass_, CUTLASS_NAMESPACE)
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-#define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
-#define CUTLASS_DEVICE __forceinline__ __device__
-#elif defined(__CUDACC_RTC__)
-#define CUTLASS_HOST_DEVICE __forceinline__ __device__
-#define CUTLASS_DEVICE __forceinline__ __device__
-#else
-#define CUTLASS_HOST_DEVICE inline
-#define CUTLASS_DEVICE inline
-#endif
-
-#define CUTLASS_HOST __host__
-#define CUTLASS_GLOBAL __global__ static
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename T>
-CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) 
-{ }
-
-#if defined(__GNUC__)
-  #define CUTLASS_UNUSED(expr) __CUTLASS_UNUSED(expr)
-#else
-  #define CUTLASS_UNUSED(expr) do { ; } while (&expr != &expr)
-#endif
-
-#ifdef _MSC_VER
-// Provides support for alternative operators 'and', 'or', and 'not'
-#include <iso646.h>
-#endif // _MSC_VER
-
-#if !defined(__CUDACC_RTC__)
-#include <assert.h>
-#endif
-
-#if defined(__CUDA_ARCH__)
-  #if defined(_MSC_VER)
-    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __FUNCSIG__); asm volatile ("brkpt;\n"); }
-  #else
-    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __PRETTY_FUNCTION__); asm volatile ("brkpt;\n"); }
-  #endif
-#else
-  #if defined(_MSC_VER)
-    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
-  #else
-    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
-  #endif
-#endif
-
-// CUTLASS_CMATH_NAMESPACE is the namespace where code can find
-// <cmath> functions like isnan and log.  Such functions are in
-// the std namespace in host code, but in the global namespace
-// in device code.
-//
-// The intended use case for this macro is in "using" declarations
-// for making argument-dependent lookup (ADL) work in generic code.
-// For example, if T is cutlass::half_t, the following code will
-// invoke cutlass::isnan(half_t).  If T is float, it will invoke
-// std::isnan on host and ::isnan on device.  (CUTLASS's support
-// for NVRTC prevents it from using things in the std namespace
-// in device code.)  Correct use of "using" declarations can help
-// avoid unexpected implicit conversions, like from half_t to float.
-//
-// template<class T>
-// bool foo(T x) {
-//   using CUTLASS_CMATH_NAMESPACE :: isnan;
-//   return isnan(x);
-// }
-//
-// Without this macro, one would need to write the following.
-//
-// template<class T>
-// bool foo(T x) {
-// #if defined(__CUDA_ARCH__)
-//   using ::isnan;
-// #else
-//   using std::isnan;
-// #endif
-//   return isnan(x);
-// }
-
-#if defined(__CUDA_ARCH__)
-#  define CUTLASS_CMATH_NAMESPACE
-#else
-#  define CUTLASS_CMATH_NAMESPACE std
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-
-#ifndef CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
-#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 0
-#endif
-
-
-// CUDA 10.1 introduces the mma instruction
-#if !defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)
-#define CUTLASS_ENABLE_TENSOR_CORE_MMA 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define CUTLASS_ASSERT(x) assert(x)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
-#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
-  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
-    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
-    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
-  #else
-    #define CUTLASS_PRAGMA_UNROLL #pragma unroll
-    #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
-  #endif
-
-  #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
-
-#else
-
-    #define CUTLASS_PRAGMA_UNROLL
-    #define CUTLASS_PRAGMA_NO_UNROLL
-    #define CUTLASS_GEMM_LOOP
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if !defined(__CUDACC_RTC__)
-#define CUTLASS_THREAD_LOCAL thread_local
-#else
-#define CUTLASS_THREAD_LOCAL
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(_MSVC_LANG)
-#  define CUTLASS_CPLUSPLUS _MSVC_LANG
-#else
-#  define CUTLASS_CPLUSPLUS __cplusplus
-#endif
-
-#if (201700L <= CUTLASS_CPLUSPLUS)
-#define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
-#define CUTLASS_CXX17_OR_LATER 1
-#else
-#define CUTLASS_CONSTEXPR_IF_CXX17
-#define CUTLASS_CXX17_OR_LATER 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}; // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
deleted file mode 100755
index cbed61f68..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp
+++ /dev/null
@@ -1,406 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/pointer_sparse.hpp"       // cute::is_sparse
-#include "cute/swizzle.hpp"              // cute::Swizzle
-#include "cute/swizzle_layout.hpp"       // cute::detail::get_swizzle_portion
-#include "cute/util/type_traits.hpp"
-#include "cute/arch/copy_sm90_tma.hpp"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/detail/collective.hpp"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::detail {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// For each cutlass::layout, provides its corresponding cute stride types, 64b by default
-
-template <class L>
-struct TagToStrideA {
-  using type = L;
-};
-
-// Maps to modes [M, K, L]
-template <>
-struct TagToStrideA<layout::RowMajor> {
-  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using tag = layout::RowMajor;
-};
-
-// Maps to modes [M, K, L]
-template <>
-struct TagToStrideA<layout::ColumnMajor> {
-  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
-  using tag = layout::ColumnMajor;
-};
-
-template <class L>
-struct TagToStrideB {
-  using type = L;
-};
-
-// Maps to modes [N, K, L]
-template <>
-struct TagToStrideB<layout::RowMajor> {
-  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
-  using tag = layout::RowMajor;
-};
-
-// Maps to modes [N, K, L]
-template <>
-struct TagToStrideB<layout::ColumnMajor> {
-  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using tag = layout::ColumnMajor;
-};
-
-// For each cutlass::layout *, provides its corresponding cute stride types, 64b by default
-// Used by pointer array and grouped gemm
-// Maps to modes [M, K, L]
-template <>
-struct TagToStrideA<layout::RowMajor *> {
-  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
-  using type = UnderlyingType*;
-  using tag = layout::RowMajor;
-};
-
-// Maps to modes [M, K, L]
-template <>
-struct TagToStrideA<layout::ColumnMajor *> {
-  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
-  using type = UnderlyingType*;
-  using tag = layout::ColumnMajor;
-};
-
-// Maps to modes [N, K, L]
-template <>
-struct TagToStrideB<layout::RowMajor *> {
-  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
-  using type = UnderlyingType*;
-  using tag = layout::RowMajor;
-};
-
-// Maps to modes [N, K, L]
-template <>
-struct TagToStrideB<layout::ColumnMajor *> {
-  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
-  using type = UnderlyingType*;
-  using tag = layout::ColumnMajor;
-};
-
-// Maps to modes [M, N, L]
-template <class LayoutTag>
-struct TagToStrideC : TagToStrideA<LayoutTag> { };
-
-// Conv: Maps to modes ((P,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorNWC> {
-  using type = cute::Stride<cute::Stride<int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes ((P,Q,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorNHWC> {
-  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes ((P,Q,Z,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorNDHWC> {
-  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes (K, (C,S), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorKCS> {
-  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes (K, (C,S,R), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorKCSR> {
-  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes (K, (C,S,R,T), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorKCSRT> {
-  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, cute::Int<0>>;
-};
-
-// Conv: Maps to modes ((C,S), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorCSK> {
-  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t>, int64_t, cute::Int<0>>;
-};
-
-// Conv: Maps to modes ((C,S,R), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorCSRK> {
-  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t>, int64_t, cute::Int<0>>;
-};
-
-// Conv: Maps to modes ((C,S,R,T), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
-template <>
-struct TagToStrideC<cutlass::layout::TensorCSRTK> {
-  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, int64_t, cute::Int<0>>;
-};
-
-// Convenience aliases
-template<class LayoutTag>
-using TagToStrideA_t = typename TagToStrideA<LayoutTag>::type;
-
-template<class LayoutTag>
-using TagToStrideB_t = typename TagToStrideB<LayoutTag>::type;
-
-template<class LayoutTag>
-using TagToStrideC_t = typename TagToStrideC<LayoutTag>::type;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// For 2.x compatibility APIs, provide stride->layout tag mappers
-
-template<int ModeIndex, class Stride>
-constexpr bool
-is_major(Stride = {}) {
-  // Account for stride types with and without batch mode and batch modes with static zero stride
-  return cute::is_constant<1, decltype(cute::front(cute::get<ModeIndex>(cute::remove_pointer_t<Stride>{})))>::value;
-}
-
-template<int ModeIndex, class Shape, class Stride>
-constexpr bool
-is_major(cute::Layout<Shape,Stride> = {}) {
-  return is_major<ModeIndex>(Stride{});
-}
-
-// Note : This method can be used for deducing the Layout Tag of A, C, D Matrices
-template<class StrideA>
-constexpr
-auto
-stride_to_layout_tag_A() {
-  using InternalStrideA = cute::remove_pointer_t<StrideA>;
-  if constexpr (cute::is_layout<InternalStrideA>::value) {
-    return stride_to_layout_tag_A<decltype(cute::stride(InternalStrideA{}))>();
-  }
-  else if constexpr (is_major<0, StrideA>()) { // M major
-    return layout::ColumnMajor{};
-  }
-  // Specialize for sparse layout
-  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} && 
-                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 && 
-                     cute::is_same_v<cute::_1, cute::remove_cvref_t<decltype(cute::get<1,0>(InternalStrideA{}))>>) {
-    return layout::ColumnMajor{};
-  }
-  else { // K major
-    return layout::RowMajor{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template<class StrideB>
-constexpr
-auto
-stride_to_layout_tag_B() {
-  using InternalStrideB = cute::remove_pointer_t<StrideB>;
-  if constexpr (cute::is_layout<InternalStrideB>::value) {
-    return stride_to_layout_tag_B<decltype(cute::stride(InternalStrideB{}))>();
-  }
-  else if constexpr (is_major<0, StrideB>()) { // N major
-    return layout::RowMajor{};
-  }
-  else { // K major
-    return layout::ColumnMajor{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template<class StrideC>
-constexpr
-auto
-stride_to_layout_tag_C() {
-  using InternalStrideC = cute::remove_pointer_t<StrideC>;
-  if constexpr (cute::is_layout<InternalStrideC>::value) {
-    return stride_to_layout_tag_C<decltype(cute::stride(InternalStrideC{}))>();
-  }
-  else if constexpr (is_major<0, StrideC>()) { // M major
-    return layout::ColumnMajor{};
-  }
-  else { // N major
-    return layout::RowMajor{};
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// Utilities to map Stride back on to their corresponding layout tags
-template <class S>
-struct StrideToLayoutTagA {
-  using type = decltype(detail::stride_to_layout_tag_A<S>());
-};
-
-template <class S>
-struct StrideToLayoutTagB {
-  using type = decltype(detail::stride_to_layout_tag_B<S>());
-};
-
-template <class S>
-struct StrideToLayoutTagC {
-  using type = decltype(detail::stride_to_layout_tag_C<S>());
-};
-
-// Convenience aliases
-template<class S>
-using StrideToLayoutTagA_t = typename StrideToLayoutTagA<S>::type;
-
-template<class S>
-using StrideToLayoutTagB_t = typename StrideToLayoutTagB<S>::type;
-
-template<class S>
-using StrideToLayoutTagC_t = typename StrideToLayoutTagC<S>::type;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Inspects a tiled copy and whether its copy engine is TMA or not
-template<class GmemTiledCopy>
-constexpr bool is_tma_copy_engine() {
-  if constexpr (cute::is_void_v<GmemTiledCopy>) {
-    return false;
-  }
-  else {
-   if constexpr (   cute::is_base_of_v<cute::SM90_TMA_LOAD,                         GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_MULTICAST,              GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL,                 GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL_MULTICAST,       GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM90_TMA_STORE,                       GmemTiledCopy>
-                  || cute::is_base_of_v<cute::SM90_TMA_STORE_IM2COL,                GmemTiledCopy>
-                  ) {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <class X, class = void>
-struct RawDtype { using type = X; };
-
-template <class X>
-struct RawDtype<X,cute::void_t<typename X::raw_type>> { using type = typename X::raw_type; };
-
-
-// Inspects a TiledCopy and returns its alignment in terms of element count
-template <class GmemTiledCopy, class Element, class ElementMma = Element>
-constexpr int
-get_alignment_count_from_gmem_tiled_copy() {
-
-  if constexpr (cute::is_void_v<GmemTiledCopy>) {
-    return 1;
-  }
-
-  // Account for ElementC = void kernels
-  else if constexpr (cute::is_void_v<Element>) {
-    return 0;
-  }
-
-  else {
-    // For TMA tiled copies, we know the alignment has to be 128 bits
-    if constexpr (is_tma_copy_engine<GmemTiledCopy>()) {
-      // For sparse MMA, alignment in logical elements is increased by sparsity factor
-      if constexpr (cute::is_sparse_v<ElementMma>) {
-        return 128 / sizeof_bits<Element>::value * ElementMma::sparsity;
-      }
-      return 128 / sizeof_bits<Element>::value;
-    }
-    else {
-      // For non-TMA tiled copies, TiledCopy holds the alignment count directly in its TiledShape_MN
-      return GmemTiledCopy::NumValSrc;
-    }
-  }
-}
-
-// Return alignment bit requirements for the GEMM inputs.
-template <
-  class ElementType
->
-constexpr int
-get_input_alignment_bits() {
-  return 128;
-}
-
-// Return alignment bit requirements for the GEMM outputs.
-template <class ElementType>
-constexpr int
-get_output_alignment_bits() {
-  return 128;
-}
-
-// Check if tensor layout satisfies a given major alignment
-template<int Alignment, class Shape, class Stride>
-CUTLASS_HOST_DEVICE constexpr
-bool
-check_alignment(cute::Layout<Shape,Stride> const& layout) {
-  // Condition: shape must divide by Alignment without rounding
-  bool shape_check = cute::size(layout.shape()) == Alignment * cute::size(cute::upcast<Alignment>(layout));
-  // Condition: every dynamic stride must be a multiple of Alignment
-  bool stride_check = cute::all_of(cute::flatten(layout.stride()), [](auto s){ return cute::is_static<decltype(s)>::value || (s % Alignment == 0); });
-  return shape_check && stride_check;
-}
-
-// Check if tensor layout satisfies a given major alignment
-template<int Alignment, class Shape, class Stride>
-CUTLASS_HOST_DEVICE constexpr
-bool
-check_alignment(Shape const& shape, Stride const& stride) {
-  return check_alignment<Alignment>(cute::make_layout(shape, stride));
-}
-
-template<int B, int M, int S>
-CUTLASS_HOST_DEVICE constexpr
-size_t
-alignment_for_swizzle(cute::Swizzle<B, M, S>) {
-  static_assert(B >= 0 and M >= 0);
-  return size_t(1) << size_t(B + M + cute::abs(S));
-}
-
-template<class Layout>
-CUTLASS_HOST_DEVICE constexpr
-size_t
-alignment_for_swizzle(Layout layout) {
-  return alignment_for_swizzle(cute::detail::get_swizzle_portion(layout));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp b/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
deleted file mode 100755
index 0e491b9c4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cute/layout.hpp"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::detail {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class TiledMma, class = void>
-struct IsSparseTensorOp : cute::false_type { };
-
-// TiledMma for sparse must have ValTypeE
-template <class TiledMma>
-struct IsSparseTensorOp<TiledMma, cute::void_t<typename TiledMma::ValTypeE>>
-    : cute::true_type { };
-
-// The following metafunction is used to extract the OperatorClass from a cutlass 3.x kernel.
-template <class TiledMma>
-struct get_operator_class {
-  static constexpr bool is_sparse_op = IsSparseTensorOp<TiledMma>::value;
-  static constexpr bool is_tensor_op = cute::size<0>(typename TiledMma::AtomShape_MNK{}) >= 8;
-  using type = cute::conditional_t<
-                is_tensor_op, 
-                cute::conditional_t<
-                  is_sparse_op,
-                  cutlass::arch::OpClassSparseTensorOp,
-                    cutlass::arch::OpClassTensorOp
-                  >,
-                cutlass::arch::OpClassSimt
-                >;
-};
-
-template <class T>
-using get_operator_class_t = typename get_operator_class<T>::type;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/device_kernel.h b/lightllm-kernel/cutlass/include/cutlass/device_kernel.h
deleted file mode 100755
index 7af5d96cf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/device_kernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for generic CUTLASS kernel.
-*/
-
-#pragma once
-
-// __grid_constant__ was introduced in CUDA 11.7.
-#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7)))
-#  define CUTLASS_GRID_CONSTANT_SUPPORTED
-#endif
-
-// __grid_constant__ can be enabled only on SM70+
-#if defined(CUTLASS_GRID_CONSTANT_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
-#  define CUTLASS_GRID_CONSTANT_ENABLED
-#endif
-
-#if ! defined(CUTLASS_GRID_CONSTANT)
-#  if defined(CUTLASS_GRID_CONSTANT_ENABLED)
-#    define CUTLASS_GRID_CONSTANT __grid_constant__
-#  else
-#    define CUTLASS_GRID_CONSTANT
-#  endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-template <typename T>   struct Type2Type  {  using type=T;                    };
-// using the simple type to replace the complex type to reduce this symbol size
-template <typename  T>                                                                        struct GetUnderlyingKernel                              : public Type2Type<T>               {};
-template <uint64_t shader_guid, unsigned index, template <uint64_t, unsigned> class Wrapper > struct GetUnderlyingKernel<Wrapper<shader_guid,index>>  : public Wrapper<shader_guid,index> {};
-template <typename  T>                                                                        using  GetUnderlyingKernel_t                            = typename GetUnderlyingKernel<T>::type;
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Generic CUTLASS kernel template.
-template <typename Operator>
-CUTLASS_GLOBAL
-void Kernel(typename Operator::Params params) {
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-  // Declare pointer to dynamic shared memory.
-  typename Operator::SharedStorage *shared_storage =
-      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
-
-  Operator op;
-
-  op(params, *shared_storage);
-  cutlass::arch::synclog_print();
-}
-
-
-/// Generic CUTLASS kernel template.
-template <typename Operator>
-CUTLASS_GLOBAL
-void Kernel2(typename Operator::Params params) {
-  // Dynamic shared memory base pointer
-  extern __shared__ int SharedStorageBase[];
-  // Declare pointer to dynamic shared memory.
-  typename Operator::SharedStorage *shared_storage =
-      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
-
-  Operator::invoke(params, *shared_storage);
-  cutlass::arch::synclog_print();
-
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// 3.0 specific launch
-//
-////////////////////////////////////////////////////////////////////////////////
-
-/// Generic CUTLASS kernel template.
-template <typename Operator>
-CUTLASS_GLOBAL
-#ifdef __CUDACC__
-// Enclosing this in __CUDACC__ suppresses MSVC warnings.
-__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
-#endif // __CUDACC__
-void device_kernel(CUTLASS_GRID_CONSTANT typename Operator::Params const params)
-{
-  // Dynamic shared memory base pointer
-  extern __shared__ char smem[];
-  Operator op;
-  op(params, smem);
-  cutlass::arch::synclog_print();
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-} /// namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
deleted file mode 100755
index 759591b5d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ /dev/null
@@ -1,812 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/atom/mma_traits_sm90.hpp"
-#include "cute/atom/mma_traits_sm90_gmma.hpp"
-#include "cute/atom/copy_traits_sm90.hpp"
-
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-#include "cutlass/epilogue/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/collective_epilogue.hpp"
-#include "cutlass/epilogue/collective/builders/sm90_common.inl"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/type_traits>
-#else
-#include <type_traits>
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// Returns the parameterized dispatch policy for the TMA epilogue
-template<class TileShapeMNK, class EpilogueTileMN, class ElementC, class ElementD, class Schedule>
-constexpr auto
-sm90_get_tma_dispatch_policy() {
-  using namespace cute;
-
-  constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
-  constexpr int FragmentSize = size(EpilogueTileMN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128);
-  // 8b residuals load fast and consume little smem, so the perf cost of waiting on stores to finish outweighs the cost of extra allocation
-  constexpr bool ReuseSmem = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
-  // TMA store delay performs worse with residual loads and compilicates tensormap updates for Ptr-Array GEMMs
-  constexpr bool DelayTmaStore = is_void_v<ElementC> && !detail::sm90_is_ptr_array_tma_v<Schedule>;
-  constexpr int StagesD = cute::min(EpiTiles, 2);
-  constexpr int StagesC = ReuseSmem ? cute::max(cute::min(EpiTiles, 4), StagesD+1)
-                                    : cute::min(EpiTiles, 4);
-
-  if constexpr (detail::sm90_is_ptr_array_tma_v<Schedule>) {
-      return Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, 
-                                            DelayTmaStore, Schedule::NumEpilogueWarpGroups>{};
-  } 
-  else {
-    return Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmem, DelayTmaStore>{};
-  }
-}
-
-// Returns the smem layout atom to be used for C or D matrix
-template<class GmemStrideType, class Element, class EpilogueTile_MN>
-constexpr auto
-sm90_get_epilogue_smem_swizzle_layout_atom() {
-  using namespace cute;
-
-  // ColMajor C/D (M-major)
-  if constexpr (cutlass::gemm::detail::is_major<0>(GmemStrideType{})) {
-    return cutlass::gemm::collective::detail::ss_smem_selector<
-      cute::GMMA::Major::MN, Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
-    >();
-  }
-  // RowMajor C/D (N-major)
-  else if constexpr (cutlass::gemm::detail::is_major<1>(GmemStrideType{})) {
-    return cutlass::gemm::collective::detail::ss_smem_selector<
-      cute::GMMA::Major::K , Element, decltype(get<0>(EpilogueTile_MN{})), decltype(get<1>(EpilogueTile_MN{}))
-    >();
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<GmemStrideType>, "Unsupported gmem layout.");
-  }
-}
-
-// Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
-template <class ElementD, class EpilogueTileType, class Schedule, class TileShape_MNK>
-constexpr auto
-sm90_compute_tile_shape_or_override() {
-  if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
-    auto epi_tile = [&] () {
-      if constexpr (detail::sm90_is_cooperative_v<Schedule>) {
-        auto tile_m = cute::min(_128{}, size<0>(TileShape_MNK{}));
-        auto tile_n = cute::min(_32{}, size<1>(TileShape_MNK{}));
-        return make_shape(tile_m, tile_n);
-      }
-      else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
-        constexpr int N_perf = sizeof_bits_v<ElementD> == 8 ? 64 : 32;
-        auto tile_m = cute::min(_64{}, size<0>(TileShape_MNK{}));
-        auto tile_n = cute::min(Int<N_perf>{}, size<1>(TileShape_MNK{}));
-        return make_shape(tile_m, tile_n);
-      }
-      else {
-        static_assert(cutlass::detail::dependent_false<Schedule>, "Unsupported schedule.");
-      }
-    }();
-
-    return cute::transform(epi_tile, seq<0,1>{},
-      [] (auto epi_tiler, auto I) {
-        auto cta_tiler = make_layout(get<I>(TileShape_MNK{}));
-        // This is a multimodal CTA tiler, transform before returning
-        if constexpr (depth(cta_tiler) > 0) {
-          // This is an implicit multimodal tiler, match profile and return
-          if constexpr (tuple_size_v<decltype(shape(cta_tiler))> == 1) {
-            return make_tile(epi_tiler);
-          }
-          // This is an explicit multimodal tiler, compose out epi tiler
-          else {
-            return composition(cta_tiler, epi_tiler);
-          }
-        }
-        // This is a flat CTA tiler, no need for transformation
-        else {
-          return epi_tiler;
-        }
-      });
-  }
-  else if constexpr (cute::is_tuple<EpilogueTileType>::value) {
-    EpilogueTileType epi_tile;
-    constexpr int M = size<0>(shape(epi_tile));
-    constexpr int N = size<1>(shape(epi_tile));
-
-    static_assert(!is_layout<EpilogueTileType>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
-    static_assert(M ==  64 && detail::sm90_is_warp_specialized_v<Schedule> ||
-                  M == 128 && detail::sm90_is_cooperative_v<Schedule>, "Unsupported tile shape");
-    static_assert(N % 16 == 0, "Unsupported tile shape");
-
-    return epi_tile;
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<EpilogueTileType>, "Invalid type for EpilogueTileType.");
-  }
-}
-
-// callbacks builder with TMA aux out
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class FusionOp,
-  class TileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementAccumulator
->
-struct CallbacksBuilder<
-  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-  FusionOp,
-  TileShape_MNK,
-  EpilogueTile_MN,
-  ElementAccumulator,
-  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
-              && not cute::is_subbyte_v<typename FusionOp::ElementAux>>
-> {
-  using GmemStrideTypeAux = gemm::TagToStrideC_t<typename FusionOp::GmemLayoutTagAux>;
-  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
-    GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>());
-  using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
-  using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source<
-    GmemStrideTypeAux, typename FusionOp::ElementAux>());
-  using SmemCopyOpAux = cute::conditional_t<FusionOp::IsAuxOutSupported, CopyOpR2S, CopyOpS2R>;
-
-  using Callbacks = fusion::FusionCallbacks<
-    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    FusionOp, TileShape_MNK, EpilogueTile_MN,
-    SmemLayoutAtomAux, SmemCopyOpAux
-  >;
-};
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class FusionOp,
-  class TileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementAccumulator
->
-struct CallbacksBuilder<
-  Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-  FusionOp,
-  TileShape_MNK,
-  EpilogueTile_MN,
-  ElementAccumulator,
-  cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor
-              && sizeof_bits_v<typename FusionOp::ElementAux> == 1>
-> {
-  using Callbacks = fusion::FusionCallbacks<
-    Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    FusionOp, TileShape_MNK, EpilogueTile_MN,
-    Layout<_1,_0>, DefaultCopy // aux bit tensor doesn't use smem
-  >;
-};
-
-// Helper for building TMA warp-specialized collective epilogues, specialized by
-// the fusion operation performed and the dispatch policy to use.
-template <
-  class TileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC_,
-  class GmemLayoutTagC_,
-  int AlignmentC,
-  class ElementD_,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class FusionOpOrCallbacks,
-  class DispatchPolicy
->
-struct Sm90TmaBuilderImpl {
-  // Passing void D disables destination store + smem allocation
-  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
-                     fusion::get_element_aux_t<FusionOpOrCallbacks>, ElementD_>;
-
-  // Passing void C disables source load + smem allocation
-  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
-  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
-
-  using GmemStrideTypeC = cutlass::detail::TagToStrideC_t<GmemLayoutTagC>;
-  using GmemStrideTypeD = cutlass::detail::TagToStrideC_t<GmemLayoutTagD>;
-  
-  using UnderlyingGmemStrideTypeC = cute::remove_pointer_t<GmemStrideTypeC>;
-  using UnderlyingGmemStrideTypeD = cute::remove_pointer_t<GmemStrideTypeD>;
-
-  using CopyOpS2G = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagD>,
-      SM90_TMA_STORE_IM2COL,
-      SM90_TMA_STORE
-    >;
-  using CopyOpG2S = cute::conditional_t<detail::is_im2col_mode<GmemLayoutTagC>,
-      SM90_TMA_LOAD_IM2COL,
-      SM90_TMA_LOAD
-    >;
-
-  // Get the smallest tiled copy we can use to retile the accumulators
-  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
-  // Get register to register tiled copy that happen before shared memory store.
-  // Apply void as no register transform op needed currently.
-  using CopyOpR2R = void;
-
-  // TMA builder allows for passing callbacks directly, which is either a fusion::FusionCallbacks
-  // instance or a direct visitor implementation, e.g. fusion::Sm90LinearCombination
-  using FusionCallbacks = 
-    typename CallbacksBuilder<
-      DispatchPolicy,
-      FusionOpOrCallbacks,
-      TileShape_MNK,
-      EpilogueTile_MN,
-      ElementAccumulator
-    >::Callbacks;
-
-  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
-      DispatchPolicy,
-      TileShape_MNK,
-      EpilogueTile_MN,
-      ElementC_, // Need to pass void through to expose via GemmUniversal
-      GmemStrideTypeC,
-      ElementD_,
-      GmemStrideTypeD,
-      FusionCallbacks,
-      CopyOpG2S,
-      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeC, ElementC, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_load_op_for_source<UnderlyingGmemStrideTypeC, ElementC>()),
-      CopyOpS2G,
-      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<UnderlyingGmemStrideTypeD, ElementD, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_store_op_for_accumulator<UnderlyingGmemStrideTypeD, ElementD>()),
-      CopyAtomC,
-      CopyOpR2R
-    >;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Descriptor classes for defining EVT nodes
-// Some of the epilogue visitor nodes require non-intuitive template arguments
-// such as CopyOpS2R for AuxLoad node. Traditionaly, these are resolved by the
-// builder classes. Here we provide a set of descriptor classes that resolve
-// these template arguments from more intuitive types such as Stride, Layout
-
-// Get TileShape, EpilogueTile, Dispatch Policy, StagesC, and STagesD
-template<
-  typename TileShape_MNK,
-  typename EpilogueTileType, 
-  typename ElementC,
-  typename ElementD,
-  typename Schedule
->
-struct EpilogueDescriptor {
-  using TileShape = TileShape_MNK;
-  using EpilogueTile = 
-    decltype(
-      detail::sm90_compute_tile_shape_or_override<
-        ElementD, EpilogueTileType, Schedule, TileShape_MNK
-      >()
-    );
-  using DispatchPolicy = 
-    decltype(
-      detail::sm90_get_tma_dispatch_policy<
-        TileShape_MNK, EpilogueTile, 
-        ElementC, ElementD, Schedule
-      >()
-    );
-  constexpr static int StagesC = DispatchPolicy::StagesC;
-  constexpr static int StagesD = DispatchPolicy::StagesD;
-};
-
-// Get Stride, SmemLayout, and CopyOpS2R for AuxLoad node
-template<
-  typename EpilogueDescriptor,
-  typename StrideOrLayoutTag,
-  typename ElementAux
->
-struct AuxLoadDescriptor {
-  constexpr static int Stages = EpilogueDescriptor::StagesC;
-  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
-  using Element = ElementAux;
-  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
-  using SmemLayoutAtom =
-    decltype(
-      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
-        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
-      >()
-    );
-  using CopyOpS2R =
-    decltype(detail::sm90_get_smem_load_op_for_source<Stride, ElementAux>());
-};
-
-// Get Stride, SmemLayout, and CopyOpS2R for AuxStore node
-template<
-  typename EpilogueDescriptor,
-  typename StrideOrLayoutTag,
-  typename ElementAux
->
-struct AuxStoreDescriptor {
-  constexpr static int Stages = EpilogueDescriptor::StagesD;
-  using EpilogueTile = typename EpilogueDescriptor::EpilogueTile;
-  using Element = ElementAux;
-  using Stride = cutlass::detail::TagToStrideC_t<StrideOrLayoutTag>;
-  using SmemLayoutAtom =
-    decltype(
-      detail::sm90_get_epilogue_smem_swizzle_layout_atom<
-        Stride, ElementAux, typename EpilogueDescriptor::EpilogueTile
-      >()
-    );
-  using CopyOpR2S =
-    decltype(detail::sm90_get_smem_store_op_for_accumulator<Stride, ElementAux>());
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////
-
-// No-smem builder
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC_,
-  class GmemLayoutTagC_,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class Schedule,
-  FloatRoundStyle RoundStyle
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC_,
-    GmemLayoutTagC_,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    Schedule,
-    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
-    cute::enable_if_t<cute::is_same_v<Schedule, NoSmemWarpSpecialized> ||
-                      cute::is_same_v<Schedule, PtrArrayNoSmemWarpSpecialized> >> {
-
-  // Passing void C disables source load
-  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
-      ElementD, ElementC_>; // prevents cute breakages
-  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
-      GmemLayoutTagD, GmemLayoutTagC_>;
-  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
-      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
-
-  static constexpr int FragmentSize = 1;
-  using ThreadOp = thread::LinearCombination<
-    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
-    ScaleType, RoundStyle, ElementC>;
-
-  using CollectiveOp = cute::conditional_t<
-    cute::is_same_v<Schedule, NoSmemWarpSpecialized>,
-    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
-      cutlass::epilogue::collective::DefaultEpilogue<
-        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
-        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
-        ThreadOp,
-        cutlass::gemm::EpilogueDefault>>,
-    // Epilogue for Ptr-Array and Grouped Gemm
-    cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
-      cutlass::epilogue::collective::DefaultEpilogueArray<
-        cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
-        cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
-        ThreadOp,
-        Schedule>>
-    >;
-};
-
-// Tma warp-specialized builder
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC,
-  class GmemLayoutTagC,
-  int AlignmentC,
-  class ElementD_,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class Schedule,
-  class FusionOperation
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC,
-    GmemLayoutTagC,
-    AlignmentC,
-    ElementD_,
-    GmemLayoutTagD,
-    AlignmentD,
-    Schedule,
-    FusionOperation,
-    cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
-                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> ||
-                      detail::sm90_is_ptr_array_tma_v<Schedule>>> {
-private:
-  using ElementD = cute::conditional_t<cute::is_void_v<ElementD_>,
-                     fusion::get_element_aux_t<FusionOperation>, ElementD_>;
-  using EpilogueTile_MN =
-    decltype(detail::sm90_compute_tile_shape_or_override<ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
-  using DispatchPolicy =
-    decltype(detail::sm90_get_tma_dispatch_policy<TileShape_MNK,EpilogueTile_MN,ElementC,ElementD,Schedule>());
-
-public:
-  using CollectiveOp =
-    typename detail::Sm90TmaBuilderImpl<
-      TileShape_MNK,
-      EpilogueTile_MN,
-      ElementAccumulator,
-      ElementCompute,
-      ElementC,
-      GmemLayoutTagC,
-      AlignmentC,
-      ElementD_,
-      GmemLayoutTagD,
-      AlignmentD,
-      FusionOperation,
-      DispatchPolicy
-    >::CollectiveOp;
-};
-
-// Auto builder
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC,
-  class GmemLayoutTagC,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class FusionOperation
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC,
-    GmemLayoutTagC,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    EpilogueScheduleAuto,
-    FusionOperation,
-    void> {
-private:
-  static_assert(cute::is_same_v<FusionOperation, fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>>,
-                "Auto schedule doesn't support fusion. Use one of the TmaWarpSpecialized schedules instead.");
-
-  // Pick No-Smem epilogue as the Auto Epilogue Schedule (Auto schedules do not guarantee best performance) 
-  // since TMA epilogues are not compatible with non-TMA non-WS mainloops
-  using EpilogueSchedule = NoSmemWarpSpecialized;
-  using _CollectiveBuilder = CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC,
-    GmemLayoutTagC,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    EpilogueSchedule,
-    FusionOperation
-  >;
-
-public:
-  using CollectiveOp = typename _CollectiveBuilder::CollectiveOp;
-};
-
-// DEPRECATED Tma warp-specialized builder for elementwise fusion
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC,
-  class GmemLayoutTagC,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class Schedule,
-  class UnusedFusionOp
->
-struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
-CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC,
-    GmemLayoutTagC,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    Schedule,
-    UnusedFusionOp,
-    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
-                      cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
-private:
-  using FusionOp =
-    fusion::LinCombEltAct<Schedule::template ActivationFunctor, ElementD, ElementCompute, ElementC, ElementCompute, Schedule::Round>;
-  using ImplSchedule =
-    cute::conditional_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule>,
-      TmaWarpSpecialized, TmaWarpSpecializedCooperative>;
-
-public:
-  using CollectiveOp =
-    typename CollectiveBuilder<
-      arch::Sm90,
-      OpClass,
-      TileShape_MNK,
-      ClusterShape_MNK,
-      EpilogueTileType,
-      ElementAccumulator,
-      ElementCompute,
-      ElementC,
-      GmemLayoutTagC,
-      AlignmentC,
-      ElementD,
-      GmemLayoutTagD,
-      AlignmentD,
-      ImplSchedule,
-      FusionOp
-    >::CollectiveOp;
-};
-
-// DEPRECATED Tma warp-specialized builder for bias + elementwise fusion
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC_,
-  class GmemLayoutTagC_,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class Schedule,
-  class UnusedFusionOp
->
-struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltAct or fusion::LinCombPerRowBiasEltActAux instead")]]
-CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC_,
-    GmemLayoutTagC_,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    Schedule,
-    UnusedFusionOp,
-    cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
-                      cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
-private:
-  using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
-    ElementD, EpilogueTileType, Schedule, TileShape_MNK>());
-  // MSVC doesn't seem to be able to deduce DispatchPolicy correctly if it's
-  // defined as decltype of a detail::sm90_get_tma_dispatch_policy call.
-  // Instead, we paste in the contents of that function.  A natural refactoring
-  // would be to create a type alias in the detail namespace.
-  using DispatchPolicy = Sm90TmaWarpSpecialized<
-    /* StagesC = */ size(shape_div(take<0, 2>(TileShape_MNK{}), EpilogueTile_MN{})),
-    /* StagesD = */ 2,
-    /* FragmentSize = */ size(EpilogueTile_MN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128),
-    /* ReuseSmemC = */ sizeof_bits_v<ElementC_> == sizeof_bits_v<ElementD>,
-    false
-  >;
-
-  using GmemStrideTypeAux = gemm::TagToStrideC_t<GmemLayoutTagD>;
-  using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<
-    GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>());
-  using SmemCopyOpAux = decltype(detail::sm90_get_smem_store_op_for_accumulator<
-    GmemStrideTypeAux, typename Schedule::ElementT>());
-  using FusionOperationAux = fusion::LinCombPerRowBiasEltActAux<
-    GmemLayoutTagD, Schedule::template ActivationFunctor, ElementD, ElementCompute,
-    typename Schedule::ElementT, typename Schedule::ElementBias, ElementC_, ElementCompute
-  >;
-  using FusionCallbacksAux = fusion::FusionCallbacks<
-    DispatchPolicy, FusionOperationAux, TileShape_MNK, EpilogueTile_MN, SmemLayoutAtomAux, SmemCopyOpAux
-  >;
-
-  using FusionOperationNoAux = fusion::LinCombPerRowBiasEltAct<
-    Schedule::template ActivationFunctor, ElementD, ElementCompute,
-    typename Schedule::ElementBias, ElementC_, ElementCompute
-  >;
-  using FusionCallbacksNoAux = fusion::FusionCallbacks<
-    DispatchPolicy, FusionOperationNoAux, TileShape_MNK, EpilogueTile_MN
-  >;
-
-  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,ElementD,ElementC_>; // prevents void ref breakages
-  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,GmemLayoutTagD,GmemLayoutTagC_>;
-
-  using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
-  using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
-
-  // Get the smallest tiled copy we can use to retile the accumulators
-  using CopyAtomC = Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>;
-  // Get register to register tiled copy that happen before shared memory store.
-  // Apply void as no register transform op needed.
-  using CopyOpR2R = void;
-
-public:
-  using CollectiveOp = cutlass::epilogue::collective::Sm90EpilogueTmaWarpSpecializedBiasElementwise<
-      DispatchPolicy::StagesC,
-      DispatchPolicy::StagesD,
-      DispatchPolicy::FragmentSize,
-      TileShape_MNK,
-      EpilogueTile_MN,
-      ElementC_, // Need to pass void through to expose via GemmUniversal
-      GmemStrideTypeC,
-      ElementD,
-      GmemStrideTypeD,
-      cute::conditional_t<Schedule::StoreT, FusionCallbacksAux, FusionCallbacksNoAux>,
-      SM90_TMA_LOAD,
-      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
-      SM90_TMA_STORE,
-      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
-      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>()),
-      CopyAtomC,
-      CopyOpR2R
-    >;
-};
-
-// CollectiveBuilder that transposed epilogue below is used for sm90 gmma RS TT kernels
-// since swapping NNN kernels input matrix and transposing its output at the same time then
-// we can get TTN kernel.
-template <
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC_,
-  class GmemLayoutTagC_,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  FloatRoundStyle RoundStyle
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    OpClass,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    EpilogueTileType,
-    ElementAccumulator,
-    ElementCompute,
-    ElementC_,
-    GmemLayoutTagC_,
-    AlignmentC,
-    ElementD,
-    GmemLayoutTagD,
-    AlignmentD,
-    cutlass::gemm::EpilogueTransposed,
-    fusion::LinearCombination<ElementD,ElementCompute,ElementC_,ElementCompute,RoundStyle>,
-    void> {
-  // Passing void C disables source load
-  using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
-      ElementD, ElementC_>; // prevents cute breakages
-  using GmemLayoutTagC = cute::conditional_t<cute::is_void_v<ElementC_>,
-      GmemLayoutTagD, GmemLayoutTagC_>;
-  static constexpr thread::ScaleType::Kind ScaleType = cute::is_void_v<ElementC_> ?
-      thread::ScaleType::OnlyAlphaScaling : thread::ScaleType::Default;
-
-  static constexpr int FragmentSize = 1;
-  using ThreadOp = thread::LinearCombination<
-    ElementD, FragmentSize, ElementAccumulator, ElementCompute,
-    ScaleType, RoundStyle, ElementC>;
-
-  using CollectiveOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
-    cutlass::epilogue::collective::DefaultEpilogue<
-      cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
-      cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
-      ThreadOp,
-      cutlass::gemm::EpilogueTransposed>
-    >;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
deleted file mode 100755
index cd2639c5d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl
+++ /dev/null
@@ -1,80 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective::detail {
-
-///////////////////////////////////////////////////////////////////////////////
-
-// Selects the largest vectorized smem store atom available
-template <class GmemStrideTypeD, class ElementD>
-constexpr auto
-sm90_get_smem_store_op_for_accumulator() {
-  using namespace cute;
-
-  if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) {
-    return SM90_U16x8_STSM_T{};
-  }
-  else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) {
-    return SM90_U32x4_STSM_N{};
-  }
-  else {
-    // auto-vectorizing store
-    return AutoVectorizingCopyWithAssumedAlignment{};
-  }
-}
-
-// Selects the largest vectorized smem load atom available
-template <class GmemStrideTypeC, class ElementC>
-constexpr auto
-sm90_get_smem_load_op_for_source() {
-  using namespace cute;
-
-  // Reuse the logic from smem store selector
-  using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator<GmemStrideTypeC, ElementC>());
-
-  if constexpr (cute::is_same_v<SmemStoreOp, SM90_U16x8_STSM_T>) {
-    return SM75_U16x8_LDSM_T{};
-  }
-  else if constexpr (cute::is_same_v<SmemStoreOp, SM90_U32x4_STSM_N>) {
-    return SM75_U32x4_LDSM_N{};
-  }
-  else {
-    // auto-vectorizing load
-    return AutoVectorizingCopyWithAssumedAlignment<128>{};
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::collective::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
deleted file mode 100755
index d54cd0a8f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/arch/copy.hpp>         // cute::DefaultCopy
-#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
-
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Used to specify epilogue subtile shape or dispatch to automatic computation of subtile shape
-struct EpilogueTileAuto {};
-
-// Used to let the builder pick the epilogue schedule automatically.
-// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
-struct EpilogueScheduleAuto {};
-struct EpilogueIm2ColScheduleAuto {};
-
-template <
-  class ArchTag,
-  class OpClass,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class EpilogueTileType,
-  class ElementAccumulator,
-  class ElementCompute,
-  class ElementC,
-  class GmemLayoutTagC,
-  int AlignmentC,
-  class ElementD,
-  class GmemLayoutTagD,
-  int AlignmentD,
-  class EpilogueScheduleType,
-  class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>,
-  class Enable = void
->
-struct CollectiveBuilder {
-  static_assert(cutlass::detail::dependent_false<ArchTag>,
-      "Could not build a collective epilogue for given parameters.");
-};
-
-// helper sub-builder for epilogue fusion callbacks (for internal use by CollectiveBuilder only)
-namespace detail {
-
-// callbacks builder with operation tag
-template<
-  class DispatchPolicy,
-  class FusionOp,
-  class TileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementAccumulator,
-  class = void
->
-struct CallbacksBuilder {
-  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp, TileShape_MNK, EpilogueTile_MN>;
-};
-
-// callbacks builder with callbacks passthrough
-template <
-  class DispatchPolicy,
-  class FusionCallbacks,
-  class TileShape_MNK,
-  class EpilogueTile_MN,
-  class ElementAccumulator
->
-struct CallbacksBuilder<
-  DispatchPolicy,
-  FusionCallbacks,
-  TileShape_MNK,
-  EpilogueTile_MN,
-  ElementAccumulator,
-  cute::enable_if_t<not cute::is_base_of_v<fusion::FusionOperation, FusionCallbacks>>
-> {
-  using Callbacks = FusionCallbacks;
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "builders/sm90_builder.inl"
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
deleted file mode 100755
index 8fb1a9588..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cutlass/detail/dependent_false.hpp>
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class DispatchPolicy,
-  class... Args
->
-class CollectiveEpilogue {
-  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "detail.hpp"
-
-//
-// Gemm
-//
-#include "default_epilogue.hpp"
-#include "default_epilogue_array.hpp"
-#include "epilogue_tensor_broadcast.hpp"
-#include "sm70_epilogue_vectorized.hpp"
-#include "sm70_epilogue_vectorized_array.hpp"
-#include "sm90_epilogue_tma_warpspecialized.hpp"
-#include "sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp"
-#include "sm90_epilogue_array_tma_warpspecialized.hpp"
-//
-// Conv
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
deleted file mode 100755
index cd4a6ccdd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/numeric/numeric_types.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies an element wise operation to all elements within the fragment
-/// and writes them out to destination storage.
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class EpilogueSchedule_
->
-class DefaultEpilogue {
-public:
-  //
-  // Type Aliases
-  //
-  using EpilogueSchedule = EpilogueSchedule_;
-  using DispatchPolicy = EpilogueSchedule_;
-
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-
-  using GmemTiledCopyC = void;
-  using GmemTiledCopyD = void;
-
-  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
-  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
-
-  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-
-  struct SharedStorage { };
-
-  using TensorStorage = SharedStorage;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename ThreadEpilogueOp::Params thread{};
-    ElementC const* ptr_C = nullptr;
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-  };
-
-  // Device side epilogue params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      [[maybe_unused]] ProblemShape const& _,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-
-  // Note: SharedStorage is unused for DefaultEpilogue
-  CUTLASS_HOST_DEVICE
-  DefaultEpilogue(Params const& params_, SharedStorage const& shared_storage = SharedStorage())
-      : params(params_), epilogue_op(params_.thread) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    return epilogue_op.is_source_needed();
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_HOST_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      [[maybe_unused]] char* smem_buf)
-  {
-    using namespace cute;
-    using X = Underscore;
-
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-
-    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
-    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
-
-    // Represent the full output tensor
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c);                 // (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                 // (m,n,l)
-    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-
-    // Slice to get the tile this CTA is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
-
-    // Partition source and destination tiles to match the accumulator partitioning
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
-    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
-
-    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
-    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
-        "Source and destination must have the same number of elements.");
-    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
-        "Accumulator count must have the same destination element count.");
-
-    // Make an identity coordinate tensor for predicating our output MN tile
-    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
-    Tensor tCcD = thr_mma.partition_C(cD);
-
-    // source is needed
-    if (epilogue_op.is_source_needed()) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
-        }
-      }
-    }
-    // source is not needed, avoid load
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i));
-        }
-      }
-    }
-  }
-
-private:
-  Params params;
-  ThreadEpilogueOp epilogue_op;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
deleted file mode 100755
index 0f6f32931..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/numeric/numeric_types.hpp"
-#include "cutlass/trace.h"
-
-#include "cutlass/cuda_host_adapter.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Applies an element wise operation to all elements within the fragment
-// and writes them out to destination storage.
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class EpilogueSchedule_
->
-class DefaultEpilogueArray {
-public:
-  //
-  // Type Aliases
-  //
-  using EpilogueSchedule = EpilogueSchedule_;
-  using DispatchPolicy = EpilogueSchedule_;
-  
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using InternalStrideC = cute::remove_pointer_t<StrideC>;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-  using InternalStrideD = cute::remove_pointer_t<StrideD>;
-
-  using GmemTiledCopyC = void;
-  using GmemTiledCopyD = void;
-
-  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
-  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
-
-  static_assert(cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecialized> || cute::is_same_v<EpilogueSchedule, PtrArrayDefault>, "Incompatible epilogue schedule.");
-  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-
-  struct SharedStorage { };
-
-  using TensorMapStorage = SharedStorage;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename ThreadEpilogueOp::Params thread{};
-    ElementC const** ptr_C = nullptr;
-    StrideC dC{};
-    ElementD** ptr_D = nullptr;
-    StrideD dD{};
-  };
-
-  // Device side epilogue params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      ProblemShape const&,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  DefaultEpilogueArray(Params const& params_)
-      : params(params_) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
-    return true;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_HOST_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      [[maybe_unused]] char* smem_buf)
-  {
-    using namespace cute;
-    using X = Underscore;
-
-    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-    // Batches are managed by using appropriate pointers to C and D matrices
-    const int32_t mock_L = 1;
-    const int32_t mock_l_coord = 0;
-    // Slice to get the tile this CTA is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-
-    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
-    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
-    // we get the correct alpha/beta values for the current batch/group using group index.
-    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
-
-    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
-      // Beta value is non-zero while pointer to C is a nullptr
-      assert(0);
-    }
-
-    InternalStrideC stride_c;
-    InternalStrideD stride_d;
-    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
-      // If grouped gemm
-      if (epilogue_op.is_source_needed()) {
-        stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC[l_coord]);
-      }
-      stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l_coord]);
-    }
-    else {
-      stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
-      stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
-    }
-
-    // Represent the full output tensor
-    ElementC const* ptr_C_l = nullptr;
-    if (epilogue_op.is_source_needed()) {
-      ptr_C_l = params.ptr_C[l_coord];
-    }
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      // (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      // (m,n,l)
-    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord, mock_l_coord);                                                 // (BLK_M,BLK_N)
-
-    // Partition source and destination tiles to match the accumulator partitioning
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
-    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
-
-    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
-    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
-        "Source and destination must have the same number of elements.");
-    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
-        "Accumulator count must have the same destination element count.");
-
-    // Make an identity coordinate tensor for predicating our output MN tile
-    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
-    Tensor tCcD = thr_mma.partition_C(cD);
-
-    // source is needed
-    if (epilogue_op.is_source_needed()) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
-        }
-      }
-    }
-    // source is not needed, avoid load
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i));
-        }
-      }
-    }
-  }
-
-private:
-  Params params;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
deleted file mode 100755
index 6c0368e09..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp
+++ /dev/null
@@ -1,491 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/epilogue/dispatch_policy.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/numeric/numeric_types.hpp"
-#include "cute/util/type_traits.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Stride>
-constexpr bool
-is_m_major() {
-  return cutlass::gemm::detail::is_major<0,Stride>();
-}
-
-template <class Stride>
-constexpr bool
-is_n_major() {
-  return cutlass::gemm::detail::is_major<1,Stride>();
-}
-
-template <class Stride>
-constexpr bool
-is_im2col() {
-  return cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNWC>>
-      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNHWC>>
-      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNDHWC>>;
-}
-
-template<class Schedule>
-struct sm90_is_ptr_array_tma : cute::false_type {};
-
-template<>
-struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
-
-template<>
-struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
-
-template<>
-struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecialized> : cute::true_type {};
-
-template<class Schedule>
-static constexpr bool sm90_is_ptr_array_tma_v = sm90_is_ptr_array_tma<Schedule>::value;
-
-template<class Schedule>
-struct sm90_is_ptr_array_tma_cooperative : cute::false_type {};
-
-template<>
-struct sm90_is_ptr_array_tma_cooperative<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
-
-template<class Schedule>
-static constexpr bool sm90_is_ptr_array_tma_cooperative_v = sm90_is_ptr_array_tma_cooperative<Schedule>::value;
-
-template<class Schedule>
-struct sm90_is_ptr_array_tma_pingpong : cute::false_type {};
-
-template<>
-struct sm90_is_ptr_array_tma_pingpong<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
-
-template<class Schedule>
-static constexpr bool sm90_is_ptr_array_tma_pingpong_v = sm90_is_ptr_array_tma_pingpong<Schedule>::value;
-
-template<class DispatchPolicy>
-struct sm90_is_ptr_array_tma_dispatch_policy : cute::false_type {};
-
-template<
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  int NumEpilogueWarpGroups
->
-struct sm90_is_ptr_array_tma_dispatch_policy<
-    Sm90PtrArrayTmaWarpSpecialized<StagesC, 
-                                   StagesD, 
-                                   FragmentSize,
-                                   ReuseSmemC, 
-                                   DelayTmaStore, 
-                                   NumEpilogueWarpGroups>> 
-    : cute::true_type {};
-
-template<class DispatchPolicy>
-static constexpr bool sm90_is_ptr_array_tma_dispatch_policy_v = sm90_is_ptr_array_tma_dispatch_policy<DispatchPolicy>::value;
-
-using cutlass::atomic_maximum;
-
-template <class T>
-static constexpr int elements_per_access_v = cutlass::sizeof_bits<uint32_t>::value / cutlass::sizeof_bits<T>::value;
-
-template <class EpilogueSchedule>
-static constexpr bool sm90_is_cooperative_v =
-  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueSchedule> ||
-  sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule>;
-
-template <class EpilogueSchedule>
-static constexpr bool sm90_is_warp_specialized_v =
-  (!sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule> && sm90_is_ptr_array_tma_v<EpilogueSchedule>) ||
-  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecialized, EpilogueSchedule>;
-
-template <class GmemLayoutTag>
-static constexpr bool is_im2col_mode =
-  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNWC> ||
-  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNHWC> ||
-  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNDHWC>;
-
-template <class T>
-struct EmptyStorage {
-  CUTLASS_HOST_DEVICE
-  T* data() { return nullptr; }
-};
-
-template<class EpilogueSchedule, class Stride>
-CUTLASS_HOST_DEVICE
-auto get_epilogue_stride(Stride stride){
-  if constexpr (cute::is_base_of_v<cutlass::gemm::EpilogueTransposed, EpilogueSchedule>) {
-    return cute::make_stride(cute::get<1>(stride), cute::get<0>(stride), cute::get<2>(stride));
-  }
-  else {
-    return stride;
-  }
-}
-
-template <typename ThreadEpilogueOp, typename = void>
-struct IsThreadEpilogueOpWithBias { 
-  static constexpr bool value = false; 
-  using type = typename ThreadEpilogueOp::ElementCompute; 
-};
-
-template <typename ThreadEpilogueOp>
-struct IsThreadEpilogueOpWithBias <ThreadEpilogueOp, cute::void_t<typename ThreadEpilogueOp::ElementBias>> { 
-  static constexpr bool value = true; 
-  using type = typename ThreadEpilogueOp::ElementBias; 
-};
-
-template <typename ThreadEpilogueOp, typename = void>
-struct IsThreadEpilogueOpWithPerChannelScaling {
-  static constexpr bool value = false;
-};
-
-template <typename ThreadEpilogueOp>
-struct IsThreadEpilogueOpWithPerChannelScaling <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsPerChannelScalingSupported>> {
-  static constexpr bool value = true;
-};
-
-template <typename ThreadEpilogueOp, typename = void>
-struct IsThreadEpilogueOpWithActivation {
-  static constexpr bool value = false;
-  using type = void;
-};
-
-template <typename ThreadEpilogueOp>
-struct IsThreadEpilogueOpWithActivation <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsEltActSupported>> {
-  static constexpr bool value = true;
-  using type = typename ThreadEpilogueOp::ActivationFn;
-};
-
-template <typename ThreadEpilogueOp, typename = void>
-struct IsThreadEpilogueOpWithElementwiseArguments : cute::false_type {};
-
-template <typename ThreadEpilogueOp>
-struct IsThreadEpilogueOpWithElementwiseArguments<
-        ThreadEpilogueOp,
-        cute::void_t<typename ThreadEpilogueOp::ElementwiseOp::Arguments>> : cute::true_type {};
-
-// Wrapper class to use operator-style epilogues in sm90 TMA warp-specialized kernels
-template <class EpilogueOp>
-class Sm90TmaWarpSpecializedAdapter : public EpilogueOp {
-public:
-  using GmemTiledCopyC = void;
-  using GmemTiledCopyD = void;
-
-  using LoadPipeline = cutlass::PipelineTransactionAsync<0>;
-  using LoadPipelineState = cutlass::PipelineState<0>;
-  constexpr static uint32_t TmaTransactionBytes = 0;
-  constexpr static bool RequiresTransactionBytes = false;
-
-  using StorePipeline = cutlass::PipelineTmaStore<0>;
-  using StorePipelineState = cutlass::PipelineState<0>;
-
-  using TensorStorage = typename EpilogueOp::SharedStorage;
-  using TensorMapStorage = typename EpilogueOp::SharedStorage;
-  using PipelineStorage = typename LoadPipeline::SharedStorage;
-
-  template<class CtaTileMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_load_pipe_increment(CtaTileMNK) {
-    return 1;
-  }
-
-  template<class CtaTileMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_store_pipe_increment(CtaTileMNK) {
-    return 1;
-  }
-
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors([[maybe_unused]] typename EpilogueOp::Params const&) {
-  }
-
-  // ctor inheritance
-  using EpilogueOp::EpilogueOp;
-
-  CUTLASS_HOST_DEVICE
-  Sm90TmaWarpSpecializedAdapter(
-      typename EpilogueOp::Params const& params,
-      [[maybe_unused]] TensorStorage& shared_tensors)
-    : EpilogueOp(params) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE auto
-  load_init(
-    [[maybe_unused]] typename EpilogueOp::Params const& params,
-    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
-    [[maybe_unused]] int32_t sm_count,
-    [[maybe_unused]] int32_t sm_idx) {
-    return cute::make_tuple(nullptr);
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class CtaTileMNK,
-    class CtaCoordMNKL,
-    class TiledMma
-  >
-  CUTLASS_DEVICE auto
-  load(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state,
-      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
-      [[maybe_unused]] CtaTileMNK cta_tile_mnk,
-      [[maybe_unused]] CtaCoordMNKL cta_coord_mnkl,
-      [[maybe_unused]] TiledMma tiled_mma,
-      [[maybe_unused]] int thread_idx,
-      [[maybe_unused]] TensorStorage& shared_tensors,
-      [[maybe_unused]] int subtile_idx=-1)
-  {
-    return load_pipe_producer_state;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class TiledMma,
-    class TensorMapC
-  >
-  CUTLASS_DEVICE auto
-  load(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state,
-      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
-      [[maybe_unused]] TileShapeMNK tile_shape_MNK,
-      [[maybe_unused]] TileCoordMNKL tile_coord_mnkl,
-      [[maybe_unused]] TiledMma tiled_mma,
-      [[maybe_unused]] int thread_idx,
-      [[maybe_unused]] TensorStorage& shared_tensors,
-      [[maybe_unused]] TensorMapC const& load_tensormap,
-      [[maybe_unused]] int subtile_idx=-1,
-      [[maybe_unused]] bool wait = false)
-  {
-    return load_pipe_producer_state;
-  }
-
-  CUTLASS_DEVICE auto
-  load_tail(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state)
-  {
-    return load_pipe_producer_state;
-  }
-
-  CUTLASS_DEVICE auto
-  store_init(
-    [[maybe_unused]] typename EpilogueOp::Params const& params,
-    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
-    [[maybe_unused]] int32_t sm_count,
-    [[maybe_unused]] int32_t sm_idx,
-    [[maybe_unused]] int32_t warp_group_idx) {
-    return cute::make_tuple(nullptr);
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class CtaTileMNK,
-    class CtaCoordMNKL,
-    class AccEngine, class AccLayout,
-    class TiledMma
-  >
-  CUTLASS_DEVICE auto
-  store(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      [[maybe_unused]] StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      CtaTileMNK cta_tile_mnk,
-      CtaCoordMNKL cta_coord_mnkl,
-      cute::Tensor<AccEngine,AccLayout> accumulators,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      int subtile_index = -1)
-  {
-    constexpr int BLK_M_RANK = cute::rank<0>(cta_tile_mnk);
-    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
-        return get<0,i>(problem_shape_mnkl) - get<0,i>(cta_tile_mnk) * get<0,i>(cta_coord_mnkl);
-      }));
-
-    constexpr int BLK_N_RANK = cute::rank<1>(cta_tile_mnk);
-    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
-        return get<1,i>(problem_shape_mnkl) - get<1,i>(cta_tile_mnk) * get<1,i>(cta_coord_mnkl);
-      }));
-
-    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
-
-    (*this)(
-        problem_shape_mnkl,
-        cta_tile_mnk,
-        cta_coord_mnkl,
-        accumulators,
-        tiled_mma,
-        residue_mnk,
-        thread_idx,
-        reinterpret_cast<char*>(&shared_tensors));
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class AccEngine, class AccLayout,
-    class TiledMma,
-    class TensorMapD
-  >
-  CUTLASS_DEVICE auto
-  store(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      [[maybe_unused]] StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_MNK,
-      TileCoordMNKL tile_coord_mnkl,
-      cute::Tensor<AccEngine,AccLayout> accumulators,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      [[maybe_unused]] TensorMapD const& store_tensormap,
-      int subtile_index = -1)
-  {
-    constexpr int BLK_M_RANK = cute::rank<0>(tile_shape_MNK);
-    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
-        return get<0,i>(problem_shape_mnkl) - get<0,i>(tile_shape_MNK) * get<0,i>(tile_coord_mnkl);
-      }));
-
-    constexpr int BLK_N_RANK = cute::rank<1>(tile_shape_MNK);
-    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
-        return get<1,i>(problem_shape_mnkl) - get<1,i>(tile_shape_MNK) * get<1,i>(tile_coord_mnkl);
-      }));
-
-    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
-
-    (*this)(
-        problem_shape_mnkl,
-        tile_shape_MNK,
-        tile_coord_mnkl,
-        accumulators,
-        tiled_mma,
-        residue_mnk,
-        thread_idx,
-        reinterpret_cast<char*>(&shared_tensors));
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  CUTLASS_DEVICE auto
-  store_tail(
-      [[maybe_unused]] LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      [[maybe_unused]] StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state) {
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  // Dummy methods to perform different parts of TMA/Tensormap modifications
-
-  template <bool IsLoad,
-            class ProblemShapeMNKL>
-  CUTLASS_DEVICE
-  void
-  tensormaps_perform_update(
-      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
-      [[maybe_unused]] typename EpilogueOp::Params const& params,
-      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
-      [[maybe_unused]] ProblemShapeMNKL problem_shape,
-      [[maybe_unused]] int32_t next_batch,
-      [[maybe_unused]] int32_t warp_group_idx) { }
-
-  template <bool IsLoad>
-  CUTLASS_DEVICE
-  void
-  tensormaps_cp_fence_release(
-      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
-      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
-      [[maybe_unused]] int32_t warp_group_idx) { }
-
-  template <bool IsLoad>
-  CUTLASS_DEVICE
-  void
-  tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
-};
-
-// SFINAE helpers for detecting beta/beta_ptr in EVT arguments.
-template <class Arguments, class = void>
-struct has_beta {
-  static constexpr bool value = false;
-};
-
-template <class Arguments>
-struct has_beta<Arguments, cute::void_t<decltype(Arguments{}.thread.beta)>> {
-  static constexpr bool value = true;
-};
-
-template <class Arguments, class = void>
-struct has_beta_ptr {
-  static constexpr bool value = false;
-};
-
-template <class Arguments>
-struct has_beta_ptr<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr)>> {
-  static constexpr bool value = true;
-};
-
-} // namespace detail
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
deleted file mode 100755
index 48833ecf1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Functor for performing tensor-tensor broadacasts atop existing epilogues.
-
-  Concretely, the opeartion performed is the following:
-    UnaryOp(
-        BinaryOp1(
-            BinaryOp0(
-                Activation((alpha * A @ B) + bias),
-                beta * C0
-            ),
-            beta * C1
-        )
-    )
-
-    where:
-        - C0 and C1 have the same extents as the output
-        - BinaryOp0 and BinaryOp1 perform elementwise binary operations
-        - UnaryOp is an elementwise operation
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-
-#include "cute/tensor.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Collective epilogue that applies elementwise tensor-tensor operations atop other epilogues
-///
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class EpilogueSchedule_,
-  bool PerColumnBias_ = false
->
-class EpilogueTensorBroadcast {
-public:
-  //
-  // Type Aliases
-  //
-  using EpilogueSchedule = EpilogueSchedule_;
-
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementBias = typename ThreadEpilogueOp::ElementBias;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-  using ActivationFunctor = typename ThreadEpilogueOp::ActivationFunctor;
-
-  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-
-  static constexpr int kOutputAlignment = ThreadEpilogueOp::kCount;
-  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
-
-  static constexpr bool IsBinaryOp0Enabled = ThreadEpilogueOp::IsBinaryOp0Enabled;
-  static constexpr bool IsBinaryOp1Enabled = ThreadEpilogueOp::IsBinaryOp1Enabled;
-  static constexpr bool IsUnaryOpEnabled = ThreadEpilogueOp::IsUnaryOpEnabled;
-
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  using BiasStride = typename cute::conditional_t<PerColumnBias, Stride<_0, _1, _0>, Stride<_1, _0, _0>>;
-
-  struct SharedStorage { };
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename ThreadEpilogueOp::Params thread{};
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-    ElementBias* ptr_Bias = nullptr;
-    ElementC* ptr_C0 = nullptr;
-    ElementC* ptr_C1 = nullptr;
-  };
-
-  // Device side epilogue params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      [[maybe_unused]] ProblemShape const& _,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  EpilogueTensorBroadcast(Params const& params_)
-      : params(params_), epilogue_op(params_.thread) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    return epilogue_op.is_source0_needed() || epilogue_op.is_source1_needed();
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_HOST_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      [[maybe_unused]] char* smem_buf)
-  {
-    using namespace cute;
-    using X = Underscore;
-
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 4");
-
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-
-    auto stride_c    = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
-    auto stride_d    = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
-    auto stride_bias = detail::get_epilogue_stride<EpilogueSchedule>(BiasStride{});
-
-    // Represent the full output tensor
-    Tensor mC0_mnl = make_tensor(make_gmem_ptr(params.ptr_C0), make_shape(M,N,L), stride_c);                   // (m,n,l)
-    Tensor mC1_mnl = make_tensor(make_gmem_ptr(params.ptr_C1), make_shape(M,N,L), stride_c);                   // (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                     // (m,n,l)
-    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), stride_bias);            // (m,n,l)
-
-    Tensor gC0_mnl = local_tile(mC0_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
-    Tensor gC1_mnl = local_tile(mC1_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
-
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});        // (BLK_M,BLK_N,m,n,l)
-    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});  // (BLK_M,BLK_N,m,n,l)
-
-    // Slice to get the tile this thread block is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-    Tensor gC0 = gC0_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gC1 = gC1_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                     // (BLK_M,BLK_N)
-    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                               // (BLK_M,BLK_N)
-
-    // Partition source and destination tiles to match the accumulator partitioning
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCgD = thr_mma.partition_C(gD);                                                           // (VEC,THR_M,THR_N)
-    Tensor tCgC0 = thr_mma.partition_C(gC0);                                                         // (VEC,THR_M,THR_N)
-    Tensor tCgC1 = thr_mma.partition_C(gC1);                                                         // (VEC,THR_M,THR_N)
-    Tensor tCgBias = thr_mma.partition_C(gBias);                                                     // (VEC,THR_M,THR_N)
-
-    static_assert(is_static<FrgLayout>::value,
-        "Accumulator layout must be static");
-    CUTE_STATIC_ASSERT_V(size(tCgC0) == size(tCgD),
-        "Source and destination must have the same number of elements.");
-    CUTE_STATIC_ASSERT_V(size(tCgC1) == size(tCgD),
-        "Source and destination must have the same number of elements.");
-    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
-        "Accumulator count must have the same destination element count.");
-    CUTE_STATIC_ASSERT_V(size(tCgBias) == size(accumulators),
-        "Accumulator count must have the same destination element count.");
-
-    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
-    Tensor tCcD = thr_mma.partition_C(cD);
-
-    bool bias_needed = params.ptr_Bias != nullptr;
-    bool c0_needed = (params.ptr_C0 != nullptr) && epilogue_op.is_source0_needed();
-    bool c1_needed = (params.ptr_C1 != nullptr) && epilogue_op.is_source1_needed();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accumulators); ++i) {
-      if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-        ElementBias bias = bias_needed ? tCgBias(i) : ElementBias(0);
-        ElementC c0 = c0_needed ? tCgC0(i) : ElementC(0);
-        ElementC c1 = c1_needed ? tCgC1(i) : ElementC(0);
-
-        tCgD(i) = epilogue_op(accumulators(i), c0, c1, bias);
-      }
-    }
-  }
-
-private:
-  Params params;
-  ThreadEpilogueOp epilogue_op;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
deleted file mode 100755
index a8083dab1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
+++ /dev/null
@@ -1,549 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class StrideC,
-  class StrideD,
-  class ThreadEpilogueOp,
-  class SmemLayout,
-  class CopyAtomR2S,
-  class TiledCopyS2R,
-  class CopyAtomR2G,
-  class EpilogueScheduleType = EpilogueSimtVectorized,
-  class Enable = void
->
-class Epilogue {
-  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
-                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>, 
-                "Could not find an epilogue specialization.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Epilogue Vectorized
-/// Applies an element wise operation to all elements within the fragment
-/// and writes it out to destination storage.
-///
-/// Ways to generalize this:
-/// - CTA tile shape
-/// - vectorization requirements (GMEM)
-/// - vectoriz(able) transform()
-///
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class SmemLayout_,
-  class CopyAtomR2S_,
-  class TiledCopyS2R_,
-  class CopyAtomR2G_,
-  class EpilogueScheduleType_
->
-class Epilogue<
-        StrideC_,
-        StrideD_,
-        ThreadEpilogueOp_,
-        SmemLayout_,
-        CopyAtomR2S_,
-        TiledCopyS2R_,
-        CopyAtomR2G_,
-        EpilogueScheduleType_,
-        cute::enable_if_t<
-          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
-        >
-      > {
-public:
-  //
-  // Type Aliases
-  //
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
-  using SmemLayout   = SmemLayout_;
-  using CopyAtomR2S  = CopyAtomR2S_;
-  using TiledCopyS2R = TiledCopyS2R_;
-  using CopyAtomR2G  = CopyAtomR2G_;
-
-  using GmemTiledCopyC = void;
-  using GmemTiledCopyD = CopyAtomR2G;
-
-  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
-  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
-
-  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
-  };
-
-  static constexpr bool IsActHasArgs = detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpilogueOp>::value;
-
-  // Host side epilogue arguments
-  template<class ThreadEpiOp, class = void>
-  struct ThreadEpilogueOpArguments {
-    ElementScalar alpha{0};
-    ElementScalar beta{0};
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias{};
-  };  
-
-  template<class ThreadEpiOp>
-  struct ThreadEpilogueOpArguments<
-          ThreadEpiOp,
-          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
-    ElementScalar alpha{0};
-    ElementScalar beta{0};
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias{};
-    typename ThreadEpiOp::ElementwiseArguments activation{};
-  };
-
-  struct Arguments {
-    ThreadEpilogueOpArguments<ThreadEpilogueOp> thread{};
-    using StrideBias = decltype(thread.dBias);
-    ElementC const* ptr_C = nullptr;
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-  };
-
-  // Device side epilogue params
-  template<class ThreadEpiOp, class = void>
-  struct ParamsType {
-    typename ThreadEpiOp::Params thread{};
-    ElementC const* ptr_C = nullptr;
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-    ElementBias const* ptr_Bias = nullptr;
-    StrideBias dBias{};
-  };
-
-  template<class ThreadEpiOp>
-  struct ParamsType<
-          ThreadEpiOp,
-          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
-    typename ThreadEpiOp::Params thread{};
-    typename ThreadEpiOp::ElementwiseArguments activation{};
-    ElementC const* ptr_C = nullptr;
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-    ElementBias const* ptr_Bias = nullptr;
-    StrideBias dBias{};
-  };
-
-  using Params = ParamsType<ThreadEpilogueOp>;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      [[maybe_unused]] ProblemShape const& _,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) { 
-    typename ThreadEpilogueOp::Params thread_op_args;
-    thread_op_args.alpha = args.thread.alpha;
-    thread_op_args.beta = args.thread.beta;
-    thread_op_args.alpha_ptr = args.thread.alpha_ptr;
-    thread_op_args.beta_ptr = args.thread.beta_ptr;
-
-    if constexpr (IsActHasArgs) {
-      return {
-        thread_op_args,
-        args.thread.activation,
-        args.ptr_C,
-        args.dC,
-        args.ptr_D,
-        args.dD,
-        args.thread.bias_ptr,
-        args.thread.dBias
-      };
-    }
-    else {
-      return {
-        thread_op_args,
-        args.ptr_C,
-        args.dC,
-        args.ptr_D,
-        args.dD,
-        args.thread.bias_ptr,
-        args.thread.dBias
-      };
-    }
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Epilogue(Params const& params_)
-      : params(params_), epilogue_op(params_.thread) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    return epilogue_op.is_source_needed();
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-    // synchronizing function for smem reads/writes
-#if CUDA_BARRIER_ENABLED
-    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-#else
-    auto synchronize = [] () { __syncthreads(); };
-#endif
-
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-
-    // Represent the full output tensor
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
-    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
-
-    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
-    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
-
-    // Slice to get the tile this CTA is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
-  
-    // Construct a tensor in SMEM that we can partition for rearranging data
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
-
-    // Partition sAcc to match the accumulator partitioning
-    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
-    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
-    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
-    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    // Tile gD and gC by the shape of SmemLayout first
-    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
-    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
-    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
-    Tensor gBiast = flat_divide(gBias, tile);                                          // (SMEM_M,SMEM_N,TILE_M,TILE_N)
-
-    // Partition sAcc, gC, and gD for the output
-    auto tiled_s2r = TiledCopyS2R{};
-    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
-    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-    Tensor tSR_gBias = thread_s2r.partition_D(gBiast);                   // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-
-    // Allocate intermediate registers on the dst tensors
-    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_rC = make_tensor<ElementC>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_rD = make_tensor<ElementD>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_rBias = make_tensor_like(tSR_gBias);                      // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-
-    // Repeat the D-partitioning for coordinates and predication
-    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
-    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
-    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-
-    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
-    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
-
-#if 0
-    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
-      print("aC   : "); print(accumulators.layout()); print("\n");
-      print("gC   : "); print(gC.layout()); print("\n");
-      print("gD   : "); print(gD.layout()); print("\n");
-      print("gBias   : "); print(gBias.layout()); print("\n");
-      print("sAcc   : "); print(sAcc.layout()); print("\n");
-      print("\n");
-      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
-      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
-      print("\n");
-      print("gDt  : "); print(gDt.layout()); print("\n");
-      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
-      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
-      print("\n");
-      print("tSR_rC : "); print(tSR_rC.layout()); print("\n");
-      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
-      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
-      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
-      print("\n");
-      print("gBiast  : "); print(gBiast.layout()); print("\n");
-      print("tSR_gBias  : "); print(tSR_gBias.layout()); print("\n");
-      print("tSR_rBias  : "); print(tSR_rBias.layout()); print("\n");
-    }
-#endif
-
-    if constexpr (IsEpilogueBiasSupported) {
-      if (params.ptr_Bias) {
-        // Filter so we don't issue redundant copies over stride-0 modes
-        // (only works if 0-strides are in same location, which is by construction)
-        Tensor tSR_gBias_flt = filter_zeros(tSR_gBias);
-        Tensor tSR_rBias_flt = filter_zeros(tSR_rBias);
-        Tensor tSR_cD_flt = filter_zeros(tSR_cD, tSR_gBias.stride());
-
-        // Step 0. Copy Bias from GMEM to fragment
-        auto pred_fn = [&] (auto const&... coords) { return elem_less(tSR_cD_flt(coords...), take<0, 2>(residue_mnk)); };
-        copy_if(pred_fn, tSR_gBias_flt, tSR_rBias_flt);    
-      }
-    }
-
-    // For each tiling needed for SmemLayout to cover shape(gD)
-    CUTLASS_PRAGMA_UNROLL
-    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
-        // Step 1. Copy to SMEM
-        CUTLASS_PRAGMA_UNROLL
-        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
-            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
-            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
-
-            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
-          }
-        }
-
-        // Step 2. Wait for SMEM writes to complete
-        synchronize();
-
-        // Step 3. Copy from SMEM into a fragment
-        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
-
-        // Step 4. Wait for SMEM reads to complete
-        synchronize();
-
-        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
-        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
-
-        if constexpr (IsEpilogueBiasSupported) {
-          Tensor tSR_rBiasmn = tSR_rBias(_,_,_,step_m,step_n);
-
-          if (epilogue_op.is_source_needed()) {
-            // source is needed
-            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
-
-            // Step 5. Copy C from GMEM to a fragment
-            CUTLASS_PRAGMA_UNROLL
-            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-              CUTLASS_PRAGMA_UNROLL
-              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-                // Predication
-                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                  CUTLASS_PRAGMA_UNROLL
-                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
-                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
-                  }
-                }
-              }
-            }
-
-            // Step 6. Elementwise operation with conversion
-            CUTLASS_PRAGMA_UNROLL
-            for (int i = 0; i < size(tSR_rAcc); ++i) {
-              if constexpr (IsActHasArgs) {
-                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i), params.activation);
-              } else {
-                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i));
-              }
-            }
-          }
-          else {
-            // source is not needed, avoid load and lift compute
-
-            // Step 5. Elementwise operation with conversion
-            CUTLASS_PRAGMA_UNROLL
-            for (int i = 0; i < size(tSR_rAcc); ++i) {
-              if constexpr (IsActHasArgs) {
-                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i), params.activation);
-              } else {
-                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i));
-              }
-            }
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-              // Predication
-              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                // The Last Step. Copy to GMEM
-                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
-              }
-            }
-          }
-        } else {
-          if (epilogue_op.is_source_needed()) {
-            // source is needed
-            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
-
-            // Step 5. Copy C from GMEM to a fragment
-            CUTLASS_PRAGMA_UNROLL
-            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-              CUTLASS_PRAGMA_UNROLL
-              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-                // Predication
-                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                  CUTLASS_PRAGMA_UNROLL
-                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
-                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
-                  }
-                }
-              }
-            }
-
-            // Step 6. Elementwise operation with conversion
-            CUTLASS_PRAGMA_UNROLL
-            for (int i = 0; i < size(tSR_rAcc); ++i) {
-              tSR_rD(i) = epilogue_op(tSR_rAcc(i), tSR_rC(i));
-            }
-          }
-          else {
-            // source is not needed, avoid load and lift compute
-
-            // Step 5. Elementwise operation with conversion
-            CUTLASS_PRAGMA_UNROLL
-            for (int i = 0; i < size(tSR_rAcc); ++i) {
-              tSR_rD(i) = epilogue_op(tSR_rAcc(i));
-            }
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-              // Predication
-              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                // The Last Step. Copy to GMEM
-                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-private:
-  Params params;
-  ThreadEpilogueOp epilogue_op;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
deleted file mode 100755
index 8a70370b2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Ptr Array Epilogue Vectorized
-/// Applies an element wise operation to all elements within the fragment
-/// and writes it out to destination storage.
-///
-/// Ways to generalize this:
-/// - CTA tile shape
-/// - vectorization requirements (GMEM)
-/// - vectoriz(able) transform()
-///
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class SmemLayout_,
-  class CopyAtomR2S_,
-  class TiledCopyS2R_,
-  class CopyAtomR2G_,
-  class EpilogueScheduleType_
->
-class Epilogue<
-        StrideC_,
-        StrideD_,
-        ThreadEpilogueOp_,
-        SmemLayout_,
-        CopyAtomR2S_,
-        TiledCopyS2R_,
-        CopyAtomR2G_,
-        EpilogueScheduleType_,
-        cute::enable_if_t<
-          cute::is_same_v<EpilogueScheduleType_, EpiloguePtrArraySimtVectorized>
-        >
-      > {
-public:
-  //
-  // Type Aliases
-  //
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using InternalStrideC = cute::remove_pointer_t<StrideC>;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-  using InternalStrideD = cute::remove_pointer_t<StrideD>;
-
-  using SmemLayout   = SmemLayout_;
-  using CopyAtomR2S  = CopyAtomR2S_;
-  using TiledCopyS2R = TiledCopyS2R_;
-  using CopyAtomR2G  = CopyAtomR2G_;
-
-  using GmemTiledCopyC = TiledCopyS2R;
-  using GmemTiledCopyD = TiledCopyS2R;
-
-  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
-
-  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
-
-  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
-  };
-
-  using TensorMapStorage = SharedStorage;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename ThreadEpilogueOp::Params thread{};
-    ElementC const** ptr_C = nullptr;
-    StrideC dC{};
-    ElementD** ptr_D = nullptr;
-    StrideD dD{};
-  };
-
-  // Device side epilogue params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      ProblemShape const&,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Epilogue(Params const& params_)
-      : params(params_) { }
-
-  CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
-    return true;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-    // synchronizing function for smem reads/writes
-#if CUDA_BARRIER_ENABLED
-    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-#else
-    auto synchronize = [] () { __syncthreads(); };
-#endif
-
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-    // Batches are managed by using appropriate pointers to C and D matrices
-    const int32_t mock_L = 1;
-    const int32_t mock_l_coord = 0;
-    // Slice to get the tile this CTA is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-
-    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
-    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
-    // we get the correct alpha/beta values for the current batch/group using group index.
-    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
-
-    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
-      // Beta value is non-zero while pointer to C is a nullptr
-      assert(0);
-    }
-
-    InternalStrideC stride_c;
-    InternalStrideD stride_d;
-    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
-      // If grouped gemm
-      if (epilogue_op.is_source_needed()) {
-        stride_c = params.dC[l_coord];
-      }
-      stride_d = params.dD[l_coord];
-    }
-    else {
-      stride_c = params.dC;
-      stride_d = params.dD;
-    }
-
-    // Represent the full output tensor
-    ElementC const* ptr_C_l = nullptr;
-    if (epilogue_op.is_source_needed()) {
-      ptr_C_l = params.ptr_C[l_coord];
-    }
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      //             (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      //             (m,n,l)
-    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
-
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
-
-    // Construct a tensor in SMEM that we can partition for rearranging data
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
-
-    // Partition sAcc to match the accumulator partitioning
-    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
-    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
-    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
-    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    // Tile gD and gC by the shape of SmemLayout first
-    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
-    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
-    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
-
-    // Partition sAcc, gC, and gD for the output
-    auto tiled_s2r = TiledCopyS2R{};
-    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
-    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-
-    // Allocate intermediate registers on the dst tensors
-    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tSR_rD = make_tensor<ElementOutput>(shape(tSR_rAcc));                       // ((Atom,AtomNum),ATOM_M,ATOM_N)
-
-    // Repeat the D-partitioning for coordinates and predication
-    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
-    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
-    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
-
-    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
-    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
-
-#if 0
-    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
-      print("aC   : "); print(accumulators.layout()); print("\n");
-      print("gC   : "); print(gC.layout()); print("\n");
-      print("gD   : "); print(gD.layout()); print("\n");
-      print("sAcc   : "); print(sAcc.layout()); print("\n");
-      print("\n");
-      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
-      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
-      print("\n");
-      print("gDt  : "); print(gDt.layout()); print("\n");
-      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
-      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
-      print("\n");
-      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
-      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
-      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
-      print("\n");
-    }
-#endif
-
-    // For each tiling needed for SmemLayout to cover shape(gD)
-    CUTLASS_PRAGMA_UNROLL
-    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
-        // Step 1. Copy to SMEM
-        CUTLASS_PRAGMA_UNROLL
-        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
-            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
-            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
-
-            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
-          }
-        }
-
-        // Step 2. Wait for SMEM writes to complete
-        synchronize();
-
-        // Step 3. Copy from SMEM into a fragment
-        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
-
-        // Step 4. Wait for SMEM reads to complete
-        synchronize();
-
-        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
-        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
-
-        if (epilogue_op.is_source_needed()) {
-          // source is needed
-          Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
-
-          Tensor tSR_rCmn = make_tensor<ElementC>(shape(tSR_gCmn));                     // ((Atom,AtomNum),ATOM_M,ATOM_N)
-
-          // Step 5. Copy C from GMEM to a fragment
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-              // Predication
-              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                CUTLASS_PRAGMA_UNROLL
-                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
-                  tSR_rCmn(i,m,n) = tSR_gCmn(i,m,n);
-                }
-              }
-            }
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-              // Predication
-              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                // Step 6. Elementwise operation with conversion
-                CUTLASS_PRAGMA_UNROLL
-                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
-                  tSR_rD(i,m,n) = epilogue_op(tSR_rAcc(i,m,n), tSR_rCmn(i,m,n));
-                }
-                // Step 7. Copy to GMEM
-                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
-              }
-            }
-          }
-        }
-        else {
-          // source is not needed, avoid load and lift compute
-
-          // Step 5. Elementwise operation with conversion
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tSR_rAcc); ++i) {
-            tSR_rD(i) = epilogue_op(tSR_rAcc(i));
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
-              // Predication
-              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
-                // Step 6. Copy to GMEM
-                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-private:
-  Params params;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
deleted file mode 100755
index 84b6e14ee..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,1191 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/epilogue/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
-#include "cutlass/detail/collective.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/copy_traits_sm90_tma.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_,
-  bool ReuseSmemC_,
-  bool DelayTmaStore_,
-  int NumEpilogueWarpGroups_,
-  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
-  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
-  class ElementC_,
-  class StrideC_,
-  class ElementD_,
-  class StrideD_,
-  class FusionCallbacks_,
-  class CopyOpG2S_,
-  class SmemLayoutAtomC_,
-  class CopyOpS2R_,
-  class CopyOpS2G_,
-  class SmemLayoutAtomD_,
-  class CopyOpR2S_,
-  class CopyAtomC_,
-  class CopyOpR2R_
->
-class CollectiveEpilogue<
-    Sm90PtrArrayTmaWarpSpecialized<StagesC_,
-                                   StagesD_,
-                                   FragmentSize_,
-                                   ReuseSmemC_,
-                                   DelayTmaStore_,
-                                   NumEpilogueWarpGroups_
-                                  >,
-    CtaTileMNK_,
-    EpilogueTile_,
-    ElementC_,
-    StrideC_,
-    ElementD_,
-    StrideD_,
-    FusionCallbacks_,
-    CopyOpG2S_,
-    SmemLayoutAtomC_,
-    CopyOpS2R_,
-    CopyOpS2G_,
-    SmemLayoutAtomD_,
-    CopyOpR2S_,
-    CopyAtomC_,
-    CopyOpR2R_
-> {
-public:
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = Sm90PtrArrayTmaWarpSpecialized<StagesC_,
-                                                        StagesD_,
-                                                        FragmentSize_,
-                                                        ReuseSmemC_,
-                                                        DelayTmaStore_, 
-                                                        NumEpilogueWarpGroups_
-                                                       >;
-  using CtaTileMNK = CtaTileMNK_;
-  using EpilogueTile = EpilogueTile_;
-  using FusionCallbacks = FusionCallbacks_;
-  using ElementC = ElementC_;
-  using StrideC = StrideC_;
-  using InternalStrideC = cute::remove_pointer_t<StrideC>;
-  using ElementD = ElementD_;
-  using StrideD = StrideD_;
-  using InternalStrideD = cute::remove_pointer_t<StrideD>;
-  using CopyOpG2S = CopyOpG2S_;
-  using SmemLayoutAtomC = SmemLayoutAtomC_;
-  using CopyOpS2R = CopyOpS2R_;
-  using CopyOpS2G = CopyOpS2G_;
-  using SmemLayoutAtomD = SmemLayoutAtomD_;
-  using CopyOpR2S = CopyOpR2S_;
-  using CopyAtomC = CopyAtomC_;
-  using CopyOpR2R = CopyOpR2R_;
-
-  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
-  using GmemTiledCopyC = CopyOpG2S;
-  using GmemTiledCopyD = CopyOpS2G;
-
-  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
-  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
-  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
-  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
-  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
-  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
-  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
-
-private:
-  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
-  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
-  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
-  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
-  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
-
-  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
-  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
-
-  constexpr static int StagesC = StagesC_;
-  constexpr static int StagesD = StagesD_;
-  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
-
-  constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
-  constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
-
-  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
-  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
-
-  // Check if register transformation is needed before copying register to shared memory.
-  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
-
-  using SmemLayoutC = decltype(tile_to_shape(
-      SmemLayoutAtomC{},
-      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
-      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-  using SmemLayoutD = decltype(tile_to_shape(
-      SmemLayoutAtomD{},
-      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
-      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-
-  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
-                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
-  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
-
-  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
-  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
-  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
-
-  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
-  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
-
-  using EmptyType = cute::tuple<>;
-  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
-                         SmemArrayTypeC,
-                         EmptyType>;
-  using SmemDStorage = cute::conditional_t<is_destination_supported,
-                         SmemArrayTypeD,
-                         EmptyType>;
-
-  struct CollectiveStorageWithC {
-    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
-    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-  union CollectiveStorageWithoutC {
-    cute::array<SmemElementC, 0> smem_C;
-    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-  union CollectiveStorageReuseC {
-    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
-    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-public:
-  // TMA pipeline for loading C
-  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
-  using LoadPipelineState = cutlass::PipelineState<StagesC>;
-  constexpr static uint32_t TmaTransactionBytes =
-    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
-  constexpr static bool RequiresTransactionBytes = true;
-
-  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
-
-  // TMA pipeline for storing D
-  using StorePipeline = cute::conditional_t<ReuseSmemC,
-                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
-                          cutlass::PipelineTmaStore<StagesD>>;
-  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
-
-  struct SharedStorage {
-    struct TensorStorage {
-      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
-                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
-      CollectiveStorage collective;
-
-      using FusionStorage = typename FusionCallbacks::SharedStorage;
-      FusionStorage thread;
-    } tensors;
-
-    struct TensorMapStorage : cute::aligned_struct<128, _0> {
-      cute::TmaDescriptor smem_tensormap_C;
-      cute::array<cute::TmaDescriptor, NumEpilogueWarpGroups> smem_tensormap_D;
-    } tensormaps;
-
-    using PipelineStorage = typename LoadPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideC, StrideC>;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename FusionCallbacks::Arguments thread{};
-    ElementC const** ptr_C = nullptr;
-    StrideC dC;
-    ElementD ** ptr_D = nullptr;
-    StrideD dD;
-  };
-
-  // Device side epilogue params
-  struct Params {
-    using TMA_C = decltype(make_tma_copy(
-        CopyOpG2S{},
-        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
-            repeat_like(InternalStrideC{}, int32_t(0)), InternalStrideC{}),
-        take<0,2>(SmemLayoutC{}),
-        EpilogueTile{},
-        _1{}));
-
-    using TMA_D = decltype(make_tma_copy(
-        CopyOpS2G{},
-        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
-            repeat_like(InternalStrideD{}, int32_t(0)), InternalStrideD{}),
-        take<0,2>(SmemLayoutD{}),
-        EpilogueTile{},
-        _1{}));
-
-    typename FusionCallbacks::Params thread{};
-    TMA_C tma_load_c;
-    TMA_D tma_store_d;
-    cute::TmaDescriptor* tensormaps;
-    ElementC const** ptr_C;
-    StrideC dC;
-    ElementD** ptr_D;
-    StrideD dD;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      ProblemShape const& problem_shape,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
-    // These will be replaced with correct values before the initial tma load.
-    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
-    auto init_M = get<0>(init_shape);
-    auto init_N = get<1>(init_shape);
-    auto init_L = get<3>(init_shape);
-
-    static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
-
-    InternalStrideC stride_c;
-    InternalStrideD stride_d;
-    if constexpr (IsGroupedGemmKernel) {
-      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
-      stride_c = InternalStrideC{};
-      stride_d = InternalStrideD{};
-    } 
-    else {
-      // Tensor shapes for Ptr-Array are initialized correctly only here.
-      auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
-      init_M = get<0>(problem_shape_MNKL);
-      init_N = get<1>(problem_shape_MNKL);
-      init_L = get<3>(problem_shape_MNKL);
-
-      stride_c = args.dC;
-      stride_d = args.dD;
-    }
-
-    uint32_t transaction_bytes = TmaTransactionBytes;
-    typename Params::TMA_C tma_load_c = {};
-    if constexpr (is_source_supported) {
-      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(args.ptr_C); 
-      Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
-      tma_load_c = make_tma_copy(
-          CopyOpG2S{},
-          tensor_c,
-          take<0,2>(SmemLayoutC{}),
-          EpilogueTile{},
-          _1{});
-    }
-
-    typename Params::TMA_D tma_store_d;
-    if constexpr (is_destination_supported) {
-      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(args.ptr_D);
-      Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
-      tma_store_d = make_tma_copy(
-          CopyOpS2G{},
-          tensor_d,
-          take<0,2>(SmemLayoutD{}),
-          EpilogueTile{},
-          _1{});
-    }
-
-    auto fusion_workspace = static_cast<char*>(workspace);
-    auto fusion_workspace_size = FusionCallbacks::get_workspace_size(problem_shape, args.thread);
-    auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
-                                      static_cast<char*>(workspace) + fusion_workspace_size);
-
-    return {
-      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, fusion_workspace),
-      tma_load_c,
-      tma_store_d,
-      tma_descriptor_workspace,
-      args.ptr_C,
-      args.dC,
-      args.ptr_D,
-      args.dD,
-      transaction_bytes,
-    };
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
-    
-    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
-    auto descriptors_shape = cute::make_shape(sm_count, Int<NumInputTensors>{});
-    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
-
-    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
-    return (size(descriptors_shape) * SizeOfCuTensorMap) + FusionCallbacks::get_workspace_size(problem_shape, args.thread);
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-
-    bool implementable = true;
-    bool fusion_implementable = true;
-
-    if (problem_shape.is_host_problem_shape_available()) {
-      for (int i = 0; i < problem_shape.groups(); ++i) {
-        auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
-        auto [M,N,K,L] = problem_shape_MNKL;
-
-        if constexpr (is_destination_supported) {
-          constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
-          constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
-          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(cute::make_shape(M,N,L), InternalStrideD{});
-        }
-
-        if constexpr (not cute::is_void_v<ElementC>) {
-          constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
-          constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
-          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(cute::make_shape(M,N,L), InternalStrideC{});
-        }
-
-        fusion_implementable = fusion_implementable && FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
-      }
-    }
-    else {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-
-    if (!fusion_implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
-    }
-
-    bool beta_implementable = true;
-
-    if constexpr (cute::is_void_v<ElementC>) {
-      if constexpr (detail::has_beta<Arguments>::value) {
-        beta_implementable = args.thread.beta == 0.0;
-      }
-      if constexpr (detail::has_beta_ptr<Arguments>::value) {
-        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
-      }
-    }
-
-    if (!beta_implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
-    }
-
-    return implementable && fusion_implementable && beta_implementable;
-  }
-
-  template<class TileShapeMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
-    // Compute number of epilogue subtiles
-    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
-  }
-
-  template<class TileShapeMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
-    return get_load_pipe_increment(tile_shape_MNK);
-  }
-
-  CUTLASS_HOST_DEVICE
-  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
-      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
-
-  CUTLASS_DEVICE
-  bool
-  is_producer_load_needed() const {
-    return fusion_callbacks.is_producer_load_needed();
-  }
-
-  CUTLASS_DEVICE auto
-  load_init(
-      Params const& params,
-      TensorMapStorage& shared_tensormaps,
-      int32_t sm_count,
-      int32_t sm_idx) {
-    // Initialize tma for loading
-    constexpr bool IsLoad = true;
-    auto load_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, 0);
-    return load_tensormaps;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class TiledMma,
-    class TensorMapC,
-    __CUTE_REQUIRES(std::is_pointer_v<TensorMapC>)
-  >
-  CUTLASS_DEVICE auto
-  load(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_MNK,
-      TileCoordMNKL tile_coord_mnkl,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      TensorMapC const& load_tensormap,
-      int subtile_idx=-1,
-      bool wait_until_load_finishes = false) {
-    using namespace cute;
-
-    // Indexing variables
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
-
-    static_assert(!is_im2col_D, "Do not support im2col");
-
-    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
-
-    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
-    Tensor mC_mn = params.tma_load_c.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));             //       (M,N,L)
-    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
-    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
-
-    // Apply epilogue subtile, get matching smem tensor
-    auto ptr_sC = shared_tensors.collective.smem_C.begin();
-    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
-
-    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
-    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
-    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
-    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
-
-    // Get the fusion callbacks for the producer load warp
-    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
-                      problem_shape_mnkl,
-                      CtaTileMNK{},
-                      tile_coord_mnkl,
-                      tiled_mma,
-                      EpilogueTile{},
-                      thread_idx
-                    };
-    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
-    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
-
-    LoadPipelineState last_load_producer_state = load_pipe_producer_state;
-
-    // Predication for TMA load (one thread issues TMA load)
-    bool issue_tma_load = cute::elect_one_sync();
-
-    // Pre-loop fusion callback entry point
-    pld_callbacks.begin();
-
-    LoadPipelineState prior_state = load_pipe_producer_state;
-
-    bool did_load = false;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
-        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
-          continue;
-        }
-
-        // Acquire the lock for this stage
-        constexpr uint16_t mcast_mask = 0;
-        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
-
-        load_pipeline.producer_acquire(load_pipe_producer_state);
-
-        // Loop fusion callback entry point
-        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
-
-        // Execute the TMA load for C if needed
-        if (is_C_load_needed) {
-          if (issue_tma_load) {
-            copy(params.tma_load_c.with(load_tensormap, *tma_barrier, mcast_mask),
-                bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
-            load_pipeline.producer_expect_transaction(load_pipe_producer_state);
-          }
-          last_load_producer_state = load_pipe_producer_state;
-          did_load = true;
-        }
-
-        // Commit TMA loads for this stage and release the lock
-        load_pipeline.producer_commit(load_pipe_producer_state);
-        ++load_pipe_producer_state;
-      }
-    }
-
-    // Post-loop fusion callback entry point
-    pld_callbacks.end();
-
-    if (wait_until_load_finishes && did_load) {
-      typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_tma_consumer_state =
-        {last_load_producer_state.index(), !last_load_producer_state.phase(), last_load_producer_state.count()};
-      load_pipeline.consumer_wait(epi_load_pipe_tma_consumer_state);
-    }
-
-    return load_pipe_producer_state;
-  }
-
-  CUTLASS_DEVICE auto
-  load_tail(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state) {
-
-    if (!fusion_callbacks.is_producer_load_needed()) {
-      return load_pipe_producer_state; 
-    }
-
-    bool issue_tma_load = cute::elect_one_sync();
-    if (issue_tma_load) {
-      load_pipeline.producer_tail(load_pipe_producer_state);
-    }
-
-    return load_pipe_producer_state;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class AccEngine, class AccLayout,
-    class TiledMma,
-    class TensorMapD
-  >
-  CUTLASS_DEVICE auto
-  store(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_MNK,
-      TileCoordMNKL tile_coord_mnkl,
-      cute::Tensor<AccEngine,AccLayout> accumulators,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      TensorMapD const& store_tensormap,
-      int subtile_idx=-1) {
-
-    using namespace cute;
-    using ElementAccumulator = typename AccEngine::value_type;
-    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
-    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
-
-    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
-    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
-    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
-    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
-    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
-
-    // Indexing variables
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
-
-
-    static_assert(!is_im2col_D, "Do not support im2col");
-
-    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
-
-    // Represent the full output tensor, slice to get the tile this CTA is responsible for
-    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));            //       (M,N,L)
-
-    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
-    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
-
-    // Apply epilogue subtiling
-    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-
-    // Construct the corresponding pipelined smem tensors
-    auto ptr_sC = shared_tensors.collective.smem_C.begin();
-    auto ptr_sD = shared_tensors.collective.smem_D.begin();
-    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
-                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
-    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
-                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
-
-    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
-
-    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
-    TiledCopy tiled_r2r = [&]() {
-      if constexpr (IsUseR2R) {
-        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
-      }
-      else {
-        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
-          ElementCompute>{}, tiled_copy_C_atom);
-      }
-    }();
-    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
-
-    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
-    TiledCopy tiled_r2s = [&]() {
-      if constexpr (IsUseR2R) {
-        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
-      }
-      else {
-        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
-      }
-    }();
-    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
-    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
-    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
-
-    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
-    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
-    auto epi_tile_m = size<0>(EpilogueTile{});
-    auto epi_tile_n = size<1>(EpilogueTile{});
-
-    // Allocate D registers
-    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
-    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                          // (R2S,R2S_M,R2S_N)
-
-    // Vectorized fragment view
-    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
-    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
-    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
-    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
-
-    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
-    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
-    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
-    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
-    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
-
-    // Allocate C registers
-    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
-    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
-    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
-                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
-    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
-    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
-    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
-
-    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
-    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
-    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
-    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
-
-    // OOB predication for tile quantization "residue"
-    // Absolute coordinate tensors (dynamic)
-    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
-    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
-    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
-    // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
-    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
-    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
-    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
-
-    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
-
-    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
-    // Get TiledCopy for partition reference when consumer store.
-    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
-    // Get the fusion callbacks for the consumer store warps
-    constexpr bool RefSrc = true; // Register tensors reference R2S copy src layout
-    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
-                      problem_shape_mnkl,
-                      CtaTileMNK{},
-                      tile_coord_mnkl,
-                      tiled_mma,
-                      EpilogueTile{},
-                      tiled_copy_partition_ref,
-                      cD,
-                      residue_cD,
-                      tRS_cD,
-                      residue_tRS_cD,
-                      tRS_rC,
-                      thread_idx
-                    };
-    auto cst_callbacks = fusion_callbacks.get_consumer_store_callbacks<RefSrc>(cst_args);
-    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
-    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
-
-    // Thread synchronizer for previously issued waits or fences
-    // to ensure visibility of smem reads/writes to threads or TMA unit
-    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-
-    // Predication for TMA store (one warp issues TMA store)
-    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
-
-    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
-    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
-    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
-    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
-    LoadPipelineState load_wait_state = load_pipe_consumer_state;
-    if constexpr (ReuseSmemC) {
-      load_wait_state = store_pipe_producer_state;
-      load_wait_state.phase_ ^= 1;
-    }
-
-    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
-    // Sync requirements of smem reuse may preclude this optimization
-    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
-    int epi_m_prev = 0, epi_n_prev = 0;
-    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
-
-    // The TMA store sequence for one subtile iteration
-    auto tma_store_fn = [&] (int epi_m, int epi_n) {
-      // Write the tile from smem to gmem with TMA
-      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
-      synchronize(); // ensure all threads have issued their async fence
-      if constexpr (is_destination_supported) {
-        if (issue_tma_store) {
-          copy(params.tma_store_d.with(store_tensormap), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
-        }
-      }
-
-      // Post async fence, pre TMA commit callback entry point
-      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
-
-      // Commit the TMA stores for this stage
-      if (issue_tma_store) {
-        store_pipeline.producer_commit(store_pipe_producer_state);
-      }
-      ++store_pipe_producer_state;
-      ++issued_stores;
-
-      // Wait for the next smem buffer to be available
-      if (issue_tma_store) {
-        store_pipeline.producer_acquire(store_pipe_producer_state);
-      }
-      synchronize();
-
-      if constexpr (ReuseSmemC) {
-        // producer_acquire returns when at most StagesD-1 committed stores are pending
-        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
-        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
-        if (store_finished) {
-          if (is_producer_load_needed) {
-            load_pipeline.consumer_release(load_pipe_consumer_state);
-          }
-          ++load_pipe_consumer_state;
-        }
-      }
-    };
-
-    //
-    // BEGIN EPILOGUE
-    //
-
-    // Pre-loop fusion callback entry point
-    cst_callbacks.begin();
-
-    // For each output tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
-        bool is_first_iteration = epi_m == 0 && epi_n == 0;
-        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
-
-        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
-          continue;
-        }
-
-        cst_callbacks.begin_loop(epi_m, epi_n);
-
-        if (is_producer_load_needed) {
-          // Wait for the producer load to fill smem
-          load_pipeline.consumer_wait(load_wait_state);
-
-          if (is_C_load_needed) {
-            // Copy source tile from smem to register
-            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
-          }
-        }
-
-        // First loop fusion callback entry point
-        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
-
-        if (is_producer_load_needed) {
-          if constexpr (not ReuseSmemC) {
-            // Let producer load warp know smem buffers are consumed and empty
-            cutlass::arch::fence_view_async_shared();
-            load_pipeline.consumer_release(load_pipe_consumer_state);
-            ++load_pipe_consumer_state;
-          }
-          ++load_wait_state;
-        }
-
-        int mma_m = epi_m;
-        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
-        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
-
-        // Vectorized fragment loop with visitor callback entry point
-        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
-        int r2s_v = epi_n_in_mma * size(tRS_rD_frg);
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size(tRS_rD_frg); ++epi_v) {
-          tRS_rD_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
-        }
-        // The latest we can delay the TMA store is right before the smem store of the next iteration
-        // since the current TMA store needs to be committed before we can acquire the next smem buffer
-        if constexpr (DelayTmaStore) {
-          // Issue TMA stores for the previous subtile
-          if (not is_first_iteration and subtile_idx == -1) {
-            tma_store_fn(epi_m_prev, epi_n_prev);
-          }
-          epi_m_prev = epi_m;
-          epi_n_prev = epi_n;
-        }
-
-        // Smem reduction callback entry point using current store buffer for workspace
-        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
-                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
-
-        // Copy tile from register to regiser if needed
-        if constexpr (IsUseR2R) {
-          // retile source and destination for tiled_r2r
-          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
-          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
-
-          // Output needs register shuffling before copying to shared memory.
-          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
-        }
-
-        // Copy tile from register to smem
-        if constexpr (is_destination_supported) {
-          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
-        }
-
-        // Post reduction, pre TMA store callback entry point
-        constexpr bool issue_smem_store = true; // No smem store predication
-        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
-
-        if constexpr (not DelayTmaStore) {
-          // Issue TMA stores for this subtile
-          tma_store_fn(epi_m, epi_n);
-        }
-
-        cst_callbacks.end_loop(epi_m, epi_n);
-
-      } // for epi_m
-    } // for epi_n
-
-
-    if constexpr (DelayTmaStore) {
-      // Issue TMA stores for the last subtile
-      tma_store_fn(epi_m_prev, epi_n_prev);
-    }
-
-    // Post-loop fusion callback entry point
-    cst_callbacks.end();
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  CUTLASS_DEVICE auto
-  store_tail(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state) {
-    // wait for all TMA stores to complete
-    store_pipeline.producer_tail(store_pipe_producer_state);
-    // reset store counter
-    issued_stores = 0;
-
-    if constexpr (ReuseSmemC) {
-      if (fusion_callbacks.is_producer_load_needed()) {
-        // Issue releases on up to StagesD-1 previously issued TMA stores
-        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
-        CUTLASS_PRAGMA_UNROLL
-        for (int stage = 0; stage < release_stages; ++stage) {
-          load_pipeline.consumer_release(load_pipe_consumer_state);
-          ++load_pipe_consumer_state;
-        }
-      }
-    }
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  CUTLASS_DEVICE auto
-  store_init(
-      Params const& params,
-      TensorMapStorage& shared_tensormaps,
-      int32_t sm_count,
-      int32_t sm_idx,
-      int32_t warp_group_idx) {
-    int warp_idx_in_warp_group = canonical_warp_idx_sync() % NumWarpsPerWarpGroup;
-    // Since only one warp issues TMA store, we only need that one warp to initialize tensormaps
-    if (warp_idx_in_warp_group == 0) {
-      // Initialize tma
-      constexpr bool IsLoad = false;
-      auto store_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, warp_group_idx);
-      return store_tensormaps;
-    }
-    TmaDescriptor* null_tma_desc = nullptr;
-    return cute::make_tuple(null_tma_desc);
-  }
-
-  //
-  // Methods to perform different parts of TMA/Tensormap modifications
-  //
-
-  template <bool IsLoad>
-  CUTLASS_DEVICE auto
-  tensormaps_init(
-      Params const& params,
-      TensorMapStorage& shared_tensormaps,
-      int32_t sm_count,
-      int32_t sm_idx,
-      int32_t warp_group_idx) {
-
-    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
-    Layout desc_layout = make_layout(make_shape(sm_count, Int<NumInputTensors>{}));
-
-    Tensor gmem_tensormap = make_tensor(params.tensormaps, desc_layout);                      // (SMs, NumInputTensors)
-
-    if constexpr (IsLoad) {
-      if (not cute::is_void_v<ElementC>) {
-        constexpr int C_tensormap_index = NumEpilogueWarpGroups;
-        Tensor pC_tensormap = make_tensor(params.tma_load_c.get_tma_descriptor(), Int<1>{}, Int<1>{});
-        Tensor sC_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_C), Int<1>{}, Int<1>{});
-
-        if (cute::elect_one_sync()) {
-          // Bringing tensormaps from params to smem for modification later
-          copy(recast<uint128_t>(pC_tensormap), recast<uint128_t>(sC_tensormap));
-        }
-        __syncwarp();
-        return cute::make_tuple(&gmem_tensormap(sm_idx, C_tensormap_index));
-
-      }
-      TmaDescriptor* null_tma_desc = nullptr;
-      return cute::make_tuple(null_tma_desc);
-    }
-    else {
-      Tensor pD_tensormap = make_tensor(params.tma_store_d.get_tma_descriptor(), Int<1>{}, Int<1>{});
-      Tensor sD_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_D[warp_group_idx]), Int<1>{}, Int<1>{});
-
-      if (cute::elect_one_sync()) {
-        // Bringing tensormaps from params to smem for modification later
-        copy(recast<uint128_t>(pD_tensormap), recast<uint128_t>(sD_tensormap));
-      }
-      __syncwarp();
-      return cute::make_tuple(&gmem_tensormap(sm_idx, warp_group_idx));
-    }
-  }
-
-  // Replace address for the global tensor (to be done by single thread)
-  template <bool IsLoad>
-  CUTLASS_DEVICE
-  void
-  tensormaps_replace_global_address(
-      TensorMapStorage& shared_tensormaps,
-      Params const& params,
-      int32_t next_batch,
-      int32_t warp_group_idx) {
-    // Replacing global_address for the next batch
-    if constexpr (IsLoad) {
-      if constexpr (is_source_supported) {
-        cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_C,
-                                                        params.ptr_C[next_batch]);
-      }
-    }
-    else if constexpr (is_destination_supported) {
-      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
-                                                      params.ptr_D[next_batch]);
-    }
-  }
-
-  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
-  template <bool IsLoad, class ProblemShape_MNKL>
-  CUTLASS_DEVICE
-  void
-  tensormaps_replace_global_tensor_properties(
-      TensorMapStorage& shared_tensormaps,
-      Params const& params,
-      int32_t next_group,
-      ProblemShape_MNKL problem_shape_mnkl,
-      int32_t warp_group_idx) {
-    const uint32_t M = get<0>(problem_shape_mnkl);
-    const uint32_t N = get<1>(problem_shape_mnkl);
-    // Replace all dims for consistency
-    constexpr int MaxTensorRank = 5;
-    cute::array<uint32_t, MaxTensorRank> prob_shape  = {1,1,1,1,1};
-    cute::array<uint64_t, MaxTensorRank> prob_stride = {0,0,0,0,0};
-
-    if constexpr (IsLoad) {
-      if constexpr (is_source_supported) {
-        ElementC const* ptr_C = nullptr;
-        Tensor tensor_c = make_tensor(ptr_C, make_layout(make_shape(M,N,Int<1>{}), params.dC[next_group]));
-
-        cute::detail::fill_tma_gmem_shape_stride(params.tma_load_c, tensor_c, 
-                                                 prob_shape, prob_stride);
-        // Convert strides to byte strides
-        for (uint64_t& stride : prob_stride) {
-          stride = (stride * sizeof_bits_v<ElementC>) / 8;
-        }
-        cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_C,
-                                                                prob_shape,
-                                                                prob_stride);
-      }
-    }
-    else if constexpr (is_destination_supported) {
-      ElementD const* ptr_D = nullptr;
-      Tensor tensor_d = make_tensor(ptr_D, make_layout(make_shape(M,N,Int<1>{}), params.dD[next_group]));
-
-      cute::detail::fill_tma_gmem_shape_stride(params.tma_store_d, tensor_d, 
-                                               prob_shape, prob_stride);
-      // Convert strides to byte strides
-      for (uint64_t& stride : prob_stride) {
-        stride = (stride * sizeof_bits_v<ElementD>) / 8;
-      }
-
-      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
-                                                              prob_shape,
-                                                              prob_stride);
-    }
-  }
-
-  template <bool IsLoad, class ProblemShape_MNKL>
-  CUTLASS_DEVICE
-  void
-  tensormaps_perform_update(
-      TensorMapStorage& shared_tensormaps,
-      Params const& params,
-      cute::TmaDescriptor const* tensormap,
-      ProblemShape_MNKL problem_shape_mnkl,
-      int32_t next_batch,
-      int32_t warp_group_idx) {
-
-    if (cute::elect_one_sync()) {
-      // Replacing global_address for the next batch
-      tensormaps_replace_global_address<IsLoad>(shared_tensormaps, params, next_batch, warp_group_idx);
-
-      if constexpr (IsGroupedGemmKernel) {
-        // Replacing global dims and strides for the next batch
-        tensormaps_replace_global_tensor_properties<IsLoad>(
-            shared_tensormaps, params, next_batch, problem_shape_mnkl, warp_group_idx);
-      }
-
-    }
-  }
-
-  template <bool IsLoad>
-  CUTLASS_DEVICE
-  void
-  tensormaps_cp_fence_release(
-      TensorMapStorage& shared_tensormaps,
-      cute::TmaDescriptor const* tensormap,
-      const int32_t warp_group_idx = 0) {
-
-    // Entire warp must do this (ie its aligned)
-    if constexpr (IsLoad) {
-      if constexpr (is_source_supported) {
-        tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_C);
-      }
-    }
-    else if constexpr (is_destination_supported) {
-      tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_D[warp_group_idx]);
-    }
-  }
-
-  template <bool IsLoad>
-  CUTLASS_DEVICE
-  void
-  tensormaps_fence_acquire(cute::TmaDescriptor const* tensormap) {
-    if constexpr (IsLoad) {
-      if constexpr (not cute::is_void_v<ElementC>) {
-        cute::tma_descriptor_fence_acquire(tensormap);
-      }
-    } 
-    else {
-      cute::tma_descriptor_fence_acquire(tensormap);
-    }
-  }
-
-private:
-  Params const& params;
-  FusionCallbacks fusion_callbacks;
-  int issued_stores = 0;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
deleted file mode 100755
index b96c4aea0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,904 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/epilogue/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
-#include "cutlass/detail/collective.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_,
-  bool ReuseSmemC_,
-  bool DelayTmaStore_,
-  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
-  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
-  class ElementC_,
-  class StrideC_,
-  class ElementD_,
-  class StrideD_,
-  class FusionCallbacks_,
-  class CopyOpG2S_,
-  class SmemLayoutAtomC_,
-  class CopyOpS2R_,
-  class CopyOpS2G_,
-  class SmemLayoutAtomD_,
-  class CopyOpR2S_,
-  class CopyAtomC_,
-  class CopyOpR2R_
->
-class CollectiveEpilogue<
-    Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>,
-    CtaTileMNK_,
-    EpilogueTile_,
-    ElementC_,
-    StrideC_,
-    ElementD_,
-    StrideD_,
-    FusionCallbacks_,
-    CopyOpG2S_,
-    SmemLayoutAtomC_,
-    CopyOpS2R_,
-    CopyOpS2G_,
-    SmemLayoutAtomD_,
-    CopyOpR2S_,
-    CopyAtomC_,
-    CopyOpR2R_,
-> {
-public:
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>;
-  using CtaTileMNK = CtaTileMNK_;
-  using EpilogueTile = EpilogueTile_;
-  using FusionCallbacks = FusionCallbacks_;
-  using ElementC = ElementC_;
-  using StrideC = StrideC_;
-  using ElementD = ElementD_;
-  using StrideD = StrideD_;
-  using CopyOpG2S = CopyOpG2S_;
-  using SmemLayoutAtomC = SmemLayoutAtomC_;
-  using CopyOpS2R = CopyOpS2R_;
-  using CopyOpS2G = CopyOpS2G_;
-  using SmemLayoutAtomD = SmemLayoutAtomD_;
-  using CopyOpR2S = CopyOpR2S_;
-  using CopyAtomC = CopyAtomC_;
-  using CopyOpR2R = CopyOpR2R_;
-
-  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
-  using GmemTiledCopyC = CopyOpG2S;
-  using GmemTiledCopyD = CopyOpS2G;
-
-  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
-  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
-  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
-  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
-  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
-  static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
-
-private:
-  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
-  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
-  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
-  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
-  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
-
-  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
-  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
-
-  constexpr static int StagesC = StagesC_;
-  constexpr static int StagesD = StagesD_;
-  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
-
-  constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
-  constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
-
-  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
-  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
-
-  // Check if register transformation is needed before copying register to shared memory.
-  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
-
-  using SmemLayoutC = decltype(tile_to_shape(
-      SmemLayoutAtomC{},
-      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
-      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-  using SmemLayoutD = decltype(tile_to_shape(
-      SmemLayoutAtomD{},
-      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
-      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-
-  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
-                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
-  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
-
-  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
-  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
-  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
-
-  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
-  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
-
-  using EmptyType = cute::tuple<>;
-  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
-                         SmemArrayTypeC,
-                         EmptyType>;
-  using SmemDStorage = cute::conditional_t<is_destination_supported,
-                         SmemArrayTypeD,
-                         EmptyType>;
-
-  struct CollectiveStorageWithC {
-    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
-    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-  union CollectiveStorageWithoutC {
-    cute::array<SmemElementC, 0> smem_C;
-    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-  union CollectiveStorageReuseC {
-    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
-    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
-  };
-
-public:
-  // TMA pipeline for loading C
-  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
-  using LoadPipelineState = cutlass::PipelineState<StagesC>;
-  constexpr static uint32_t TmaTransactionBytes =
-    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
-  constexpr static bool RequiresTransactionBytes = true;
-
-  // TMA pipeline for storing D
-  using StorePipeline = cute::conditional_t<ReuseSmemC,
-                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
-                          cutlass::PipelineTmaStore<StagesD>>;
-  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
-
-  struct SharedStorage {
-    struct TensorStorage {
-      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
-                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
-      CollectiveStorage collective;
-
-      using FusionStorage = typename FusionCallbacks::SharedStorage;
-      FusionStorage thread;
-    } tensors;
-
-    using PipelineStorage = typename LoadPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side epilogue arguments
-  struct Arguments {
-    typename FusionCallbacks::Arguments thread{};
-    ElementC const* ptr_C;
-    StrideC dC;
-    ElementD const* ptr_D;
-    StrideD dD;
-  };
-
-  // Device side epilogue params
-  struct Params {
-    using TMA_C = decltype(make_tma_copy(
-        CopyOpG2S{},
-        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
-            repeat_like(StrideC{}, int32_t(0)), StrideC{}),
-        take<0,2>(SmemLayoutC{}),
-        EpilogueTile{},
-        _1{}));
-    using TMA_D = decltype(make_tma_copy(
-        CopyOpS2G{},
-        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
-            repeat_like(StrideD{}, int32_t(0)), StrideD{}),
-        take<0,2>(SmemLayoutD{}),
-        EpilogueTile{},
-        _1{}));
-
-    typename FusionCallbacks::Params thread{};
-    TMA_C tma_load_c;
-    TMA_D tma_store_d;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      ProblemShape const& problem_shape,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_MNKL;
-
-    uint32_t transaction_bytes = TmaTransactionBytes;
-    typename Params::TMA_C tma_load_c = {};
-    if constexpr (is_source_supported) {
-      Tensor tensor_c = make_tensor(make_gmem_ptr(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
-      tma_load_c = make_tma_copy_C_sm90(
-          CopyOpG2S{},
-          tensor_c,
-          take<0,2>(SmemLayoutC{}),
-          EpilogueTile{});
-    }
-
-    typename Params::TMA_D tma_store_d;
-    if constexpr (is_destination_supported) {
-      Tensor tensor_d = make_tensor(make_gmem_ptr(args.ptr_D), make_layout(make_shape(M,N,L), args.dD));
-      tma_store_d = make_tma_copy_C_sm90(
-          CopyOpS2G{},
-          tensor_d,
-          take<0,2>(SmemLayoutD{}),
-          EpilogueTile{});
-    }
-
-    return {
-      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
-      tma_load_c,
-      tma_store_d,
-      transaction_bytes
-    };
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, 
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    auto shape = cute::make_shape(M,N,L);
-
-    bool implementable = true;
-    if constexpr (is_destination_supported) {
-      constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
-      constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
-      if constexpr (cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>) { // ignore L stride for implicit gemm
-        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(take<0,2>(shape), take<0,2>(StrideD{}));
-      }
-      else {
-        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(shape, StrideD{});
-      }
-    }
-
-    if constexpr (not cute::is_void_v<ElementC>) {
-      constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
-      constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
-      if constexpr (cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>) { // ignore L stride for implicit gemm
-        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(take<0,2>(shape), take<0,2>(StrideC{}));
-      }
-      else {
-        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(shape, StrideC{});
-      }
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-
-    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
-
-    if (!fusion_implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
-    }
-
-    bool beta_implementable = true;
-
-    if constexpr (cute::is_void_v<ElementC>) {
-      if constexpr (detail::has_beta<Arguments>::value) {
-        beta_implementable = args.thread.beta == 0.0;
-      }
-      if constexpr (detail::has_beta_ptr<Arguments>::value) {
-        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
-      }
-    }
-
-    if (!beta_implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
-    }
-
-    return implementable && fusion_implementable && beta_implementable;
-  }
-
-  template<class TileShapeMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
-    // Compute number of epilogue subtiles
-    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
-  }
-
-  template<class TileShapeMNK>
-  CUTLASS_HOST_DEVICE
-  static constexpr int
-  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
-    return get_load_pipe_increment(tile_shape_MNK);
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void
-  prefetch_tma_descriptors(Params const& epilogue_params) {
-    if constexpr (is_source_supported) {
-      cute::prefetch_tma_descriptor(epilogue_params.tma_load_c.get_tma_descriptor());
-    }
-    if constexpr (is_destination_supported) {
-      cute::prefetch_tma_descriptor(epilogue_params.tma_store_d.get_tma_descriptor());
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
-      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
-
-  CUTLASS_DEVICE
-  bool
-  is_producer_load_needed() const {
-    return fusion_callbacks.is_producer_load_needed();
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class TiledMma
-  >
-  CUTLASS_DEVICE auto
-  load(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_MNK,
-      TileCoordMNKL tile_coord_mnkl,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      int subtile_idx=-1) {
-    using namespace cute;
-
-    // Indexing variables
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
-
-    // The tma tensor C under im2col mode only has two modes (M, N) which
-    // should be local tiled with only (m_coord, n_coord).
-    auto coord_shape = conditional_return<is_im2col_C>(
-      make_coord(m_coord, n_coord),
-      make_coord(m_coord, n_coord, l_coord));
-
-    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
-    Tensor mC_mn = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                //       (M,N,L)
-    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
-    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
-
-    // Apply epilogue subtile, get matching smem tensor
-    auto ptr_sC = shared_tensors.collective.smem_C.begin();
-    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
-
-    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
-    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
-    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
-    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
-
-    // Get the fusion callbacks for the producer load warp
-    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs(
-                      problem_shape_mnkl,
-                      CtaTileMNK{},
-                      tile_coord_mnkl,
-                      tiled_mma,
-                      EpilogueTile{},
-                      thread_idx
-                    );
-    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
-    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
-
-    // Predication for TMA load (one thread issues TMA load)
-    bool issue_tma_load = cute::elect_one_sync();
-
-    // Pre-loop fusion callback entry point
-    pld_callbacks.begin();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
-        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
-          continue;
-        }
-        // Acquire the lock for this stage
-        constexpr uint16_t mcast_mask = 0;
-        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
-        load_pipeline.producer_acquire(load_pipe_producer_state);
-
-        // Loop fusion callback entry point
-        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
-
-        // Execute the TMA load for C if needed
-        if (issue_tma_load && is_C_load_needed) {
-          copy(params.tma_load_c.with(*tma_barrier, mcast_mask),
-              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
-          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
-        }
-
-        // Commit TMA loads for this stage and release the lock
-        load_pipeline.producer_commit(load_pipe_producer_state);
-        ++load_pipe_producer_state;
-      }
-    }
-
-    // Post-loop fusion callback entry point
-    pld_callbacks.end();
-
-    return load_pipe_producer_state;
-  }
-
-  CUTLASS_DEVICE auto
-  load_tail(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_producer_state) {
-    bool issue_tma_load = cute::elect_one_sync();
-    if (issue_tma_load) {
-      load_pipeline.producer_tail(load_pipe_producer_state);
-    }
-
-    return load_pipe_producer_state;
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class TileShapeMNK,
-    class TileCoordMNKL,
-    class AccEngine, class AccLayout,
-    class TiledMma
-  >
-  CUTLASS_DEVICE auto
-  store(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_MNK,
-      TileCoordMNKL tile_coord_mnkl,
-      cute::Tensor<AccEngine,AccLayout> accumulators,
-      TiledMma tiled_mma,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      int subtile_idx=-1) {
-    using namespace cute;
-    using ElementAccumulator = typename AccEngine::value_type;
-    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
-    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
-
-    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
-    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
-    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
-    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
-    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
-
-    // Indexing variables
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
-
-    // The tma tensor D under im2col mode only has two modes (M, N) which
-    // should be local tiled with only (m_coord, n_coord).
-    auto coord_shape = conditional_return<is_im2col_D>( 
-        make_coord(m_coord, n_coord),
-        make_coord(m_coord, n_coord, l_coord));
-
-    // Represent the full output tensor, slice to get the tile this CTA is responsible for
-    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
-    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
-    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
-
-    // Apply epilogue subtiling
-    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-
-    // Construct the corresponding pipelined smem tensors
-    auto ptr_sC = shared_tensors.collective.smem_C.begin();
-    auto ptr_sD = shared_tensors.collective.smem_D.begin();
-    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
-                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
-    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
-                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
-
-    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
-
-    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
-    TiledCopy tiled_r2r = [&]() {
-      if constexpr (IsUseR2R) {
-        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
-      }
-      else {
-        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
-          ElementCompute>{}, tiled_copy_C_atom);
-      }
-    }();
-    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
-
-    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
-    TiledCopy tiled_r2s = [&]() {
-      if constexpr (IsUseR2R) {
-        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
-      }
-      else {
-        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
-      }
-    }();
-    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
-    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
-    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
-
-    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
-    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
-    auto epi_tile_m = size<0>(EpilogueTile{});
-    auto epi_tile_n = size<1>(EpilogueTile{});
-
-    // Allocate D registers
-    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
-    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                      // (R2S,R2S_M,R2S_N)
-
-    // Vectorized fragment view
-    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
-    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
-    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
-    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
-
-    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
-    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
-    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
-    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
-    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
-
-    // Allocate C registers
-    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
-    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
-    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
-                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
-    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
-    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
-    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
-
-    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
-    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
-    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
-    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
-
-    // OOB predication for tile quantization "residue"
-    // Absolute coordinate tensors (dynamic)
-    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
-    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
-    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
-    // Relative coordinate tensors (static)
-    Tensor cD = make_counting_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
-    Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
-    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
-    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
-    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
-
-    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
-
-    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
-    // Get TiledCopy for partition reference when consumer store.
-    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
-    // Get the fusion callbacks for the consumer store warps
-    constexpr bool RefSrc = true; // Register tensors reference tiled copy src layout
-    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs(
-                      problem_shape_mnkl,
-                      CtaTileMNK{},
-                      tile_coord_mnkl,
-                      tiled_mma,
-                      EpilogueTile{},
-                      tiled_copy_partition_ref,
-                      cD,
-                      residue_cD,
-                      tRS_cD,
-                      residue_tRS_cD,
-                      tRS_rC,
-                      thread_idx
-                    );
-    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
-    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
-    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
-
-    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
-    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
-    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
-    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                            // (R2S,R2S_M,R2S_N)
-    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
-
-    // Thread synchronizer for previously issued waits or fences
-    // to ensure visibility of smem reads/writes to threads or TMA unit
-    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-
-    // Predication for TMA store (one warp issues TMA store)
-    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
-
-    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
-    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
-    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
-    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
-    LoadPipelineState load_wait_state = load_pipe_consumer_state;
-    if constexpr (ReuseSmemC) {
-      load_wait_state = store_pipe_producer_state;
-      load_wait_state.phase_ ^= 1;
-    }
-
-    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
-    // Sync requirements of smem reuse may preclude this optimization
-    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
-    [[maybe_unused]] int epi_m_prev = 0;
-    [[maybe_unused]] int epi_n_prev = 0;
-    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
-
-    // The TMA store sequence for one subtile iteration
-    auto tma_store_fn = [&] (int epi_m, int epi_n) {
-      // Write the tile from smem to gmem with TMA
-      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
-      synchronize(); // ensure all threads have issued their async fence
-      if constexpr (is_destination_supported) {
-        if (issue_tma_store) {
-          copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
-        }
-      }
-
-      // Post async fence, pre TMA commit callback entry point
-      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
-
-      // Commit the TMA stores for this stage
-      if (issue_tma_store) {
-        store_pipeline.producer_commit(store_pipe_producer_state);
-      }
-      ++store_pipe_producer_state;
-      ++issued_stores;
-
-      // Wait for the next smem buffer to be available
-      if (issue_tma_store) {
-        store_pipeline.producer_acquire(store_pipe_producer_state);
-      }
-      synchronize();
-
-      if constexpr (ReuseSmemC) {
-        // producer_acquire returns when at most StagesD-1 committed stores are pending
-        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
-        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
-        if (store_finished) {
-          if (is_producer_load_needed) {
-            load_pipeline.consumer_release(load_pipe_consumer_state);
-          }
-          ++load_pipe_consumer_state;
-        }
-      }
-    };
-
-    //
-    // BEGIN EPILOGUE
-    //
-
-    // Pre-loop fusion callback entry point
-    cst_callbacks.begin();
-
-    // For each output tile
-    CUTLASS_PRAGMA_UNROLL
-    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
-        [[maybe_unused]] bool is_first_iteration = epi_m == 0 && epi_n == 0;
-        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
-
-        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
-          continue;
-        }
-
-        cst_callbacks.begin_loop(epi_m, epi_n);
-
-        if (is_producer_load_needed) {
-          // Wait for the producer load to fill smem
-          load_pipeline.consumer_wait(load_wait_state);
-
-          if (is_C_load_needed) {
-            // Copy source tile from smem to register
-            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
-          }
-        }
-
-        // First loop fusion callback entry point
-        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
-
-        if (is_producer_load_needed) {
-          if constexpr (not ReuseSmemC) {
-            // Let producer load warp know smem buffers are consumed and empty
-            cutlass::arch::fence_view_async_shared();
-            load_pipeline.consumer_release(load_pipe_consumer_state);
-            ++load_pipe_consumer_state;
-          }
-          ++load_wait_state;
-        }
-
-        int mma_m = epi_m;
-        int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
-        Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
-
-        // Vectorized fragment loop with visitor callback entry point
-        int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
-        int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
-          tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
-        }
-        // The latest we can delay the TMA store is right before the smem store of the next iteration
-        // since the current TMA store needs to be committed before we can acquire the next smem buffer
-        if constexpr (DelayTmaStore) {
-          // Issue TMA stores for the previous subtile
-          if (not is_first_iteration and subtile_idx == -1) {
-            tma_store_fn(epi_m_prev, epi_n_prev);
-          }
-          epi_m_prev = epi_m;
-          epi_n_prev = epi_n;
-        }
-
-        // Smem reduction callback entry point using current store buffer for workspace
-        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
-                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
-
-        // Copy tile from register to regiser if needed
-        if constexpr (IsUseR2R) {
-          // retile source and destination for tiled_r2r
-          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
-          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
-
-          // Output register transformation before copying to shared memory.
-          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tRS_rD_frg); ++i) {
-          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
-        }
-
-        // Copy tile from register to smem
-        if constexpr (is_destination_supported) {
-          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
-        }
-
-        // Post reduction, pre TMA store callback entry point
-        constexpr bool issue_smem_store = true; // No smem store predication
-        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
-
-        if constexpr (not DelayTmaStore) {
-          // Issue TMA stores for this subtile
-          tma_store_fn(epi_m, epi_n);
-        }
-
-        cst_callbacks.end_loop(epi_m, epi_n);
-
-      } // for epi_m
-    } // for epi_n
-
-    if constexpr (DelayTmaStore) {
-      // Issue TMA stores for the last subtile
-      tma_store_fn(epi_m_prev, epi_n_prev);
-    }
-
-    // Post-loop fusion callback entry point
-    cst_callbacks.end();
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-  CUTLASS_DEVICE auto
-  store_tail(
-      LoadPipeline load_pipeline,
-      LoadPipelineState load_pipe_consumer_state,
-      StorePipeline store_pipeline,
-      StorePipelineState store_pipe_producer_state) {
-    // wait for all TMA stores to complete
-    store_pipeline.producer_tail(store_pipe_producer_state);
-    // reset store counter
-    issued_stores = 0;
-
-    if constexpr (ReuseSmemC) {
-      if (fusion_callbacks.is_producer_load_needed()) {
-        // Issue releases on up to StagesD-1 previously issued TMA stores
-        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
-        CUTLASS_PRAGMA_UNROLL
-        for (int stage = 0; stage < release_stages; ++stage) {
-          load_pipeline.consumer_release(load_pipe_consumer_state);
-          ++load_pipe_consumer_state;
-        }
-      }
-    }
-
-    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
-  }
-
-private:
-  Params const& params;
-  FusionCallbacks fusion_callbacks;
-  int issued_stores = 0;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
deleted file mode 100755
index 974904008..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing pipelined epilogues with bias add and elementwise activation functions.
-         This collective is now DEPRECATED, will be removed in the next release. Use EVT instead.
-*/
-
-#pragma once
-
-#include "sm90_epilogue_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_,
-  class BlockTileShape_,    //     (BLK_M,BLK_N,BLK_K)
-  class EpilogueTileShape_, // (EPI_TILE_M,EPI_TILE_N)
-  class ElementC_,
-  class StrideC_,
-  class ElementD_,
-  class StrideD_,
-  class FusionCallbacks_,
-  class CopyOpG2S_,
-  class SmemLayoutAtomC_,
-  class CopyOpS2R_,
-  class CopyOpS2G_,
-  class SmemLayoutAtomD_,
-  class CopyOpR2S_,
-  class CopyAtomC_,
-  class CopyOpR2R_
->
-class Sm90EpilogueTmaWarpSpecializedBiasElementwise
-  : public CollectiveEpilogue<
-      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
-      BlockTileShape_,
-      EpilogueTileShape_,
-      ElementC_,
-      StrideC_,
-      ElementD_,
-      StrideD_,
-      FusionCallbacks_,
-      CopyOpG2S_,
-      SmemLayoutAtomC_,
-      CopyOpS2R_,
-      CopyOpS2G_,
-      SmemLayoutAtomD_,
-      CopyOpR2S_,
-      CopyAtomC_,
-      CopyOpR2R_
-> {
-private:
-  using Impl =
-    CollectiveEpilogue<
-      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
-      BlockTileShape_,
-      EpilogueTileShape_,
-      ElementC_,
-      StrideC_,
-      ElementD_,
-      StrideD_,
-      FusionCallbacks_,
-      CopyOpG2S_,
-      SmemLayoutAtomC_,
-      CopyOpS2R_,
-      CopyOpS2G_,
-      SmemLayoutAtomD_,
-      CopyOpR2S_,
-      CopyAtomC_,
-      CopyOpR2R_
-    >;
-public:
-  using DispatchPolicy = Sm90TmaWarpSpecializedBiasElementwise<StagesC_, StagesD_, FragmentSize_>;
-  using ElementCompute = typename Impl::ThreadEpilogueOp::ElementCompute;
-  using ElementBias = typename Impl::ThreadEpilogueOp::ElementBias;
-  using ElementT = typename Impl::ThreadEpilogueOp::ElementAux;
-
-  // Constructor inheritance
-  using Impl::Impl;
-
-  // Host side epilogue arguments
-  struct [[deprecated("use Sm90TmaWarpSpecialized Arguments instead")]]
-  Arguments {
-    struct ThreadArgs {
-      ElementCompute alpha{1};
-      ElementCompute beta{0};
-      ElementCompute const *alpha_ptr{nullptr};
-      ElementCompute const *beta_ptr{nullptr};
-    } thread;
-    ElementC_ const* ptr_C{nullptr};
-    StrideC_ dC{};
-    ElementD_* ptr_D{nullptr};
-    StrideD_ dD{};
-    ElementBias const* ptr_Bias{nullptr};
-    ElementT* ptr_T{nullptr};
-
-    CUTLASS_HOST_DEVICE
-    operator typename Impl::Arguments() const {
-      typename Impl::Arguments arguments;
-      arguments.thread.alpha = thread.alpha;
-      arguments.thread.beta = thread.beta;
-      arguments.thread.alpha_ptr = thread.alpha_ptr;
-      arguments.thread.beta_ptr = thread.beta_ptr;
-      if constexpr (not cute::is_void_v<ElementBias>) {
-        arguments.thread.bias_ptr = ptr_Bias;
-      }
-      if constexpr (not cute::is_void_v<ElementT>) {
-        arguments.thread.aux_ptr = ptr_T;
-        arguments.thread.dAux = dD;
-      }
-      arguments.ptr_C = ptr_C;
-      arguments.dC = dC;
-      arguments.ptr_D = ptr_D;
-      arguments.dD = dD;
-
-      return arguments;
-    }
-  };
-
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
deleted file mode 100755
index f829a2ff5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue {
-
-//////////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Builder Epilogue Schedules
-//
-//////////////////////////////////////////////////////////////////////////////
-
-struct PtrArrayDefault {};
-struct EpilogueSimtVectorized {};
-struct EpiloguePtrArraySimtVectorized {};
-struct NoSmemWarpSpecialized {};
-struct PtrArrayNoSmemWarpSpecialized {};
-struct PtrArrayPlanarComplexNoSmemWarpSpecialized {};
-struct TmaWarpSpecialized {};
-struct TmaWarpSpecializedCooperative {};
-struct PtrArrayTmaWarpSpecializedCooperative {
-  static constexpr int NumEpilogueWarpGroups = 2;
-};
-
-// Standard warp specialized epilogue
-struct PtrArrayTmaWarpSpecialized {
-  static constexpr int NumEpilogueWarpGroups = 1;
-};
-
-// Pingpong kernel epilogue
-struct PtrArrayTmaWarpSpecializedPingpong {
-  static constexpr int NumEpilogueWarpGroups = 2;
-};
-
-// DEPRECATED schedules, will be removed in next release
-struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
-struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
-template <
-  template <class T> class ActivationFunctor_,
-  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
-  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
->
-struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
-TmaWarpSpecializedElementwise : public TmaWarpSpecializedElementwiseBase {
-  template <class T>
-  using ActivationFunctor = ActivationFunctor_<T>;
-  static constexpr thread::ScaleType::Kind Scale = Scale_;
-  static constexpr FloatRoundStyle Round = Round_;
-};
-
-template <
-  template <class T> class ActivationFunctor_,
-  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
-  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
->
-struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombEltAct instead")]]
-TmaWarpSpecializedCooperativeElementwise : public TmaWarpSpecializedCooperativeElementwiseBase {
-  template <class T>
-  using ActivationFunctor = ActivationFunctor_<T>;
-  static constexpr thread::ScaleType::Kind Scale = Scale_;
-  static constexpr FloatRoundStyle Round = Round_;
-};
-
-struct TmaWarpSpecializedBiasElementwiseBase : public TmaWarpSpecialized{};
-struct TmaWarpSpecializedCooperativeBiasElementwiseBase : public TmaWarpSpecializedCooperative {};
-
-template <
-  template <class T> class ActivationFunctor_,
-  class ElementT_,
-  template <class T> class BiasOp_,
-  bool StoreT_,
-  class ElementBias_
->
-struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltActAux instead")]]
-TmaWarpSpecializedBiasElementwise : public TmaWarpSpecializedBiasElementwiseBase {
-  template <class T>
-  using ActivationFunctor = ActivationFunctor_<T>;
-  using ElementT = ElementT_;
-
-  template <class T>
-  using BiasOp = BiasOp_<T>;
-
-  static constexpr bool StoreT = StoreT_;
-  using ElementBias = ElementBias_;
-};
-
-template <
-  template <class T> class ActivationFunctor_,
-  class ElementT_,
-  template <class T> class BiasOp_,
-  bool StoreT_,
-  class ElementBias_
->
-struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombPerRowBiasEltActAux instead")]]
-TmaWarpSpecializedCooperativeBiasElementwise : public TmaWarpSpecializedCooperativeBiasElementwiseBase {
-  template <class T>
-  using ActivationFunctor = ActivationFunctor_<T>;
-
-  using ElementT = ElementT_;
-
-  template <class T>
-  using BiasOp = BiasOp_<T>;
-
-  static constexpr bool StoreT = StoreT_;
-  using ElementBias = ElementBias_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Collective Dispatch Policies
-//
-//////////////////////////////////////////////////////////////////////////////
-
-template<
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_,
-  bool ReuseSmemC_,
-  bool DelayTmaStore_
->
-struct Sm90TmaWarpSpecialized {
-  constexpr static int StagesC = StagesC_;
-  constexpr static int StagesD = StagesD_;
-  constexpr static int FragmentSize = FragmentSize_;
-  constexpr static bool ReuseSmemC = ReuseSmemC_;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
-};
-
-template<
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_,
-  bool ReuseSmemC_,
-  bool DelayTmaStore_,
-  int NumEpilogueWarpGroups_
->
-struct Sm90PtrArrayTmaWarpSpecialized {
-  constexpr static int StagesC = StagesC_;
-  constexpr static int StagesD = StagesD_;
-  constexpr static int FragmentSize = FragmentSize_;
-  constexpr static bool ReuseSmemC = ReuseSmemC_;
-  constexpr static bool DelayTmaStore = DelayTmaStore_;
-  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
-};
-
-// DEPRECATED policies, will be removed in next release
-template<
-  int StagesC_,
-  int StagesD_,
-  int FragmentSize_ = 2
->
-struct Sm90TmaWarpSpecializedBiasElementwise {
-  constexpr static int StagesC = StagesC_;
-  constexpr static int StagesD = StagesD_;
-  constexpr static int FragmentSize = FragmentSize_;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
deleted file mode 100755
index 9ee37234c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/epilogue/fusion/operations.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Dispatch interface for epilogue fusion callbacks
-// For visitor fusions, this is just a convenience wrapper to provide metadata and non-nested args.
-// It is also valid to just pass visitor callbacks directly to the collective, e.g. fusion::Sm90LinearCombination,
-// provided the collective supports a visitor callbacks interface. This is useful for implementing custom fusions.
-template <
-  class DispatchPolicy,  // specialize on collective's dispatch policy since callbacks API will depend on collective's algorithm
-  class Operation,       // the fusion operation being performed, e.g. fusion::LinearCombination
-  class CtaTile_MNK,     // computed tile per CTA
-  class EpilogueTile_MN, // epilogue subtile size
-  class... Args          // callbacks implementation dependent args (e.g. copy atoms, smem layouts)
->
-struct FusionCallbacks {
-  static_assert(cutlass::detail::dependent_false<DispatchPolicy, Operation>, "Could not find a callbacks specialization.");
-};
-
-// Metadata helper to handle custom EVTs or other non-FusionCallbacks types
-template <class T>
-struct FusionCallbacksTraits {
-  using DispatchPolicy = void;
-  using Operation = T;
-  using CtaTile_MNK = void;
-  using EpilogueTile_MN = void;
-  using ElementCompute = void;
-};
-
-template <
-  class DispatchPolicy_,
-  class Operation_,
-  class CtaTile_MNK_,
-  class EpilogueTile_MN_,
-  class... Args
->
-struct FusionCallbacksTraits<
-  FusionCallbacks<DispatchPolicy_, Operation_, CtaTile_MNK_, EpilogueTile_MN_, Args...>
-> {
-  using DispatchPolicy = DispatchPolicy_;
-  using Operation = Operation_;
-  using CtaTile_MNK = CtaTile_MNK_;
-  using EpilogueTile_MN = EpilogueTile_MN_;
-  using ElementCompute = typename Operation::ElementCompute;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
deleted file mode 100755
index 3aed32710..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp
+++ /dev/null
@@ -1,351 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/layout/matrix.h>
-#include <cute/numeric/numeric_types.hpp>
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Fusion Operations
-// Template args must not be implementation dependent
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct FusionOperation {
-  // metadata types/queries that can be overrided
-  using ElementOutput = void;
-  using ElementCompute = void;
-
-  using ElementSource = void;
-  static constexpr bool IsSourceSupported = false;
-
-  using ElementScalar = void;
-  static constexpr int AlignmentScalar = 0;
-  static constexpr bool IsScaleFactorSupported = false;
-  static constexpr bool IsPerRowScaleSupported = false;
-  using ElementBias = void;
-  static constexpr int AlignmentBias = 0;
-  static constexpr bool IsPerRowBiasSupported = false;
-  static constexpr bool IsDePerRowBiasSupported = false;
-
-  using ActivationFn = void;
-  static constexpr bool IsEltActSupported = false;
-  static constexpr bool IsDeEltActSupported = false;
-
-  using ElementAux = void;
-  using GmemLayoutTagAux = void;
-  static constexpr int AlignmentAux = 0;
-  static constexpr bool IsAuxOutSupported = false;
-  static constexpr bool IsAuxInSupported = false;
-
-  using ElementAmax = void;
-  static constexpr bool IsAbsMaxSupported = false;
-
-};
-
-// D = alpha * acc
-template<
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementScalar_ = ElementCompute_,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct ScaledAcc : FusionOperation {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalar = ElementScalar_;
-  static constexpr int AlignmentScalar = 1;
-  static constexpr auto RoundStyle = RoundStyle_;
-};
-
-// D = alpha * acc + beta * C
-template<
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinearCombination
-    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_> {
-  using ElementSource = ElementSource_;
-  static constexpr bool IsSourceSupported = true;
-};
-
-// D = activation(alpha * acc + beta * C)
-template<
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombEltAct
-    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-  using ActivationFn = ActivationFn_<ElementCompute_>;
-  static constexpr bool IsEltActSupported = true;
-};
-
-// D = softmax(top_k(alpha * acc + beta * C))
-template<
-  int TopK,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombTopKSoftmaxCol
-    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-};
-
-
-// D = alpha * acc + beta * C + per-row bias
-template<
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombPerRowBias
-    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-  using ElementBias = ElementBias_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  static constexpr bool IsPerRowBiasSupported = true;
-};
-
-// D = alpha * acc + beta * C + per-column bias
-template<
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombPerColBias
-    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-  using ElementBias = ElementBias_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  static constexpr bool IsPerColBiasSupported = true;
-};
-
-// D = activation(alpha * acc + beta * C + per-row bias)
-template<
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombPerRowBiasEltAct
-    : LinCombPerRowBias<ElementOutput_, ElementCompute_,
-        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  using ActivationFn = ActivationFn_<ElementCompute_>;
-  static constexpr bool IsEltActSupported = true;
-};
-
-// D = activation(alpha * acc + beta * C + per-row bias)
-// aux = alpha * acc + beta * C + per-row bias
-template<
-  class GmemLayoutTagAux_,
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementAux_ = ElementOutput_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombPerRowBiasEltActAux
-    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  using ElementAux = ElementAux_;
-  using GmemLayoutTagAux = GmemLayoutTagAux_;
-  static constexpr int AlignmentAux = AlignmentAux_;
-  static constexpr bool IsAuxOutSupported = true;
-};
-
-// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
-template<
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct PerRowLinCombPerRowBiasEltAct
-    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  static constexpr int AlignmentScalar = AlignmentScalar_;
-  static constexpr bool IsPerRowScaleSupported = true;
-};
-
-// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
-// if D is fp8 
-//   D = scale_d * activation(Z)
-// else
-//   D = activation(Z)
-template<
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct ScaledLinCombPerRowBiasEltAct
-    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  static constexpr bool IsScaleFactorSupported = true;
-};
-
-// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
-// if D is fp8 
-//   amax_d = max(abs(elements in activation(Z)))
-//   D = scale_d * activation(Z)
-// else
-//   D = activation(Z)
-// if Aux is fp8 
-//   amax_aux = max(abs(elements in Z))
-//   Aux = scale_aux * Z
-// else
-//   Aux = Z
-template<
-  class GmemLayoutTagAux_,
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementAux_ = ElementOutput_,
-  class ElementAmax_ = ElementCompute_,
-  class ElementBias_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct ScaledLinCombPerRowBiasEltActAmaxAux
-    : ScaledLinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
-        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
-  using ElementAmax = ElementAmax_;
-  static constexpr bool IsAbsMaxSupported = true;
-
-  using ElementAux = ElementAux_;
-  using GmemLayoutTagAux = GmemLayoutTagAux_;
-  static constexpr int AlignmentAux = AlignmentAux_;
-  static constexpr bool IsAuxOutSupported = true;
-};
-
-// Z = Aux
-// dY = alpha * acc + beta * C
-// D = d_activation(dY, Z)
-template<
-  class GmemLayoutTagAux_,
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementAux_ = ElementOutput_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombDeEltAct
-    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
-  using ActivationFn = ActivationFn_<ElementCompute_>;
-  static constexpr bool IsDeEltActSupported = true;
-
-  using ElementAux = ElementAux_;
-  using GmemLayoutTagAux = GmemLayoutTagAux_;
-  static constexpr int AlignmentAux = AlignmentAux_;
-  static constexpr bool IsAuxInSupported = true;
-};
-
-// Z = Aux
-// dY = alpha * acc + beta * C
-// D = d_activation(dY, Z)
-// dBias = sum of columns of D
-template<
-  class GmemLayoutTagAux_,
-  template <class> class ActivationFn_,
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementAux_ = ElementOutput_,
-  class ElementBias_ = ElementCompute_,
-  class ElementSource_ = ElementOutput_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
-  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct LinCombDeEltActDePerRowBias
-    : LinCombDeEltAct<GmemLayoutTagAux_, ActivationFn_, ElementOutput_, ElementCompute_,
-        ElementAux_, ElementSource_, ElementScalar_, AlignmentAux_, RoundStyle_> {
-  using ElementBias = ElementBias_;
-  static constexpr int AlignmentBias = AlignmentBias_;
-  static constexpr bool IsDePerRowBiasSupported = true;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
deleted file mode 100755
index e028846a4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,1787 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Fusion callbacks specializations for the sm90 TMA warp-specialized (ws) epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/epilogue/dispatch_policy.hpp"
-#include "cutlass/epilogue/fusion/callbacks.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
-
-#include "cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class NodeOp, class... ChildOps>
-using Sm90EVT = Sm90TreeVisitor<NodeOp, ChildOps...>;
-
-// D = alpha * acc
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, 
-      Sm90AccFetch
-    > {
-  using Impl = 
-    Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>,
-      Sm90AccFetch
-    >;
-  using Operation = fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    // Give a name and flat ordering to the fusion callback args
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-
-    // Conversion to the args expected by the visitor implementation
-    // to_underlying_arguments will implicitly call this
-    operator typename Impl::Arguments() const {
-      return
-        {    // binary op : alpha * acc
-          {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-          {},                     // leaf args : acc
-          {} // binary args : multiplies
-        };   // end binary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = alpha * acc + beta * C
-template<
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinearCombination =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
-      Sm90AccFetch // acc
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
-
-  using Impl = Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // ternary op : beta * C + (alpha * acc)
-          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-          {},                   // leaf args : C
-          {                     // binary op : alpha * acc
-            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-            {},                     // leaf args : acc
-            {}                  // binary args : multiplies
-          },                    // end binary op
-          {} // ternary args : multiply_add
-        };   // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = alpha * acc + beta * C, where beta and alpha can be vectors for each batch
-template<
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinearCombinationPtrArray =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
-    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
-      Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
-      Sm90AccFetch // acc
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  int NumEpilogueWarpGroups,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
-                                             StagesD, 
-                                             FragmentSize, 
-                                             ReuseSmemC, 
-                                             DelayTmaStore, 
-                                             NumEpilogueWarpGroups
-                                            >,
-    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
-
-  using Impl = Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementScalar const* const* alpha_ptr_array = nullptr;
-    ElementScalar const* const* beta_ptr_array = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // ternary op : beta * C + (alpha * acc)
-          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
-          {},                   // leaf args : C
-          {                     // binary op : alpha * acc
-            {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
-            {},                     // leaf args : acc
-            {}                  // binary args : multiplies
-          },                    // end binary op
-          {} // ternary args : multiply_add
-        };   // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = activation(alpha * acc + beta * C)
-template<
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombEltAct =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
-    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
-
-  using Impl = Sm90LinCombEltAct<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op: activation(beta * C + (alpha * acc))
-          {    // ternary op : beta * C + (alpha * acc)
-            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-            {},                   // leaf args : C
-            {                     // binary op : alpha * acc
-              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-              {},                     // leaf args : acc
-              {}                  // binary args : multiplies
-            },                    // end binary op
-            {} // ternary args : multiply_add
-          },   // end ternary op
-          activation // unary args: activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = activation(alpha * acc + beta * C), where beta and alpha can be vectors for each batch
-template<
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombEltActPtrArray =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
-    Sm90LinearCombinationPtrArray<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  int NumEpilogueWarpGroups,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
-                                             StagesD, 
-                                             FragmentSize, 
-                                             ReuseSmemC, 
-                                             DelayTmaStore, 
-                                             NumEpilogueWarpGroups
-                                            >,
-    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombEltActPtrArray<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
-
-  using Impl = Sm90LinCombEltActPtrArray<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    ElementScalar const* const* alpha_ptr_array = nullptr;
-    ElementScalar const* const* beta_ptr_array = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op: activation(beta * C + (alpha * acc))
-          {    // ternary op : beta * C + (alpha * acc)
-            {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
-            {},                   // leaf args : C
-            {                     // binary op : alpha * acc
-              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
-              {},                     // leaf args : acc
-              {}                  // binary args : multiplies
-            },                    // end binary op
-            {} // ternary args : multiply_add
-          },   // end ternary op
-          activation // unary args: activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = alpha * acc + beta * C + per-row bias
-template<
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombPerRowBias =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
-      Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombPerRowBias<
-      CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
-  using Impl = Sm90LinCombPerRowBias<
-    CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
-  using Operation = fusion::LinCombPerRowBias<
-    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    operator typename Impl::Arguments() const {
-      return
-        {     // ternary op : beta * C + (alpha * acc + bias)
-          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-          {},                   // leaf args : C
-          {                     // ternary op : alpha * acc + bias
-            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-            {},                     // leaf args : acc
-            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-            {}                  // ternary args : multiply_add
-          },                    // end ternary op
-          {} // ternary args : multiply_add
-        };   // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = alpha * acc + beta * C + per-column bias
-template<
-  int StagesC,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombPerColBias =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
-      Sm90AccFetch, // acc
-      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombPerColBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombPerColBias<
-      StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
-  using Impl = Sm90LinCombPerColBias<
-    StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
-  using Operation = fusion::LinCombPerColBias<
-    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_0,_1,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    operator typename Impl::Arguments() const {
-      return
-        {     // ternary op : beta * C + (alpha * acc + bias)
-          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-          {},                   // leaf args : C
-          {                     // ternary op : alpha * acc + bias
-            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-            {},                     // leaf args : acc
-            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-            {}                  // ternary args : multiply_add
-          },                    // end ternary op
-          {} // ternary args : multiply_add
-        };   // end ternary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = activation(alpha * acc + beta * C + per-row bias)
-template<
-  class CtaTileShapeMNK,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombPerRowBiasEltAct =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
-    Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    > {
-
-  using Impl =
-    Sm90LinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >;
-  using Operation =
-    fusion::LinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op : activation(beta * C + (alpha * acc + bias))
-          {    // ternary op : beta * C + (alpha * acc + bias)
-            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-            {},                   // leaf args : C
-            {                     // ternary op : alpha * acc + bias
-              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-              {},                     // leaf args : acc
-              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-              {}                  // ternary args : multiply_add
-            },                    // end ternary op
-            {} // ternary args : multiply_add
-          },   // end ternary op
-          activation // unary args : activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = activation(alpha * acc + beta * C + per-row bias)
-// Aux = alpha * acc + beta * C + per-row bias)
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int Stages,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombPerRowBiasEltActAux =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
-    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
-      Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class GmemLayoutTagAux,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentAux,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class SmemLayoutAtom,
-  class CopyOpR2S
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombPerRowBiasEltActAux<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile,
-    SmemLayoutAtom,
-    CopyOpR2S
-> : Sm90LinCombPerRowBiasEltActAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    > {
-
-  using Impl =
-    Sm90LinCombPerRowBiasEltActAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-  using Operation =
-    fusion::LinCombPerRowBiasEltActAux<
-      GmemLayoutTagAux, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
-    ElementAux* aux_ptr = nullptr;
-    StrideAux dAux = {};
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
-          {                 // unary op : store(beta * C + (alpha * acc + bias))
-            {                  // ternary op : beta * C + (alpha * acc + bias)
-              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-              {},                   // leaf args : C
-              {                     // ternary op : alpha * acc + bias
-                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-                {},                     // leaf args : acc
-                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-                {}                  // ternary args : multiply_add
-              },                    // end ternary op
-              {}               // ternary args : multiply_add
-            },                 // end ternary op
-            {aux_ptr, dAux} // unary args : store
-          },                // end unary op
-          activation // unary args : activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// D = per-row alpha * acc + per-row beta * C + per-row bias
-template<
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90PerRowLinCombPerRowBias =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
-    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
-      Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
-    >
-  >;
-
-// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
-template<
-  class CtaTileShapeMNK,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90PerRowLinCombPerRowBiasEltAct =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
-    Sm90PerRowLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute,
-                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentBias,
-  int AlignmentScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::PerRowLinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90PerRowLinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
-    > {
-
-  using Impl =
-    Sm90PerRowLinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
-    >;
-  using Operation =
-    fusion::PerRowLinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
-    >;
-
-  struct Arguments {
-    using StrideAlpha = Stride<bool,_0,int64_t>;
-    using StrideBeta  = Stride<bool,_0,int64_t>;
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-    StrideAlpha dAlpha = {bool(1), _0{}, 0};
-    StrideBeta  dBeta  = {bool(1), _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op : activation(beta * C + (alpha * acc + bias))
-          {    // ternary op : beta * C + (alpha * acc + bias)
-            {beta_ptr, beta, dBeta}, // leaf args : beta
-            {},                      // leaf args : C
-            {                        // ternary op : alpha * acc + bias
-              {alpha_ptr, alpha, dAlpha}, // leaf args : alpha
-              {},                         // leaf args : acc
-              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-              {}                     // ternary args : multiply_add
-            },                       // end ternary op
-            {} // ternary args : multiply_add
-          },   // end ternary op
-          activation // unary args : activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename T>
-constexpr bool is_fp8_v = cute::is_same_v<T,float_e4m3_t> || cute::is_same_v<T,float_e5m2_t>;
-
-// We only apply the scaling factor if output is fp8
-template <typename ElementOutput>
-struct ScaleOutOp { template <typename T> using Op = cutlass::first<T>; };
-template <>
-struct ScaleOutOp<float_e4m3_t> { template <typename T> using Op = cutlass::multiplies<T>; };
-template <>
-struct ScaleOutOp<float_e5m2_t> { template <typename T> using Op = cutlass::multiplies<T>; };
-
-template <typename T>
-using amax = cutlass::maximum_absolute_value_reduction<T, true>; // propogate nans
-
-}; // end namespace detail
-
-// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
-template<
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90ScaledLinCombPerRowBias =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
-      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
-      Sm90AccFetch, // acc
-      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
-    >
-  >;
-
-// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
-// if D is fp8 
-//   D = scale_d * activation(Z)
-// else
-//   D = activation(Z)
-template<
-  class CtaTileShapeMNK,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90ScaledLinCombPerRowBiasEltAct =
-  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
-    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
-      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
-      Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
-    >,
-    Sm90ScalarBroadcast<ElementScalar> // scale_d
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::ScaledLinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90ScaledLinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    > {
-
-  using Impl =
-    Sm90ScaledLinCombPerRowBiasEltAct<
-      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >;
-  using Operation =
-    fusion::ScaledLinCombPerRowBiasEltAct<
-      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    ElementScalar scale_a = ElementScalar(1);
-    ElementScalar scale_b = ElementScalar(1);
-    ElementScalar scale_c = ElementScalar(1);
-    ElementScalar scale_d = ElementScalar(1);
-    ElementScalar const* scale_a_ptr = nullptr;
-    ElementScalar const* scale_b_ptr = nullptr;
-    ElementScalar const* scale_c_ptr = nullptr;
-    ElementScalar const* scale_d_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
-          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
-            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
-              {{beta, scale_c},
-               {beta_ptr, scale_c_ptr},
-               {dBeta, {_0{}, _0{}, 0}}
-               },  // leaf args : (scale_c * beta)
-              {},  // leaf args : C
-              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
-                {{alpha, scale_a, scale_b}, 
-                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
-                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
-                 },                   // leaf args : (scale_a * scale_b * alpha)
-                {},                   // leaf args : acc
-                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-                {} // ternary args : multiply_add
-              },   // end ternary op
-              {} // ternary args : multiply_add
-            },   // end ternary op
-            activation // unary args : activation
-          },   // end unary op
-          {{scale_d},
-           {scale_d_ptr}
-           },   // leaf args : scale_d
-          {} // binary args : multiplies or first
-        };   // end binary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
-// if D is fp8 
-//   amax_d = max(abs(elements in activation(Z)))
-//   D = scale_d * activation(Z)
-// else
-//   D = activation(Z)
-// if Aux is fp8 
-//   amax_aux = max(abs(elements in Z))
-//   Aux = scale_aux * Z
-// else
-//   Aux = Z
-
-// fp8 aux specialization
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int StagesD,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementAmax = ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8 =
-  Sm90SplitTreeVisitor<
-    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
-    Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
-    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
-    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
-      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
-        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
-          Sm90SplitTreeFetch // Z
-        >
-      >,
-      Sm90ScalarBroadcast<ElementScalar> // scale_d
-    >,
-    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
-    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
-      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
-        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
-          Sm90SplitTreeFetch // Z
-        >,
-        Sm90ScalarBroadcast<ElementScalar> // scale_aux
-      >
-    >
-  >;
-
-// non-fp8 aux specialization
-// lets us use some EVT specializations such as relu + uint1b_t aux
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int StagesD,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementAmax = ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8 =
-  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
-  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
-    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
-      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
-        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
-          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
-          Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
-        >
-      >
-    >,
-    Sm90ScalarBroadcast<ElementScalar> // scale_d
-  >;
-
-// dispatcher
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int StagesD,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementAmax = ElementCompute,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90ScaledLinCombPerRowBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
-  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8<
-    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
-    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
-  >,
-  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8<
-    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
-    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-  >
->;
-
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class GmemLayoutTagAux,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux,
-  class ElementAmax,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentAux,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class SmemLayoutAtom,
-  class CopyOpR2S
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile,
-    SmemLayoutAtom,
-    CopyOpR2S
-> : Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
-      SmemLayoutAtom, CopyOpR2S, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    > {
-
-  using Impl =
-    Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
-      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
-      SmemLayoutAtom, CopyOpR2S, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-  using Operation =
-    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    ElementScalar scale_a = ElementScalar(1);
-    ElementScalar scale_b = ElementScalar(1);
-    ElementScalar scale_c = ElementScalar(1);
-    ElementScalar scale_d = ElementScalar(1);
-    ElementScalar const* scale_a_ptr = nullptr;
-    ElementScalar const* scale_b_ptr = nullptr;
-    ElementScalar const* scale_c_ptr = nullptr;
-    ElementScalar const* scale_d_ptr = nullptr;
-
-    ElementScalar scale_aux = ElementScalar(1);
-    ElementScalar const* scale_aux_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias const* bias_ptr = nullptr;
-    StrideBias dBias = {};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    ElementAmax* amax_D_ptr = nullptr;
-    ElementAmax* amax_aux_ptr = nullptr;
-
-    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
-    ElementAux* aux_ptr = nullptr;
-    StrideAux dAux = {};
-
-    operator typename Impl::Arguments() const {
-      // Only compute amax_d if D is fp8
-      ElementAmax* amax_D_ptr_ = nullptr;
-      if constexpr (detail::is_fp8_v<ElementOutput>) {
-        amax_D_ptr_ = amax_D_ptr;
-      }
-
-      // Aux is fp8 -> DAG arguments
-      if constexpr (detail::is_fp8_v<ElementAux>) {
-        typename Impl::Arguments args;
-        // always use structured binding to unpack DAG args since it may or may not be a tuple
-        auto& [Z_args, aux_args, D_args] = args;
-
-        Z_args =
-          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
-            {{beta, scale_c},
-             {beta_ptr, scale_c_ptr},
-             {dBeta, {_0{}, _0{}, 0}}
-             },  // leaf args : (scale_c * beta)
-            {},  // leaf args : C
-            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
-              {{alpha, scale_a, scale_b}, 
-               {alpha_ptr, scale_a_ptr, scale_b_ptr},
-               {dAlpha ,{_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
-               },                   // leaf args : (scale_a * scale_b * alpha)
-              {},                   // leaf args : acc
-              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
-              {} // ternary args : multiply_add
-            },   // end ternary op
-            {} // ternary args : multiply_add
-          };   // end ternary op
-
-        D_args =
-          {    // binary op : activation(Z) * scale_d or activation(Z)
-            {    // unary op : reduce(activation(Z))
-              {             // unary op : activation(Z)
-                {},             // leaf args : Z
-                activation      // unary args : activation
-              },                // end unary op
-              {amax_D_ptr_} // unary args : reduce
-            },              // end unary op
-            {{scale_d},
-             {scale_d_ptr}
-             },  // leaf args : scale_d
-            {} // binary args : multiplies or first
-          };   // end binary op
-
-        aux_args =
-          {    // unary op : store(Aux)
-            {    // binary op : Z * scale_d or Z
-              {    // unary op : reduce(Z)
-                {},            // leaf args : Z
-                {amax_aux_ptr} // unary args : reduce
-              },   // end unary op
-              {{scale_aux},
-               {scale_aux_ptr}
-               },  // leaf args : scale_d
-              {} // binary args : multiplies
-            },   // end binary op
-            {aux_ptr, dAux} // unary args : store
-          };   // end unary op
-
-        return args;
-      }
-
-      // Aux is not fp8 -> Tree arguments
-      else {
-        return
-          {  // binary op : activation(Z) * scale_d or activation(Z)
-            {  // unary op : reduce(activation(Z))
-              {  // unary op : activation(Z)
-                {  // unary op : store(Z)
-                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
-                    {{beta, scale_c},
-                     {beta_ptr, scale_c_ptr},
-                     {dBeta, {_0{}, _0{}, 0}}
-                    },                // leaf args : (scale_c * beta)
-                    {},               // leaf args : C
-                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
-                      {{alpha, scale_a, scale_b}, 
-                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
-                       {dAlpha, {_0{}, _0{}, 0}}
-                      },                // leaf args : (scale_a * scale_b * alpha)
-                      {},               // leaf args : acc
-                      {bias_ptr, ElementBias(0), dBias
-                      },                // leaf args : bias
-                      {}              // ternary args : multiply_add
-                    },                // end ternary op
-                    {}              // ternary args : multiply_add
-                  },                // end ternary op
-                  {aux_ptr, dAux} // unary args : store
-                },                // end unary op
-                activation     // unary args : activation
-              },               // end unary op
-              {amax_D_ptr_} // unary args : reduce
-            },              // end unary op
-            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
-            {} // binary args : multiplies or first
-          };   // end binary op
-      }
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int Stages,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpS2R,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombDeEltAct =
-  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc), aux)
-    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>, // beta * C + (alpha * acc)
-    Sm90AuxLoad<Stages, EpilogueTile, ElementAux, StrideAux, SmemLayoutAtom, CopyOpS2R, AlignmentAux> // aux
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class GmemLayoutTagAux,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentAux,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class SmemLayoutAtom,
-  class CopyOpS2R
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombDeEltAct<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile,
-    SmemLayoutAtom,
-    CopyOpS2R
-> : Sm90LinCombDeEltAct<
-      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
-    > {
-
-  using Impl =
-    Sm90LinCombDeEltAct<
-      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
-    >;
-  using Operation =
-    fusion::LinCombDeEltAct<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
-    ElementAux const* aux_ptr = nullptr;
-    StrideAux dAux = {};
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // binary op : activation(beta * C + (alpha * acc), aux)
-          {                  // ternary op : beta * C + (alpha * acc)
-            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-            {},                   // leaf args : C
-            {                     // binary op : alpha * acc
-              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-              {},                     // leaf args : acc
-              {}                  // binary args : multiplies
-            },                    // end binary op
-            {}               // ternary args : multiply_add
-          },                 // end ternary op
-          {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
-          activation // binary args : activation
-        };   // end binary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  int Stages,
-  class StrideAux,
-  class SmemLayoutAtom,
-  class CopyOpS2R,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux = ElementOutput,
-  class ElementBias = ElementOutput,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
-  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombDeEltActDePerRowBias =
-  Sm90EVT<Sm90Compute<cutlass::epilogue::thread::Identity, ElementOutput, ElementCompute, RoundStyle>, // Identity for final conversion
-    Sm90EVT<Sm90ColReduction<plus, plus, plus, 0, CtaTileShapeMNK,
-                             ElementBias, ElementCompute, RoundStyle, Stride<_1,_0,int64_t>, AlignmentBias>,
-      Sm90LinCombDeEltAct<CtaTileShapeMNK, EpilogueTile, Stages, StrideAux, SmemLayoutAtom, CopyOpS2R, ActivationFn,
-                          ElementCompute, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle>
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class GmemLayoutTagAux,
-  template <class> class ActivationFn,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementAux,
-  class ElementBias,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentAux,
-  int AlignmentBias,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class SmemLayoutAtom,
-  class CopyOpS2R
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombDeEltActDePerRowBias<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >,
-    CtaTileShapeMNK,
-    EpilogueTile,
-    SmemLayoutAtom,
-    CopyOpS2R
-> : Sm90LinCombDeEltActDePerRowBias<
-      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    > {
-
-  using Impl =
-    Sm90LinCombDeEltActDePerRowBias<
-      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
-      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-  using Operation =
-    fusion::LinCombDeEltActDePerRowBias<
-      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
-      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
-    >;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    using StrideAlpha = Stride<_0,_0,int64_t>;
-    using StrideBeta  = Stride<_0,_0,int64_t>;
-    StrideAlpha dAlpha = {_0{}, _0{}, 0};
-    StrideBeta  dBeta  = {_0{}, _0{}, 0};
-
-    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
-    ActivationArguments activation = ActivationArguments();
-
-    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
-    ElementAux const* aux_ptr = nullptr;
-    StrideAux dAux = {};
-
-    using StrideBias = Stride<_1,_0,int64_t>;
-    ElementBias* dbias_ptr = nullptr;
-    StrideBias dDbias = {};
-
-    operator typename Impl::Arguments() const {
-      return
-      {   // unary op : identity/convert
-        {    // unary op : reduce(activation(beta * C + (alpha * acc), aux))
-          {    // binary op : activation(beta * C + (alpha * acc), aux)
-            {                  // ternary op : beta * C + (alpha * acc)
-              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
-              {},                   // leaf args : C
-              {                     // binary op : alpha * acc
-                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
-                {},                     // leaf args : acc
-                {}                  // binary args : multiplies
-              },                    // end binary op
-              {}               // ternary args : multiply_add
-            },                 // end ternary op
-            {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
-            activation // binary args : activation
-          },   // end binary op
-          {dbias_ptr, ElementCompute(0), dDbias} // unary args : reduce
-        },   // end unary op
-        {} // unary args : identity/convert
-      };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// D = softmax(top_k(alpha * acc + beta * C))
-template<
-  int TopK,
-  int FragmentSize,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90LinCombTopKSoftmaxCol =
-  Sm90EVT<Sm90TopKSoftmaxColReduction<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, RoundStyle>, // softmax(top_k(beta * C + (alpha * acc)))
-    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
-  >;
-
-template <
-  int TopK,
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
-
-  using Impl = Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-  using Operation = fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
-
-  struct Arguments {
-    ElementScalar alpha = ElementScalar(1);
-    ElementScalar beta = ElementScalar(0);
-    ElementScalar const* alpha_ptr = nullptr;
-    ElementScalar const* beta_ptr = nullptr;
-
-    operator typename Impl::Arguments() const {
-      return
-        {    // unary op: activation(beta * C + (alpha * acc))
-          {    // ternary op : beta * C + (alpha * acc)
-            {{beta}, {beta_ptr}}, // leaf args : beta
-            {},                   // leaf args : C
-            {                     // binary op : alpha * acc
-              {{alpha}, {alpha_ptr}}, // leaf args : alpha
-              {},                     // leaf args : acc
-              {}                  // binary args : multiplies
-            },                    // end binary op
-            {} // ternary args : multiply_add
-          },   // end ternary op
-          {} // unary args: activation
-        };   // end unary op
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-template <class FusionOpOrCallbacks, class = cute::void_t<>>
-struct get_element_aux {
-  using type = void;
-};
-
-template <class FusionOpOrCallbacks>
-struct get_element_aux<FusionOpOrCallbacks, cute::void_t<typename FusionOpOrCallbacks::ElementAux>> {
-  using type = typename FusionOpOrCallbacks::ElementAux;
-};
-
-template <class NodeOp, class... ChildOps>
-struct get_element_aux<Sm90TreeVisitor<NodeOp, ChildOps...>, cute::void_t<>> {
-  using type = typename get_element_aux<NodeOp>::type;
-};
-
-template <class... Ts>
-struct get_element_aux<FusionCallbacks<Ts...>, cute::void_t<typename FusionCallbacks<Ts...>::Operation>> {
- private:
-  using Operation = typename FusionCallbacks<Ts...>::Operation;
- public:
-  using type = typename get_element_aux<Operation>::type;
-};
-} // namespace cutlass:epilogue::fusion::detail
-
-template <class Callbacks>
-using get_element_aux_t = typename detail::get_element_aux<Callbacks>::type;
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
deleted file mode 100755
index 131d0ba5b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,839 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree compute operations for the sm90 TMA warp-specialized (ws) epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// N-nary Elementwise Compute Operation
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The template argument provided for ComputeFn must be able to accept
-// exactly one template parameter.  In Standard C++, it's OK for
-// ComputeFn to have other template parameters, as long as those have
-// defaults.  For example, the following struct Foo would work.
-//
-// template<class A, class B = A>
-// struct Foo {
-//   CUTLASS_HOST_DEVICE auto operator() (A a, B b);
-// };
-//
-// However, some compilers, such as Clang, require that the argument
-// take _exactly_ one template parameter.  This is nonstandard C++
-// behavior.  One work-around for this case is to create a subclass
-// with exactly one template parameter, and then use that subclass as
-// the template argument.
-//
-// template<class A>
-// struct FooHomogeneous : public Foo<A, A> {};
-//
-template<
-  template <class> class ComputeFn,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class = void
->
-struct Sm90Compute {
-private:
-  using EmptyArguments = typename Sm90VisitorImpl<>::Arguments;
-
-  template <class Fn, class = void>
-  struct ComputeArguments {
-    using type = EmptyArguments;
-  };
-
-  // partial specialization for compute fns that define an Arguments member, e.g. activation hyperparameters
-  template <class Fn>
-  struct ComputeArguments<Fn, platform::void_t<typename Fn::Arguments>> {
-    using type = typename Fn::Arguments;
-  };
-
-public:
-  struct SharedStorage { };
-
-  using Arguments = typename ComputeArguments<ComputeFn<ElementCompute>>::type;
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const&, Arguments const& args, void*) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const&, Arguments const&) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90Compute()
-      : params() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90Compute(Params const& params, SharedStorage const& shared_storage)
-      : params(params) {}
-
-  Params const params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(Params const& params)
-      : params(params) {}
-
-    Params const& params;
-
-    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
-      return transform_apply(cute::make_tuple(frg_inputs...),
-        [&] (auto&& frg_input) {
-          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
-          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-          ConvertInput convert_input{};
-
-          return convert_input(frg_input);
-        },
-        [&] (auto&&... cvt_frg_inputs) {
-          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
-          ComputeOutput compute_output{};
-
-          if constexpr (cute::is_same_v<Arguments, EmptyArguments>) {
-            using ElementComputeOutput =
-                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs...))>::Element;
-            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
-            ConvertOutput convert_output{};
-            return convert_output(compute_output(cvt_frg_inputs...));
-          }
-          else {
-            using ElementComputeOutput =
-                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs..., params))>::Element;
-            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
-            ConvertOutput convert_output{};
-            return convert_output(compute_output(cvt_frg_inputs..., params));
-          }
-        }
-      );
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    return ConsumerStoreCallbacks(params);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Performance Optimized Specializations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// beta * C + Z
-template <
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class InputScaleOp,  // beta
-  class ElementSource, // C
-  class InputAddOp     // Z
->
-struct Sm90TreeVisitor<
-  Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle,
-              cute::void_t<decltype(declval<InputScaleOp>().is_zero())>>,
-  InputScaleOp,
-  Sm90SrcFetch<ElementSource>,
-  InputAddOp
-> : Sm90VisitorImpl<
-      InputScaleOp,
-      Sm90SrcFetch<ElementSource>,
-      InputAddOp,
-      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
-    >
-{
-  using Impl =
-    Sm90VisitorImpl<
-      InputScaleOp,
-      Sm90SrcFetch<ElementSource>,
-      InputAddOp,
-      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
-    >;
-  using Params = typename Impl::Params;
-  using SharedStorage = typename Impl::SharedStorage;
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor(
-      Params const& params,
-      SharedStorage const& shared_storage)
-    : Impl(params, shared_storage) {}
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    auto const& scale_op = get<0>(Impl::ops);
-    auto const& added_op = get<2>(Impl::ops);
-    if constexpr (detail::IsScalarBroadcast<InputScaleOp>::value && not is_void_v<ElementSource>) {
-      return (get<2>(scale_op.params_ptr->dScalar[0]) != 0 && scale_op.params_ptr->scalar_ptrs[0] != nullptr) || 
-              is_C_load_needed() || 
-              added_op.is_producer_load_needed();
-    }
-    else {
-      return is_C_load_needed() || added_op.is_producer_load_needed();
-    }
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    auto const& scale_op = get<0>(Impl::ops);
-    auto const& src_op = get<1>(Impl::ops);
-    auto const& added_op = get<2>(Impl::ops);
-    return (not scale_op.is_zero() && src_op.is_C_load_needed()) || added_op.is_C_load_needed();
-  }
-
-  template <class CallbacksImpl>
-  struct ConsumerStoreCallbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(bool is_C_load_needed, CallbacksImpl&& impl)
-      : is_C_load_needed(is_C_load_needed), CallbacksImpl(cute::forward<CallbacksImpl>(impl)) { }
-
-    bool is_C_load_needed;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array frg_added = get<2>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
-
-      using ElementZ = typename decltype(frg_added)::Element;
-      using ConvertZ = NumericArrayConverter<ElementCompute, ElementZ, FragmentSize, RoundStyle>;
-      using ConvertI = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
-      ConvertZ convert_Z{};
-      ConvertI convert_I{};
-
-      Array frg_I = convert_Z(frg_added);
-
-      if constexpr (!is_void_v<ElementSource>) {
-        Array frg_scalar = get<0>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
-        Array frg_source = get<1>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
-
-        using ElementX = typename decltype(frg_scalar)::Element;
-        using ElementY = typename decltype(frg_source)::Element;
-        using ConvertX = NumericArrayConverter<ElementCompute, ElementX, FragmentSize, RoundStyle>;
-        using ConvertY = NumericArrayConverter<ElementCompute, ElementY, FragmentSize, RoundStyle>;
-        using ComputeI = multiply_add<Array<ElementCompute, FragmentSize>>;
-        ConvertX convert_X{};
-        ConvertY convert_Y{};
-        ComputeI compute_I{};
-
-        frg_I = compute_I(convert_X(frg_scalar), convert_Y(frg_source), frg_I);
-      }
-
-      return convert_I(frg_I);
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto callbacks_tuple = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
-    bool is_C_load_needed = this->is_C_load_needed();
-    if (not is_C_load_needed) {
-      cute::clear(args.tCrC);
-    }
-    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(
-        is_C_load_needed, std::move(callbacks_tuple));
-  }
-};
-
-// ReLU with aux bit tensor dReLU/dZ
-// Aux(i) = Z(i) >= 0 ? 1 : 0
-namespace detail {
-// Placeholder node so we can retain standard EVT structure
-template <class StrideMNL>
-struct Sm90ReLUAuxStore : Sm90VisitorImpl<> {
-  struct SharedStorage {};
-
-  struct Arguments {
-    cutlass::uint1b_t* ptr_aux = nullptr;
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ReLUAuxStore() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ReLUAuxStore(Params const& params, SharedStorage const& shared_storage) { }
-};
-} // namespace detail
-
-// Specialization on the generic compute+aux EVT
-template <
-  // Compute node
-  template <class> class Activation,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  // Aux node
-  int Stages,
-  class EpilogueTile,
-  class StrideMNL,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  int Alignment,
-  bool EnableNullptr,
-  // Input node
-  class InputOp
->
-struct Sm90TreeVisitor<
-  Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle,
-              cute::enable_if_t<cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ReLu<ElementCompute>> ||
-                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>>  >>,
-  Sm90TreeVisitor<
-    Sm90AuxStore<
-      Stages,
-      EpilogueTile,
-      cutlass::uint1b_t,
-      RoundStyle,
-      StrideMNL,
-      SmemLayoutAtom,
-      CopyOpR2S,
-      Alignment,
-      EnableNullptr
-    >,
-    InputOp
-  >
-> : Sm90VisitorImpl<
-      Sm90VisitorImpl<
-        InputOp,
-        detail::Sm90ReLUAuxStore<StrideMNL>
-      >,
-      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
-    >
-{
-  using Impl =
-    Sm90VisitorImpl<
-      Sm90VisitorImpl<
-        InputOp,
-        detail::Sm90ReLUAuxStore<StrideMNL>
-      >,
-      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
-    >;
-  using Params = typename Impl::Params;
-  using SharedStorage = typename Impl::SharedStorage;
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor(Params const& params_, SharedStorage const& shared_storage)
-    : params(params_), Impl(params_, shared_storage) {}
-
-  Params const& params;
-
-  template <class RTensor, class GTensor, class CTensor, class ThrResidue, class CallbacksImpl>
-  struct ConsumerStoreCallbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        RTensor&& tC_rAux,
-        GTensor&& tC_gAux,
-        CTensor tC_cAux,
-        ThrResidue residue_tC_cAux,
-        Params const& params,
-        CallbacksImpl&& impl)
-      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
-        tC_gAux(cute::forward<GTensor>(tC_gAux)),
-        tC_cAux(tC_cAux),
-        residue_tC_cAux(residue_tC_cAux),
-        params(params),
-        CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tC_cAux;
-    Params const& params;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      // Unpack callbacks + params
-      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
-      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
-      auto const& [params_input_aux, params_compute] = params;
-      auto const& [params_input, params_aux] = params_input_aux;
-
-      // Visit the input node
-      Array frg_input = callbacks_input.visit(frg_acc, epi_v, epi_m, epi_n);
-
-      // Compute activation + aux
-      using ElementInput = typename decltype(frg_input)::Element;
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      using ConvertAux = PackPredicates<FragmentSize>;
-      using ComputeOutput = Activation<ElementCompute>;
-      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-      ComputeOutput relu{};
-      ConvertAux convert_aux{};
-      ConvertOutput convert_output{};
-
-      Array frg_compute = convert_input(frg_input);
-      bool frg_aux[FragmentSize];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        ElementCompute pre_relu = frg_compute[i];
-        if constexpr (cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
-          frg_compute[i] = relu(frg_compute[i], params_compute);
-        }
-        else {
-          frg_compute[i] = relu(frg_compute[i]);
-        }
-        if constexpr (cute::is_same_v<ElementCompute, float>) {
-          uint32_t aux;
-          asm volatile("set.equ.u32.f32 %0, %1, %2;\n" : "=r"(aux) : "f"(frg_compute[i]), "f"(pre_relu)); // NaN outputs 1 in Aux
-          frg_aux[i] = static_cast<bool>(aux);
-        } else if constexpr (cute::is_same_v<ElementCompute, cutlass::half_t>) {
-          uint32_t aux;
-          cutlass::half_t compute = frg_compute[i];
-          asm volatile("set.equ.u32.f16 %0, %1, %2;\n" : "=r"(aux) : "h"(compute.raw()), "h"(pre_relu.raw())); // NaN outputs 1 in Aux
-          frg_aux[i] = static_cast<bool>(aux);
-        } else {
-          frg_aux[i] = frg_compute[i] == pre_relu;
-        }
-      }
-
-      static_assert(FragmentSize % 8 == 0, "Predicate vector must be byte-aligned");
-      Tensor tC_rAux_frg = recast<typename ConvertAux::result_type>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)));   // (EPI_V)
-      tC_rAux_frg(epi_v) = convert_aux(frg_aux);
-
-      return convert_output(frg_compute);
-    }
-
-    CUTLASS_DEVICE void
-    end() {
-      // Unpack callbacks + params
-      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
-      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
-      auto const& [params_input_aux, params_compute] = params;
-      auto const& [params_input, params_aux] = params_input_aux;
-
-      // Visit the input node
-      callbacks_input.end();
-
-      // Nullptr is no-op
-      if constexpr (EnableNullptr) {
-        if (params_aux.ptr_aux == nullptr) {
-          return;
-        }
-      }
-
-      // Compute vectorization
-      constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
-      constexpr int V = cute::min(Alignment, size(MCL));
-      // Copy vectorizes into byte-aligned stores
-      if constexpr (V > 1 && V % 8 == 0) {
-        using VecType = uint_bit_t<V>;
-        Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
-        Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
-        Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
-        copy_if(predicate_fn, tC_rAux_vec, tC_gAux_vec);
-      }
-      // sub-byte vectorization, must serialize threads
-      else {
-        // Assumes no inter-warp sharing of bytes (most copy layouts should satisfy this)
-        int lane_idx = canonical_lane_idx();
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
-        CUTLASS_PRAGMA_NO_UNROLL
-        for (int i = 0; i < NumThreadsPerWarp; ++i) {
-          if (lane_idx == i) {
-            copy_if(predicate_fn, tC_rAux, tC_gAux);
-          }
-          __syncwarp();
-        }
-      }
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    // Unpack params
-    auto const& [params_input_aux, params_compute] = params;
-    auto const& [params_input, params_aux] = params_input_aux;
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t>(params_aux.ptr_aux));
-    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params_aux.dAux));                     // (M,N,L)
-    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
-
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tC_rAux = make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    auto callbacks_impl = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
-    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD), decltype(callbacks_impl)>(
-        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params, cute::move(callbacks_impl));
-  }
-};
-
-// Aux load for uint1b_t
-template <
-  int Stages,
-  class EpilogueTile,
-  class StrideMNL,
-  class SmemLayoutAtom,
-  class CopyOpS2R,
-  int Alignment,
-  bool EnableNullptr
->
-struct Sm90AuxLoad<
-  Stages,
-  EpilogueTile,
-  cutlass::uint1b_t,
-  StrideMNL,
-  SmemLayoutAtom,
-  CopyOpS2R,
-  Alignment,
-  EnableNullptr
-> {
-  static_assert(Alignment % 128 == 0, "sub-16B alignment not supported yet");
-
-  struct SharedStorage {};
-
-  struct Arguments {
-    cutlass::uint1b_t const* ptr_aux = nullptr;
-    cutlass::uint1b_t null_default = cutlass::uint1b_t(0);
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad(Params const& params, SharedStorage const&)
-      : params(params) { }
-
-  Params const params;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class RTensor, class GTensor, class CTensor, class ThrResidue>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(RTensor&& tC_rAux_, GTensor&& tC_gAux_, CTensor tC_cAux_, ThrResidue residue_tC_cAux_, Params const& params_)
-      : tC_rAux(cute::forward<RTensor>(tC_rAux_)),
-        tC_gAux(cute::forward<GTensor>(tC_gAux_)),
-        tC_cAux(tC_cAux_),
-        residue_tC_cAux(residue_tC_cAux_),
-        params(params_) {}
-
-    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,{EPI_M,EPI_N})
-    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tC_cAux;
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if constexpr (decltype(cute::rank(tC_rAux))::value == 5) {
-        if constexpr (EnableNullptr) {
-          if (params.ptr_aux == nullptr) {
-            return;
-          }
-        }
-
-        constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
-        constexpr int V = cute::min(Alignment, size(MCL));
-        if constexpr (V > 1) {
-          using VecType = uint_bit_t<V>;
-          Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
-          Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
-          Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
-          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
-          copy_if(predicate_fn, tC_gAux_vec, tC_rAux_vec);
-        }
-        else {
-          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
-          copy_if(predicate_fn, tC_gAux, tC_rAux);
-        }
-      }
-    }
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
-        if constexpr (EnableNullptr) {
-          if (params.ptr_aux == nullptr) {
-            return;
-          }
-        }
-
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(_,_,_,epi_m,epi_n)(coords...), residue_tC_cAux); };
-        copy_if(predicate_fn, tC_gAux(_,_,_,epi_m,epi_n), tC_rAux);
-      }
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      using ElementRegister = typename remove_cvref_t<RTensor>::value_type;
-      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
-        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux))(epi_v);
-      }
-      else {
-        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)))(epi_v);
-      }
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    gmem_ptr ptr_aux = make_gmem_ptr(subbyte_iterator<cutlass::uint1b_t const>(params.ptr_aux));
-    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params.dAux));                         // (M,N,L)
-    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
-
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    // If byte-unaligned vectorization, store in registers as uint32_t to reduce redundant pack+unpack instruction sequences
-    constexpr int V = decltype(max_common_vector(tC_gAux.layout(), make_layout(tC_gAux.shape())))::value;
-    Tensor tC_rAux = [&] () {
-      if constexpr (V % 8 != 0) {
-        return make_tensor<uint32_t>(take<0,3>(shape(tC_gAux)));                       // (CPY,CPY_M,CPY_N)
-      } else {
-        return make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      }
-    }();
-
-    if constexpr (EnableNullptr) {
-      if (params.ptr_aux == nullptr) {
-        fill(tC_rAux, params.null_default);
-      }
-    }
-
-    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD)>(
-        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params);
-  }
-};
-
-// dReLU specialization
-template<
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle
->
-struct Sm90Compute<
-  cutlass::epilogue::thread::dReLU,
-  ElementOutput,
-  ElementCompute,
-  RoundStyle
-> : Sm90VisitorImpl<> {
-
-  using Sm90VisitorImpl<>::Sm90VisitorImpl;
-
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    template <typename ElementAccumulator, typename ElementInput, typename ElementAux, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput      , FragmentSize> const& frg_input,
-          Array<ElementAux        , FragmentSize> const& frg_aux) {
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      using ComputeOutput = cutlass::epilogue::thread::dReLU<Array<ElementCompute, FragmentSize>>;
-      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-      ComputeOutput compute_output{};
-      ConvertOutput convert_output{};
-
-      return convert_output(compute_output(convert_input(frg_input), frg_aux)); // don't convert frg_aux for dReLU
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    return ConsumerStoreCallbacks();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
deleted file mode 100755
index a22bed4e0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,1415 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree load operations for the sm90 TMA warp-specialized (ws) epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-
-#include "cute/tensor.hpp"
-#include "sm90_visitor_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Fetch Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// returns accumulator
-struct Sm90AccFetch : Sm90VisitorImpl<> {
-
-  using Sm90VisitorImpl<>::Sm90VisitorImpl;
-
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      return frg_acc;
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    return ConsumerStoreCallbacks{};
-  }
-};
-
-// Split tree visitor fetches intermediate results from temporary accumulators
-using Sm90SplitTreeFetch = Sm90AccFetch;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// returns C
-template <class Element>
-struct Sm90SrcFetch : Sm90VisitorImpl<> {
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return is_C_load_needed();
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return not is_void_v<Element>;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return is_void_v<Element>;
-  }
-
-  using Sm90VisitorImpl<>::Sm90VisitorImpl;
-
-  template<class SrcTensor>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(SrcTensor const& tCrC)
-      : tCrC(tCrC) {}
-
-    SrcTensor const& tCrC;                                                                         // (CPY,CPY_M,CPY_N)
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<typename SrcTensor::value_type, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      return recast<Array<typename SrcTensor::value_type, FragmentSize>>(tCrC)(epi_v);
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    // register type may differ from logical type so we can't assert matching types here
-    return ConsumerStoreCallbacks(args.tCrC);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Load Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Stages,
-  class EpilogueTile,
-  class Element,
-  class StrideMNL,
-  class SmemLayoutAtom,
-  class CopyOpS2R,
-  int Alignment = 128 / sizeof_bits_v<Element>,
-  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
->
-struct Sm90AuxLoad {
-  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
-
-  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
-  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
-  using SmemShapeTma = decltype(make_shape(
-      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
-      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
-  using SmemLayoutTma = decltype(tile_to_shape(
-      SmemLayoutAtom{}, SmemShapeTma{},
-      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
-  using SmemLayout = decltype(tile_to_shape(
-      SmemLayoutTma{},
-      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
-      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-  using CopyOpG2S =
-      SM90_TMA_LOAD
-    ;
-
-  struct SharedStorage {
-    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
-    array_aligned<Element, size(SmemLayout{})> smem_aux;
-  };
-
-  struct Arguments {
-    Element const* ptr_aux = nullptr;
-    Element null_default = Element(0);
-    StrideMNL dAux = {};
-  };
-
-  struct Params {
-    using TMA_Aux = decltype(make_tma_copy(
-        CopyOpG2S{},
-        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideMNL{}, int32_t(0)), append<3>(StrideMNL{}, _0{})),
-        take<0,2>(SmemLayoutTma{})));
-    TMA_Aux tma_load_aux;
-    Element null_default = Element(0);
-    bool use_default = false;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto M_AUX =
-        size(M)
-      ;
-    Tensor tensor_aux = make_tensor(make_gmem_ptr(args.ptr_aux), make_layout(make_shape(M_AUX,N,L), append<3>(args.dAux, _0{})));
-    typename Params::TMA_Aux tma_load_aux = make_tma_copy(CopyOpG2S{}, tensor_aux, take<0,2>(SmemLayoutTma{}));
-
-    bool use_default = false;
-    if constexpr (EnableNullptr) {
-      use_default = args.ptr_aux == nullptr;
-    }
-
-    return Params{tma_load_aux, args.null_default, use_default};
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
-      : params_ptr(&params),
-        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
-
-  Params const* params_ptr;
-  Element* smem_aux;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return true;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (params_ptr->use_default && params_ptr->null_default == Element(0));
-  }
-
-  template <class GTensor, class STensor>
-  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
-    CUTLASS_DEVICE
-    ProducerLoadCallbacks(GTensor&& bGS_gAux, STensor&& bGS_sAux, Params const* params_ptr)
-      : bGS_gAux(cute::forward<GTensor>(bGS_gAux)),
-        bGS_sAux(cute::forward<STensor>(bGS_sAux)),
-        params_ptr(params_ptr) {}
-
-    GTensor bGS_gAux;                                                                  // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
-    STensor bGS_sAux;                                                                  // (TMA,TMA_M,TMA_N,PIPE)
-    Params const* params_ptr;
-
-    CUTLASS_DEVICE void
-    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->use_default) {
-          return;
-        }
-      }
-
-      if (issue_tma_load) {
-        // Increment the expected transaction bytes of the current stage's mbarrier by the subtile's byte-size
-        constexpr uint32_t copy_bytes = size(take<0,2>(SmemLayout{})) * sizeof_bits_v<Element> / 8;
-        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
-        // Issue the TMA load
-        constexpr uint16_t mcast_mask = 0;
-        int load_pipe_index = load_iteration % Stages;
-        copy(params_ptr->tma_load_aux.with(*full_mbarrier_ptr, mcast_mask),
-          bGS_gAux(_,_,_,epi_m,epi_n), bGS_sAux(_,_,_,load_pipe_index));
-      }
-    }
-  };
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    auto coord_shape =
-        make_coord(m, n, l)
-      ;
-    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
-    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
-    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), coord_shape);                       // (CTA_M,CTA_N)
-
-    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
-
-    ThrCopy thrblk_g2s = params_ptr->tma_load_aux.get_slice(_0{});
-    Tensor bGS_gAux = thrblk_g2s.partition_S(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
-    Tensor bGS_sAux = thrblk_g2s.partition_D(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
-
-    return ProducerLoadCallbacks<decltype(bGS_gAux), decltype(bGS_sAux)>(
-      cute::move(bGS_gAux), cute::move(bGS_sAux), params_ptr);
-  }
-
-  template <class RTensor, class TiledS2R, class STensorS2R>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(RTensor&& tC_rAux, TiledS2R tiled_s2r, STensorS2R&& tSR_sAux, Params const* params_ptr)
-      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
-        tiled_s2r(tiled_s2r),
-        tSR_sAux(cute::forward<STensorS2R>(tSR_sAux)),
-        params_ptr(params_ptr) { }
-
-    TiledS2R tiled_s2r;
-    RTensor tC_rAux;                                                                          // (CPY,CPY_M,CPY_N)
-    STensorS2R tSR_sAux;                                                                      // (S2R,S2R_M,S2R_N,PIPE)
-    Params const* params_ptr;
-
-    CUTLASS_DEVICE void
-    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->use_default) {
-          fill(tC_rAux, params_ptr->null_default);
-          return;
-        }
-      }
-
-      using RLayoutS2R = decltype(cute::layout(TiledS2R{}.get_slice(0).retile_S(RTensor{})));
-      Tensor tSR_rAux = make_tensor(tC_rAux.data(), RLayoutS2R{});                                 // (S2R,S2R_M,S2R_N)
-
-      int load_pipe_index = load_iteration % Stages;
-      copy(tiled_s2r, tSR_sAux(_,_,_,load_pipe_index), tSR_rAux);
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
-
-      return tC_rAux_frg(epi_v);
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-
-    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
-    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      >(mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
-
-    auto tiled_s2r = conditional_return<ReferenceSrc>(
-      make_tiled_copy_S(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy),
-      make_tiled_copy_D(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy)
-    );
-    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
-                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));            // (EPI_TILE_M,EPI_TILE_N,PIPE)
-    auto tSR_sAux = tiled_s2r.get_slice(args.thread_idx).partition_S(sAux_epi);               // (S2R,S2R_M,S2R_N,PIPE)
-
-    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_s2r), decltype(tSR_sAux)>(
-        cute::move(tC_rAux), tiled_s2r, cute::move(tSR_sAux), params_ptr);
-  }
-};
-
-template <
-  class Element,
-  class EpilogueTile,   // Unused
-  class LayoutOrStrideMNL,
-  class SmemLayoutAtom, // Unused
-  class CopyOpS2R,      // Unused
-  int Alignment,
-  bool EnableNullptr
->
-struct Sm90AuxLoad<
-  0, EpilogueTile, Element, LayoutOrStrideMNL, 
-  SmemLayoutAtom, CopyOpS2R, Alignment, EnableNullptr
-> {
-  using ElementAux = Element;
-  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
-
-  struct SharedStorage { };
-
-  struct Arguments {
-    Element const* ptr_aux = nullptr;
-    Element null_default = Element(0);
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-  
-  Params const* params_ptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<
-    class GTensorG2R,
-    class RTensor,
-    class CTensorG2R,
-    class ProblemShapeMNL
-  >
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(GTensorG2R&& tC_gAux,
-        RTensor&& tC_rAux,
-        CTensorG2R&& tC_cAux,
-        ProblemShapeMNL problem_shape_mnl,
-        Params const* params_ptr)
-      : tC_gAux(cute::forward<GTensorG2R>(tC_gAux)),
-        tC_rAux(cute::forward<RTensor>(tC_rAux)),
-        tC_cAux(cute::forward<CTensorG2R>(tC_cAux)),
-        problem_shape_mnl(problem_shape_mnl),
-        params_ptr(params_ptr) {}
-    
-    GTensorG2R tC_gAux;
-    RTensor tC_rAux;
-    CTensorG2R tC_cAux;
-    ProblemShapeMNL problem_shape_mnl;
-    Params const* params_ptr;
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->ptr_aux == nullptr) {
-          fill(tC_rAux, params_ptr->null_default);
-          return;
-        }
-      }
-      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
-      constexpr int V = cute::min(Alignment, size(MCL));
-
-      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
-      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
-      
-      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
-      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
-
-      auto pred_fn = [&] (auto const&... coords) {
-        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
-      };
-
-      copy_if(pred_fn, tC_gAux_vec, tC_rAux_vec);
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      return recast<Array<Element, FragmentSize>>(tC_rAux)(epi_v);
-    }
-  };
-
-  template <
-    bool ReferenceSrc,
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-
-    auto problem_shape_mnl = make_shape(M,N,L);
-
-    // Gmem Tensor
-    Tensor mAux = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
-    );
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
-      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    // Register Tensor
-    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
-
-    // Predication support
-    Tensor coordAux = make_identity_tensor(shape(mAux));
-    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
-      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
-      cute::move(tC_gAux),
-      cute::move(tC_rAux),
-      cute::move(tC_cAux),
-      problem_shape_mnl,
-      params_ptr
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Broadcast Load Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Scalar broadcast
-// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
-template<
-  class Element,
-  class StrideMNL_ = Stride<_0,_0,_0>,
-  int BroadcastCount = 1,
-  template <class> class ReductionFn = multiplies
->
-struct Sm90ScalarBroadcast {
-  using StrideMNL = StrideMNL_;
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
-
-  struct SharedStorage { };
-
-  struct Arguments {
-    Element scalars[BroadcastCount] = {};
-    Element const* scalar_ptrs[BroadcastCount] = {};
-    StrideMNL dScalar[BroadcastCount] = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-  
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  // This must be called after update_scalar is called
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    if (get<2>(params_ptr->dScalar[0]) == 0) { 
-      // Only 1 batch
-      return scalar == Element(0);
-    }
-    else { 
-      // multiple batch
-      if (valid_scalar == false) {
-        // for stridedBatch kernel, if ptr has a valid address, we need to enable the epi_load warps.
-        return params_ptr->scalar_ptrs[0] == nullptr;
-      }
-      else {
-        // Check whether each batch is ZERO or not.
-        return scalar == Element(0);
-      }
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params_ptr(&params) {
-    // Get the scalar for non-batched broadcast
-    if (size<2>(params_ptr->dScalar[0]) == 0) {
-      update_scalar();
-    }
-  }
-
-  Element scalar;
-  bool valid_scalar = false;
-  Params const* params_ptr;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    // Get the scalar for batched broadcast
-    if (size<2>(params_ptr->dScalar[0]) != 0) {
-      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
-      update_scalar(l_coord);
-    }
-
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(Element scalar)
-      : scalar(scalar) {}
-
-    Element scalar;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_scalar;
-      frg_scalar.fill(scalar);
-
-      return frg_scalar;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    // Get the scalar for batched broadcast
-    if (get<2>(params_ptr->dScalar[0]) != 0) {
-      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
-      update_scalar(l_coord);
-    }
-
-    return ConsumerStoreCallbacks(scalar);
-  }
-
-private:
-  CUTLASS_DEVICE void
-  update_scalar(int l_coord = 0) {
-    valid_scalar = true;
-    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
-
-    if (params_ptr->scalar_ptrs[0] != nullptr) {
-      scalar = params_ptr->scalar_ptrs[0][l_offset];
-    } 
-    else {
-      // batch stride is ignored for nullptr fallback
-      scalar = params_ptr->scalars[0];
-    }
-
-    // Do reduction over multiple broadcasts if necessary
-    ReductionFn<Element> reduction_fn;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < BroadcastCount; ++i) {
-      if (params_ptr->scalar_ptrs[i] != nullptr) {
-        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
-        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
-      } 
-      else {
-        // batch stride is ignored for nullptr fallback
-        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
-      }
-    }
-  }
-
-  template<class... Xs>
-  CUTLASS_DEVICE void
-  update_scalar(cute::tuple<Xs...>) {
-    // Only support multiple L-modes with fully-broadcast scalar
-    scalar = params_ptr->scalars[0];
-    valid_scalar = true;
-  }
-};
-
-// Scalar broadcast
-// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
-template<
-  class Element,
-  class StrideMNL = Stride<_0,_0,_0>,
-  int BroadcastCount = 1,
-  template <class> class ReductionFn = multiplies
->
-struct Sm90ScalarBroadcastPtrArray {
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
-
-  struct SharedStorage { };
-
-  struct Arguments {
-    Element scalars[BroadcastCount] = {};
-    Element const* scalar_ptrs[BroadcastCount] = {};
-    Element const* const* scalar_ptr_arrays[BroadcastCount] = {};
-    StrideMNL dScalar[BroadcastCount] = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-  
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    // producer load is needed if Element is not void and we have multiple scalars
-    return !cute::is_void_v<Element> and size<2>(params_ptr->dScalar[0]) != 0;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  // This must be called after update_scalar is called
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return scalar == Element(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarBroadcastPtrArray() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarBroadcastPtrArray(Params const& params, SharedStorage const& shared_storage)
-      : params_ptr(&params) {
-    // Get the scalar for non-batched broadcast
-    if (size<2>(params_ptr->dScalar[0]) == 0) {
-      update_scalar();
-    }
-  }
-
-  Element scalar;
-  Params const* params_ptr;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    // Get the scalar for batched broadcast
-    if (get<2>(params_ptr->dScalar[0]) != 0) {
-      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
-      update_scalar(l_coord);
-    }
-
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(Element scalar)
-      : scalar(scalar) {}
-
-    Element scalar;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_scalar;
-      frg_scalar.fill(scalar);
-
-      return frg_scalar;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    // Get the scalar for batched broadcast
-    if (get<2>(params_ptr->dScalar[0]) != 0) {
-      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
-      update_scalar(l_coord);
-    }
-
-    return ConsumerStoreCallbacks(scalar);
-  }
-
-private:
-  CUTLASS_DEVICE void
-  update_scalar(int l_coord = 0) {
-    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
-
-    if (params_ptr->scalar_ptr_arrays[0] != nullptr) {
-      scalar = *(params_ptr->scalar_ptr_arrays[0][l_offset]);
-    }
-    else if (params_ptr->scalar_ptrs[0] != nullptr) {
-      scalar = params_ptr->scalar_ptrs[0][l_offset];
-    }
-    else {
-      // batch stride is ignored for nullptr fallback
-      scalar = params_ptr->scalars[0];
-    }
-
-    // Do reduction over multiple broadcasts if necessary
-    ReductionFn<Element> reduction_fn;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < BroadcastCount; ++i) {
-
-      if (params_ptr->scalar_ptr_arrays[i] != nullptr) {
-        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
-        scalar = reduction_fn(scalar, *(params_ptr->scalar_ptr_arrays[i][rest_l_offset]));
-      }
-      if (params_ptr->scalar_ptrs[i] != nullptr) {
-        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
-        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
-      } 
-      else {
-        // batch stride is ignored for nullptr fallback
-        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
-      }
-    }
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <int StagesC, class CtaTileShapeMNK, class EpilogueTile>
-[[deprecated("row broadcast only uses 0 stages")]] constexpr int
-compute_row_broadcast_stages() {
-  return ceil_div(StagesC, size<1>(zipped_divide(make_layout(take<0,2>(CtaTileShapeMNK{})), EpilogueTile{}))) + 1;
-}
-
-}
-
-// Row vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class ElementInput,
-  class ElementCompute = ElementInput,
-  class StrideMNL_ = Stride<_0,_1,_0>,
-  int Alignment = 128 / sizeof_bits_v<ElementInput>,
-  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
->
-struct Sm90RowBroadcast {
-  using StrideMNL = StrideMNL_;
-  static_assert(Stages == 0, "Row broadcast doesn't support smem pipelining");
-
-  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<1>(StrideMNL{}))>, bool>; // row vector or scalar broadcast
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{} || IsDynamicBroadcast);
-
-  struct SharedStorage { 
-    array_aligned<ElementInput, size<1>(CtaTileShapeMNK{})> smem;
-  };
-
-  struct Arguments {
-    ElementInput const* ptr_row = nullptr;
-    ElementInput null_default = ElementInput(0);
-    StrideMNL dRow = {};
-  };
-
-  struct Params {
-    ElementInput const* ptr_row = nullptr;
-    ElementCompute null_default = ElementCompute(0);
-    StrideMNL dRow = {};
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return {args.ptr_row, ElementCompute(args.null_default), args.dRow};
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params), is_zero_(false),
-        smem(const_cast<ElementInput*>(shared_storage.smem.data())) {
-    auto const& [stride_M, stride_N, stride_L] = params.dRow;
-    // Nullptr default
-    if (EnableNullptr && params.ptr_row == nullptr) {
-      is_zero_ = params.null_default == ElementCompute(0);
-    }
-    // Dynamic non-batched scalar broadcast
-    else if (IsDynamicBroadcast && stride_N == bool(0) && stride_L == repeat_like(stride_L, 0)) {
-      is_zero_ = params.ptr_row[0] == ElementInput(0);
-    }
-  }
-
-  Params params;
-  bool is_zero_ = false;
-  ElementInput *smem = nullptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return is_zero_;
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue, class ThrNum>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
-        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
-        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        Residue residue_cRow_, ThrNum thr_num_, Params const& params_)
-      : tGS_gRow(tGS_gRow_)
-      , tGS_sRow(tGS_sRow_)
-      , tGS_cRow(tGS_cRow_)
-      , tiled_G2S(tiled_g2s_)
-      , tSR_sRow(tSR_sRow_)
-      , tSR_rRow(tSR_rRow_)
-      , residue_cRow(residue_cRow_)
-      , params(params_)
-      , is_nullptr(EnableNullptr && params_.ptr_row == nullptr) {
-      if (is_nullptr) {
-        fill(tSR_rRow, params.null_default);
-      }
-    }
-
-    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
-    Tiled_G2S tiled_G2S;
-
-    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
-  
-    Residue residue_cRow;                                                        // (m, n)
-    ThrNum thr_num;
-    Params const& params;
-    bool is_nullptr;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if (is_nullptr) {
-        return;
-      }
-
-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
-      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
-      Tensor tGS_cRow_flt = filter_zeros(tGS_cRow, tGS_gRow.stride());
-
-      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
-        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
-          continue; // OOB of SMEM, 
-        }
-        if (elem_less(tGS_cRow_flt(i), residue_cRow)) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
-        }
-        else {
-          tGS_sRow_flt(i) = ElementInput(0); // Set to Zero when OOB so LDS can be issued without any preds.
-        }
-      }
-      synchronize();
-    }
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if (epi_m == 0 and not is_nullptr) { // Assumes M-major subtile loop
-        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
-        Tensor tSR_rRow_flt = make_tensor_like<ElementInput>(tSR_sRow_flt);
-        copy_aligned(tSR_sRow_flt, tSR_rRow_flt);
-
-        constexpr int FrgSize = size(tSR_rRow_flt);
-        using FrgInput = Array<ElementInput, FrgSize>;
-        using FrgCompute = Array<ElementCompute, FrgSize>;
-        using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
-
-        Tensor tSR_rRow_input_frg = recast<FrgInput>(coalesce(tSR_rRow_flt));
-        Tensor tSR_rRow_compute_frg = recast<FrgCompute>(filter(tSR_rRow));
-        ConvertInput convert_input{};
-
-        tSR_rRow_compute_frg(_0{}) = convert_input(tSR_rRow_input_frg(_0{}));
-      }
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<ElementCompute, FragmentSize> frg_row;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
-      }
-
-      return frg_row;
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    using ThreadCount = decltype(size(args.tiled_copy));
-
-    auto layout_N = [&] () {
-      auto shape_N = get<1>(args.problem_shape_mnkl);
-      if constexpr (IsDynamicBroadcast) {
-        auto stride_N = repeat_like(shape_N, int(0));
-        if (get<1>(params.dRow) == bool(1)) {
-          stride_N = transform_leaf(compact_major<LayoutLeft>(shape_N),
-            [] (auto const& stride) { return static_cast<int>(stride); }
-          );
-        }
-        return make_layout(shape_N, stride_N);
-      }
-      else {
-        return make_layout(shape_N);
-      }
-    }();
-
-    auto layout_M = make_layout(M, repeat_like(M, _0{}));
-    auto layout_L = make_layout(L, get<2>(params.dRow));
-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_layout(layout_M,layout_N,layout_L));
-    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
-    Tensor sRow = make_tensor(make_smem_ptr(smem), 
-        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
-    //// G2S: Gmem to Smem
-    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, ElementInput>{},
-                                     Layout< Shape<_1, ThreadCount>, 
-                                            Stride<_0,          _1>>{}, 
-                                     Layout<_1>{});   
-    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
-    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
-    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
-
-    //// G2S: Coord 
-    Tensor tGS_cRow = thr_g2s.partition_S(args.cD);
-
-    //// S2R: Smem to Reg
-    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tSR_rRow = make_tensor_like<ElementCompute>(take<0,3>(tSR_sRow));                        // (CPY,CPY_M,CPY_N)
-
-    return ConsumerStoreCallbacks(
-      tGS_gRow, 
-      tGS_sRow, 
-      tGS_cRow, tiled_g2s, 
-      tSR_sRow, 
-      tSR_rRow, 
-      args.residue_cD,
-      ThreadCount{}, 
-      params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class ElementInput,
-  class ElementCompute = ElementInput,
-  class StrideMNL_ = Stride<_1,_0,_0>,
-  int Alignment = 128 / sizeof_bits_v<ElementInput>,
-  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
->
-struct Sm90ColBroadcast {
-  using StrideMNL = StrideMNL_;
-  static_assert(Stages == 0, "Column broadcast doesn't support smem pipelining");
-
-  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<0>(StrideMNL{}))>, bool>; // Column vector or scalar broadcast
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{} || IsDynamicBroadcast);
-
-  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
-  struct SharedStorage { };
-
-  struct Arguments {
-    ElementInput const* ptr_col = nullptr;
-    ElementInput null_default = ElementInput(0);
-    StrideMNL dCol = {};
-  };
-
-  struct Params {
-    ElementInput const* ptr_col = nullptr;
-    ElementCompute null_default = ElementCompute(0);
-    StrideMNL dCol = {};
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return {args.ptr_col, ElementCompute(args.null_default), args.dCol};
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return is_zero_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params), is_zero_(false) {
-    auto const& [stride_M, stride_N, stride_L] = params.dCol;
-    // Nullptr default
-    if (EnableNullptr && params.ptr_col == nullptr) {
-      is_zero_ = params.null_default == ElementCompute(0);
-    }
-    // Dynamic non-batched scalar broadcast
-    else if (IsDynamicBroadcast && stride_M == bool(0) && stride_L == repeat_like(stride_L, 0)) {
-      is_zero_ = params.ptr_col[0] == ElementInput(0);
-    }
-  }
-
-  Params params;
-  bool is_zero_;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class GTensor, class RTensor, class CTensor, class ThrResidue>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(GTensor tCgCol_, RTensor tCrCol_, CTensor tCcCol_, ThrResidue residue_tCcCol_, Params const& params_)
-      : tCgCol(tCgCol_),
-        tCrCol(tCrCol_),
-        tCcCol(tCcCol_),
-        residue_tCcCol(residue_tCcCol_),
-        params(params_) {
-      if (EnableNullptr && params.ptr_col == nullptr) {
-        fill(tCrCol, params.null_default);
-      }
-    }
-
-    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcCol;
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if (EnableNullptr && params.ptr_col == nullptr) {
-        return;
-      }
-
-      // Filter so we don't issue redundant copies over stride-0 modes
-      // (only works if 0-strides are in same location, which is by construction)
-      Tensor tCgCol_flt = filter_zeros(tCgCol);
-      Tensor tCrCol_flt = make_tensor_like<ElementInput>(filter_zeros(tCrCol));
-      Tensor tCcCol_flt = filter_zeros(tCcCol, tCgCol.stride());
-
-      constexpr auto MCL = decltype(max_common_layout(tCgCol_flt, tCrCol_flt)){};
-      constexpr int V = cute::min(Alignment, size(MCL));
-      if constexpr (V > 1) {
-        using VecType = uint_bit_t<V * sizeof_bits_v<ElementInput>>;
-        Tensor tCgCol_vec = recast<VecType>(coalesce(tCgCol_flt));
-        Tensor tCrCol_vec = recast<VecType>(coalesce(tCrCol_flt));
-        Tensor tCcCol_vec = tensor<1>(zipped_divide(tCcCol_flt, MCL.compose(Int<V>{})));
-        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_vec(coords...), residue_tCcCol); };
-        copy_if(pred_fn, tCgCol_vec, tCrCol_vec);
-      }
-      else {
-        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_flt(coords...), residue_tCcCol); };
-        copy_if(pred_fn, tCgCol_flt, tCrCol_flt);
-      }
-
-      constexpr int FrgSize = size(tCrCol_flt);
-      using FrgInput = Array<ElementInput, FrgSize>;
-      using FrgCompute = Array<ElementCompute, FrgSize>;
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
-
-      Tensor tCrCol_input_frg = recast<FrgInput>(coalesce(tCrCol_flt));
-      Tensor tCrCol_compute_frg = recast<FrgCompute>(filter(tCrCol));
-      ConvertInput convert_input{};
-
-      tCrCol_compute_frg(_0{}) = convert_input(tCrCol_input_frg(_0{}));
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<ElementCompute, FragmentSize> frg_col;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
-      }
-
-      return frg_col;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto layout_M = [&] () {
-      auto shape_M = get<0>(args.problem_shape_mnkl);
-      if constexpr (IsDynamicBroadcast) {
-        auto stride_M = repeat_like(shape_M, int(0));
-        if (get<0>(params.dCol) == bool(1)) {
-          stride_M = transform_leaf(compact_major<LayoutLeft>(shape_M),
-            [] (auto const& stride) { return static_cast<int>(stride); }
-          );
-        }
-        return make_layout(shape_M, stride_M);
-      }
-      else {
-        return make_layout(shape_M);
-      }
-    }();
-
-    auto layout_N = make_layout(N, repeat_like(N, _0{}));
-    auto layout_L = make_layout(L, get<2>(params.dCol));
-    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(layout_M,layout_N,layout_L));
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    Tensor mCol_static = make_tensor(make_gmem_ptr(params.ptr_col), make_layout(make_layout(M),layout_N,layout_L));
-    Tensor tCgCol_static = sm90_partition_for_epilogue<ReferenceSrc>(                  // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol_static, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol_static);                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    return ConsumerStoreCallbacks(tCgCol, tCrCol, args.tCcD, args.residue_tCcD, params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Batch matrix broadcast
-// Only need to redefine this if we can multicast across cluster L
-template <
-  int Stages,
-  class EpilogueTile,
-  class Element,
-  class StrideMNL,
-  class SmemLayoutAtom,
-  class CopyOpS2R,
-  int Alignment = 128 / sizeof_bits_v<Element>,
-  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
->
-using Sm90MatrixBroadcast
-  = Sm90AuxLoad<Stages, EpilogueTile, Element, StrideMNL, SmemLayoutAtom, CopyOpS2R, EnableNullptr>;
-
-namespace detail {
-
-template <typename Operation, typename = void>
-struct IsScalarBroadcast {
-  static constexpr bool value = false;
-};
-
-template <typename Operation>
-struct IsScalarBroadcast<Operation, cute::enable_if_t<is_same_v<decltype(take<0,2>(typename Operation::StrideMNL{})), Stride<_0,_0>>>> {
-  static constexpr bool value = true;
-};
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
deleted file mode 100755
index f9ebe7393..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,1736 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-
-#include "cute/tensor.hpp"
-#include "sm90_visitor_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Store Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Stages,
-  class EpilogueTile,
-  class Element,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL,
-  class SmemLayoutAtom,
-  class CopyOpR2S,
-  int Alignment = 128 / sizeof_bits_v<Element>,
-  bool EnableNullptr = true // Noop on nullptr params
->
-struct Sm90AuxStore {
-  using ElementAux = Element;
-  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
-
-  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
-  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
-  using SmemShapeTma = decltype(make_shape(
-      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
-      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
-  using SmemLayoutTma = decltype(tile_to_shape(
-      SmemLayoutAtom{}, SmemShapeTma{},
-      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
-  using SmemLayout = decltype(tile_to_shape(
-      SmemLayoutTma{},
-      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
-      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
-
-  struct SharedStorage {
-    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
-    array_aligned<Element, size(SmemLayout{})> smem_aux;
-  };
-
-  struct Arguments {
-    Element* ptr_aux = nullptr;
-    StrideMNL dAux = {};
-  };
-
-  struct Params {
-    using TMA_Aux = decltype(make_tma_copy(
-        SM90_TMA_STORE{},
-        make_tensor(static_cast<Element*>(nullptr), repeat_like(StrideMNL{}, int32_t(0)), StrideMNL{}),
-        SmemLayoutTma{}));
-    TMA_Aux tma_store_aux;
-    bool is_nullptr = false;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-
-    bool is_nullptr = false;
-    if constexpr (EnableNullptr) {
-      is_nullptr = args.ptr_aux == nullptr;
-    }
-
-    typename Params::TMA_Aux tma_store_aux;
-    if (not is_nullptr) {
-      Tensor tensor_aux = make_tensor(args.ptr_aux, make_layout(make_shape(M,N,L), args.dAux));
-      tma_store_aux = make_tma_copy(SM90_TMA_STORE{}, tensor_aux, SmemLayoutTma{});
-    }
-
-    return {tma_store_aux, is_nullptr};
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxStore() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
-      : params_ptr(&params),
-        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
-
-  Params const* params_ptr;
-  Element* smem_aux;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <
-    class RTensor,
-    class TiledR2S,
-    class STensorR2S,
-    class STensorS2G,
-    class GTensorS2G
-  >
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-          RTensor&& tC_rAux,
-          TiledR2S tiled_r2s,
-          STensorR2S&& tRS_sAux,
-          STensorS2G&& bSG_sAux,
-          GTensorS2G&& bSG_gAux,
-          Params const* params_ptr)
-      : tiled_r2s(tiled_r2s),
-        tC_rAux(cute::forward<RTensor>(tC_rAux)),
-        tRS_sAux(cute::forward<STensorR2S>(tRS_sAux)),
-        bSG_sAux(cute::forward<STensorS2G>(bSG_sAux)),
-        bSG_gAux(cute::forward<GTensorS2G>(bSG_gAux)),
-        params_ptr(params_ptr) {}
-
-    TiledR2S tiled_r2s;
-    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N)
-    STensorR2S tRS_sAux;                                                               // (R2S,R2S_M,R2S_N,PIPE)
-    STensorS2G bSG_sAux;                                                               // (S2G,S2G_M,S2G_N,PIPE)
-    GTensorS2G bSG_gAux;                                                               // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
-    Params const* params_ptr;
-
-    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-
-      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
-      tC_rAux_frg(epi_v) = convert_input(frg_input);
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE void
-    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->is_nullptr) {
-          return;
-        }
-      }
-
-      using RLayoutR2S = decltype(cute::layout(TiledR2S{}.get_slice(0).retile_S(RTensor{})));
-      Tensor tRS_rAux = make_tensor(tC_rAux.data(), RLayoutR2S{});                                 // (R2S,R2S_M,R2S_N)
-
-      if (issue_smem_store) {
-        int store_pipe_index = store_iteration % Stages;
-        copy(tiled_r2s, tRS_rAux, tRS_sAux(_,_,_,store_pipe_index));
-      }
-    }
-
-    CUTLASS_DEVICE void
-    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->is_nullptr) {
-          return;
-        }
-      }
-
-      if (issue_tma_store) {
-        // Issue the TMA store
-        int store_pipe_index = store_iteration % Stages;
-        copy(params_ptr->tma_store_aux, bSG_sAux(_,_,_,store_pipe_index), bSG_gAux(_,_,_,epi_m,epi_n));
-      }
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    Tensor mAux = params_ptr->tma_store_aux.get_tma_tensor(make_shape(M,N,L));                               // (M,N,L)
-    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
-
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
-
-    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
-                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));     // (EPI_TILE_M,EPI_TILE_N,PIPE)
-    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-
-    auto tiled_r2s = conditional_return<ReferenceSrc>(
-      make_tiled_copy_S(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy),
-      make_tiled_copy_D(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy)
-    );
-    auto tRS_sAux = tiled_r2s.get_slice(args.thread_idx).partition_D(sAux_epi);               // (R2S,R2S_M,R2S_N,PIPE)
-
-    ThrCopy thrblk_s2g = params_ptr->tma_store_aux.get_slice(_0{});
-    Tensor bSG_sAux = thrblk_s2g.partition_S(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
-    Tensor bSG_gAux = thrblk_s2g.partition_D(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
-
-    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_r2s), decltype(tRS_sAux), decltype(bSG_sAux), decltype(bSG_gAux)>(
-            cute::move(tC_rAux),
-            tiled_r2s,
-            cute::move(tRS_sAux),
-            cute::move(bSG_sAux),
-            cute::move(bSG_gAux),
-            params_ptr);
-  }
-};
-
-template <
-  class Element,
-  class EpilogueTile,   // Unused
-  FloatRoundStyle RoundStyle,
-  class LayoutOrStrideMNL,
-  class SmemLayoutAtom, // Unused
-  class CopyOpR2S,      // Unused
-  int Alignment, 
-  bool EnableNullptr
->
-struct Sm90AuxStore<
-  0, EpilogueTile, Element, RoundStyle, LayoutOrStrideMNL, 
-  SmemLayoutAtom, CopyOpR2S, Alignment, EnableNullptr
-> {
-  using ElementAux = Element;
-  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
-
-  struct SharedStorage { };
-
-  struct Arguments {
-    Element* ptr_aux = nullptr;
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxStore() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-  
-  Params const* params_ptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<
-    class GTensorR2G,
-    class RTensor,
-    class CTensorR2G,
-    class ProblemShapeMNL
-  >
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GTensorR2G&& tC_gAux,
-        RTensor&& tC_rAux,
-        CTensorR2G&& tC_cAux,
-        ProblemShapeMNL problem_shape_mnl,
-        Params const* params_ptr)
-      : tC_gAux(cute::forward<GTensorR2G>(tC_gAux)),
-        tC_rAux(cute::forward<RTensor>(tC_rAux)),
-        tC_cAux(cute::forward<CTensorR2G>(tC_cAux)),
-        problem_shape_mnl(problem_shape_mnl),
-        params_ptr(params_ptr) {}
-    
-    GTensorR2G tC_gAux;
-    RTensor tC_rAux;
-    CTensorR2G tC_cAux;
-    ProblemShapeMNL problem_shape_mnl;
-    Params const* params_ptr;
-
-    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-
-      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
-      tC_rAux_frg(epi_v) = convert_input(frg_input);
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE void
-    end_loop(int epi_m, int epi_n) {
-      if constexpr (EnableNullptr) {
-        if (params_ptr->ptr_aux == nullptr) {
-          return;
-        }
-      }
-
-      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
-      constexpr int V = cute::min(Alignment, size(MCL));
-
-      Tensor tC_cAux_mn = tC_cAux(_,_,_,epi_m,epi_n);
-      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux_mn), MCL.compose(Int<V>{})));
-      
-      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
-      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
-
-      auto pred_fn = [&] (auto const&... coords) {
-        return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
-      };
-
-      copy_if(pred_fn, tC_rAux_vec, tC_gAux_vec);
-    }
-  };
-
-  template <
-    bool ReferenceSrc,
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-
-    auto problem_shape_mnl = make_shape(M,N,L);
-
-    // Gmem Tensor
-    Tensor mAux = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
-    );
-    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
-                      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    // Register Tensor
-    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
-
-    // Predication support
-    Tensor coordAux = make_identity_tensor(shape(mAux));
-    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
-                      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);   
-
-    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
-      cute::move(tC_gAux),
-      cute::move(tC_rAux),
-      cute::move(tC_cAux),
-      problem_shape_mnl,
-      params_ptr
-    );
-
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Reduction Store Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Scalar reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class GmemReduceFn,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_0,_0,_0>,
-  bool EnableNullptr = true // Noop on nullptr params
->
-struct Sm90ScalarReduction {
-private:
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
-  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
-  static_assert(IsAtomic, "non-atomic scalar reduction not supported yet");
-
-public:
-  struct SharedStorage { };
-
-  struct Arguments {
-    ElementOutput* ptr_scalar = nullptr;
-    ElementCompute reduction_identity = ElementCompute(0);
-    StrideMNL dScalar = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-  #if !defined(CUTLASS_SKIP_REDUCTION_INIT)
-    if constexpr (IsAtomic) {
-      auto problem_shape_mnkl = append<4>(problem_shape, 1);
-      auto [M, N, K, L] = problem_shape_mnkl;
-      Layout mScalar_layout = make_layout(make_shape(M,N,L), args.dScalar);
-      if (args.ptr_scalar != nullptr) {
-        return fill_workspace(args.ptr_scalar, ElementOutput(args.reduction_identity), cosize(mScalar_layout), stream, cuda_adapter);
-      }
-    }
-  #endif
-
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ScalarReduction(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params const params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class CTensor, class ThrResidue>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        int l_coord,
-        CTensor tCcScalar,
-        ThrResidue residue_tCcScalar,
-        Params const& params)
-      : scalar(params.reduction_identity),
-        l_coord(l_coord),
-        tCcScalar(tCcScalar),
-        residue_tCcScalar(residue_tCcScalar),
-        params(params) {}
-
-    ElementCompute scalar;
-    int l_coord;
-    CTensor tCcScalar;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcScalar;
-    Params params;
-
-    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-      if constexpr (EnableNullptr) {
-        if (params.ptr_scalar == nullptr) {
-          return frg_input;
-        }
-      }
-
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      using ReduceInput = RegReduceFn<ElementCompute>;
-      ConvertInput convert_input{};
-      ReduceInput reduce_input{};
-
-      Array frg_I = convert_input(frg_input);
-      Tensor tCcScalar_mn = tCcScalar(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        if (elem_less(tCcScalar_mn(epi_v * FragmentSize + i), residue_tCcScalar)) {
-          scalar = reduce_input(scalar, frg_I[i]);
-        }
-      }
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE void
-    end() {
-      if constexpr (EnableNullptr) {
-        if (params.ptr_scalar == nullptr) {
-          return;
-        }
-      }
-
-      using ConvertI = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-      using ReduceInput = GmemReduceFn<ElementOutput>;
-
-      ConvertI convert_I{};
-      ReduceInput reduce_input{};
-
-      ElementOutput* ptr_scalar = params.ptr_scalar + l_coord * get<2>(params.dScalar);
-      reduce_input(ptr_scalar, convert_I(scalar));
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    return ConsumerStoreCallbacks<decltype(args.tCcD), decltype(args.residue_tCcD)>(
-      get<3>(args.tile_coord_mnkl), args.tCcD, args.residue_tCcD, params);
-  }
-
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Row vector reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class ShuffleReduceFn,
-  template <class> class GmemReduceFn,
-  int Stages,
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_0,_1,_0>,
-  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
-  bool EnableNullptr = true, // Noop on nullptr params
-  // If this is false, ptr_row is assumed to point to a compact n-major (ceil_div(M,CTA_M), round_nearest(N,CTA_N), L)
-  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (N, L) tensor of ElementOutput
-  bool FinalReduction = true,
-  // False means skip OOB predication if OOB inputs are known to be the reduction identity
-  bool VisitCheckOOB = true,
-  // Indicate the parameter order when calling RegReduceFn
-  // Seq length equals the number of RegReduceFn parameters
-  // No.0 represents tCrRow; No.1 and subsequent numbers sequentially represent frg_inputs in `visit`
-  class RegReduceSeq = cute::seq<0, 1>
->
-struct Sm90RowReduction {
-private:
-  static_assert(Stages == 0, "Smem usage not supported yet");
-  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
-  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
-  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
-
-public:
-  struct SharedStorage { };
-
-  struct Arguments {
-    void* ptr_row = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
-    ElementCompute reduction_identity = 0;
-    StrideMNL dRow = {};
-  };
-
-  struct Params {
-    void* ptr_row = nullptr;
-    ElementCompute reduction_identity = 0;
-    StrideMNL dRow = {};
-    ElementCompute* reduction_buffer = nullptr;
-    int* tile_counters = nullptr;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    ElementCompute* reduction_buffer;
-    int* tile_counters = nullptr;
-    if constexpr (IsAtomic) {
-      reduction_buffer = nullptr;
-    }
-    else if constexpr (FinalReduction) {
-      auto problem_shape_mnkl = append<4>(problem_shape, 1);
-      auto [M, N, K, L] = problem_shape_mnkl;
-
-      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M), size<>(N), L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
-      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
-
-      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
-      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
-    }
-    else {
-      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_row);
-    }
-
-    return {
-      args.ptr_row,
-      args.reduction_identity,
-      args.dRow,
-      reduction_buffer,
-      tile_counters
-    };
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    if constexpr (IsAtomic || not FinalReduction) {
-      return 0;
-    }
-
-    size_t workspace_size = 0;
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-    // Increment by size of reduction buffer
-    workspace_size += product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
-    // Align and increment by size of tile counters
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-    workspace_size += cute::ceil_div(size<>(N), tile_N) * sizeof(int);
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-#if !defined(CUTLASS_SKIP_REDUCTION_INIT)
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-    if constexpr (IsAtomic) {
-      Layout mRow_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dRow);
-      if (args.ptr_row != nullptr) {
-        return fill_workspace(args.ptr_row, ElementOutput(args.reduction_identity), cosize(mRow_layout), stream, cuda_adapter);
-      }
-      return Status::kSuccess;
-    }
-    else
-#endif 
-    if constexpr (FinalReduction) {
-      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
-      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
-
-      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
-      size_t tile_counters_size = cute::ceil_div(size<>(N), tile_N) * sizeof(int);
-      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
-    }
-    else {
-      return Status::kSuccess;
-    }
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowReduction(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class ArgsTuple>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
-      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
-        params(params) {}
-
-    ArgsTuple args_tuple;
-    Params const& params;
-    bool do_final_reduction = false;
-
-    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
-      if constexpr (EnableNullptr) {
-        if (params.ptr_row == nullptr) {
-          return cute::get<0>(cute::make_tuple(frg_inputs...));
-        }
-      }
-
-      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
-        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
-      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
-      Tensor tCcRow_mn = tCcRow(_,_,_,epi_m,epi_n);
-
-      if constexpr (VisitCheckOOB) {
-        using ReduceInput = RegReduceFn<ElementCompute>;
-        ReduceInput reduce_input{};
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < FragmentSize; ++i) {
-          if (elem_less(tCcRow_mn(epi_v * FragmentSize + i), residue_tCcRow)) {
-            ElementCompute& tCrRow_vmn = tCrRow_mn(epi_v * FragmentSize + i);
-            tCrRow_vmn = transform_apply(cute::make_tuple(frg_inputs...),
-                [&] (auto&& frg_input) {
-                  return ElementCompute(frg_input[i]);
-                },
-                [&] (auto&&... cvt_frg_inputs) {
-                  auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn, cvt_frg_inputs...);
-                  return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
-                });
-          }
-        }
-      }
-      else {
-        constexpr int RegFragSize = cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute)));
-        using ReduceInput = RegReduceFn<Array<ElementCompute, RegFragSize>>;
-        ReduceInput reduce_input{};
-        Tensor tCrRow_mn_frg = recast<Array<ElementCompute, RegFragSize>>(tCrRow_mn);
-
-        constexpr int RegFragArraySize = FragmentSize / RegFragSize;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < RegFragArraySize; ++i) {
-          Array<ElementCompute, RegFragSize>& tCrRow_vmn_frg = tCrRow_mn_frg(epi_v * RegFragArraySize + i);
-          tCrRow_vmn_frg = transform_apply(cute::make_tuple(frg_inputs...),
-              [&] (auto&& frg_input) {
-                using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
-                using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, RegFragSize, RoundStyle>;
-                using RegFragArr = Array<Array<ElementCompute, RegFragSize>, RegFragArraySize>;
-                ConvertInput convert_input{};
-                return convert_input(reinterpret_cast<RegFragArr&>(frg_input)[i]);
-              },
-              [&] (auto&&... cvt_frg_inputs) {
-                auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn_frg, cvt_frg_inputs...);
-                return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
-              });
-        }
-      }
-      return cute::get<0>(cute::make_tuple(frg_inputs...));
-    }
-
-    template <class STensor, class SyncFn, class VTensor>
-    CUTLASS_DEVICE void
-    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
-      if (not is_last_iteration) {
-        return;
-      }
-
-      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
-        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
-      auto [m, n, k, l] = tile_coord_mnkl;
-      constexpr bool ReferenceSrc = decltype(ref_src)::value;
-      if constexpr (EnableNullptr) {
-        if (params.ptr_row == nullptr) {
-          return;
-        }
-      }
-
-      // fully OOB CTA in partially OOB cluster
-      if (not elem_less(cRow(_0{},_0{}), residue_cRow)) {
-        return;
-      }
-
-      int lane_m = get<0>(lane_mn);
-      [[maybe_unused]] bool is_reduced_lane = lane_m == 0;
-
-      //
-      // 1. Warp shuffle reduction
-      //
-      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
-      Tensor tCrRow_frg = recast<FragmentShuffle>(filter(tCrRow));
-      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
-      ReduceShuffle reduce_shuffle{};
-
-      auto FrgSizePerLaneM = size(tCrRow_frg) / size<0>(lane_layout_MN);
-      constexpr bool SwapShuffle = FrgSizePerLaneM > 0;
-
-      //
-      // Swap Shuffle
-      //
-      // The normal way to reduction among threads:
-      // use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
-      // After each step of reduction, a half of threads won't work in the following steps.
-      // That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
-      //
-      // To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
-      // we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
-      // After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
-      // We can recursively do this until the problem size is 1.
-      //
-      if constexpr (SwapShuffle) { // for a NxN matrix to be reduced among N threads as a 1XN vectors
-        Tensor tCrRow_frg_ = logical_divide(tCrRow_frg, FrgSizePerLaneM);                       // (FrgSizePerLaneM, M)
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = size<1>(tCrRow_frg_) / 2; m > 0; m /= 2) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int r = 0; r < m; ++r) {
-            auto frg_A = tCrRow_frg_(_,r);
-            auto frg_B = tCrRow_frg_(_,r + m);
-            CUTLASS_PRAGMA_UNROLL
-            for (int v = 0; v < size(frg_A); ++v) {
-              // Step1: swap
-              if (not (lane_m & m)) { // the first half of threads swap fragments from the first half of data to the second
-                swap(frg_A(v), frg_B(v));
-              }
-
-              // Step2: shuffle
-              uint64_t frg_shfl = reinterpret_cast<uint64_t&>(frg_A(v));
-              // each half of threads get a half of data from the other half of threads
-              frg_shfl = __shfl_xor_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(m, _0{}));
-
-              // Step3: reduction
-              frg_A(v) = reduce_shuffle(frg_B(v), reinterpret_cast<FragmentShuffle&>(frg_shfl));
-            }
-          }
-        }
-      }
-      else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int reduction_rows = size<0>(lane_layout_MN) / 2; reduction_rows > 0; reduction_rows /= 2) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int frg_idx = 0; frg_idx < size(tCrRow_frg); ++frg_idx) {
-            uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrRow_frg(frg_idx));
-            frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(reduction_rows, _0{}));
-            tCrRow_frg(frg_idx) = reduce_shuffle(tCrRow_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
-          }
-        }
-      }
-
-      //
-      // 2. Atomic reduction
-      //
-      if constexpr (IsAtomic) {
-        // Filter so we don't issue redunant copies over stride-0 modes
-        Tensor tCrRow_flt = filter_zeros(tCrRow);
-        Tensor tCcRow_flt = make_tensor(tCcRow.data(), make_layout(tCrRow_flt.shape(), tCcRow.stride()));
-        auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
-
-        Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(gRow_l(_,_,l), epi_tile, tiled_copy, thread_idx);
-        Tensor tCgRow_flt = filter_zeros(tCgRow);
-        // NOTE: atomic reduction is performed in the output type
-        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-        using ReduceOutput = GmemReduceFn<ElementOutput>;
-        ConvertOutput convert_output{};
-        ReduceOutput reduce_output{};
-
-        if constexpr (SwapShuffle) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < FltFrgSizePerLaneM; ++i) {
-            int idx = lane_m * FltFrgSizePerLaneM + i;
-            // Only care about OOB for N mode
-            if (get<1>(tCcRow_flt(idx)) < get<1>(residue_tCcRow)) {
-              reduce_output(&tCgRow_flt(idx), convert_output(tCrRow_flt(i)));
-            }
-          }
-        }
-        else {
-          if (is_reduced_lane) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int i = 0; i < size(tCrRow_flt); ++i) {
-              if (elem_less(tCcRow_flt(i), residue_tCcRow)) {
-                reduce_output(&tCgRow_flt(i), convert_output(tCrRow_flt(i)));
-              }
-            }
-          }
-        }
-        sync_fn();
-      }
-
-      //
-      // 2. One warp in M, skip threadblock smem reduction
-      //
-      else if constexpr (decltype(size<0>(warp_layout_MN))::value <= 1) {
-        // Dump warp reduction to gmem workspace
-        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
-        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_ml(_,_,m,l), epi_tile, tiled_copy, thread_idx);
-
-        if constexpr (SwapShuffle) {
-          Tensor tCrRow_flt = filter(tCrRow);
-          Tensor tCgBuf_flt = recast<ElementGmem>(filter(tCgBuf));
-          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
-          Tensor tCgBuf_flt_ = logical_divide(tCgBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
-          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
-          copy_aligned(tCrRow_flt_(_,_0{}), tCgBuf_flt_(_,lane_m));
-        }
-        else {
-          if (is_reduced_lane) {
-            // Filter so we don't issue redundant copies over stride-0 modes
-            // (only works if 0-strides are in same location, which is by construction)
-            copy_aligned(filter(tCrRow), recast<ElementGmem>(filter(tCgBuf)));
-          }
-        }
-        sync_fn();
-      }
-
-      //
-      // 2. Multiple warps in M, do threadblock smem reduction
-      //
-      else {
-        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
-        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
-                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
-                      "smem reduction buffer not large enough, use a larger epilogue tile");
-        sync_fn();
-
-        // Dump warp reduction to smem workspace
-        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<0>(warp_mn)), epi_tile, tiled_copy, thread_idx);
-
-        if constexpr (SwapShuffle) {
-          Tensor tCrRow_flt = filter(tCrRow);
-          Tensor tCsBuf_flt = filter(tCsBuf);
-          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
-          Tensor tCsBuf_flt_ = logical_divide(tCsBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
-          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
-          copy_aligned(tCrRow_flt_(_,_0{}), tCsBuf_flt_(_,lane_m));
-        }
-        else {
-          if (is_reduced_lane) {
-            // Filter so we don't issue redunant copies over stride-0 modes
-            // (only works if 0-strides are in same location, which is by construction)
-            copy_aligned(filter(tCrRow), filter(tCsBuf));
-          }
-        }
-        sync_fn();
-
-        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
-        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
-        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
-        using ReduceSmem = GmemReduceFn<FragmentSmem>;
-        ReduceSmem reduce_smem{};
-
-        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
-        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
-        constexpr int FragsPerRow = decltype(size<1>(sBuf_frg))::value;
-
-        constexpr int RowNum = decltype(size<0>(warp_layout_MN))::value;
-        using FragmentSmemArray = Array<FragmentSmem, RowNum>;
-
-        // Do the threadblock smem reduction
-        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
-        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_ml(_,_,m,l)));
-        CUTLASS_PRAGMA_UNROLL
-        for (int frg_idx = thread_idx; frg_idx < FragsPerRow; frg_idx += size(tiled_copy)) {
-          FragmentSmemArray frg_smem;
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int reduction_rows = 0; reduction_rows < RowNum; ++reduction_rows) {
-            int FragsCurrRows = reduction_rows * FragsPerRow;
-            frg_smem[reduction_rows] = sBuf_frg(FragsCurrRows + frg_idx);
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int reduction_rows = RowNum / 2; reduction_rows > 0; reduction_rows /= 2) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int row_idx = 0; row_idx < reduction_rows; ++row_idx) {
-              frg_smem[row_idx] = reduce_smem(frg_smem[row_idx], frg_smem[row_idx + reduction_rows]);
-            }
-          }
-          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem[0]);
-        }
-        sync_fn();
-      }
-
-      //
-      // 3. Increment atomic counters to signal final gmem reduction
-      //
-      if constexpr (not IsAtomic && FinalReduction) {
-        // Ensure gmem writes are visible to other threads before incrementing counter
-        __threadfence();
-        sync_fn();
-        // Collective thread 0 increments atomic tile counter and copies value to smem
-        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
-        if (thread_idx == 0) {
-          *prev_tile_count = atomicAdd(&params.tile_counters[n], 1);
-        }
-        sync_fn();
-        // Broadcast tile count to other threads in CTA and determine final reduction status
-        do_final_reduction = *prev_tile_count == size<2>(gBuf_ml) * size<3>(gBuf_ml) - 1;
-        sync_fn();
-      }
-    }
-
-    CUTLASS_DEVICE void
-    end() {
-      //
-      // 4. Do final gmem reduction if necessary
-      //
-      if constexpr (not IsAtomic && FinalReduction) {
-        if (not do_final_reduction) {
-          return;
-        }
-
-        auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
-          lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-          tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
-
-        using ReduceOutput = GmemReduceFn<ElementCompute>;
-        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-        ReduceOutput reduce_output{};
-        ConvertOutput convert_output{};
-
-        // Reduction over batches
-        if (size<2>(stride(gRow_l)) == 0) {
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
-            Tensor tRgBuf_ml = gBuf_ml(_0{},n,_,_);
-            ElementCompute output = tRgBuf_ml(_0{});
-            CUTLASS_PRAGMA_NO_UNROLL
-            for (int ml = 1; ml < size(tRgBuf_ml); ++ml) {
-              output = reduce_output(output, tRgBuf_ml(ml));
-            }
-            if (elem_less(cRow(_0{},n), residue_cRow)) {
-              gRow_l(_0{},n,_0{}) = convert_output(output);
-            }
-          }
-        }
-        // No reduction over batches
-        else {
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
-            bool do_store = elem_less(cRow(_0{},n), residue_cRow);
-            CUTLASS_PRAGMA_NO_UNROLL
-            for (int l = 0; l < size<3>(gBuf_ml); ++l) {
-              Tensor tRgBuf_m = gBuf_ml(_0{},n,_,l);
-              ElementCompute output = tRgBuf_m(_0{});
-              CUTLASS_PRAGMA_NO_UNROLL
-              for (int m = 1; m < size(tRgBuf_m); ++m) {
-                output = reduce_output(output, tRgBuf_m(m));
-              }
-              if (do_store) {
-                gRow_l(_0{},n,l) = convert_output(output);
-              }
-            }
-          }
-        }
-
-      }
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    Layout ref_layout_MN = [&] () {
-      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
-      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
-    }();                                                                                         // tile_mn -> tv_idx
-
-    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
-    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
-    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
-    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
-    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
-    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
-    int lane_idx = canonical_lane_idx();
-    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
-
-    // Get the MN layout + coord of warps to determine smem reduction iterations
-    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
-    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
-    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
-    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
-
-    int warp_idx = args.thread_idx / NumThreadsPerWarp;
-    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
-
-    // Partition output gmem and register tensors
-    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-
-    Tensor mRow = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_row), make_shape(M,N,L), params.dRow); // (M,N,L)
-    Tensor gRow_l = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
-    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      gRow_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrRow = make_tensor_like<ElementCompute>(tCgRow);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    fill(tCrRow, params.reduction_identity);
-
-    // Partition gmem+smem reduction buffer tensors
-    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_0{}, _1{}));
-    auto block_shape = ceil_div(make_shape(M,N,L), shape(gBuf_layout)); // (M_CNT, N_CNT, L_CNT)
-
-    // Let the M_CNT (the num of partial reduction results) become the outer mode
-    Layout block_layout = make_layout(block_shape, make_stride(get<1>(block_shape), _1{}, get<0>(block_shape) * get<1>(block_shape)));
-    Layout mBuf_layout = blocked_product(gBuf_layout, block_layout);
-    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
-    Tensor gBuf_ml = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(_,n,_));     // (CTA_M,CTA_N,REST_M,L)
-    Layout sBuf_layout = blocked_product(gBuf_layout,                                          // (CTA_M,CTA_N,WARPS_M)
-      make_layout(make_shape(_1{},_1{},size<0>(warp_layout_MN))));
-
-    auto args_tuple = make_tuple(
-        bool_constant<ReferenceSrc>{}, cute::move(tCrRow), args.tCcD, gRow_l, args.cD, gBuf_ml, sBuf_layout,
-        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
-    return ConsumerStoreCallbacks<decltype(args_tuple)>(cute::move(args_tuple), params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Col vector reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class ShuffleReduceFn,
-  template <class> class GmemReduceFn,
-  int Stages,
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_1,_0,_0>,
-  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
-  bool EnableNullptr = true, // Noop on nullptr params
-  // If this is false, ptr_col is assumed to point to a compact m-major (round_nearest(M,CTA_M), ceil_div(N,CTA_N), L)
-  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (M, L) tensor of ElementOutput
-  bool FinalReduction = true,
-  // False means skip OOB predication if OOB inputs are known to be the reduction identity
-  bool VisitCheckOOB = true
->
-struct Sm90ColReduction {
-private:
-  static_assert(Stages == 0, "Smem usage not supported yet");
-  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{});
-  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
-  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
-
-public:
-  struct SharedStorage { };
-
-  struct Arguments {
-    void* ptr_col = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
-    ElementCompute reduction_identity = 0;
-    StrideMNL dCol = {};
-  };
-
-  struct Params {
-    void* ptr_col = nullptr;
-    ElementCompute reduction_identity = 0;
-    StrideMNL dCol = {};
-    ElementCompute* reduction_buffer = nullptr;
-    int* tile_counters = nullptr;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    ElementCompute* reduction_buffer;
-    int* tile_counters = nullptr;
-    if constexpr (IsAtomic) {
-      reduction_buffer = nullptr;
-    }
-    else if constexpr (FinalReduction) {
-      auto problem_shape_mnkl = append<4>(problem_shape, 1);
-      auto [M, N, K, L] = problem_shape_mnkl;
-
-      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
-      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
-
-      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
-      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
-    }
-    else {
-      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_col);
-    }
-
-    return {
-      args.ptr_col,
-      args.reduction_identity,
-      args.dCol,
-      reduction_buffer,
-      tile_counters
-    };
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    if constexpr (IsAtomic || not FinalReduction) {
-      return 0;
-    }
-
-    size_t workspace_size = 0;
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-
-    // Increment by size of reduction buffer
-    workspace_size += product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
-    // Align and increment by size of tile counters
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-    workspace_size += cute::ceil_div(M, tile_M) * sizeof(int);
-
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-#if !defined(CUTLASS_SKIP_REDUCTION_INIT)
-    auto problem_shape_mnkl = append<4>(problem_shape, 1);
-    auto [M, N, K, L] = problem_shape_mnkl;
-    if constexpr (IsAtomic) {
-      Layout mCol_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dCol);
-      if (args.ptr_col != nullptr) {
-        return fill_workspace(args.ptr_col, ElementOutput(args.reduction_identity), cosize(mCol_layout), stream, cuda_adapter);
-      }
-      return Status::kSuccess;
-    }
-    else
-#endif 
-    if constexpr (FinalReduction) {
-      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
-      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
-
-      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
-      size_t tile_counters_size = cute::ceil_div(M, tile_M) * sizeof(int);
-      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
-    }
-    else {
-      return Status::kSuccess;
-    }
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColReduction(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class ArgsTuple>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
-      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
-        params(params) {}
-
-    ArgsTuple args_tuple;
-    Params const& params;
-    bool do_final_reduction = false;
-
-    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-      if constexpr (EnableNullptr) {
-        if (params.ptr_col == nullptr) {
-          return frg_input;
-        }
-      }
-
-      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
-              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
-
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      using ReduceInput = RegReduceFn<ElementCompute>;
-      ConvertInput convert_input{};
-      ReduceInput reduce_input{};
-
-      Array frg_I = convert_input(frg_input);
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        if (!VisitCheckOOB || elem_less(tCcCol_mn(epi_v * FragmentSize + i), residue_tCcCol)) {
-          ElementCompute& tCrCol_vmn = tCrCol_mn(epi_v * FragmentSize + i);
-          tCrCol_vmn = reduce_input(tCrCol_vmn, frg_I[i]);
-        }
-      }
-
-      return frg_input;
-    }
-
-    template <class STensor, class SyncFn, class VTensor>
-    CUTLASS_DEVICE void
-    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
-      if (not is_last_iteration) {
-        return;
-      }
-
-      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
-              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
-      auto [m, n, k, l] = tile_coord_mnkl;
-      constexpr bool ReferenceSrc = decltype(ref_src)::value;
-
-      // Runtime nullptr is noop
-      if constexpr (EnableNullptr) {
-        if (params.ptr_col == nullptr) {
-          return;
-        }
-      }
-
-      // fully OOB CTA in partially OOB cluster
-      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
-        return;
-      }
-
-      //
-      // 1. Warp shuffle reduction
-      //
-      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
-      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
-      ReduceShuffle reduce_shuffle{};
-      Tensor tCrCol_frg = recast<FragmentShuffle>(filter(tCrCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int frg_idx = 0; frg_idx < size(tCrCol_frg); ++frg_idx) {
-          uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrCol_frg(frg_idx));
-          frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(_0{},reduction_cols));
-          tCrCol_frg(frg_idx) = reduce_shuffle(tCrCol_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
-        }
-      }
-      bool is_reduced_lane = get<1>(lane_mn) == 0;
-
-      //
-      // 2. Atomic reduction
-      //
-      if constexpr (IsAtomic) {
-        // Filter so we don't issue redunant copies over stride-0 modes
-        Tensor tCrCol_flt = filter_zeros(tCrCol);
-        Tensor tCcCol_flt = make_tensor(tCcCol.data(), make_layout(tCrCol_flt.shape(), tCcCol.stride()));
-
-        Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(gCol_l(_,_,l), epi_tile, tiled_copy, thread_idx);
-        Tensor tCgCol_flt = filter_zeros(tCgCol);
-
-        // NOTE: atomic reduction is performed in the output type
-        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-        using ReduceOutput = GmemReduceFn<ElementOutput>;
-        ConvertOutput convert_output{};
-        ReduceOutput reduce_output{};
-
-        if (is_reduced_lane) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tCrCol_flt); ++i) {
-            if (elem_less(tCcCol_flt(i), residue_tCcCol)) {
-              reduce_output(&tCgCol_flt(i), convert_output(tCrCol_flt(i)));
-            }
-          }
-        }
-        sync_fn();
-      }
-
-      //
-      // 2. One warp in N, skip threadblock smem reduction
-      //
-      else if constexpr (decltype(size<1>(warp_layout_MN))::value <= 1) {
-        // Dump warp reduction to gmem workspace
-        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
-        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_nl(_,_,n,l), epi_tile, tiled_copy, thread_idx);
-        if (is_reduced_lane) {
-          // Filter so we don't issue redundant copies over stride-0 modes
-          // (only works if 0-strides are in same location, which is by construction)
-          copy_aligned(filter(tCrCol), recast<ElementGmem>(filter(tCgBuf)));
-        }
-        sync_fn();
-      }
-
-      //
-      // 2. Multiple warps in N, do threadblock smem reduction
-      //
-      else {
-        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
-        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
-                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
-                      "smem reduction buffer not large enough, use a larger epilogue tile");
-        sync_fn();
-
-        // Dump warp reduction to smem workspace
-        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<1>(warp_mn)), epi_tile, tiled_copy, thread_idx);
-        if (is_reduced_lane) {
-          // Filter so we don't issue redunant copies over stride-0 modes
-          // (only works if 0-strides are in same location, which is by construction)
-          copy_aligned(filter(tCrCol), filter(tCsBuf));
-        }
-        sync_fn();
-
-        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
-        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
-        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
-        using ReduceSmem = GmemReduceFn<FragmentSmem>;
-        ReduceSmem reduce_smem{};
-
-        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
-        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
-        constexpr int FragsPerCol = decltype(size<0>(sBuf_frg))::value;
-
-        // Do the threadblock smem reduction
-        CUTLASS_PRAGMA_UNROLL
-        for (int reduction_cols = size<1>(warp_layout_MN) / 2; reduction_cols > 1; reduction_cols /= 2) {
-          int FragsPerReduction = reduction_cols * FragsPerCol;
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int frg_idx = thread_idx; frg_idx < FragsPerReduction; frg_idx += size(tiled_copy)) {
-            FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerReduction));
-            sBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
-          }
-          sync_fn();
-        }
-
-        // Do final smem reduction and dump to gmem workspace
-        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
-        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_nl(_,_,n,l)));
-        CUTLASS_PRAGMA_NO_UNROLL
-        for (int frg_idx = thread_idx; frg_idx < FragsPerCol; frg_idx += size(tiled_copy)) {
-          FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerCol));
-          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
-        }
-        sync_fn();
-      }
-
-      //
-      // 3. Increment atomic counters to signal final gmem reduction
-      //
-      if constexpr (not IsAtomic && FinalReduction) {
-        // Ensure gmem writes are visible to other threads before incrementing counter
-        __threadfence();
-        sync_fn();
-        // Collective thread 0 increments atomic tile counter and copies value to smem
-        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
-        if (thread_idx == 0) {
-          *prev_tile_count = atomicAdd(&params.tile_counters[m], 1);
-        }
-        sync_fn();
-        // Broadcast tile count to other threads in CTA and determine final reduction status
-        do_final_reduction = *prev_tile_count == size<2>(gBuf_nl) * size<3>(gBuf_nl) - 1;
-        sync_fn();
-      }
-    }
-
-    CUTLASS_DEVICE void
-    end() {
-      //
-      // 4. Do final gmem reduction if necessary
-      //
-      if constexpr (not IsAtomic && FinalReduction) {
-        if (not do_final_reduction) {
-          return;
-        }
-
-        auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
-                lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-                tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
-
-        using ReduceOutput = GmemReduceFn<ElementCompute>;
-        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-        ReduceOutput reduce_output{};
-        ConvertOutput convert_output{};
-
-        // Reduction over batches
-        if (size<2>(stride(gCol_l)) == 0) {
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
-            Tensor tRgBuf_nl = gBuf_nl(m,_0{},_,_);
-            ElementCompute output = tRgBuf_nl(_0{});
-            CUTLASS_PRAGMA_NO_UNROLL
-            for (int nl = 1; nl < size(tRgBuf_nl); ++nl) {
-              output = reduce_output(output, tRgBuf_nl(nl));
-            }
-            if (elem_less(cCol(m,_0{}), residue_cCol)) {
-              gCol_l(m,_0{},_0{}) = convert_output(output);
-            }
-          }
-        }
-        // No reduction over batches
-        else {
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
-            bool do_store = elem_less(cCol(m,_0{}), residue_cCol);
-            CUTLASS_PRAGMA_NO_UNROLL
-            for (int l = 0; l < size<3>(gBuf_nl); ++l) {
-              Tensor tRgBuf_n = gBuf_nl(m,_0{},_,l);
-              ElementCompute output = tRgBuf_n(_0{});
-              CUTLASS_PRAGMA_NO_UNROLL
-              for (int n = 1; n < size(tRgBuf_n); ++n) {
-                output = reduce_output(output, tRgBuf_n(n));
-              }
-              if (do_store) {
-                gCol_l(m,_0{},l) = convert_output(output);
-              }
-            }
-          }
-        }
-
-      }
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    Layout ref_layout_MN = [&] () {
-      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
-      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
-    }();                                                                                         // tile_mn -> tv_idx
-
-    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
-    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
-    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
-    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
-    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
-    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
-    int lane_idx = canonical_lane_idx();
-    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
-
-    // Get the MN layout + coord of warps to determine smem reduction iterations
-    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
-    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
-    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
-    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
-    int warp_idx = args.thread_idx / NumThreadsPerWarp;
-    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
-
-    // Partition output gmem and register tensors
-    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-
-    Tensor mCol = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_col), make_shape(M,N,L), params.dCol); // (M,N,L)
-    Tensor gCol_l = local_tile(mCol, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-                      gCol_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    fill(tCrCol, params.reduction_identity);
-
-    // Partition gmem+smem reduction buffer tensors
-    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_1{}, _0{}));
-    Layout mBuf_layout = blocked_product(gBuf_layout, make_layout(ceil_div(make_shape(M,N,L), shape(gBuf_layout))));
-    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
-    Tensor gBuf_nl = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(m,_,_));     // (CTA_M,CTA_N,REST_N,L)
-    Layout sBuf_layout = blocked_product(gBuf_layout,make_layout(make_shape(_1{},_1{},size<1>(warp_layout_MN)))); // (CTA_M,CTA_N,WARPS_N)
-
-    auto args_tuple = make_tuple(
-        bool_constant<ReferenceSrc>{}, cute::move(tCrCol), args.tCcD, gCol_l, args.cD, gBuf_nl, sBuf_layout,
-        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
-        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
-    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Batch matrix reduction
-template <
-  int Stages,
-  class EpilogueTile,
-  class Element,
-  class StrideMNL,
-  class CopyOpR2S,
-  class SmemLayoutAtom,
-  int Alignment = 128 / sizeof_bits_v<Element>,
-  bool EnableNullptr = true // Noop on nullptr params
->
-struct Sm90MatrixReduction;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
deleted file mode 100755
index 4f7d99fa3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,1139 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree operation base implementation to enable composable fusions
-         for the sm90 TMA warp-specialized (ws) epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using cute::tuple;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partitioning Helpers
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-  class CtaTileMN,
-  class EpilogueTile,
-  class TiledCopy
->
-CUTLASS_HOST_DEVICE
-constexpr auto
-sm90_partition_for_epilogue(
-    CtaTileMN cT,          // (CTA_M,CTA_N,...)
-    EpilogueTile epi_tile, // (EPI_TILE_M,EPI_TILE_N)
-    TiledCopy tiled_copy,
-    int thread_idx) {
-  ThrCopy thread_copy = tiled_copy.get_thread_slice(thread_idx);
-  Tensor cT_epi = flat_divide(cT, epi_tile);                                 // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,...)
-  if constexpr (ReferenceSrc) {
-    return thread_copy.partition_S(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
-  }
-  else {
-    return thread_copy.partition_D(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
-  }
-}
-
-template <
-  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-  class Engine, class LayoutMNL,
-  class TileShapeMNK,
-  class TileCoordMNKL,
-  class EpilogueTile,
-  class TiledCopy
->
-CUTLASS_HOST_DEVICE
-constexpr auto
-sm90_partition_for_epilogue(
-    Tensor<Engine, LayoutMNL> mT,  // (M,N,L)
-    TileShapeMNK tile_shape_mnk,   // (CTA_M,CTA_N,CTA_K)
-    TileCoordMNKL tile_coord_mnkl, // (m,n,k,l)
-    EpilogueTile epi_tile,         // (EPI_TILE_M,EPI_TILE_N)
-    TiledCopy tiled_copy,
-    int thread_idx) {
-  auto [m, n, k, l] = tile_coord_mnkl;
-  auto coord_shape =
-      make_coord(m, n, l)
-    ;
-  Tensor cT = local_tile(mT, take<0,2>(tile_shape_mnk), coord_shape);                                  // (CTA_M,CTA_N)
-  Tensor tCcT =
-    sm90_partition_for_epilogue<ReferenceSrc>(cT, epi_tile, tiled_copy, thread_idx);   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-  return tCcT;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Visitor Implementation
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ProblemShapeMNKL,
-  class TileShapeMNK,
-  class TileCoordMNKL,
-  class TiledMma,
-  class EpilogueTile
->
-struct ProducerLoadArgs {
-  ProblemShapeMNKL problem_shape_mnkl;
-  TileShapeMNK tile_shape_mnk;
-  TileCoordMNKL tile_coord_mnkl;
-  TiledMma tiled_mma;
-  EpilogueTile epi_tile;
-  int thread_idx;
-
-  CUTLASS_DEVICE
-  ProducerLoadArgs(
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_mnk,
-      TileCoordMNKL tile_coord_mnkl,
-      TiledMma tiled_mma,
-      EpilogueTile epi_tile,
-      int thread_idx)
-  : problem_shape_mnkl(problem_shape_mnkl),
-    tile_shape_mnk(tile_shape_mnk),
-    tile_coord_mnkl(tile_coord_mnkl),
-    tiled_mma(tiled_mma),
-    epi_tile(epi_tile),
-    thread_idx(thread_idx) {}
-};
-
-template<
-  class ProblemShapeMNKL,
-  class TileShapeMNK,
-  class TileCoordMNKL,
-  class TiledMma,
-  class EpilogueTile,
-  class TiledCopy,
-  class CoordTensor,
-  class Residue,
-  class ThrCoordTensor,
-  class ThrResidue,
-  class ThrSrcTensor
->
-struct ConsumerStoreArgs {
-  ProblemShapeMNKL problem_shape_mnkl;
-  TileShapeMNK tile_shape_mnk;
-  TileCoordMNKL tile_coord_mnkl;
-  TiledMma tiled_mma;
-  EpilogueTile epi_tile;
-  TiledCopy tiled_copy;
-  CoordTensor cD;
-  Residue residue_cD;
-  ThrCoordTensor tCcD;
-  ThrResidue residue_tCcD;
-  ThrSrcTensor & tCrC;
-  int thread_idx;
-
-  CUTLASS_DEVICE
-  ConsumerStoreArgs(
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShapeMNK tile_shape_mnk,
-      TileCoordMNKL tile_coord_mnkl,
-      TiledMma tiled_mma,
-      EpilogueTile epi_tile,
-      TiledCopy tiled_copy,
-      CoordTensor cD,
-      Residue residue_cD,
-      ThrCoordTensor tCcD,
-      ThrResidue residue_tCcD,
-      ThrSrcTensor & tCrC,
-      int thread_idx)
-  : problem_shape_mnkl(problem_shape_mnkl),
-    tile_shape_mnk(tile_shape_mnk),
-    tile_coord_mnkl(tile_coord_mnkl),
-    tiled_mma(tiled_mma),
-    epi_tile(epi_tile),
-    tiled_copy(tiled_copy),
-    cD(cD),
-    residue_cD(residue_cD),
-    tCcD(tCcD),
-    residue_tCcD(residue_tCcD),
-    tCrC(tCrC),
-    thread_idx(thread_idx) {}
-};
-
-template <class... Ops>
-struct Sm90VisitorImplBase {
-  // Shared memory allocation
-  using SharedStorage = tuple<typename Ops::SharedStorage...>;
-  // Host side fusion arguments
-  using Arguments = tuple<typename Ops::Arguments...>;
-  // Device side fusion params (Kernel-entry API)
-  using Params = tuple<typename Ops::Params...>;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
-    return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
-        using Op = cute::remove_cvref_t<decltype(op)>;
-        auto ret = Op::to_underlying_arguments(problem_shape, op_args, op_workspace);
-        if (op_workspace != nullptr) {
-          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
-          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
-        }
-        return ret;
-      },
-      [] (auto&&... op_params) { return cute::make_tuple(op_params...); }
-    );
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
-        using Op = cute::remove_cvref_t<decltype(op)>;
-        return Op::can_implement(problem_shape, op_args);
-      },
-      [&] (auto&&... implementable) {
-        return (true && ... && implementable);
-      }
-    );
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
-        using Op = cute::remove_cvref_t<decltype(op)>;
-        size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
-        return round_nearest(op_workspace_size, MinWorkspaceAlignment);
-      },
-      [&] (auto&&... op_workspace_size) {
-        return (0 + ... + op_workspace_size);
-      }
-    );
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
-    return transform_apply(tuple<Ops...>{}, args,
-      // Initialize each operation's workspace, stopping at the first error
-      [&] (auto&& op, auto const& op_args) {
-        if (status != Status::kSuccess) {
-          return status;
-        }
-
-        using Op = cute::remove_cvref_t<decltype(op)>;
-        status = Op::initialize_workspace(problem_shape, op_args, op_workspace, stream, cuda_adapter);
-        if (op_workspace != nullptr) {
-          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
-          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
-        }
-        return status;
-      },
-      // Return the final status
-      [&] (auto const&...ops) { return status; }
-    );
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
-    : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
-        [] (auto&& op, auto const& op_params, auto&& op_storage) {
-          using Op = cute::remove_cvref_t<decltype(op)>;
-          return Op(op_params, op_storage);
-        },
-        [] (auto&&... ops) { return cute::make_tuple(ops...); }
-      )) {}
-
-  // Ops can store kernel persistent variables (e.g. descriptors, scalars, wave counters)
-  tuple<Ops...> ops;
-};
-
-
-template <class... Ops>
-struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
-
-  using Impl = Sm90VisitorImplBase<Ops...>;
-  using Params = typename Impl::Params;
-  using SharedStorage = typename Impl::SharedStorage;
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImpl() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImpl(Params const& params, SharedStorage const& shared_storage)
-    : Impl(params, shared_storage) {}
-
-  using Impl::ops;
-
-  //
-  // Queries for kernel runtime
-  //
-
-  // Is a specialized warp for producer TMA loads needed
-  // e.g. Aux tensor loads, broadcasts using TMA bulk copy
-  // This condition cannot change between work tiles because it is used
-  // to determine whether the load warp should exit early or not
-  // e.g. for batched beta this must always be true regardless of current batch idx
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return cute::apply(ops,
-      [] (auto const&... op) {
-        return (false || ... || op.is_producer_load_needed());
-      }
-    );
-  }
-
-  // Is a producer TMA load specifically for C needed
-  // If this is true then is_producer_load_needed must also be true
-  // This condition can change between work tiles because it is only used
-  // to determine whether the TMA and smem loads for C of a given tile should happen
-  // e.g. for batched beta this can be false depending on current batch idx
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return cute::apply(ops,
-      [] (auto const&... op) {
-        return (false || ... || op.is_C_load_needed());
-      }
-    );
-  }
-
-  //
-  // Producer load callbacks, called by the epilogue load warp.
-  // Operations usually only define this if TMA load is needed. Most operations will reuse this empy implementation
-  // Load callbacks are responsible for issuing corresponding mbarrier expect-tx ops for any TMA loads issued, but
-  // are not responsible for issuing the producer_commit barrier arrival, which is issued by the collective instead
-  // If this is non-empty, is_producer_load_needed must be true.
-  //
-  template <class CallbacksTuple>
-  struct ProducerLoadCallbacks {
-    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
-    CallbacksTuple callbacks_tuple;
-
-    // Before entry of the subtile load loop
-    CUTLASS_DEVICE void
-    begin() {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.begin();
-        }
-      );
-    }
-
-    // Entry of the subtile load loop. Aux loads usually performed here
-    // Upon entry the producer acquire of the current subtile lock has completed.
-    // Upon exit all TMA loads for this subtile must have been issued, with corresponding expect-tx operations
-    CUTLASS_DEVICE void
-    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.step(full_mbarrier_ptr, epi_m, epi_n, load_iteration, issue_tma_load);
-        }
-      );
-    }
-
-    // Exit of the subtile load loop.
-    CUTLASS_DEVICE void
-    end() {
-      for_each(callbacks_tuple,
-        [] (auto& callbacks) {
-          callbacks.end();
-        }
-      );
-    }
-  };
-
-  // Producer load callbacks factory
-  // All operations must redefine this, but most can just dispatch to the base impl
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return transform_apply(ops,
-      [&] (auto& op) {
-        return op.get_producer_load_callbacks(args);
-      },
-      [] (auto&&... callbacks) {
-        auto callbacks_tuple = cute::make_tuple(callbacks...);
-        return ProducerLoadCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
-      }
-    );
-  }
-
-  //
-  // Consumer store callbacks, called by the epilogue store warps.
-  // All operations must redefine this, with optional inheritance from this empty implementation.
-  //
-  template <class CallbacksTuple>
-  struct ConsumerStoreCallbacks {
-    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
-    CallbacksTuple callbacks_tuple;
-
-    // Before entry of subtile store loop. Gmem broadcasts usually performed here.
-    CUTLASS_DEVICE void
-    begin() {
-      for_each(callbacks_tuple,
-        [] (auto& callbacks) {
-          callbacks.begin();
-        }
-      );
-    }
-
-    // Start of subtile store iteration
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.begin_loop(epi_m, epi_n);
-        }
-      );
-    }
-
-    // Before visit callback. Smem broadcasts usually performed here.
-    // Upon entry, all producer loads for this subtile are completed and visible.
-    CUTLASS_DEVICE void
-    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.previsit(epi_m, epi_n, load_iteration, is_producer_load_needed);
-        }
-      );
-    }
-
-    // Perform the fused elementwise computation
-    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
-      = delete; // Must be implemented for each operation
-
-    // After visit call. Smem reductions usually performed here
-    // reduction_buffer is an arbitrary smem tensor that can be used for workspace
-    // It is each nodes reponsibility to assert that this buffer is sufficiently sized
-    // and to ensure that this buffer is no longer needed upon callback exit
-    // i.e. results are synchronized and no longer in the reduction buffer
-    //
-    // visit_results is a rmem tensor that contains the results of visit() for an entire
-    // on the current epilogue subtile
-    template <class STensor, class SyncFn, class VTensor>
-    CUTLASS_DEVICE void
-    reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.reduce(reduction_buffer, sync_fn, epi_m, epi_n, is_last_iteration, visit_results);
-        }
-      );
-    }
-
-    // After reduce call, before smem async fence. Smem stores usually performed here.
-    // Upon exit, all smem stores for TMA must have been issued
-    CUTLASS_DEVICE void
-    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.postreduce(epi_m, epi_n, store_iteration, issue_smem_store);
-        }
-      );
-    }
-
-    // After smem async fence, before TMA store commit. Aux stores usually performed here
-    // Upon exit, all TMA stores for this subtile must have been issued
-    // Because of the TMA store delay optimization, this entry point must ONLY be used for TMA stores
-    // other gmem stores can be placed in the reduce or postreduce entry points
-    CUTLASS_DEVICE void
-    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.tma_store(epi_m, epi_n, store_iteration, issue_tma_store);
-        }
-      );
-    }
-
-    // End of subtile store iteration
-    CUTLASS_DEVICE void
-    end_loop(int epi_m, int epi_n) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.end_loop(epi_m, epi_n);
-        }
-      );
-    }
-
-    // Exit of subtile store loop. Gmem reductions usually performed here.
-    CUTLASS_DEVICE void
-    end() {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.end();
-        }
-      );
-    }
-  };
-
-  // Consumer store callbacks factory
-  // All operations must redefine this
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    return transform_apply(ops,
-      [&] (auto& op) {
-        return op.template get_consumer_store_callbacks<ReferenceSrc>(args);
-      },
-      [] (auto&&... callbacks) {
-        auto callbacks_tuple = cute::make_tuple(callbacks...);
-        return ConsumerStoreCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
-      }
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convenience aliases
-using EmptyProducerLoadCallbacks = Sm90VisitorImpl<>::ProducerLoadCallbacks<cute::tuple<>>;
-using EmptyConsumerStoreCallbacks = Sm90VisitorImpl<>::ConsumerStoreCallbacks<cute::tuple<>>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Tree visitor
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class NodeOp, class... ChildOps>
-struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
-
-  using Impl = Sm90VisitorImpl<ChildOps..., NodeOp>;
-  using Params = typename Impl::Params;
-  using SharedStorage = typename Impl::SharedStorage;
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90TreeVisitor(
-      Params const& params,
-      SharedStorage const& shared_storage)
-    : Impl(params, shared_storage) {}
-
-  template<class CallbacksImpl>
-  struct ConsumerStoreCallbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(CallbacksImpl&& impl)
-      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    using CallbacksImpl::callbacks_tuple;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      constexpr int Rm1 = sizeof...(ChildOps);
-      return cute::detail::tapply(callbacks_tuple,
-        [&] (auto& child_callbacks) {
-          return child_callbacks.visit(frg_acc, epi_v, epi_m, epi_n); // child ops must be nullary (e.g. loads, trees)
-        },
-        [&] (auto&&... frg_inputs) {
-          return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
-        },
-        make_seq<Rm1>{} // restrict the transform to R-1 child ops, apply is for node op
-      );
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto callbacks_tuple = Sm90VisitorImpl<ChildOps..., NodeOp>::
-      template get_consumer_store_callbacks<ReferenceSrc>(args);
-    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// DAG visitors
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Most DAG fusions can be represented as a set of output trees with a common input tree
-// The common input is first evaluated, then the result is passed as the acc fragment to the output trees
-template <class InputTree, class OutputTree, class... AuxOutTrees>
-struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree> {
-
-  using Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::Sm90VisitorImpl;
-
-  template<class CallbacksImpl>
-  struct ConsumerStoreCallbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(CallbacksImpl&& impl)
-      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    using CallbacksImpl::callbacks_tuple;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array frg_input = get<0>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
-
-      constexpr int Rm2 = sizeof...(AuxOutTrees);
-      cute::for_each(make_seq<Rm2>{}, // restrict the sequence to aux out trees
-        [&] (auto I) {
-          get<I+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
-        }
-      );
-
-      return get<Rm2+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto callbacks_tuple = Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::
-      template get_consumer_store_callbacks<ReferenceSrc>(args);
-    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  // deducing the output type for all the nodes is tricky so we just convert them all to a common type
-  // if multiple compute types are needed then split into multiple subgraphs grouped by type
-  class ElementCompute,
-  class EdgeTuple, // tuple of int_sequence, each sequence is the children indices (indexed by topological order) for each node
-  class... Ops     // in topological order, last op is the output. EdgeTuple must match this order
->
-struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
-  static_assert(is_static_v<EdgeTuple>);
-  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
-  static_assert(sizeof...(Ops) > 1);
-
-  using Sm90VisitorImpl<Ops...>::Sm90VisitorImpl;
-
-  template<class CallbacksImpl>
-  struct ConsumerStoreCallbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(CallbacksImpl&& impl)
-      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    using CallbacksImpl::callbacks_tuple;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      constexpr int Rm1 = sizeof...(Ops) - 1;
-      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
-
-      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
-        // Visit the first R-1 ops in topological order
-        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
-          frg_compute = cute::detail::apply(frg_compute_tuple,
-            // Compute the current op with children inputs
-            [&] (auto const&... frg_inputs) {
-              auto frg_output = callbacks.visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
-              using ElementOutput = typename decltype(frg_output)::Element;
-              using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
-              ConvertOutput convert_output{};
-
-              return convert_output(frg_output);
-            },
-            // Get inputs in the sequence given by the children indices of the current op
-            edge_seq
-          );
-          return frg_compute; // unused
-        },
-        // Visit the last op
-        [&] (auto const&...ops) {
-          return cute::detail::apply(frg_compute_tuple,
-            // Compute the last op with children inputs
-            [&] (auto const&... frg_inputs) {
-              return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
-            },
-            // Get inputs in the sequence given by the children indices of the last op
-            get<Rm1>(EdgeTuple{})
-          );
-        },
-        // Transform to visit R-1 ops, apply to visit last op
-        make_seq<Rm1>{}
-      );
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto callbacks_tuple = Sm90VisitorImpl<Ops...>::
-      template get_consumer_store_callbacks<ReferenceSrc>(args);
-    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(std::move(callbacks_tuple));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Base specializations so we can have standard layout params and simple aggregate initializers
-namespace detail {
-
-template <class Op0>
-struct Sm90VisitorImplBase<Op0> {
-
-  // Retain tuple for SharedStorage because empty structs have 1B alignment
-  // tuples use multiple inheritance, avoids this problem
-  using SharedStorage = tuple<
-    typename Op0::SharedStorage
-  >;
-
-  struct Arguments {
-    typename Op0::Arguments op_0;
-  };
-
-  struct Params {
-    typename Op0::Params op_0;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return Params{
-      Op0::to_underlying_arguments(problem_shape, args.op_0, workspace)
-    };
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return Op0::can_implement(problem_shape, args.op_0);
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
-    : ops({
-        Op0(params.op_0, get<0>(shared_storage))
-      }) {}
-
-  tuple<Op0> ops;
-};
-
-template <class Op0, class Op1>
-struct Sm90VisitorImplBase<Op0, Op1> {
-
-  using SharedStorage = tuple<
-    typename Op0::SharedStorage,
-    typename Op1::SharedStorage
-  >;
-
-  struct Arguments {
-    typename Op0::Arguments op_0;
-    typename Op1::Arguments op_1;
-  };
-
-  struct Params {
-    typename Op0::Params op_0;
-    typename Op1::Params op_1;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
-    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
-    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
-    return Params{
-      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
-      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace)
-    };
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return Op0::can_implement(problem_shape, args.op_0) && 
-           Op1::can_implement(problem_shape, args.op_1);
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
-    : ops({
-        Op0(params.op_0, get<0>(shared_storage)),
-        Op1(params.op_1, get<1>(shared_storage))
-      }) {}
-
-  tuple<Op0, Op1> ops;
-};
-
-template <class Op0, class Op1, class Op2>
-struct Sm90VisitorImplBase<Op0, Op1, Op2> {
-
-  using SharedStorage = tuple<
-    typename Op0::SharedStorage,
-    typename Op1::SharedStorage,
-    typename Op2::SharedStorage
-  >;
-
-  struct Arguments {
-    typename Op0::Arguments op_0;
-    typename Op1::Arguments op_1;
-    typename Op2::Arguments op_2;
-  };
-
-  struct Params {
-    typename Op0::Params op_0;
-    typename Op1::Params op_1;
-    typename Op2::Params op_2;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
-    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
-    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
-    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
-    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
-    return Params{
-      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
-      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
-      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace)
-    };
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return Op0::can_implement(problem_shape, args.op_0) && 
-           Op1::can_implement(problem_shape, args.op_1) &&
-           Op2::can_implement(problem_shape, args.op_2);          
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
-    : ops({
-        Op0(params.op_0, get<0>(shared_storage)),
-        Op1(params.op_1, get<1>(shared_storage)),
-        Op2(params.op_2, get<2>(shared_storage))
-      }) {}
-
-  tuple<Op0, Op1, Op2> ops;
-};
-
-template <class Op0, class Op1, class Op2, class Op3>
-struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
-
-  using SharedStorage = tuple<
-    typename Op0::SharedStorage,
-    typename Op1::SharedStorage,
-    typename Op2::SharedStorage,
-    typename Op3::SharedStorage
-  >;
-
-  struct Arguments {
-    typename Op0::Arguments op_0;
-    typename Op1::Arguments op_1;
-    typename Op2::Arguments op_2;
-    typename Op3::Arguments op_3;
-  };
-
-  struct Params {
-    typename Op0::Params op_0;
-    typename Op1::Params op_1;
-    typename Op2::Params op_2;
-    typename Op3::Params op_3;
-  };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
-    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
-    size_t op_2_workspace_size = Op2::get_workspace_size(problem_shape, args.op_2);
-    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
-    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
-    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
-    uint8_t* op_3_workspace = op_2_workspace + op_2_workspace_size;
-    return Params{
-      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
-      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
-      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace),
-      Op3::to_underlying_arguments(problem_shape, args.op_3, op_3_workspace)
-    };
-  }
-  
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return Op0::can_implement(problem_shape, args.op_0) && 
-           Op1::can_implement(problem_shape, args.op_1) &&
-           Op2::can_implement(problem_shape, args.op_2) &&
-           Op3::can_implement(problem_shape, args.op_3); 
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    workspace_size += Op3::get_workspace_size(problem_shape, args.op_3);
-    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = Op3::initialize_workspace(problem_shape, args.op_3, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += Op3::get_workspace_size(problem_shape, args.op_3);
-    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase() {}
-
-  CUTLASS_HOST_DEVICE
-  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
-    : ops({
-        Op0(params.op_0, get<0>(shared_storage)),
-        Op1(params.op_1, get<1>(shared_storage)),
-        Op2(params.op_2, get<2>(shared_storage)),
-        Op3(params.op_3, get<3>(shared_storage))
-      }) {}
-
-  tuple<Op0, Op1, Op2, Op3> ops;
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
deleted file mode 100755
index 53c0dce8b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
+++ /dev/null
@@ -1,759 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree Top-K + Softmax fusion operation for sm90 TMA warp-specialized epilogue
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-
-#include "cute/tensor.hpp"
-#include "sm90_visitor_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::fusion {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Top-K + Softmax reduction across columns
-// Performs a reduction of top-K values across N, and finally performs a softmax on them,
-// and sets values not in the top-K to 0.
-//
-//   Assumptions:
-//     1. CTA_N >= N (single tile across N, the mode which is reduced)
-//     2. EPI_N >= N (single epilogue tile across N, because we can reduce and revisit one
-//        epilogue tile at a time.)
-//     3. Top-K value is either 2 or 4.
-//
-
-namespace detail {
-
-// Implementations for add to sorted list and merging sorted lists,
-// with fast paths for lists of size 2 and 4 (Top-2 and Top-4).
-// Generic implementations may result in greater register use and branching,
-// and should be avoided.
-// Fast paths for Top-2 and Top-4 are written in inline PTX directly.
-
-CUTLASS_DEVICE
-Array<float, 2> top_2_reduce_scalar(Array<float, 2> a, float scalar) {
-  Array<float, 2> out;
-  asm volatile(
-      "{\n"
-      "  .reg .f32 mx;\n"
-      "  .reg .pred p;\n"
-      "  max.f32 mx, %3, %4;\n"
-      "  setp.gtu.f32 p, %2, %4;\n"
-      "  selp.f32 %1, mx, %2, p;\n"
-      "  selp.f32 %0, %2, %4, p;\n"
-      "}\n" : "=f"(out[0]), "=f"(out[1]) : "f"(a[0]), "f"(a[1]), "f"(scalar));
-  return out;
-}
-
-CUTLASS_DEVICE
-Array<float, 2> top_2_reduce(Array<float, 2> a, Array<float, 2> b) {
-  Array<float, 2> out;
-  asm volatile(
-      "{\n"
-      "  .reg .v2 .f32 mx;\n"
-      "  .reg .pred p;\n"
-      "  max.f32 mx.x, %3, %4;\n"           // max(a1, b0)
-      "  max.f32 mx.y, %2, %5;\n"           // max(a0, b1)
-      "  setp.gtu.f32 p, %2, %4;\n"         // a0 > b0
-      "  selp.f32 %1, mx.x, mx.y, p;\n"     // a0 > b0 ? max(a1, b0) : max(a0, b1)
-      "  selp.f32 %0, %2, %4, p;\n"         // a0 > b0 ? a0 : b0
-      "}\n" : "=f"(out[0]), "=f"(out[1]) : 
-      "f"(a[0]), "f"(a[1]), "f"(b[0]), "f"(b[1]));
-  return out;
-}
-
-CUTLASS_DEVICE
-Array<float, 4> top_4_reduce_scalar(Array<float, 4> a, float scalar) {
-  Array<float, 4> out;
-  asm volatile(
-      "{\n"
-      "  .reg .f32 mx;\n"                   // max(a3, b)
-      "  .reg .pred p0;\n"                  // a0 > b
-      "  .reg .pred p1;\n"                  // a1 > b
-      "  .reg .pred p2;\n"                  // a2 > b
-      "  max.f32 mx, %7, %8;\n"             // max(a3, b)
-      "  setp.gtu.f32 p0, %4, %8;\n"        // a0 > b
-      "  setp.gtu.f32 p1, %5, %8;\n"        // a1 > b
-      "  setp.gtu.f32 p2, %6, %8;\n"        // a2 > b
-      "  selp.f32 %3, mx, %6, p2;\n"        // a2 > b ? max(a3, b) : a2
-      "  selp.f32 %2, %6, %8, p2;\n"        // a1 = a2 > b ? a2 : b
-      "  selp.f32 %2, %2, %5, p1;\n"        // a1 > b ? max(a2, b) : a1 == a1 > b ? a1 : old_a1
-      "  selp.f32 %1, %5, %8, p1;\n"        // a0 = a1 > b ? a1 : b
-      "  selp.f32 %1, %1, %4, p0;\n"        // a0 > b ? max(a1, b) : a0 == a0 > b ? a0 : old_a0
-      "  selp.f32 %0, %4, %8, p0;\n"        // a0 = a0 > b ? a0 : b
-      "}\n" : 
-      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
-      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]), "f"(scalar));
-  return out;
-}
-
-CUTLASS_DEVICE
-Array<float, 4> top_4_reduce(Array<float, 4> a, Array<float, 4> b) {
-  Array<float, 4> out;
-  asm volatile(
-      "{\n"
-      "  .reg .f32 mxa0b1;\n"                          // max(a0, b1)
-      "  .reg .f32 mxa1b0;\n"                          // max(a1, b0)
-
-      "  .reg .f32 mxa2b0;\n"                          // max(a2, b0)
-      "  .reg .f32 mxa1b1;\n"                          // max(a1, b1)
-      "  .reg .f32 mxa0b2;\n"                          // max(a1, b1)
-
-      "  .reg .f32 mxa1b2;\n"                          // max(a1, b2)
-      "  .reg .f32 mxa2b1;\n"                          // max(a2, b1)
-      "  max.f32 mxa1b2, %5, %10;\n"
-      "  max.f32 mxa2b1, %6, %9;\n"
-
-      "  .reg .f32 mxa3b0;\n"                          // max(a1, b2)
-      "  .reg .f32 mxa0b3;\n"                          // max(a2, b1)
-      "  max.f32 mxa3b0, %7, %8;\n"
-      "  max.f32 mxa0b3, %4, %11;\n"
-
-      "  .reg .pred pa0b0;\n"                          // a0 > b0
-      "  .reg .pred pa1b0;\n"                          // a1 > b0
-      "  .reg .pred pa2b0;\n"                          // a2 > b0
-      "  .reg .pred pa0b1;\n"                          // a0 > b1
-      "  .reg .pred pa1b1;\n"                          // a1 > b1
-      "  .reg .pred pa0b2;\n"                          // a0 > b2
-      "  .reg .pred pb2a0;\n"                          // b1 > a0
-      "  .reg .pred pb1a0;\n"                          // b1 > a0
-
-      "  setp.gtu.f32 pa0b0, %4, %8;\n"                // a0 > b0
-      "  setp.gtu.f32 pa1b0, %5, %8;\n"                // a1 > b0
-      "  setp.gtu.f32 pa2b0, %6, %8;\n"                // a2 > b0
-      "  setp.gtu.f32 pa0b1, %4, %9;\n"                // a0 > b1
-      "  setp.gtu.f32 pa1b1, %5, %9;\n"                // a1 > b1
-      "  setp.gtu.f32 pa0b2, %4, %10;\n"               // a0 > b2
-
-      "  not.pred pb2a0, pa0b2;\n"
-      "  not.pred pb1a0, pa0b1;\n"
-
-      "  selp.f32 mxa1b0, %5, %8, pa1b0;\n"            // max(a1, b0)
-      "  selp.f32 mxa0b1, %4, %9, pa0b1;\n"            // max(a0, b1)
-
-      "  selp.f32 mxa1b1, %5, %9, pa1b1;\n"            // max(a1, b1)
-      "  selp.f32 mxa2b0, %6, %8, pa2b0;\n"            // max(a2, b0)
-      "  selp.f32 mxa0b2, %4, %10, pa0b2;\n"           // max(a0, b2)
-
-      // a0
-      "  selp.f32 %0, %4, %8, pa0b0;\n"                // a0 = a0 > b0 ? a0 : b0
-
-      // a1
-      "  selp.f32 %1, mxa1b0, mxa0b1, pa0b0;\n"        // a1 = a0 > b0 ? max(a1, b0) : max(a0, b1)
-
-      // a2
-      "  mov.f32 %2, mxa1b1;\n"                        // a2 = max(a1, b1) ** most likely case
-      "  selp.f32 %2, mxa2b0, %2, pa1b0;\n"            // a0 > a1 > b0
-      "  selp.f32 %2, mxa0b2, %2, pb1a0;\n"            // b0 > b1 > a0
-
-      // a3
-      "  mov.f32 %3, mxa1b2;\n"                        // a3 = max(a1, b2) ** one of the most likely cases
-      "  selp.f32 %3, mxa2b1, %3, pa1b1;\n"            // a3 = a1 > b1 ? max(a2, b1) ** second most likely case
-      "  selp.f32 %3, mxa3b0, %3, pa2b0;\n"            // a0 > a1 > a2 > b0
-      "  selp.f32 %3, mxa0b3, %3, pb2a0;\n"            // b0 > b1 > b2 > a0
-      "}\n" : 
-      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) : 
-      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]),
-      "f"(b[0]), "f"(b[1]), "f"(b[2]), "f"(b[3]));
-  return out;
-}
-
-// Assumption: array elements are sorted in descending order
-// (a[0] is the largest element in a[].)
-template <typename Element, int N>
-CUTLASS_DEVICE
-void add_element_to_desc_sorted_array(cutlass::Array<Element, N>& a, Element b) {
-  if constexpr (N == 2 && is_same_v<Element, float>) {
-    a = top_2_reduce_scalar(a, b);
-  }
-  else if constexpr (N == 4 && is_same_v<Element, float>) {
-    a = top_4_reduce_scalar(a, b);
-  }
-  else {
-    // slower generic path with branching, slower, and can cause register spill
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < N; ++k) {
-      if (a[k] <= b) {
-        // Shift down
-        CUTLASS_PRAGMA_UNROLL
-        for (int l = N - 1; l > k; --l) {
-          a[l] = a[l-1];
-        }
-        a[k] = b;
-      }
-    }
-  }
-}
-
-// Assumption: array elements are sorted in descending order
-// (a[0] and b[0] are the largest elements in a[] and b[].)
-template <typename Element, int N>
-CUTLASS_DEVICE
-void merge_desc_sorted_arrays(cutlass::Array<Element, N>& a, const cutlass::Array<Element, N>& b) {
-  if constexpr (N == 2 && is_same_v<Element, float>) {
-    a = top_2_reduce(a, b);
-  }
-  else if constexpr (N == 4 && is_same_v<Element, float>) {
-    a = top_4_reduce(a, b);
-  }
-  else {
-    // slower generic path with branching, slower, and can cause register spill
-    int j = 0;
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < N; ++k) {
-      if (a[k] <= b[j]) {
-        // Shift down
-        CUTLASS_PRAGMA_UNROLL
-        for (int l = N - 1; l > k; --l) {
-          a[l] = a[l-1];
-        }
-        a[k] = b[j];
-        ++j;
-      }
-    }
-  }
-}
-
-// Assumption: array elements are sorted in descending order
-// (a[0] is the largest element in a[].)
-template <typename Element, int N>
-CUTLASS_DEVICE
-Element topk_logsumexp(cutlass::Array<Element, N> a) {
-  // Do one less `exp`, because we know what its result will be.
-  // Assume x is a set of `x_i`s, and `x_m` is the maximum of that set.
-  // logsumexp(x) = log(sum(x_i)) = m + log(sum(x_i - m)) = m + log(1 + sum_{i != m}(x_i - x_m))
-  // Compute m + log(1 + sum_{i != m}(x_i - x_m))
-  Element sum = Element(1.0);
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 1; i < N; ++i) {
-    sum += fast_exp(a[i] - a[0]);
-  }
-  return a[0] + fast_log(sum);
-}
-
-CUTLASS_DEVICE
-float fast_masked_softmax(float value, float minimum, float logsumexp) {
-  float new_value;
-  asm volatile(
-      "{\n"
-      "  .reg .pred p0;\n"
-      // value >= minimum
-      "  setp.geu.f32 p0, %1, %2;\n"
-
-      "  .reg .f32 x_lse;\n"
-      "  .reg .f32 %%f<11>;\n"
-      "  .reg .b32 %%r<3>;\n"
-
-      // x_lse = value - minimum
-      "  sub.rn.f32  x_lse, %1, %3;\n"
-
-      // exp(x_lse)
-      // The following is derived from a ptx dump of expf.
-      // exp requires a base conversion from exp2.
-      "  fma.rn.f32 %%f1, x_lse, 0f3BBB989D, 0f3F000000;\n"
-      "  cvt.sat.f32.f32 %%f2, %%f1;\n"
-      "  fma.rm.f32 %%f3, %%f2, 0f437C0000, 0f4B400001;\n"
-      "  add.f32 %%f4, %%f3, 0fCB40007F;\n"
-      "  neg.f32 %%f5, %%f4;\n"
-      "  fma.rn.f32 %%f6, x_lse, 0f3FB8AA3B, %%f5;\n"
-      "  fma.rn.f32 %%f7, x_lse, 0f32A57060, %%f6;\n"
-      "  mov.b32 %%r1, %%f3;\n"
-      "  shl.b32 %%r2, %%r1, 23;\n"
-      "  mov.b32 %%f8, %%r2;\n"
-      "  ex2.approx.ftz.f32 %%f9, %%f7;\n"
-      "  mul.f32 %%f10, %%f9, %%f8;\n"
-
-      // Mask or softmax
-      "  selp.f32 %0, %%f10, 0f00000000, p0;\n"
-      "}\n" : "=f"(new_value) : "f"(value), "f"(minimum), "f"(logsumexp));
-  return new_value;
-}
-
-template <typename Element>
-CUTLASS_DEVICE
-Element masked_softmax(Element value, Element minimum, Element logsumexp) {
-  if constexpr (is_same_v<Element, float>) {
-    // Inline PTX implementation
-    // Significantly reduces register requirements
-    return fast_masked_softmax(value, minimum, logsumexp);
-  }
-  else {
-    return value < minimum ? Element(0.0) : fast_exp(value - logsumexp);
-  }
-}
-
-} // namespace detail
-
-template <
-  int TopK,
-  int FragmentSize,
-  class CtaTileShapeMNK,
-  class EpilogueTile,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
-  bool UseButterflyReduce = true
->
-struct Sm90TopKSoftmaxColReduction {
-private:
-  static_assert(is_same_v<ElementCompute, float>, "Fused Top-K + Softmax reduction requires FP32 accumulation.");
-  static_assert(TopK == 2 || TopK == 4, "Fused Top-K + Softmax reduction only supports K=2 and K=4.");
-  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
-
-  // Reduction tensors
-  //   We have two tensors for this EVT node: a reduction tensor and a tensor holding
-  //   final reduction values (tCrSoftmax). The reason for this is that Top-K and Softmax
-  //   require different reductions, but those luckily overlap. Top-K obviously needs at least
-  //   two values (K >= 2), and softmax needs one value: logsumexp. Logsumexp is simply the log
-  //   of sum of exponents over the set, and is equivalent to m + sum(exp(x_i - m)), where m is the
-  //   maximum of all x_i elements. Since safe softmax for any element x_i is computed as
-  //   softmax(x_i) = exp(x_i - m) / sum_j(exp(x_j - max))
-  //   we can track logsumexp instead of tracking two variables (sum of exps and the max).
-  //   In addition, subtracting logsumexp from any element and taking its exp is equivalent to
-  //   computing its softmax.
-  //   
-  //   The overlap between softmax and top-K is that we don't need to reduce logsumexp along the
-  //   way at all, because any element not in the top-K is going to be masked out and set to 0.
-  //   Therefore, we only reduce the top-K elements, and when done, compute their logsumexp and
-  //   keep it, and the smallest element in the top-K for masking out non-top-K elements.
-  //
-  //   This means that our final reduction result will always be 2 elements, regardless of the value
-  //   of K: minimum of top-K, and logsumexp.
-  //
-  //   For each reduction tensor, we define a new struct for readability.
-
-  struct ReductionResult {
-    ElementCompute min_;
-    ElementCompute logsumexp_;
-
-    CUTLASS_DEVICE
-    ReductionResult() { }
-
-    CUTLASS_DEVICE
-    ReductionResult(ElementCompute min, ElementCompute logsumexp): 
-      logsumexp_(logsumexp), min_(min) { }
-
-    // Warp shuffle broadcast
-    CUTLASS_DEVICE
-    void shuffle_up_sync(uint32_t delta, int lane_id) {
-      static_assert(sizeof(ReductionResult) == sizeof(uint64_t));
-      uint64_t r = reinterpret_cast<uint64_t&>(*this);
-      r = __shfl_up_sync(0xFFFFFFFF, r, delta);
-      *this = (lane_id - static_cast<int>(delta) >= 0) ? reinterpret_cast<ReductionResult&>(r) : *this;
-    }
-  };
-
-  struct TopKResult {
-    Array<ElementCompute, TopK> top_k_;
-
-    CUTLASS_DEVICE
-    TopKResult() {
-      top_k_.fill(-cutlass::platform::numeric_limits<ElementCompute>::infinity());
-    }
-
-    // This is where we do the "final" reduction, where we compute
-    // the logsumexp for softmax, keep the smallest value in top-K,
-    // and discard the rest.
-    CUTLASS_DEVICE
-    ReductionResult reduce_final() const {
-      return ReductionResult(top_k_[TopK - 1], topk_logsumexp(top_k_));
-    }
-
-    // Butterfly reduction
-    CUTLASS_DEVICE
-    void shuffle_xor_sync(int laneMask) {
-      if constexpr (TopK == 2) {
-        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
-        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
-        top_k = __shfl_xor_sync(0xFFFFFFFF, top_k, laneMask);
-        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-      else if constexpr (TopK == 4) {
-        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
-        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
-        uint64_t top_k_arr[2];
-        top_k_arr[0] = top_k_ptr[0];
-        top_k_arr[1] = top_k_ptr[1];
-        top_k_arr[0] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[0], laneMask);
-        top_k_arr[1] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[1], laneMask);
-        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-      else {
-        TopKResult synced_v;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < TopK; ++i) {
-          synced_v.top_k_[i] = __shfl_xor_sync(0xFFFFFFFF, top_k_[i], laneMask);
-        }
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-    }
-
-    // Warp shuffle reduction
-    CUTLASS_DEVICE
-    void shuffle_down_sync(uint32_t delta) {
-      if constexpr (TopK == 2) {
-        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
-        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
-        top_k = __shfl_down_sync(0xFFFFFFFF, top_k, delta);
-        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-      else if constexpr (TopK == 4) {
-        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
-        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
-        uint64_t top_k_arr[2];
-        top_k_arr[0] = top_k_ptr[0];
-        top_k_arr[1] = top_k_ptr[1];
-        top_k_arr[0] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[0], delta);
-        top_k_arr[1] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[1], delta);
-        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-      else {
-        TopKResult synced_v;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < TopK; ++i) {
-          synced_v.top_k_[i] = __shfl_down_sync(0xFFFFFFFF, top_k_[i], delta);
-        }
-        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
-      }
-    }
-  };
-
-public:
-  struct SharedStorage { };
-
-  struct Arguments { };
-
-  struct Params { };
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return {};
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    auto [M, N, K, L] = problem_shape;
-    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
-    // Cross CTA reduction is not possible because there is no guarantee that all CTAs run
-    // concurrently.
-    // Cross epilogue tile reduction is possible, but re-visiting and applying reduction
-    // to accumulators is only possible for the current epilogue tile.
-    auto [epi_M, epi_N] = EpilogueTile{};
-    return N <= tile_N && N <= epi_N && N >= TopK;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90TopKSoftmaxColReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90TopKSoftmaxColReduction(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class ArgsTuple>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
-      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
-        params(params) {}
-
-    ArgsTuple args_tuple;
-    Params const& params;
-
-    template <typename ElementAccumulator, typename ElementInput>
-    CUTLASS_DEVICE auto
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-
-      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
-              lane_layout_MN, lane_mn,
-              residue_cCol, residue_tCcCol] = args_tuple;
-      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
-
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-
-      Array frg_I = convert_input(frg_input);
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        auto thread_crd = tCcCol_mn(epi_v * FragmentSize + i);
-        if (elem_less(thread_crd, residue_tCcCol)) {
-          TopKResult& tCrCol_vmn = tCrTopK(epi_v * FragmentSize + i);
-          detail::add_element_to_desc_sorted_array(tCrCol_vmn.top_k_, frg_I[i]);
-        }
-      }
-
-      return frg_input;
-    }
-
-    template <class STensor, class SyncFn, class VTensor>
-    CUTLASS_DEVICE void
-    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
-
-      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
-              lane_layout_MN, lane_mn,
-              residue_cCol, residue_tCcCol] = args_tuple;
-
-      // fully OOB CTA in partially OOB cluster
-      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
-        return;
-      }
-      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
-
-      // `tCrTopK` and `tCrSoftmax` have 0-strides along modes that correspond to N,
-      // in order to reduce along modes in the `R2S` sublayout that correspond to N.
-      // This means we should modify and warp-reduce them according to their co-domain instead of
-      // their domain. Therefore we keep a filtered view of both and use them as necessary.
-      auto tCrTopK_f = filter(tCrTopK);
-      auto tCrSoftmax_f = filter(tCrSoftmax);
-
-      // The pattern here is: reduce Top-K first, then compute logsumexp, keep it and the
-      // last element of Top-K, use the latter to mask the visited results, and the former
-      // to apply softmax.
-      //
-      // This gives us two options: reduce the Top-K with warp shuffles, have the reduced
-      // lanes compute logsumexp and pair it with the last Top-K element, and broadcast
-      // the result back using warp shuffles.
-      //
-      // Alternatively, we can do a butterfly reduction over Top-K, and have all lanes
-      // compute their own logsumexp and skip the broadcast.
-      if constexpr (UseButterflyReduce) {
-        //
-        // 1. Butterfly reduction
-        //
-        CUTLASS_PRAGMA_UNROLL
-        for (int j = 1; j < size<1>(lane_layout_MN); j *= 2) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tCrTopK_f); ++i) {
-            tCrTopK_f(i).shuffle_xor_sync(j);
-          }
-        }
-
-        //
-        // 2. Strip down reduced value and compute sum of exps
-        //
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(tCrSoftmax_f); ++i) {
-          tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
-        }
-      }
-      else {
-        //
-        // 1. Warp shuffle reduction
-        //
-        CUTLASS_PRAGMA_UNROLL
-        for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tCrTopK_f); ++i) {
-            tCrTopK_f(i).shuffle_down_sync(lane_layout_MN(_0{},reduction_cols));
-          }
-        }
-
-        //
-        // 2. Strip down reduced value and compute sum of exps
-        //
-        bool is_reduced_lane = get<1>(lane_mn) == 0;
-        if (is_reduced_lane) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
-            tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
-          }
-        }
-
-        //
-        // 3. Broadcast reduced values to all participants
-        //
-        CUTLASS_PRAGMA_UNROLL
-        for (int broadcast_cols = 1; broadcast_cols <= size<1>(lane_layout_MN) / 2; broadcast_cols *= 2) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
-            tCrSoftmax_f(i).shuffle_up_sync(lane_layout_MN(_0{},broadcast_cols), get<1>(lane_mn));
-          }
-        }
-      }
-
-      //
-      // 4. Re-visit and apply top-K and softmax
-      //
-      CUTLASS_PRAGMA_UNROLL
-      for (int epi_v = 0; epi_v < size(visit_results); ++epi_v) {
-        auto& visit_frag = visit_results(epi_v);
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < FragmentSize; ++i) {
-          visit_frag[i] = detail::masked_softmax(
-            visit_frag[i],
-            tCrSoftmax(epi_v * FragmentSize + i).min_,
-            tCrSoftmax(epi_v * FragmentSize + i).logsumexp_
-          );
-        }
-      }
-
-    }
-
-    CUTLASS_DEVICE void
-    end_loop(int epi_m, int epi_n) {
-      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol, 
-              lane_layout_MN, lane_mn,
-              residue_cCol, residue_tCcCol] = args_tuple;
-
-      // Reset reduced top-K values for next tile
-      // This must be done because we only assume a single epilogue tile across N,
-      // but not M.
-      fill(tCrTopK, TopKResult());
-    }
-
-    CUTLASS_DEVICE void
-    end() { }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    Layout ref_layout_MN = [&] () {
-      if constexpr (ReferenceSrc) { return get<0>(args.tiled_copy.get_layoutS_MN()); }
-      else                        { return get<0>(args.tiled_copy.get_layoutD_MN()); }
-    }();                                                                                         // tile_mn -> tv_idx
-
-    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
-    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
-    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
-    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
-    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
-    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
-    int lane_idx = canonical_lane_idx();
-    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
-
-    // Get the MN layout + coord of warps to determine smem reduction iterations
-    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
-    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
-    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
-
-    // Make sure there's only one warp across N so we can use warp shuffle intrinsics for reduction.
-    static_assert(decltype(size<1>(warp_layout_MN))::value <= 1);
-
-    // Reduction layout
-    //   We're assuming all elements in a row (over which we're performing the reduction) are
-    //   visited in the same corresponding epilogue tile, and this is what allows us to apply the
-    //   top-K + softmax operation within `reduce()`, by re-visiting the accumulated results.
-    //
-    //   This presents a challenge, because the layout of the accumulated results is typically in
-    //   in the register to shared memory shape, or: (R2S,R2S_M,R2S_N).
-    //   This means that we still need to reduce this tensor along N.
-    //
-    //   The solution is simple: we need to flatten the layout, identify modes that correspond to
-    //   N and set their strides to 0, in order to map fragment indices corresponding to the same
-    //   row back to the same element in the tensor.
-    //
-    //   This requires some extra layout manipulation, which is as follows.
-
-    // Create new accumulator layout with column broadcast
-    auto [M, N, K] = args.tile_shape_mnk;
-    auto thr_mma = args.tiled_mma.get_thread_slice(args.thread_idx);
-    auto gColReduce = make_tensor<ElementCompute>(
-        make_layout(make_shape(M, N), make_stride(_1{}, 0_c)));                                                // (M,N)
-    auto tCrColReduce = make_tensor_like<ElementCompute>(                                       // (FrgV, MMA_M, MMA_N)
-        thr_mma.partition_C(gColReduce).layout());
-
-    // Tile the new accumulator tensor according to R2S
-    ThrCopy thread_r2s = args.tiled_copy.get_slice(args.thread_idx);
-    Tensor tRS_rSoftmax = thread_r2s.retile_S(tCrColReduce);                               // ((R2S,R2S_V),MMA_M,MMA_N)
-    auto tCrC_layout = args.tCrC.layout();                                                         // (R2S,R2S_M,R2S_N)
-
-    // Compose the new accumulator R2S layout with the expected tCrC layout to get final 
-    // reduction tensor layout.
-    auto tCrSoftmax_layout = take<0, 3>(tRS_rSoftmax.layout()).compose(tCrC_layout); // (R2S,R2S_V) o (R2S,R2S_M,R2S_N)
-
-    Tensor tCrTopK = make_tensor<TopKResult>(tCrSoftmax_layout);                                   // (R2S,R2S_M,R2S_N)
-    Tensor tCrSoftmax = make_tensor<ReductionResult>(tCrSoftmax_layout);                           // (R2S,R2S_M,R2S_N)
-    fill(tCrTopK, TopKResult());
-
-    auto args_tuple = make_tuple(
-        cute::move(tCrTopK), cute::move(tCrSoftmax), args.tCcD, args.cD,
-        lane_layout_MN, lane_mn,
-        args.residue_cD, args.residue_tCcD);
-    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::fusion
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
deleted file mode 100755
index 9f1cd7743..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h
+++ /dev/null
@@ -1,758 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This extends the contents of cutlass/functional.h with frequently used activation functions.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/constants.h"
-#include "cutlass/complex.h"
-#include "cutlass/array.h"
-#include "cutlass/half.h"
-#include "cutlass/functional.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Identity operator
-template <typename T>
-struct Identity {
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value) const {
-    return value;
-  }
-};
-
-template <typename T, int N>
-struct Identity<Array<T, N> > {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> value) const {
-    return value;
-  }
-};
-
-/// Scale operator
-template <typename T>
-struct Scale {
-  struct Arguments {
-    using scale_type = T;
-    T scale = T(1);
-  };
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value, T scale) const {
-    multiplies<T> mul;
-    return mul(scale, value);
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value, Arguments args = Arguments()) const {
-    return this->operator()(value, args.scale);
-  }
-};
-
-template <typename T, int N>
-struct Scale<Array<T, N>> {
-  using Arguments = typename Scale<T>::Arguments;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> values, T scale) const {
-    multiplies<Array<T, N>> mul;
-    return mul(scale, values);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> values, Arguments args = Arguments()) const {
-    return this->operator()(values, args.scale);
-  }
-};
-
-/// Specialization to compose other activations with a defined unary operator
-/// e.g. Scale<Identity<T>>
-template <template <class> class Activation, typename T>
-struct Scale<Activation<T>> {
-  using Arguments = typename Scale<T>::Arguments;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value, typename Arguments::scale_type scale) const {
-    multiplies<T> mul;
-    Activation<T> act;
-    return mul(scale, act(value));
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value, Arguments args = Arguments()) const {
-    return this->operator()(value, args.scale);
-  }
-};
-
-/// ReLu operator - propagates NaNs
-/// Always put threshold in the right hand side of max to propagate NaN.
-template <typename T>
-struct ReLu {
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T threshold, T value) const {
-    maximum<T> mx;
-
-    return mx(value, threshold);
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T value) const {
-    maximum<T> mx;
-
-    return mx(value, T(0));
-  }
-};
-
-template <typename T>
-using ReLU = ReLu<T>;
-
-template <typename T, int N>
-struct ReLu<Array<T, N>> {
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
-    maximum<Array<T, N>> mx;
-
-    return mx(frag, threshold);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &frag) const {
-    maximum<Array<T, N>> mx;
-    return mx(frag, T(0));
-  }
-};
-
-// Generic clamp
-template <typename T>
-struct Clamp {
-  struct Arguments {
-    T lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::lowest();
-    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
-  };
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, T const& lower_bound, T const& upper_bound) const {
-    constexpr bool PropagateNaN = true;
-    maximum<T, PropagateNaN> mx;
-    minimum<T, PropagateNaN> mn;
-
-    return mn(mx(value, lower_bound), upper_bound);
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, Arguments const& args = Arguments()) const {
-    return this->operator()(value, args.lower_bound, args.upper_bound);
-  }
-};
-
-template <typename T, int N>
-struct Clamp<Array<T,N>> {
-  using Arguments = typename Clamp<T>::Arguments;
-
-  CUTLASS_HOST_DEVICE
-  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound, T const& upper_bound) const {
-    constexpr bool PropagateNaN = true;
-    maximum<Array<T,N>, PropagateNaN> mx;
-    minimum<Array<T,N>, PropagateNaN> mn;
-
-    return mn(mx(values, lower_bound), upper_bound);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
-    return this->operator()(values, args.lower_bound, args.upper_bound);
-  }
-};
-
-// Leaky Relu operator
-template <typename T>
-struct LeakyReLU {
-
-  static const bool kIsHeavy = false;
-
-  struct Arguments {
-    T leaky_alpha = T(0);
-  };
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, T const& leaky_alpha) const {
-    T res = value > T(0) ? value : value * leaky_alpha;
-    return res;
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, Arguments const& args = Arguments()) const {
-    return this->operator()(value, args.leaky_alpha);
-  }
-};
-
-template <typename T, int N>
-struct LeakyReLU<Array<T, N> > {
-
-  static const bool kIsHeavy = false;
-
-  using Arguments = typename LeakyReLU<T>::Arguments;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& values, T const& leaky_alpha) const {
-    Array<T, N> y;
-    LeakyReLU<T> leaky_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(values.size()); ++i) {
-      y[i] = leaky_op(values[i], leaky_alpha);
-    }
-
-    return y;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
-    return this->operator()(values, args.leaky_alpha);
-  }
-};
-
-// Tanh operator
-template <typename T>
-struct Tanh {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &value) const {
-    return fast_tanh(value);
-  }
-};
-
-template <typename T, int N>
-struct Tanh<Array<T, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Array<T, N> y;
-    Tanh<T> tanh_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = tanh_op(value[i]);
-    }
-
-    return y;
-  }
-};
-
-template <int N>
-struct Tanh<Array<half_t, N>> {
-  using T = half_t;
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& z) const {
-    fast_tanh_op<Array<T, N>> tanh;
-    return tanh(z);
-  }
-};
-
-// Sigmoid operator
-template <typename T>
-struct Sigmoid {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &value) const {
-    return T(1) / (T(1) + fast_exp(-value));
-  }
-};
-
-template <typename T, int N>
-struct Sigmoid<Array<T, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Array<T, N> y;
-    Sigmoid<T> sigmoid_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = sigmoid_op(value[i]);
-    }
-
-    return y;
-  }
-};
-
-template <int N>
-struct Sigmoid<Array<half_t, N>> {
-  using T = half_t;
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& z) const {
-    plus<Array<T, N>> add;
-
-#if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
-    multiplies<Array<T, N>> mul;
-    fast_tanh_op<Array<T, N>> tanh;
-    return mul(add(tanh(mul(z, cutlass::constants::half<T>())), cutlass::constants::one<T>()),
-               cutlass::constants::half<T>());
-#else
-    divides<Array<T, N>> div;
-    negate<Array<T, N>> neg;
-    fast_exp_op<Array<T, N>> fast_exp;
-    return div(cutlass::constants::one<T>(),
-               add(cutlass::constants::one<T>(),
-                   fast_exp(neg(z))));
-#endif
-  }
-};
-
-// SiLu (swish) operator introduced by Elfwing et al. in the following paper
-// "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning" (2017)
-// https://arxiv.org/pdf/1702.03118.pdf
-// It is used in EfficientNet and YOLOv5, for example.
-// Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html
-template <typename T>
-struct SiLu {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &value) const {
-    Sigmoid<T> sigmoid;
-    return value * sigmoid(value);
-  }
-};
-
-template <typename T, int N>
-struct SiLu<Array<T, N>> {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Sigmoid<Array<T, N>> sigmoid_op;
-    multiplies<Array<T, N>>     mul;
-    return mul(value, sigmoid_op(value));
-  }
-};
-
-template <typename T>
-using ScaledSiLu = Scale<SiLu<T>>;
-
-// Hardswish operator introduced by Howard et al. in the following paper
-// "Searching for MobileNetV3" (2019)
-// https://arxiv.org/pdf/1905.02244.pdf
-// It is used in models based on MobilenetNetV3.
-// Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html
-template <typename T>
-struct HardSwish {
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &x) const {
-    minimum<T> mn;
-    maximum<T> mx;
-    T relu6 = mn(mx(x + T(3), T(0)), T(6));
-    return x * relu6 / T(6);
-  }
-};
-
-template <>
-struct HardSwish<float> {
-  using T = float;
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &x) const {
-    minimum<T> mn;
-    maximum<T> mx;
-    T relu6 = mn(mx(x + T(3), T(0)), T(6));
-    return x * relu6 * 0.16666667f;
-  }
-};
-
-template <typename T, int N>
-struct HardSwish<Array<T, N> > {
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Array<T, N> y;
-    HardSwish<T> hardswish_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = hardswish_op(value[i]);
-    }
-
-    return y;
-  }
-};
-
-template <int N>
-struct HardSwish<Array<half_t, N> > {
-  using T = half_t;
-  static const bool kIsHeavy = false;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    minimum<Array<T, N> > mn;
-    maximum<Array<T, N> > mx;
-    multiplies<Array<T, N> > mul;
-    plus<Array<T, N> > add;
-
-    return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f));
-  }
-};
-
-//
-// GELU function definitions implemented as described by
-//   Hendrycks, D., and Gimpel, K. in
-//   "Gaussian Error Linear Units (GELUs)." (2020)
-//   https://arxiv.org/pdf/1606.08415.pdf
-//
-// Floating-point constants are Taylor coefficients described in the paper.
-//
-
-// GELU operator
-template <typename T>
-struct GELU {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &value) const {
-    return T(cutlass::constants::half<T>() * value *
-      (cutlass::constants::one<T>() + (T)erff((float)(value * cutlass::constants::half_root_two<T>()))));
-  }
-};
-
-template <>
-struct GELU<float> {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  float operator()(float const &value) const {
-    return cutlass::constants::half<float>() * value *
-      (cutlass::constants::one<float>() + erff(value * cutlass::constants::half_root_two<float>() ));
-  }
-};
-
-template <>
-struct GELU<double> {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  double operator()(double const &value) const {
-    return cutlass::constants::half<double>() * value *
-      (cutlass::constants::one<double>() + erf( value * cutlass::constants::half_root_two<double>() ));
-  }
-};
-
-template <typename T, int N>
-struct GELU<Array<T, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Array<T, N> y;
-    GELU<T> gelu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = gelu_op(value[i]);
-    }
-
-    return y;
-  }
-};
-
-template <typename T>
-using ScaledGELU = Scale<GELU<T>>;
-
-// GELU operator implemented using the Taylor series approximation
-template <typename T>
-struct GELU_taylor {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &z) const {
-
-    T k0 = T(0.7978845608028654);
-    T k1 = T(0.044715);
-
-    return T(cutlass::constants::half<T>() * z *
-      (cutlass::constants::one<T>() + fast_tanh(k0 * z * (cutlass::constants::one<T>() + k1 * z * z))));
-  }
-};
-
-template <int N>
-struct GELU_taylor<Array<half_t, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const &z) const {
-
-    using T = half_t;
-    Array<half_t, N> y;
-
-    half_t k0 = half_t(0.7978845608028654);
-    half_t k1 = half_t(0.044715);
-
-    multiply_add<Array<half_t, N>> fma;
-    multiplies<Array<half_t, N>>     mul;
-    plus<Array<half_t, N>>         add;
-
-    fast_tanh_op<Array<half_t, N>> tanh;
-
-    Array<half_t, N> u = mul(mul(k0, z), fma(mul(k1, z), z, cutlass::constants::one<T>()));
-
-    y = mul(mul(z, cutlass::constants::half<T>()), add(cutlass::constants::one<T>(), tanh(u)));
-
-    return y;
-  }
-};
-
-template <typename T, int N>
-struct GELU_taylor<Array<T, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &value) const {
-    Array<T, N> y;
-    GELU_taylor<T> gelu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = gelu_op(value[i]);
-    }
-
-    return y;
-  }
-};
-
-template <typename T>
-using ScaledGELU_taylor = Scale<GELU_taylor<T>>;
-
-/// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
-/// z is computed from the forward pass.
-template <typename T>
-struct dGELU {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &d_t, T const &z) const {
-
-    T k0 = T(0.7978845608028654);
-    T k1 = T(0.044715);
-    T k2 = T(0.1070322243);
-
-    T tanh_out = fast_tanh(k0 * z * (1 + k1 * z * z));
-
-    T ff = constants::half<T>() * z * ((1 - tanh_out * tanh_out) * (k0 + k2 * z * z)) +
-      constants::half<T>() * (1 + tanh_out);
-
-    return ff * d_t;
-  }
-};
-
-template <typename T, int N>
-struct dGELU<Array<T, N> > {
-  static const bool kIsHeavy = true;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &d_t, Array<T, N> const &z) const {
-    Array<T, N> y;
-    dGELU<T> gelu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = gelu_op(d_t[i], z[i]);
-    }
-
-    return y;
-  }
-};
-
-template <typename T>
-struct dReLU {
-  CUTLASS_HOST_DEVICE
-  T operator()(T d_t, bool d_relu) const {
-    return d_relu ? d_t : T(0);
-  }
-
-  template <typename U>
-  CUTLASS_HOST_DEVICE
-  T operator()(T d_t, U d_relu) const {
-    return operator()(d_t, static_cast<bool>(d_relu));
-  }
-};
-
-template <typename T, int N>
-struct dReLU<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& d_t, bool const (&d_relu)[N]) const {
-    Array<T, N> y;
-    dReLU<T> relu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = relu_op(d_t[i], d_relu[i]);
-    }
-
-    return y;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& d_t, Array<uint1b_t, N> const& d_relu) const {
-    UnpackPredicates<N> unpack_op;
-
-    bool preds[N];
-    unpack_op(preds, d_relu);
-
-    return operator()(d_t, preds);
-  }
-
-  template <typename U>
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& d_t, Array<U, N> const& d_relu) const {
-    Array<T, N> y;
-    dReLU<T> relu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = relu_op(d_t[i], d_relu[i]);
-    }
-
-    return y;
-  }
-};
-
-/// Computes backwards pass for ReLU operator assuming d_t is the layer gradient and
-/// z is computed from the forward pass.
-template <typename T>
-struct dReLU_Z {
-  CUTLASS_HOST_DEVICE
-  T operator()(T d_t, T z) const {
-    return z < 0 ? T(0) : d_t;
-  }
-};
-
-template <typename T, int N>
-struct dReLU_Z<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& d_t, Array<T, N> const& z) const {
-    Array<T, N> y;
-    dReLU_Z<T> relu_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = relu_op(d_t[i], z[i]);
-    }
-
-    return y;
-  }
-};
-
-// ElementwiseFilter operator
-// Filters by a specific value and maps it to 0.0
-// Used in GEMM + comm
-template <typename T>
-struct ElementwiseFilter {
-
-  static const bool kIsHeavy = false;
-
-  struct Arguments {
-    T value_to_filter = T(-0.0);
-    T filtered_value = T(0.0);
-  };
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, T const& value_to_filter, T const& filtered_value) const {
-    T res = value == value_to_filter ? filtered_value : value;
-    return res;
-  }
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& value, Arguments const& args = Arguments()) const {
-    return this->operator()(value, args.value_to_filter, args.filtered_value);
-  }
-};
-
-template <typename T, int N>
-struct ElementwiseFilter<Array<T, N> > {
-
-  static const bool kIsHeavy = false;
-
-  using Arguments = typename ElementwiseFilter<T>::Arguments;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& values, T const& value_to_filter, T const& filtered_value) const {
-    Array<T, N> y;
-    ElementwiseFilter<T> filter_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(values.size()); ++i) {
-      y[i] = filter_op(values[i], value_to_filter, filtered_value);
-    }
-
-    return y;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
-    return this->operator()(values, args.value_to_filter, args.filtered_value);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
deleted file mode 100755
index 86200b413..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing conversion operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Converts the result without other operations
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class Convert {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementAccumulator_;
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = FragmentAccumulator;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = false;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() {}
-  };
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  Convert(Params const &params = Params()) {
-
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-
-  }
-
-  /// Returns true if source is needed based on state of runtime arguments
-  CUTLASS_HOST_DEVICE
-  constexpr bool is_source_needed() const {
-    return false;
-  }
-
-  /// Constexpr function to enable the compiler to optimize away the source loading if it is
-  /// never needed.
-  CUTLASS_HOST_DEVICE
-  constexpr bool is_source_ever_needed() const {
-    return false;
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source = FragmentOutput(),
-    ElementCompute uniform = ElementCompute(0)) const {
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;
-
-    return destination_converter(accumulator);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
deleted file mode 100755
index 775630027..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Utilities for thread-level epilogues
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-namespace detail {
-
-/// Class used to identify cases in which no operation is performed
-template <typename T_>
-struct NoOp {};
-
-} // namespace detail
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
deleted file mode 100755
index f74a36af4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h
+++ /dev/null
@@ -1,523 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/epilogue/thread/linear_combination_params.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation.
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  typename ElementSource_ = ElementOutput_
->
-class LinearCombination {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementSource = ElementSource_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalar = ElementCompute;
-  using ElementC = ElementSource_;
-  using ElementD = ElementOutput_;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params 
-  {
-    ElementCompute alpha;                         ///< scales accumulators
-    ElementCompute beta;                          ///< scales source tensor
-    ElementCompute const *alpha_ptr;              ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;               ///< pointer to source scalar - if not null, loads it from memory
-    ElementCompute const* const* alpha_ptr_array; ///< array of pointers to accumulator scalar per group/batch
-    ElementCompute const* const* beta_ptr_array;  ///< array of pointers to source scalar per group/batch
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      alpha(ElementCompute(1)),
-      beta(ElementCompute(0)),
-      alpha_ptr(nullptr),
-      beta_ptr(nullptr),
-      alpha_ptr_array(nullptr),
-      beta_ptr_array(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ):
-      alpha(alpha), beta(beta),
-      alpha_ptr(nullptr), beta_ptr(nullptr),
-      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha
-    ):
-      alpha(alpha), beta(0),
-      alpha_ptr(nullptr), beta_ptr(nullptr),
-      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ):
-      alpha(0), beta(0),
-      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr),
-      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ):
-      alpha(0), beta(0),
-      alpha_ptr(alpha_ptr), beta_ptr(nullptr),
-      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const* const* alpha_ptr_array,
-      ElementCompute const* const* beta_ptr_array
-    ):
-      alpha(0), beta(0),
-      alpha_ptr(nullptr), beta_ptr(nullptr),
-      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(beta_ptr_array) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const* const* alpha_ptr_array
-    ):
-      alpha(0), beta(0),
-      alpha_ptr(nullptr), beta_ptr(nullptr),
-      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(nullptr) { }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombination(Params const &params, int group_idx = 0) {
-    if (params.alpha_ptr_array != nullptr && params.alpha_ptr_array[group_idx] != nullptr) {
-      alpha_ = *(params.alpha_ptr_array[group_idx]);
-    }
-    else if (params.alpha_ptr != nullptr) {
-      alpha_ = *params.alpha_ptr;
-    }
-    else {
-      alpha_ = params.alpha;
-    }
-    if (params.beta_ptr_array != nullptr && params.beta_ptr_array[group_idx] != nullptr) {
-      beta_ = *(params.beta_ptr_array[group_idx]);
-    }
-    else if (params.beta_ptr != nullptr) {
-      beta_ = *params.beta_ptr;
-    }
-    else {
-      beta_ = params.beta;
-    }
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-
-  /// Computes linear scaling with source: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-      FragmentAccumulator const &accumulator,
-      FragmentSource const &source) const {
-
-    // Convert source to internal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    if (Scale == ScaleType::Nothing)
-      return destination_converter(converted_accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    if (Scale == ScaleType::NoBetaScaling)
-      intermediate = converted_source;
-    else
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-      FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    if (Scale == ScaleType::Nothing)
-      return destination_converter(converted_accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    return destination_converter(intermediate);
-  }
-
-  //
-  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
-  //
-  CUTLASS_HOST_DEVICE
-  ElementD operator()(ElementAccumulator const accumulator, ElementC const source) const {
-    // Convert everything to Compute type, do compute, and then store to output type
-    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
-    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
-    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
-
-    // Convert to destination numeric type
-
-    ElementCompute converted_accumulator = accumulator_converter(accumulator);
-    if constexpr (Scale == ScaleType::Nothing) {
-      return destination_converter(converted_accumulator);
-    }
-
-    // Perform binary operations
-    ElementCompute intermediate;
-    multiplies<ElementCompute> multiply;
-    multiply_add<ElementCompute> madd;
-
-    if constexpr (Scale == ScaleType::NoBetaScaling) {
-      intermediate = source_converter(source);
-    }
-    else {
-      intermediate = multiply(beta_, source);                            // X =  beta * C + uniform
-    }
-
-    intermediate = madd(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    return destination_converter(intermediate);
-  }
-
-  CUTLASS_HOST_DEVICE
-  ElementD operator()(ElementAccumulator const accumulator) const {
-    // Convert everything to Compute type, do compute, and then store to output type
-    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
-    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
-    ElementCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Convert to destination numeric type
-    if constexpr (Scale == ScaleType::Nothing) {
-      return destination_converter(converted_accumulator);
-    }
-
-    // Perform binary operations
-    ElementCompute intermediate;
-    multiplies<ElementCompute> multiply;
-
-    intermediate = multiply(alpha_, accumulator);    // D = alpha * Accum
-    return destination_converter(intermediate);
-  }
-};
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = vector_alpha * accumulator + (optional) vector_beta/scalar_beta * source
-///
-template <
-  typename ElementOutput_,            ///< Data type used to load and store tensors
-  int Count,                          ///< Number of elements computed per operation.
-  typename ElementAccumulator_,       ///< Accumulator data type
-  typename ElementCompute_,           ///< Data type used to compute linear combination
-  FloatRoundStyle Round,
-  typename ElementSource_
->
-class LinearCombination<ElementOutput_,
-                        Count,
-                        ElementAccumulator_,
-                        ElementCompute_,
-                        ScaleType::PerChannelScaling,
-                        Round,
-                        ElementSource_> {
-public:
-        
-  using ElementOutput = ElementOutput_;
-  using ElementSource = ElementSource_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementC = ElementSource_;
-  using ElementD = ElementOutput_;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
-  static constexpr bool IsPerChannelScalingSupported = true;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params
-  {
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator vector
-    ElementCompute const *beta_ptr;        ///< pointer to source vector
-    ElementCompute beta;                   ///< scales source tensor
-
-    CUTLASS_HOST_DEVICE
-    Params():
-      alpha_ptr(nullptr),
-      beta_ptr(nullptr),
-      beta(ElementCompute(0)) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ):
-      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), beta(ElementCompute(0)) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ):
-      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(ElementCompute(0)) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute beta
-    ):
-      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(beta) { }
-
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute const* beta_ptr_ = nullptr;
-  ElementCompute beta_ = 0;
-
-public:
-
-  /// Constructs the function object
-  CUTLASS_HOST_DEVICE
-  LinearCombination(Params const& params) {
-    if (params.beta_ptr) {
-      beta_ptr_ = params.beta_ptr;
-    }
-    else {
-      beta_ = params.beta;
-    }
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool is_beta_vector() const {
-    return beta_ptr_ != nullptr;
-  }
-
-  /// Computes linear scaling with source: D = vector_alpha * accumulator + vector_beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-      FragmentAccumulator const& accumulator,
-      FragmentSource const& source,
-      FragmentCompute const& valpha,
-      FragmentCompute const& vbeta) const {
-    // Convert source to internal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    intermediate = mul_add_source(vbeta, converted_source);                             // X = vector_beta * C + uniform
-
-    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling with source: D = vector_alpha * accumulator + scalar_beta(from host) * source 
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-      FragmentAccumulator const& accumulator,
-      FragmentSource const& source,
-      FragmentCompute const& valpha) const {
-    // Convert source to internal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-
-    intermediate = mul_add_source(beta_, converted_source);                           // X =  scalar_beta * C + uniform
-
-    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = vector_alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-      FragmentAccumulator const& accumulator,
-      FragmentCompute const& valpha) const {
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(valpha, converted_accumulator);    // D = vector_alpha * Accum
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
deleted file mode 100755
index c5ffdaa03..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Functor performing linear combination operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
-namespace { // (anonymous)
-template<class Op, class Enable = void>
-struct kIsHeavy_member_or_false {
-  static constexpr bool value = false;
-};
-template<class Op>
-struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
-  static constexpr bool value = Op::kIsHeavy;
-};
-
-} // namespace (anonymous)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-struct EmptyArguments {};
-
-template<class T, class = void>
-struct ElementwiseOpDispatcher {
-  using Arguments = EmptyArguments;
-
-  T op;
-
-  CUTLASS_HOST_DEVICE
-  ElementwiseOpDispatcher(Arguments) {}
-
-  template <typename ValueType>
-  CUTLASS_HOST_DEVICE
-  ValueType operator()(ValueType value) {
-    return op(value);
-  }
-};
-
-template<class T>
-struct ElementwiseOpDispatcher<T, std::void_t<typename T::Arguments>> {
-  using Arguments = typename T::Arguments;
-
-  Arguments args;
-  T op;
-
-  CUTLASS_HOST_DEVICE
-  ElementwiseOpDispatcher(Arguments args_):args(args_) {}
-
-  template <typename ValueType>
-  CUTLASS_HOST_DEVICE
-  ValueType operator()(ValueType value) {
-    return op(value, args);
-  }
-};
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This base class is meant to define the concept required of the
-/// EpilogueWithBroadcast::OutputOp
-template <
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename ElementCompute_,
-  typename ElementZ_,
-  typename ElementT_,
-  int ElementsPerAccess,
-  typename ElementwiseOp_ = Identity<ElementCompute_>,
-  typename BinaryOp_ = plus<ElementCompute_>,
-  bool StoreT_ = true,
-  typename ElementVector_ = ElementC_
->
-class LinearCombinationBiasElementwise {
-public:
-
-  using ElementOutput = ElementC_;
-  using ElementD = ElementOutput;
-  using ElementC = ElementC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalar = ElementCompute;
-  using ElementZ = ElementZ_;
-  using ElementT = ElementT_;
-  using ElementVector = ElementVector_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-
-  /// Follow cutlass3x EVT aliases
-  static bool const IsEltActSupported = true;
-
-  using ElementwiseOp = ElementwiseOp_;
-  using BinaryOp = BinaryOp_;
-
-  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
-  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
-
-  // Indicates that this epilogue applies only one binary operation
-  static bool const kIsSingleSource = true;
-
-
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementC, kElementsPerAccess>;
-  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
-  using FragmentT = Array<ElementT, kElementsPerAccess>;
-
-  // Definitions needed for collective epilogue
-  using FragmentSource = FragmentC;
-  using FragmentOutput = FragmentZ;
-  using ElementBias = ElementVector;
-  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
-  using ActivationFn = ElementwiseOp;
-  static const ScaleType::Kind kScale = ScaleType::Default;
-
-  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
-
-  /// If true, the 'Z' tensor is stored
-  static bool const kStoreZ = true;
-
-  /// If true, the 'T' tensor is stored
-  static bool const kStoreT = StoreT_;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta,
-      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), elementwise(elementwise_) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha
-    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
-      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementwiseArguments const &elementwise_;
-  bool skip_elementwise_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor from Params
-  CUTLASS_HOST_DEVICE
-  LinearCombinationBiasElementwise(Params const &params): elementwise_(params.elementwise) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    skip_elementwise_ = false;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      skip_elementwise_ = true;
-    }
-  }
-
-  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
-  template <typename ElementwiseArgs>
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z,
-    FragmentT &frag_T,
-    FragmentAccumulator const &AB,
-    FragmentC const &frag_C,
-    FragmentCompute const &V,
-    ElementwiseArgs const &elementwise_args) const {
-
-    ElementwiseOp elementwise_op;
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
-    FragmentCompute result_Z;
-    FragmentCompute result_T;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
-      result_T[i] = z;
-      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-      frag_T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
-  template <typename ElementwiseArgs>
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z,
-    FragmentT &frag_T,
-    FragmentAccumulator const &AB,
-    FragmentCompute const &V,
-    ElementwiseArgs const &elementwise_args) const {
-
-    ElementwiseOp elementwise_op;
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute result_Z;
-    FragmentCompute result_T;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
-      result_T[i] = z;
-      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-      frag_T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is true
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z,
-    FragmentT &frag_T,
-    FragmentAccumulator const &AB,
-    FragmentC const &frag_C,
-    FragmentCompute const &V) const {
-
-    ElementwiseOpDispatcher elementwise_op(elementwise_);
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
-    FragmentCompute result_Z;
-    FragmentCompute result_T;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
-      result_T[i] = z;
-      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-      frag_T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is false
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z,
-    FragmentT &frag_T,
-    FragmentAccumulator const &AB,
-    FragmentCompute const &V) const {
-
-    ElementwiseOpDispatcher elementwise_op(elementwise_);
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute result_Z;
-    FragmentCompute result_T;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
-      result_T[i] = z;
-      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
-      frag_T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
-  template <typename ElementwiseArgs>
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    ElementZ &Z,
-    ElementT &T,
-    ElementAccumulator const &AB,
-    ElementC const &C,
-    ElementCompute const &V,
-    ElementwiseArgs const &elementwise_args) const {
-
-    ElementwiseOp elementwise_op;
-    BinaryOp binary_op;
-
-    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
-    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
-
-    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
-    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
-
-    NumericConverter<ElementZ, ElementCompute> convert_z;
-    Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      ElementCompute result_T = z;
-      NumericConverter<ElementT, ElementCompute> convert_t;
-      T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
-  template <typename ElementwiseArgs>
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    ElementZ &Z,
-    ElementT &T,
-    ElementAccumulator const &AB,
-    ElementCompute const &V,
-    ElementwiseArgs const &elementwise_args) const {
-
-    ElementwiseOp elementwise_op;
-    BinaryOp binary_op;
-
-    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
-
-    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
-    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
-
-    NumericConverter<ElementZ, ElementCompute> convert_z;
-    Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      ElementCompute result_T = z;
-      NumericConverter<ElementT, ElementCompute> convert_t;
-      T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is true
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    ElementZ &Z,
-    ElementT &T,
-    ElementAccumulator const &AB,
-    ElementC const &C,
-    ElementCompute const &V) const {
-
-    ElementwiseOpDispatcher elementwise_op(elementwise_);
-    BinaryOp binary_op;
-
-    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
-    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
-
-    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
-    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
-
-    NumericConverter<ElementZ, ElementCompute> convert_z;
-    Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      ElementCompute result_T = z;
-      NumericConverter<ElementT, ElementCompute> convert_t;
-      T = convert_t(result_T);
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is false
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    ElementZ &Z,
-    ElementT &T,
-    ElementAccumulator const &AB,
-    ElementCompute const &V) const {
-
-    ElementwiseOpDispatcher elementwise_op(elementwise_);
-    BinaryOp binary_op;
-
-    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
-
-    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
-    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
-
-    NumericConverter<ElementZ, ElementCompute> convert_z;
-    Z = convert_z(result_Z);
-
-    if constexpr (kStoreT) {
-      ElementCompute result_T = z;
-      NumericConverter<ElementT, ElementCompute> convert_t;
-      T = convert_t(result_T);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
deleted file mode 100755
index ead1123ca..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
+++ /dev/null
@@ -1,610 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination operations used by epilogues.
-*/
-
-#pragma once
-
-#include <cuda_fp16.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element, int ElementsPerAccess>
-struct ArrayMaximum {
-
-  CUTLASS_HOST_DEVICE
-  Array<Element, ElementsPerAccess> operator()(
-    Array<Element, ElementsPerAccess>  const &lhs,
-    Array<Element, ElementsPerAccess>  const &rhs) const {
-
-    Array<Element, ElementsPerAccess> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      result[i] = platform::max(lhs[i].get(), rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<Element, ElementsPerAccess> operator()(
-    Array<Element, ElementsPerAccess>  const &lhs,
-    Element                                   rhs) const {
-
-    Array<Element, ElementsPerAccess> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      result[i] = platform::max(lhs[i].get(), rhs);
-    }
-
-    return result;
-  }
-};
-
-
-/// Partial specialization: Element=float
-template <int ElementsPerAccess>
-struct ArrayMaximum<float, ElementsPerAccess> {
-
-  CUTLASS_HOST_DEVICE
-  Array<float, ElementsPerAccess> operator()(
-    Array<float, ElementsPerAccess>  const &lhs,
-    Array<float, ElementsPerAccess>  const &rhs) const {
-
-    Array<float, ElementsPerAccess> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      result[i] = fmax(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Array<float, ElementsPerAccess> operator()(
-    Array<float, ElementsPerAccess>  const &lhs,
-    float rhs) const {
-
-    Array<float, ElementsPerAccess> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      result[i] = fmax(lhs[i], rhs);
-    }
-
-    return result;
-  }
-};
-
-/// Partial specialization: Element=half
-template <int ElementsPerAccess>
-struct ArrayMaximum<half_t, ElementsPerAccess> {
-
-  CUTLASS_DEVICE
-  Array<half_t, ElementsPerAccess> operator()(
-    Array<half_t, ElementsPerAccess>  const &lhs,
-    Array<half_t, ElementsPerAccess>  const &rhs) const {
-
-    Array<half_t, ElementsPerAccess> result;
-
-    #if __CUDA_ARCH__ >= 800
-    int const kVectorCount = ElementsPerAccess / 2;
-
-
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
-    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(rhs.raw_data());
-    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kVectorCount; ++i) {
-      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
-
-    #else
-    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
-    __half const *rhs_ptr = reinterpret_cast<__half const *>(rhs.raw_data());
-    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
-    }
-
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  Array<half_t, ElementsPerAccess> operator()(
-    Array<half_t, ElementsPerAccess>  const &lhs,
-    half_t const &rhs) const {
-
-    Array<half_t, ElementsPerAccess> result;
-
-    #if __CUDA_ARCH__ >= 800
-    int const kVectorCount = ElementsPerAccess / 2;
-
-
-    __half rhs_raw = reinterpret_cast<__half const &>(rhs);
-    __half2 rhs_pair = __half2half2(rhs_raw);
-
-    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
-    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kVectorCount; ++i) {
-      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
-    }
-
-    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
-
-    #else
-
-    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
-    __half const  rhs_raw = reinterpret_cast<__half const &>(rhs);
-    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
-    }
-
-    #endif
-
-    return result;
-  }
-};
-
-/// Partial specialization: Element=bfloat16_t
-template <int ElementsPerAccess>
-struct ArrayMaximum<bfloat16_t, ElementsPerAccess> {
-
-  using NvType   = __nv_bfloat16;
-  using NvTypeV2 = __nv_bfloat162;
-
-  CUTLASS_DEVICE
-  Array<bfloat16_t, ElementsPerAccess> operator()(
-    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
-    Array<bfloat16_t, ElementsPerAccess>  const &rhs) const {
-
-    Array<bfloat16_t, ElementsPerAccess> result;
-
-    #if __CUDA_ARCH__ >= 800
-    int const kVectorCount = ElementsPerAccess / 2;
-
-
-    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
-    NvTypeV2 const *rhs_ptr = reinterpret_cast<NvTypeV2 const *>(rhs.raw_data());
-    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kVectorCount; ++i) {
-      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
-    }
-
-    #else
-    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
-    NvType const *rhs_ptr = reinterpret_cast<NvType const *>(rhs.raw_data());
-    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
-    }
-
-    #endif
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  Array<bfloat16_t, ElementsPerAccess> operator()(
-    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
-    bfloat16_t                                   rhs) const {
-
-    Array<bfloat16_t, ElementsPerAccess> result;
-
-    #if __CUDA_ARCH__ >= 800
-    int const kVectorCount = ElementsPerAccess / 2;
-
-
-    NvType rhs_raw = reinterpret_cast<NvType const &>(rhs);
-    NvTypeV2 rhs_pair = __bfloat162bfloat162(rhs_raw);
-
-    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
-    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kVectorCount; ++i) {
-      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
-    }
-
-    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
-
-    #else
-
-    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
-    NvType const  rhs_raw = reinterpret_cast<NvType const &>(rhs);
-    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
-    }
-
-    #endif
-
-    return result;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, int ElementsPerAccess>
-struct ReluConditional {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    bool conditional[],
-    Array<Element, ElementsPerAccess> const &fragment, 
-    Element threshold) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      conditional[i] = !(fragment[i] < threshold);
-    }
-  }
-};
-
-template <int ElementsPerAccess>
-struct ReluConditional<half_t, ElementsPerAccess> {
-
-  CUTLASS_DEVICE
-  void operator()(
-    bool conditional[],
-    Array<half_t, ElementsPerAccess> const &fragment, 
-    half_t threshold) const {
-
-    __half y = reinterpret_cast<__half const &>(threshold);
-    __half const *x = reinterpret_cast<__half const *>(fragment.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      conditional[i] = !__hlt(x[i], y);
-    }
-  }
-};
-
-template <int ElementsPerAccess>
-struct ReluConditional<bfloat16_t, ElementsPerAccess> {
-
-  CUTLASS_DEVICE
-  void operator()(
-    bool conditional[],
-    Array<bfloat16_t, ElementsPerAccess> const &fragment,
-    bfloat16_t threshold) const {
-
-    __nv_bfloat16 y = reinterpret_cast<__nv_bfloat16 const &>(threshold);
-    __nv_bfloat16 const *x = reinterpret_cast<__nv_bfloat16 const *>(fragment.raw_data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < ElementsPerAccess; ++i) {
-      conditional[i] = !__hlt(x[i], y);
-    }
-  }
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing
-/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization.
-///
-/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>.
-///
-/// This base class is meant to define the concept required of the
-/// EpilogueWithBroadcast::OutputOp
-template <
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename ElementCompute_,
-  typename ElementZ_,
-  int ElementsPerAccess,
-  bool StoreT_ = true,
-  typename ElementVector_ = ElementC_
->
-class LinearCombinationBiasRelu {
-public:
-
-  using ElementOutput = ElementC_;
-  using ElementC = ElementC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementZ = ElementZ_;
-  using ElementVector = ElementVector_;
-
-  using ElementT = uint1b_t;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-
-  using ElementwiseOp = ReLu<ElementCompute>;
-  using BinaryOp = plus<ElementCompute>;
-
-  // Indicates that this epilogue applies only one binary operation
-  static bool const kIsSingleSource = true;
-
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
-  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
-  using FragmentT = Array<ElementT, kElementsPerAccess>;
-
-  /// If true, the 'Z' tensor is stored
-  static bool const kStoreZ = true;
-
-  /// If true, the 'T' tensor is stored
-  static bool const kStoreT = StoreT_;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementZ threshold;                    ///< ReLu threshold
-
-    //
-    // Methods
-    //
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute()), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr),
-      threshold(ElementCompute()) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta,
-      ElementCompute threshold_ = ElementCompute()
-    ): 
-      alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-      NumericConverter<ElementZ, ElementCompute> convert_threshold;
-
-      threshold = convert_threshold(threshold_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha
-    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(ElementZ()) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
-      ElementCompute threshold_ = ElementCompute()
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-      NumericConverter<ElementZ, ElementCompute> convert_threshold;
-
-      threshold = convert_threshold(threshold_);
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(ElementZ()) {
-    }
-
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementZ threshold_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor from Params
-  CUTLASS_HOST_DEVICE
-  LinearCombinationBiasRelu(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementZ const &>(allones);
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is true
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
-    FragmentAccumulator const &AB,
-    FragmentC const &frag_C,
-    FragmentCompute const &V) const {
-
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
-    FragmentCompute result_Z;
-
-    bool conditions[kElementsPerAccess];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-
-      ElementCompute z = alpha_ * tmp_Accum[i];
-      z += beta_ * tmp_C[i];
-
-      z = binary_op(z, V[i]);
-      result_Z[i] = z;
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    //
-    // Compute condition
-    //
-
-    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
-    relu_conditional(conditions, frag_Z, threshold_);
-
-    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
-    frag_Z = maximum_op(frag_Z, threshold_);
-
-    if (kStoreT) {
-      PackPredicates<kElementsPerAccess> pack_predicates;
-      frag_T = pack_predicates(conditions); 
-    }
-  }
-
-  /// Applies the operation when is_source_needed() is false
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
-    FragmentAccumulator const &AB,
-    FragmentCompute const &V) const {
-
-    BinaryOp binary_op;
-
-    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute result_Z;
-
-    bool conditions[kElementsPerAccess];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementsPerAccess; ++i) {
-      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
-      result_Z[i] = z;
-    }
-
-    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-
-    //
-    // Compute condition
-    //
-
-    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
-    relu_conditional(conditions, frag_Z, threshold_);
-
-    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
-    frag_Z = maximum_op(frag_Z, threshold_);
-
-    // 
-    // Compute conditions
-    //
-
-    //
-    // Store
-    //
-    if (kStoreT) {
-      PackPredicates<kElementsPerAccess> pack_predicates;
-      frag_T = pack_predicates(conditions);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
deleted file mode 100755
index aad9b5238..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ /dev/null
@@ -1,685 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear scaling operations used by epilogues. Values are clamped before
-         converting to the output element type.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
-constexpr bool LinearCombinationClampIsHeavy() {
-  return false;
-}
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements then clamps the output before
-/// converting to the output element type.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationClamp {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha
-    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationClamp(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source,
-    ElementCompute uniform = ElementCompute(0)) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-    
-    minimum<ComputeFragment> min_accumulator;
-    maximum<ComputeFragment> max_accumulator;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    } else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    /// Clamping constant value
-    ElementCompute const kClampMax =
-        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
-
-    ElementCompute const kClampMin =
-        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
-
-    intermediate = max_accumulator(intermediate, kClampMin);
-    intermediate = min_accumulator(intermediate, kClampMax);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator 
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_accumulator;
-    
-    minimum<ComputeFragment> min_accumulator;
-    maximum<ComputeFragment> max_accumulator;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    /// Clamping constant value
-    ElementCompute const kClampMax =
-        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
-
-    ElementCompute const kClampMin =
-        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
-
-    intermediate = max_accumulator(intermediate, kClampMin);
-    intermediate = min_accumulator(intermediate, kClampMax);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conditional guards to enable partial specialization for packed integers
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Applies a linear combination operator to an array of elements then clamps the output before
-/// converting to the output element type.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round
->
-class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = int;
-  using ElementCompute = float;
-
-  static_assert(
-      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
-      "This elementwise op expects the output to be int.");
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha
-    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationClamp(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source,
-    ElementCompute uniform = ElementCompute(0)) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Compute linear scaling in floating point
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-    
-    // Float min-max
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    } else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    //
-    // Convert float => ElementOutput_ with clamping
-    //
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Compute linear scaling in floating point
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_accumulator;
-    
-    // Float min-max
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    //
-    // Convert float => ElementOutput_ with clamping
-    //
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-#endif // Conditional guards to enable partial specialization for packed integers
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements then clamps
-/// the output before converting to the output element type.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-/// Note: The below method only when problem_size_K <= 256 for signed int8 gemm
-/// or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
-/// above.
-/// TODO: Add logic to fallback to the default approach
-template <
-    /// Data type used to load and store< tensors
-    typename ElementOutput_,
-    /// Number of elements computed per operation
-    int Count,
-    ///< Control Alpha and Beta scaling
-    ScaleType::Kind Scale = ScaleType::Default,
-    /// Rounding mode
-    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
-class FastLinearCombinationClamp {
- public:
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = int;
-  using ElementCompute = float;
-
-  static_assert(
-      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
-      "This elementwise op expects the output to be int.");
-
-  static int const kCount = Count;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = false;
-
-  /// Host-constructable parameters structure
-  struct Params {
-    /// scales accumulators
-    ElementCompute alpha;
-    /// scales source tensor
-    ElementCompute beta;
-    /// pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *alpha_ptr;
-    /// pointer to source scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params()
-        : alpha(ElementCompute(1)),
-          beta(ElementCompute(0)),
-          alpha_ptr(nullptr),
-          beta_ptr(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute alpha, ElementCompute beta)
-        : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute alpha)
-        : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
-        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const *alpha_ptr)
-        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
- public:
-  /// Constructs the function object, possibly loading from pointers in host
-  /// memory
-  CUTLASS_HOST_DEVICE
-  FastLinearCombinationClamp(Params const &params) {
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(FragmentAccumulator const &accumulator,
-                            FragmentOutput const &source,
-                            ElementCompute uniform = ElementCompute(0)) const {
-    // Convert source to interal compute numeric type
-    FastNumericArrayConverter<ElementCompute, ElementOutput, kCount, Round>
-        source_converter;
-    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
-        accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Compute linear scaling in floating point
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-
-    minimum<ComputeFragment> min_accumulator;
-    maximum<ComputeFragment> max_accumulator;
-
-    // Float min-max
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    } else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate =
-          mul_add_source(beta_, converted_source);  // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator,
-                                         intermediate);  // D = alpha * Accum + X
-    }
-
-    /// Clamping constant value
-    ElementCompute const kClamp =
-        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
-
-    intermediate = max_accumulator(intermediate, -kClamp);
-    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
-
-    // Convert to destination numeric type
-    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-        destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
-        accumulator_converter;
-
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Compute linear scaling in floating point
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_accumulator;
-
-    minimum<ComputeFragment> min_accumulator;
-    maximum<ComputeFragment> max_accumulator;
-
-    // Float min-max
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);
-    }
-
-    /// Clamping constant value
-    ElementCompute const kClamp =
-        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
-
-    intermediate = max_accumulator(intermediate, -kClamp);
-    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
-
-    // Convert to destination numeric type
-    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-        destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
deleted file mode 100755
index 74eb8213e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  
-  \brief Functor performing linear combination followed by dGelu operation
-*/
-
-#pragma once
-
-#include "cutlass/half.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/constants.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementCompute_,                            ///< Data type returned by this functor
-  typename ElementAccumulator_,                        ///< Data type of accumulators
-  typename ElementSource_,                             ///< Data type of source tensor
-  typename ElementTensor_,                             ///< Data type of additional tensor
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationDGelu {
-public:
-
-  using ElementOutput = ElementSource_;
-  using ElementCompute = ElementCompute_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementSource = ElementSource_;
-  using ElementTensor = ElementTensor_;
-
-  static bool const kIsHeavy = true;
-
-  static int const kCount = Count;
-
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentTensor = Array<ElementTensor, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementCompute threshold;              ///< minimum value that is output
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      threshold(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementCompute threshold_;
-  bool participates_in_reduction_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationDGelu(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
-    participates_in_reduction_ = true;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Returns true if the threadblock computes the reduction
-  CUTLASS_HOST_DEVICE
-  bool participates_in_reduction() const {
-    return participates_in_reduction_;
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
-      // Avoid computing the reduction if this isn't the final Split-K slice
-      participates_in_reduction_ = false;
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentSource const &source,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    dGELU<ElementCompute>  gelu_op;
-
-    // dGelu
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
-    }
-
-    return intermediate;
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    dGELU<ElementCompute>  gelu_op;
-
-    // dGelu with conversion
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
-    }
-
-    return intermediate;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
deleted file mode 100755
index aed173056..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
+++ /dev/null
@@ -1,452 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file  
-  \brief Functor performing linear combination with a maximum operation used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/half.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementCompute_,                            ///< Data type returned by this functor
-  typename ElementAccumulator_,                        ///< Data type of accumulators
-  typename ElementSource_,                             ///< Data type of source tensor
-  typename ElementTensor_,                             ///< Data type of additional tensor
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationDRelu {
-public:
-
-  using ElementOutput = ElementSource_;
-  using ElementCompute = ElementCompute_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementSource = ElementSource_;
-  using ElementTensor = ElementTensor_;
-
-  static int const kCount = Count;
-
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentTensor = Array<ElementTensor, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    ElementCompute threshold;              ///< minimum value that is output 
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      threshold(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementTensor threshold_;
-  bool participates_in_reduction_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationDRelu(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = ElementTensor(params.threshold);
-    participates_in_reduction_  = true;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Returns true if the threadblock computes the reduction
-  CUTLASS_HOST_DEVICE
-  bool participates_in_reduction() const {
-    return participates_in_reduction_;
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementTensor const &>(allones);
-      participates_in_reduction_ = false;
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentSource const &source,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    // dReLU = (cond ? dy : 0)
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      ElementTensor cond = tensor[i];
-      if (cond <= threshold_) {
-        intermediate[i] = ElementCompute();
-      }
-    }
-
-    return intermediate;
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    // dReLU = (cond ? dy : 0)
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      ElementTensor cond = tensor[i];
-      if (cond <= threshold_) {
-        intermediate[i] = ElementCompute();
-      }
-    }
-
-    return intermediate;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementCompute_,                            ///< Data type returned by this functor
-  typename ElementAccumulator_,                        ///< Data type of accumulators
-  typename ElementSource_,                             ///< Data type of source tensor
-  int Count,                                           ///< Number of elements computed per operation
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationDReluConditionalBits {
-public:
-
-  using ElementOutput = ElementSource_;
-  using ElementCompute = ElementCompute_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementSource = ElementSource_;
-  using ElementTensor = uint1b_t;
-
-  static bool const kIsHeavy = false;
-
-  static int const kCount = Count;
-
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentTensor = Array<ElementTensor, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  FragmentTensor predicate_mask_;
-  bool participates_in_reduction_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationDReluConditionalBits(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    participates_in_reduction_ = true;
-    predicate_mask_.clear();
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Returns true if the threadblock computes the reduction
-  CUTLASS_HOST_DEVICE
-  bool participates_in_reduction() const {
-    return participates_in_reduction_;
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    predicate_mask_.clear();
-
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // Avoid computing the reduction if this isn't the final Split-K slice
-      participates_in_reduction_ = false;
-      
-      bit_not<FragmentTensor> not_op;
-      predicate_mask_ = not_op(predicate_mask_);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentSource const &source,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    bit_or<FragmentTensor> or_op;
-
-    FragmentTensor predicates = or_op(tensor, predicate_mask_);
-
-    // Obtain from packed bits
-    bool conditions[kCount];
-    UnpackPredicates<kCount> unpack_predicates;
-
-    unpack_predicates(conditions, predicates);
-
-    // dReLU = (cond ? dy : 0)
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      if (!conditions[i]) {
-        intermediate[i] = ElementCompute();
-      }
-    }
-
-    return intermediate;
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    bit_or<FragmentTensor> or_op;
-
-    FragmentTensor predicates = or_op(tensor, predicate_mask_);
-
-    // Obtain from packed bits
-    bool conditions[kCount];
-    UnpackPredicates<kCount> unpack_predicates;
-
-    unpack_predicates(conditions, predicates);
-
-    // dReLU = (cond ? dy : 0)
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kCount; ++i) {
-      if (!conditions[i]) {
-        intermediate[i] = ElementCompute();
-      }
-    }
-
-    return intermediate;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
deleted file mode 100755
index 818b21aa8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with GELU operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator followed by the GELU activation to an array of elements.
-///
-/// D = gelu(alpha * accumulator + beta * source + uniform)
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
-                                                       ElementCompute_, Scale, Round, true>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
deleted file mode 100755
index a6bd9d672..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Activation, class = void>
-struct GenericActivationTraits {
-  static constexpr bool IsArgumentsNeeded = false;
-  struct Arguments {};
-};
-
-template <class Activation>
-struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
-  static constexpr bool IsArgumentsNeeded = true;
-  using Arguments = typename Activation::Arguments;
-};
-
-template <typename T>
-struct LinearCombinationGenericParams {
-  T alpha;                  ///< scales accumulators
-  T beta;                   ///< scales source tensor
-  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams():
-    alpha(T(1)),
-    beta(T(0)),
-    alpha_ptr(nullptr),
-    beta_ptr(nullptr) { }
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams(
-    T alpha,
-    T beta = T(0)
-  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericParams(
-    T const *alpha_ptr,
-    T const *beta_ptr = nullptr
-  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator followed by an activation function to an array of elements.
-///
-/// D = activation(alpha * accumulator + beta * source + uniform)
-///
-template <
-  template<typename T> class ActivationFunctor,
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  bool IsHeavy = false
->
-class LinearCombinationGeneric {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static bool const kIsHeavy = IsHeavy;
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params
-    : LinearCombinationGenericParams<ElementCompute>,
-      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
-    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  Params params_;
-  bool skip_elementwise_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGeneric(Params const &params) {
-    params_ = params;
-    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    skip_elementwise_ = false;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return params_.beta != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      params_.beta = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      skip_elementwise_ = true;
-    }
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-    ActivationFunctor<FragmentCompute> activation;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }  else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(params_.beta, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
-      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
-    } else {
-      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
-    }
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_accumulator;
-    ActivationFunctor<FragmentCompute> activation;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
-    }
-
-    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
-      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
-    } else {
-      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
-    }
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
deleted file mode 100755
index e1dde1a6a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Functor performing linear combination operations with a generic element-wise activation
-  function. Scaling factors are applied to operands A, B, and C. The pre-activation auxiliary
-  output is also returned.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
-///   D = activation(Aux)
-///
-template <
-  template<typename T> class ActivationFunctor,
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  typename ElementAuxOutput_,                          ///< Data type used to store auxiliary output
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  bool IsHeavy = false
->
-class LinearCombinationGenericWithScalingAndAbsMax {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAuxOutput = ElementAuxOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalingFactor = ElementAccumulator_;
-
-  /// Data type used for absolute maximum value
-  using ElementAbsmax = float;
-
-  static bool const kIsScalingAndAmaxAuxOutputNeeded = (platform::is_same<ElementAuxOutput, cutlass::float_e4m3_t>::value ||
-                                                        platform::is_same<ElementAuxOutput, cutlass::float_e5m2_t>::value);
-  static bool const kIsScalingAndAmaxOutputNeeded    = (platform::is_same<ElementOutput, cutlass::float_e4m3_t>::value ||
-                                                        platform::is_same<ElementOutput, cutlass::float_e5m2_t>::value);
-
-  static bool const kIsHeavy = IsHeavy;
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAuxOutput = Array<ElementAuxOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-    struct ActivationParams
-      : LinearCombinationGenericParams<ElementCompute>,
-        GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
-      using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
-    };
-
-    ActivationParams activation;
-    ElementScalingFactor const* scale_a_ptr = nullptr;   ///< pointer to a scalar - if not null, loads it from memory
-    ElementScalingFactor const* scale_b_ptr = nullptr;   ///< pointer to b scalar - if not null, loads it from memory
-    ElementScalingFactor const* scale_c_ptr = nullptr;   ///< pointer to c scalar - if not null, loads it from memory
-    ElementScalingFactor const* scale_d_ptr = nullptr;   ///< pointer to d scalar - if not null, loads it from memory
-    ElementScalingFactor const* scale_aux_ptr = nullptr; ///< pointer to aux scalar - if not null, loads it from memory
-
-    ElementAbsmax * abs_max_aux_ptr = nullptr;      ///< pointer to location to store amax of Aux
-    ElementAbsmax * abs_max_D_ptr   = nullptr;      ///< pointer to location to store amax of D
-
-    CUTLASS_HOST_DEVICE
-    Params() :
-      scale_a_ptr(nullptr),
-      scale_b_ptr(nullptr),
-      scale_c_ptr(nullptr),
-      scale_d_ptr(nullptr),
-      scale_aux_ptr(nullptr),
-      abs_max_aux_ptr(nullptr),
-      abs_max_D_ptr(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ActivationParams activation_params,
-           ElementScalingFactor const* scale_a_ptr,
-           ElementScalingFactor const* scale_b_ptr,
-           ElementScalingFactor const* scale_c_ptr,
-           ElementScalingFactor const* scale_d_ptr,
-           ElementScalingFactor const* scale_aux_ptr,
-           ElementAbsmax * abs_max_aux_ptr,
-           ElementAbsmax * abs_max_D_ptr) :
-           activation(activation_params),
-           scale_a_ptr(scale_a_ptr),
-           scale_b_ptr(scale_b_ptr),
-           scale_c_ptr(scale_c_ptr),
-           scale_d_ptr(scale_d_ptr),
-           scale_aux_ptr(scale_aux_ptr),
-           abs_max_aux_ptr(abs_max_aux_ptr),
-           abs_max_D_ptr(abs_max_D_ptr) {}
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  Params params_;
-  bool skip_elementwise_;
-
-  // Scaling factors for output and auxiliary output
-  ElementCompute scale_d_;
-  ElementCompute scale_aux_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationGenericWithScalingAndAbsMax(Params const &params) :
-    params_(params),
-    skip_elementwise_(false),
-    scale_d_(ElementCompute(params.scale_d_ptr ? *(params.scale_d_ptr) : ElementScalingFactor(1))),
-    scale_aux_(ElementCompute(params.scale_aux_ptr ? *(params.scale_aux_ptr) : ElementScalingFactor(1)))
-  {
-    params_.activation.alpha = (params.activation.alpha_ptr ? *params.activation.alpha_ptr : params.activation.alpha);
-    params_.activation.beta = (params.activation.beta_ptr ? *params.activation.beta_ptr : params.activation.beta);
-    auto scale_a =
-        ElementCompute(params.scale_a_ptr ? *(params.scale_a_ptr) : ElementScalingFactor(1));
-    auto scale_b =
-        ElementCompute(params.scale_b_ptr ? *(params.scale_b_ptr) : ElementScalingFactor(1));
-    auto scale_c =
-        ElementCompute(params.scale_c_ptr ? *(params.scale_c_ptr) : ElementScalingFactor(1));
-
-    multiplies<ElementCompute> multiply;
-    params_.activation.alpha = multiply(params.activation.alpha, multiply(scale_a, scale_b));
-    params_.activation.beta = multiply(params.activation.beta, scale_c);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return params_.activation.beta != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      params_.activation.beta = ElementCompute(1);
-    }
-
-    // Only the final partition should perform the activation function
-    // and scale the output and auxiliary output values.
-    if (k_partition != k_partition_count - 1) {
-      skip_elementwise_ = true;
-      scale_d_ = ElementCompute(1.);
-      scale_aux_ = ElementCompute(1.);
-    }
-  }
-
-  /// Computes linear scaling:
-  ///    Aux = (alpha * scale_a * scale_b * accumulator) + (beta * scale_c * source) + bias
-  ///      D = activation(Aux)
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentCompute& output,
-    FragmentCompute& aux_output,
-    FragmentAccumulator const &accumulator,
-    FragmentCompute const& bias,
-    FragmentOutput const &source) {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> multiply;
-    plus<FragmentCompute> add;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-    ActivationFunctor<FragmentCompute> activation;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
-    }  else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = multiply(params_.activation.beta, converted_source);
-      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
-    }
-
-    intermediate = add(intermediate, bias);
-
-    aux_output = intermediate;
-    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
-      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
-    } else {
-      output = skip_elementwise_ ? intermediate : activation(intermediate);
-    }
-  }
-
-  /// Computes linear scaling:
-  ///    Aux = (alpha * scale_a * scale_b * accumulator) + bias
-  ///      D = activation(Aux)
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentCompute& output,
-    FragmentCompute& aux_output,
-    FragmentAccumulator const &accumulator,
-    FragmentCompute const& bias) {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> multiply;
-    plus<FragmentCompute> add;
-    ActivationFunctor<FragmentCompute> activation;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = multiply(params_.activation.alpha, converted_accumulator);
-    }
-
-    intermediate = add(intermediate, bias);
-
-    aux_output = intermediate;
-    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
-      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
-    } else {
-      output = skip_elementwise_ ? intermediate : activation(intermediate);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  ElementAbsmax* get_ptr_output_abs_max() const {
-    return params_.abs_max_D_ptr;
-  }
-
-  CUTLASS_HOST_DEVICE
-  ElementAbsmax* get_ptr_aux_output_abs_max() const {
-    return params_.abs_max_aux_ptr;
-  }
-
-  CUTLASS_HOST_DEVICE
-  ElementCompute get_scale_d() const {
-    return scale_d_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  ElementCompute get_scale_aux() const {
-    return scale_aux_;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
deleted file mode 100755
index ef51a318b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*************************************************************************************************** 
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with HardSwish operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator followed by the HardSwish activation to an array of elements.
-///
-/// D = hardswish(alpha * accumulator + beta * source + uniform)
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
-                                                            ElementCompute_, Scale, Round>;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
deleted file mode 100755
index 5989f09ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationLeakyRelu {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta_bias;              ///< scales bias tensor
-    ElementCompute leaky_alpha;            ///< leaky_alpha
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta_bias(ElementCompute(0)),
-      leaky_alpha(ElementCompute(1)) 
-       { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta_bias,
-      ElementCompute leaky_alpha = ElementCompute(1)
-    ): alpha(alpha), beta_bias(beta_bias), leaky_alpha(leaky_alpha) {
-
-    }
-
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_bias_;
-  ElementCompute leaky_alpha_recip_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationLeakyRelu(Params const &params) {
-    alpha_ = (params.alpha);
-    beta_bias_ = (params.beta_bias);
-    leaky_alpha_recip_ = (ElementCompute(params.leaky_alpha));    
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_bias_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition) {
-    if (k_partition) {
-      beta_bias_ = ElementCompute(1);
-    }
-  }
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_bias_ = ElementCompute(1);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source = source_converter(source);
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_add_source;
-    multiply_add<ComputeFragment> mul_add_accumulator;
-
-    LeakyReLU<ComputeFragment> leakyrelu;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }  else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_bias_, converted_source);                        // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-    // Compute threshold optionally
-    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-    
-    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
-    
-    // Perform binary operations
-    ComputeFragment intermediate;
-
-    multiplies<ComputeFragment> mul_accumulator;
-    LeakyReLU<ComputeFragment> leakyrelu;
-    //printf("in doing with bias");
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-    
-    // Compute threshold optionally
-    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
-    
-    
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
deleted file mode 100755
index 271055676..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct LinearCombinationParams {
-  uint64_t alpha_data[2];
-  uint64_t beta_data[2];
-
-  CUTLASS_HOST_DEVICE
-  LinearCombinationParams()
-  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
-  { }
-
-  template <typename ElementCompute>
-  CUTLASS_HOST_DEVICE 
-  LinearCombinationParams(ElementCompute alpha, ElementCompute beta) 
-  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
-  {
-#if defined(__CUDA_ARCH__)
-    reinterpret_cast<ElementCompute&>(alpha_data) = alpha;
-    reinterpret_cast<ElementCompute&>(beta_data) = beta;
-#else
-    memcpy( alpha_data, &alpha, sizeof(ElementCompute) ); 
-    memcpy( beta_data, &beta, sizeof(ElementCompute) ); 
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
deleted file mode 100755
index ff32f13b0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination operations on planar-complex arrays
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/complex.h"
-#include "cutlass/array_planar_complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to arrays of planar-complex elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-/// Note, as with most CUTLASS components for planar complex, the template arguments describe
-/// the underlying real data type.
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  ScaleType::Kind Scale = ScaleType::Default           ///< Control Alpha and Beta scaling
->
-class LinearCombinationPlanarComplex {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalar = complex<ElementCompute>;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = ArrayPlanarComplex<ElementOutput, kCount>;
-  using FragmentAccumulator = ArrayPlanarComplex<ElementAccumulator, kCount>;
-  using ComputeFragment = ArrayPlanarComplex<ElementCompute, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementScalar alpha{ElementCompute(1)};         ///< scales accumulators
-    ElementScalar beta{ElementCompute(0)};          ///< scales source tensor
-    ElementScalar const* alpha_ptr{nullptr};        ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementScalar const* beta_ptr{nullptr};         ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementScalar alpha,
-      ElementScalar beta
-    ): alpha(alpha), beta(beta)
-    {}
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementScalar const *alpha_ptr,
-      ElementScalar const *beta_ptr
-    ): alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) 
-    {}
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementScalar alpha_;
-  ElementScalar beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationPlanarComplex(Params const &params) {
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    return beta_.real() != ElementCompute(0) || beta_.imag() != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_source{
-      source_converter(source.real), 
-      source_converter(source.imag)};
-
-    ComputeFragment converted_accumulator{
-      accumulator_converter(accumulator.real), 
-      accumulator_converter(accumulator.imag)};
-
-    multiplies<Array<ElementCompute, kCount> > mul_op;
-    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
-
-    // Perform binary operations
-  
-    // complex multiply: I = beta * C
-    ComputeFragment intermediate {
-      mul_op(beta_.real(), converted_source.real),
-      mul_op(beta_.real(), converted_source.imag)
-    };
-
-    intermediate.real = mul_add_op(-beta_.imag(), converted_source.imag, intermediate.real);
-    intermediate.imag = mul_add_op( beta_.imag(), converted_source.real, intermediate.imag);
-
-    // complex multiply-add: I = alpha * AB + I
-    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real, intermediate.real);
-    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag, intermediate.imag);
-
-    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
-    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return FragmentOutput{
-      destination_converter(intermediate.real), 
-      destination_converter(intermediate.imag)};
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    ComputeFragment converted_accumulator{
-      accumulator_converter(accumulator.real), 
-      accumulator_converter(accumulator.imag)};
-
-    // Perform binary operations
-    multiplies<Array<ElementCompute, kCount> > mul_op;
-    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
-
-    // complex multiply-add: I = alpha * AB + I
-    ComputeFragment intermediate {
-      mul_op(alpha_.real(), converted_accumulator.real),
-      mul_op(alpha_.real(), converted_accumulator.imag)
-    };
-
-    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
-    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return FragmentOutput{
-      destination_converter(intermediate.real), 
-      destination_converter(intermediate.imag)};
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
deleted file mode 100755
index bbdc49862..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with a maximum operation used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/half.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
-constexpr bool LinearCombinationReluIsHeavy() {
-  return false;
-}
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationRelu {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentScaleBias = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< minimum value that is output 
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      threshold(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta = ElementCompute(0),
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr = nullptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementCompute threshold_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationRelu(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    } else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
-  /// Scale and Bias are from input Fragment
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentScaleBias const &scale,
-    FragmentScaleBias const &bias) const {
-    
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform per-channel scale and bias
-    FragmentCompute intermediate;
-
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
-      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
-    else
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
-
-    ReLu<FragmentCompute> relu;
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conditional guards to enable partial specialization for packed integers
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-/// Special handling for int types
-
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round
->
-class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = int;
-  using ElementCompute = float;
-
-  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentScaleBias = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< minimum value that is output 
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      threshold(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta = ElementCompute(0),
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr = nullptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementCompute threshold_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationRelu(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }  else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-
-  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
-  /// Scale and Bias are from input Fragment
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentScaleBias const &scale,
-    FragmentScaleBias const &bias) const {
-    
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform per-channel scale and bias
-    FragmentCompute intermediate;
-
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
-      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
-    else
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
-
-    ReLu<FragmentCompute> relu;
-
-    // Compute threshold optionally
-    intermediate = relu(threshold_, intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-};
-
-#endif // Conditional guards to enable partial specialization for packed integers
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
deleted file mode 100755
index 76ad59244..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with a relu operation used by epilogues.
-  This one only supports relu0 and tries to folding relu into other instructions.  Thus,
-  serial splitk is not supported by this one.  For example, relu can be folded into 
-  hfma2/hmul2 for sm80+
-*/
-
-#pragma once
-
-#include "cutlass/half.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
-constexpr bool LinearCombinationRelu0IsHeavy() {
-  return false;
-}
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationRelu0 {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentScaleBias = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta = ElementCompute(0)
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr = nullptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationRelu0(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// This is used for serial reduction which is not supported by Relu0
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    assert(k_partition == 0);
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add_relu0<FragmentCompute> mul_add_relu0_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    } else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-
-      // Compute threshold optionally
-      intermediate = relu(intermediate);
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-
-  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
-  /// Scale and Bias are from input Fragment
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentScaleBias const &scale,
-    FragmentScaleBias const &bias) const {
-    
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform per-channel scale and bias
-    FragmentCompute intermediate;
-
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
-      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
-    else
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
-
-    ReLu<FragmentCompute> relu;
-
-    // Compute threshold optionally
-    intermediate = relu(intermediate);
-
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conditional guards to enable partial specialization for packed integers
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-/// Special handling for int types
-
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round
->
-class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = int;
-  using ElementCompute = float;
-
-  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
-
-  static int const kCount = Count;
-  static const ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentScaleBias = Array<ElementCompute, kCount>;
-  using FragmentSource = Array<ElementOutput, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta = ElementCompute(0)
-    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr = nullptr
-    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationRelu0(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    if (Scale == ScaleType::NoBetaScaling) return true;
-
-    if (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if (Scale == ScaleType::Nothing) return false;
-
-    return beta_ != ElementCompute(0);
-  }
-
-  /// This is used for serial reduction which is not supported by Relu0
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    assert(k_partition == 0);
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentOutput const &source) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::NoBetaScaling) {
-      intermediate = converted_source;
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }  else if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-    ReLu<FragmentCompute> relu;
-
-    if (Scale == ScaleType::Nothing) {
-      intermediate = converted_accumulator;
-    } else {
-      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-    }
-
-    // Compute threshold optionally
-    intermediate = relu(intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-
-  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
-  /// Scale and Bias are from input Fragment
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentScaleBias const &scale,
-    FragmentScaleBias const &bias) const {
-    
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform per-channel scale and bias
-    FragmentCompute intermediate;
-
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
-      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
-    else
-      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
-
-    ReLu<FragmentCompute> relu;
-
-    // Compute threshold optionally
-    intermediate = relu(intermediate);
-
-    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
-      // Convert floats back to INT
-      FragmentAccumulator scaled_accumulator;
-
-      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
-
-      scaled_accumulator = compute_converter(intermediate);
-
-      // Convert to destination numeric type
-      NumericArrayConverter<ElementOutput, int, kCount, Round>
-          destination_converter;
-
-      return destination_converter(scaled_accumulator);
-    } else {
-      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
-          destination_converter;
-      return destination_converter(intermediate);
-    }
-  }
-};
-
-#endif // Conditional guards to enable partial specialization for packed integers
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
deleted file mode 100755
index ec4083de6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Epilogue functor specialized for residual blocks in deep neural networks.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/detail.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
-template <typename ElementOutput_, typename ElementAccumulator_,
-          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
-          template <typename T> class ActivationOp_,
-          template <typename T> class BinaryOp1_,
-          template <typename T> class UnaryOp_,
-          template <typename T> class BinaryOp2_ = detail::NoOp,
-          bool StoreT_ = false,
-          typename ElementVector_ = ElementC_>
-class LinearCombinationResidualBlock {
-public:
-  static bool const kIsSingleSource = false;
-
-  using ElementOutput = ElementC_;
-  using ElementC = ElementC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementVector = ElementVector_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-
-  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
-  using BinaryOp1 = BinaryOp1_<Array<ElementCompute, kCount>>;
-  using BinaryOp2 = BinaryOp2_<Array<ElementCompute, kCount>>;
-  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
-
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementC, kElementsPerAccess>;
-  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
-
-  using ElementZ = ElementOutput_;
-  using ElementT = ElementZ;
-  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
-  using FragmentT = Array<ElementT, kElementsPerAccess>;
-
-  static bool const kIsHeavy = true;
-  static bool const kStoreZ = true;
-  static bool const kStoreT = StoreT_;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales residual input
-    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
-
-    CUTLASS_HOST_DEVICE
-    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute alpha, ElementCompute beta)
-        : alpha(alpha), beta(beta) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
-        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
-  };
-
-private:
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  bool skip_elementwise_;
-
-public:
-
-  /// Constructor from Params
-  CUTLASS_HOST_DEVICE
-  LinearCombinationResidualBlock(Params const &params) {
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    skip_elementwise_ = false;
-  }
-
-  /// The "source" tensor corresponds to the residual input
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const { return true; }
-
-  /// Functionally required for serial reduction in the epilogue
-  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      skip_elementwise_ = true;
-    }
-  }
-
-  /// Applies the operation UnaryOp(BinaryOp(BinaryOp(ActivationOp(AB + bias), residual1), residual2))
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
-                  FragmentC const &residual1, FragmentC const &residual2,
-                  FragmentCompute const &bias) const {
-    UnaryOp unary_op;
-    BinaryOp1 binary_op1;
-    BinaryOp2 binary_op2;
-    ActivationOp activation;
-
-    FragmentCompute tmp_Accum =
-        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute tmp_residual1 =
-        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual1);
-    FragmentCompute tmp_residual2 =
-        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual2);
-
-    FragmentCompute z =
-        binary_op2(binary_op1(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual1), beta_ * tmp_residual2);
-    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
-
-    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-  }
-
-  /// Should never be called
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
-                  FragmentCompute const &) const {}
-};
-
-/// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
-template <typename ElementOutput_, typename ElementAccumulator_,
-          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
-          template <typename T> class ActivationOp_,
-          template <typename T> class BinaryOp1_,
-          template <typename T> class UnaryOp_,
-          bool StoreT_,
-          typename ElementVector_>
-class LinearCombinationResidualBlock<ElementOutput_, ElementAccumulator_,
-          ElementCompute_, ElementC_, ElementsPerAccess,
-          ActivationOp_, BinaryOp1_, UnaryOp_,
-          detail::NoOp, StoreT_, ElementVector_> {
-public:
-  static bool const kIsSingleSource = true;
-
-  using ElementOutput = ElementC_;
-  using ElementC = ElementC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementVector = ElementVector_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-
-  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
-  using BinaryOp = BinaryOp1_<Array<ElementCompute, kCount>>;
-  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
-
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementC, kElementsPerAccess>;
-  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
-
-  using ElementZ = ElementOutput_;
-  using ElementT = ElementZ;
-  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
-  using FragmentT = Array<ElementT, kElementsPerAccess>;
-
-  static bool const kIsHeavy = true;
-  static bool const kStoreZ = true;
-  static bool const kStoreT = StoreT_;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales residual input
-    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
-
-    CUTLASS_HOST_DEVICE
-    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute alpha, ElementCompute beta)
-        : alpha(alpha), beta(beta) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
-        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
-  };
-
-private:
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  bool skip_elementwise_;
-
-public:
-
-  /// Constructor from Params
-  CUTLASS_HOST_DEVICE
-  LinearCombinationResidualBlock(Params const &params) {
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    skip_elementwise_ = false;
-  }
-
-  /// The "source" tensor corresponds to the residual input
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const { return true; }
-
-  /// Functionally required for serial reduction in the epilogue
-  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      skip_elementwise_ = true;
-    }
-  }
-
-  /// Applies the operation UnaryOp(BinaryOp(ActivationOp(AB + bias), residual))
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
-                  FragmentC const &residual,
-                  FragmentCompute const &bias) const {
-    UnaryOp unary_op;
-    BinaryOp binary_op;
-    ActivationOp activation;
-
-    FragmentCompute tmp_Accum =
-        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
-    FragmentCompute tmp_residual =
-        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual);
-
-    FragmentCompute z =
-        binary_op(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual);
-    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
-
-    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
-    frag_Z = convert_z(result_Z);
-  }
-
-  /// Should never be called
-  CUTLASS_HOST_DEVICE
-  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
-                  FragmentCompute const &) const {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
deleted file mode 100755
index 35251177f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with Sigmoid operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements.
-///
-/// D = sigmoid(alpha * accumulator + beta * source + uniform)
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
-                                                          ElementCompute_, Scale, Round, true>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
deleted file mode 100755
index fa346b068..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination with SiLU operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/linear_combination_generic.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements.
-///
-/// D = silu(alpha * accumulator + beta * source + uniform)
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
-                                                       ElementCompute_, Scale, Round, true>;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
deleted file mode 100755
index c3ceea0ab..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Functor performing linear combination operation, bias addition, and tensor-tensor
-  elementwise operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/detail.hpp"
-#include "cutlass/epilogue/thread/scale_type.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-namespace detail {
-
-/// Returns whether a source operand is needed for a combination of binary operation and scale
-/// type. Simple specialized checks are made for cases in which 0 is an identity element of
-/// the binary operation.
-template <class BinaryOp, class ElementCompute, ScaleType::Kind Scale>
-CUTLASS_HOST_DEVICE
-bool is_binary_op_source_needed(ElementCompute scale) {
-  if constexpr (cute::is_same_v<BinaryOp, NoOp<ElementCompute>>) {
-    return false;
-  }
-  else if constexpr (cute::is_same_v<BinaryOp, plus<ElementCompute>> || cute::is_same_v<BinaryOp, minus<ElementCompute>>) {
-    // Cases for binary operators for which 0 is an identity element
-    if constexpr (Scale == ScaleType::NoBetaScaling) return true;
-
-    if constexpr (Scale == ScaleType::OnlyAlphaScaling) return false;
-
-    if constexpr (Scale == ScaleType::Nothing) return false;
-
-    return scale != ElementCompute(0);
-  }
-
-  return true;
-}
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/** Compute a tensor-tensor broadcast epilogue.
- *
- * @param ElementOutput_ Data type used to load and store tensors
- * @param ElementAccumulator_ Accumulator data type
- * @param ElementCompute_ Data type used to compute linear combination
- * @param ElementBias_ Data type of Bias elements
- * @param ActivationFunctor_ Fused Activation
- * @param BinaryOp0_ Binary operation to perform on O0 and C0. detail::NoOp means no operation
- * @param BinaryOp1_ Binary operation to perform on O1 and C1. detail::NoOp means no operation
- * @param UnaryOp_ Unary operation to perform on final result
- * @param Scale Controls the type of Alpha and Beta scaling to perform
- * @param Round How values should be rounded in conversions
- * @param ElementSource_ Data type used for source operands
- *
- *  Computes the following:
- *      O0 = alpha * accumulator + bias
- *      O1 = BinaryOp0(O0, beta * C0)
- *      O2 = BinaryOp1(O1, beta * C1)
- *      D  = UnaryOp(O2)
- */
-template <
-  class ElementOutput_,
-  class ElementAccumulator_ = ElementOutput_,
-  class ElementCompute_ = ElementOutput_,
-  class ElementBias_ = ElementCompute_,
-  template <class T> class ActivationFunctor_ = Identity,
-  template <class T> class BinaryOp0_ = plus,
-  template <class T> class BinaryOp1_ = detail::NoOp,
-  template <class T> class UnaryOp_ = Identity,
-  ScaleType::Kind Scale = ScaleType::Default,
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  class ElementSource_ = ElementOutput_
->
-class LinearCombinationTensorBroadcast {
-public:
-
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementScalar = ElementCompute;
-  using ElementBias = ElementBias_;
-  using ElementC = ElementSource_;
-  using ElementD = ElementOutput_;
-  using ElementScalingFactor = ElementAccumulator_;
-
-  using UnaryOp = UnaryOp_<ElementCompute>;
-  using BinaryOp0 = BinaryOp0_<ElementCompute>;
-  using BinaryOp1 = BinaryOp1_<ElementCompute>;
-  using ActivationFunctor = ActivationFunctor_<ElementCompute>;
-
-  static constexpr int kCount = 1;
-  static constexpr ScaleType::Kind kScale = Scale;
-
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-  using FragmentBias = Array<ElementBias, kCount>;
-
-  static constexpr FloatRoundStyle kRound = Round;
-  using NoOpType = detail::NoOp<ElementCompute>;
-  static constexpr bool IsBinaryOp0Enabled = !cute::is_same_v<BinaryOp0, NoOpType>;
-  static constexpr bool IsBinaryOp1Enabled = !cute::is_same_v<BinaryOp1, NoOpType>;
-  static constexpr bool IsUnaryOpEnabled = !cute::is_same_v<UnaryOp, NoOpType> && !cute::is_same_v<UnaryOp, Identity<ElementCompute>>;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha{};                          ///< scales accumulators
-    ElementCompute beta{};                           ///< scales source tensor
-    ElementCompute const* alpha_ptr = nullptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const* beta_ptr = nullptr;        ///< pointer to source scalar - if not null, loads it from memory
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const* alpha_ptr, ElementCompute const* beta_ptr)
-        : alpha_ptr(alpha_ptr),
-          beta_ptr(beta_ptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute const* alpha_ptr)
-        : alpha_ptr(alpha_ptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(ElementCompute alpha,
-           ElementCompute beta)
-        : alpha(alpha),
-          beta(beta) {}
-  };
-
-private:
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationTensorBroadcast(Params const& params)
-      : alpha_(params.alpha_ptr ? *params.alpha_ptr : params.alpha),
-        beta_(params.beta_ptr ? *params.beta_ptr : params.beta) {}
-
-  /// Returns true if source 0 is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source0_needed() const {
-    return detail::is_binary_op_source_needed<BinaryOp0, ElementCompute, Scale>(beta_);
-  }
-
-  /// Returns true if source 1 is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source1_needed() const {
-    return detail::is_binary_op_source_needed<BinaryOp1, ElementCompute, Scale>(beta_);
-  }
-
-  //
-  // Specialization for scalar
-  //
-  CUTLASS_HOST_DEVICE
-  ElementD operator()(ElementAccumulator const accumulator, ElementC const source0, ElementC source1, ElementBias const bias) {
-    // Convert everything to Compute type, do compute, and then store to output type
-    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
-    NumericConverter<ElementCompute, ElementBias, Round> bias_converter;
-    NumericConverter<ElementCompute, ElementC, Round> source_converter;
-    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
-
-    ActivationFunctor act;
-    multiplies<ElementCompute> mul;
-    multiply_add<ElementCompute> madd;
-
-    ElementCompute intermediate = accumulator_converter(accumulator);
-    intermediate = madd(alpha_, intermediate, bias_converter(bias));
-    intermediate = act(intermediate);
-
-    // Apply BinaryOp0, if needed
-    if constexpr (IsBinaryOp0Enabled) {
-      BinaryOp0 bin0;
-      ElementCompute converted_source = source_converter(source0);
-      intermediate = bin0(intermediate, mul(beta_, converted_source));
-    }
-
-    // Apply BinaryOp1, if needed
-    if constexpr (IsBinaryOp1Enabled) {
-      BinaryOp1 bin1;
-      ElementCompute converted_source = source_converter(source1);
-      intermediate = bin1(intermediate, mul(beta_, converted_source));
-    }
-
-    // Apply UnaryOp, if needed
-    if constexpr (IsUnaryOpEnabled) {
-      UnaryOp unary;
-      intermediate = unary(intermediate);
-    }
-
-    return destination_converter(intermediate);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
deleted file mode 100755
index 8a2ce5a2a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  
-  \brief Functor performing linear combination with elementwise
-*/
-
-#pragma once
-
-#include "cutlass/half.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/constants.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementCompute_,                            ///< Data type returned by this functor
-  typename ElementAccumulator_,                        ///< Data type of accumulators
-  typename ElementSource_,                             ///< Data type of source tensor
-  typename ElementTensor_,                             ///< Data type of additional tensor
-  int Count,                                           ///< Number of elements computed per operation
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LinearCombinationWithElementwise {
-public:
-
-  using ElementOutput = ElementSource_;
-  using ElementCompute = ElementCompute_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementSource = ElementSource_;
-  using ElementTensor = ElementTensor_;
-
-  static bool const kIsHeavy = true;
-
-  static int const kCount = Count;
-
-  using FragmentCompute = Array<ElementCompute, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using FragmentSource = Array<ElementSource, kCount>;
-  using FragmentTensor = Array<ElementTensor, kCount>;
-
-  static FloatRoundStyle const kRound = Round;
-
-  /// Host-constructable parameters structure
-  struct Params {
-
-    ElementCompute alpha;                  ///< scales accumulators
-    ElementCompute beta;                   ///< scales source tensor
-    ElementCompute threshold;              ///< minimum value that is output 
-    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
-    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): 
-      alpha(ElementCompute(1)), 
-      beta(ElementCompute(0)),
-      threshold(ElementCompute(0)), 
-      alpha_ptr(nullptr), 
-      beta_ptr(nullptr) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute alpha,
-      ElementCompute beta,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementCompute const *alpha_ptr,
-      ElementCompute const *beta_ptr,
-      ElementCompute threshold = ElementCompute(0)
-    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
-
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  ElementCompute alpha_;
-  ElementCompute beta_;
-  ElementCompute threshold_;
-  bool participates_in_reduction_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LinearCombinationWithElementwise(Params const &params) {
-
-    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
-    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
-    threshold_ = params.threshold;
-    participates_in_reduction_ = true;
-  }
-
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return beta_ != ElementCompute(0);
-  }
-
-  /// Returns true if the threadblock computes the reduction
-  CUTLASS_HOST_DEVICE
-  bool participates_in_reduction() const {
-    return participates_in_reduction_;
-  }
-
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    if (k_partition) {
-      beta_ = ElementCompute(1);
-    }
-
-    if (k_partition != k_partition_count - 1) {
-      // set to NaN to make ReLU no-op for all except last k partitions
-      int64_t allones = -1;
-      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
-      // Avoid computing the reduction if this isn't the final Split-K slice
-      participates_in_reduction_ = false;
-    }
-  }
-  
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator, 
-    FragmentSource const &source,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_source = source_converter(source);
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_add_source;
-    multiply_add<FragmentCompute> mul_add_accumulator;
-
-    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
-    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
-
-    return intermediate;
-  }
-
-  /// Computes linear scaling: D = alpha * accumulator
-  CUTLASS_HOST_DEVICE
-  FragmentCompute operator()(
-    FragmentAccumulator const &accumulator,
-    FragmentTensor const &tensor) const {
-
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
-
-    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
-
-    // Perform binary operations
-    FragmentCompute intermediate;
-
-    multiplies<FragmentCompute> mul_accumulator;
-
-    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
-
-    return intermediate;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
deleted file mode 100755
index b24d4f953..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing reduction operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Applies a reduction sum to an array of elements.
-///
-///
-template <
-  typename Element_,                             ///< Data type used to load and store tensors
-  int Count                                      ///< Number of elements computed per operation
->
-class ReductionOpPlus {
-public:
-
-  using Element = Element_;
-  static int const kCount = Count;
-
-  using Fragment = Array<Element, kCount>;
-  using Operator = plus<Fragment>;
-
-  /// Host-constructable parameters structure
-  struct Params { };
-
-private:
-
-  /// reduction operator
-  Operator operator_;
-
-public:
-
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  ReductionOpPlus(Params const &params) {
-
-  }
-
-  /// Computes Compute => 
-  CUTLASS_HOST_DEVICE
-  Fragment operator()(
-    Fragment const &lhs,
-    Fragment const &rhs) const {
-
-    return operator_(lhs, rhs);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
deleted file mode 100755
index d1a466213..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Enum defines the behaviors of the epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specifies internal data type for computation
-/// Note :
-///  1. Scalar means alpha/beta is a single value from host(constant param) or device memory.
-///  2. Vector means alpha/beta is a vector always from device memory.
-struct ScaleType {
-  enum Kind {
-    Default,                           // D = scalar_alpha x Acc + scalar_beta x C
-    NoBetaScaling,                     // D = scalar_alpha x Acc + C
-    OnlyAlphaScaling,                  // D = scalar_alpha x Acc
-    PerChannelScaling,                 // D = vector_alpha x Acc + vector_beta x C
-    OnlyAlphaPerChannelScaling,        // D = vector_alpha x Acc
-    Nothing                            // D = Acc
-  };
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
deleted file mode 100755
index 30af039bc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Specialization and defines sensible defaults for epilogues for complex*complex case
-//  4 real-valued mma operations (Complex)
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Epilogue Shape
-  typename Shape_,
-  /// Warp-level mma operator
-  typename WarpMmaTensorOp_,
-  /// Number of k partitions
-  int PartitionsK,
-  /// Epilogue output operator
-  typename OutputOp_,
-  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
-  int ElementsPerAccess,
-  /// Multiply-add operator 
-  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
-  typename Operator_ = arch::OpMultiplyAddComplex
-> 
-struct DefaultEpilogueComplexTensorOp {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using Operator = Operator_;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 0>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
-//  3 real-valued mma operations (Gaussian Complex)
-//  A  = (ar + j ai), B = (br +j bi), D = AB
-//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
-//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK, 
-                                      OutputOp_, ElementsPerAccess, 
-                                      arch::OpMultiplyAddGaussianComplex
-> {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using Operator = arch::OpMultiplyAddGaussianComplex;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 0>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
deleted file mode 100755
index e86e4f92b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Specialization and defines sensible defaults for epilogues for complex*complex case
-//  4 real-valued mma operations (Complex)
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Epilogue Shape
-  typename Shape_,
-  /// Warp-level mma operator
-  typename WarpMmaTensorOp_,
-  /// Number of k partitions
-  int PartitionsK,
-  /// Epilogue output operator
-  typename OutputOp_,
-  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
-  int ElementsPerAccess,
-  /// Multiply-add operator 
-  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
-  typename Operator_ = arch::OpMultiplyAddComplex,
-  /// Is for a symmetric kernel
-  BlasMode BlasMode_ = BlasMode::kGemm
-> 
-struct DefaultEpilogueComplexTensorOpBlas3 {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using Operator = Operator_;
-  static BlasMode const kBlasMode = BlasMode_;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
-    OutputTileThreadMap,
-    ElementOutput
-    , kBlasMode
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 0>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
-//  3 real-valued mma operations (Gaussian Complex)
-//  A  = (ar + j ai), B = (br +j bi), D = AB
-//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
-//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess, 
-  BlasMode BlasMode_
->
-struct DefaultEpilogueComplexTensorOpBlas3 <Shape_, WarpMmaTensorOp_, PartitionsK, 
-                                      OutputOp_, ElementsPerAccess, 
-                                      arch::OpMultiplyAddGaussianComplex
-                                      , BlasMode_
-> {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using Operator = arch::OpMultiplyAddGaussianComplex;
-  static BlasMode const kBlasMode = BlasMode_;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
-    OutputTileThreadMap,
-    ElementOutput,
-    kBlasMode
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 0>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
deleted file mode 100755
index 8770f6196..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Direct store epilogue
-*/
-
-#pragma once
-
-////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/epilogue/threadblock/epilogue_direct_store.h"
-#include "cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Given a properly constructed epilogue, returns a direct store epilogue
-template <typename EpilogueTensorOp>
-struct DefaultEpilogueDirectStore {
-
-  using OutputTileIterator = DirectStoreEpilogueIterator<typename EpilogueTensorOp::OutputTileIterator::Element>;
-
-  using Epilogue = EpilogueDirectStore<
-    typename EpilogueTensorOp::Shape,
-    typename EpilogueTensorOp::WarpMmaOperator,
-    EpilogueTensorOp::kPartitionsK,
-    OutputTileIterator,
-    typename EpilogueTensorOp::AccumulatorFragmentIterator,
-    typename EpilogueTensorOp::WarpTileIterator,
-    typename EpilogueTensorOp::SharedLoadIterator,
-    typename EpilogueTensorOp::OutputOp
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
deleted file mode 100755
index e38e0ff6d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Constructs a default epilogue for planar complex outputs.
-
-  This template reuses components for real-valued epilogues and applies them to planar complex
-  output matrices.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/array_planar_complex.h"
-
-#include "cutlass/arch/arch.h"
-
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_planar_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues.
-template <
-  typename ThreadblockShape_,
-  typename WarpMma_,
-  typename OpcodeClass_,
-  typename ArchTag_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpiloguePlanarComplex;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues.
-template <
-  typename ThreadblockShape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpiloguePlanarComplex<
-  ThreadblockShape_, 
-  WarpMmaOperator_, 
-  arch::OpClassTensorOp, 
-  arch::Sm70,
-  PartitionsK, 
-  OutputOp_, 
-  ElementsPerAccess> {
-
-  using RealEpilogue = DefaultEpilogueVoltaTensorOp<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputOp_,
-    ElementsPerAccess
-  >;
-
-  using Epilogue = EpiloguePlanarComplex<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    typename RealEpilogue::OutputTileIterator,
-    typename RealEpilogue::AccumulatorFragmentIterator,
-    typename RealEpilogue::WarpTileIterator,
-    typename RealEpilogue::SharedLoadIterator,
-    OutputOp_,
-    typename RealEpilogue::Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues.
-template <
-  typename ThreadblockShape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpiloguePlanarComplex<
-  ThreadblockShape_, 
-  WarpMmaOperator_, 
-  arch::OpClassTensorOp, 
-  arch::Sm75,
-  PartitionsK, 
-  OutputOp_, 
-  ElementsPerAccess> {
-
-  using RealEpilogue = DefaultEpilogueTensorOp<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputOp_,
-    ElementsPerAccess
-  >;
-
-  using Epilogue = EpiloguePlanarComplex<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    typename RealEpilogue::OutputTileIterator,
-    typename RealEpilogue::AccumulatorFragmentIterator,
-    typename RealEpilogue::WarpTileIterator,
-    typename RealEpilogue::SharedLoadIterator,
-    OutputOp_,
-    typename RealEpilogue::Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues.
-template <
-  typename ThreadblockShape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpiloguePlanarComplex<
-  ThreadblockShape_, 
-  WarpMmaOperator_, 
-  arch::OpClassTensorOp, 
-  arch::Sm80,
-  PartitionsK, 
-  OutputOp_, 
-  ElementsPerAccess> {
-
-  using RealEpilogue = DefaultEpilogueTensorOp<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputOp_,
-    ElementsPerAccess
-  >;
-
-  using Epilogue = EpiloguePlanarComplex<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    typename RealEpilogue::OutputTileIterator,
-    typename RealEpilogue::AccumulatorFragmentIterator,
-    typename RealEpilogue::WarpTileIterator,
-    typename RealEpilogue::SharedLoadIterator,
-    OutputOp_,
-    typename RealEpilogue::Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues.
-template <
-  typename ThreadblockShape_,
-  typename WarpMmaOperator_,
-  typename ArchTag_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpiloguePlanarComplex<
-  ThreadblockShape_, 
-  WarpMmaOperator_, 
-  arch::OpClassSimt, 
-  ArchTag_,
-  PartitionsK, 
-  OutputOp_, 
-  ElementsPerAccess> {
-
-  using RealEpilogue = DefaultEpilogueSimt<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    OutputOp_,
-    ElementsPerAccess
-  >;
-
-  using Epilogue = EpiloguePlanarComplex<
-    ThreadblockShape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    typename RealEpilogue::OutputTileIterator,
-    typename RealEpilogue::AccumulatorFragmentIterator,
-    typename RealEpilogue::WarpTileIterator,
-    typename RealEpilogue::SharedLoadIterator,
-    OutputOp_,
-    typename RealEpilogue::Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
deleted file mode 100755
index f3119fa40..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ /dev/null
@@ -1,443 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using SIMT.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/arch/mma.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
-#include "cutlass/epilogue/warp/tile_iterator_simt.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_simt.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h" 
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h"
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/epilogue_depthwise.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for SimtOps.
-template <
-  typename Shape_,
-  typename WarpMmaSimt_,
-  typename OutputOp_,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute,
-  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
-  int Rank = 4
->
-struct DefaultEpilogueSimt {
-
-  using Shape = Shape_;
-  using WarpMmaSimt = WarpMmaSimt_;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaSimt::LayoutC;
-  using ElementAccumulator = typename WarpMmaSimt::ElementC;
-  static conv::StrideSupport const kStrideSupport = StrideSupport;
-  static int const kRank = Rank;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
-    Shape,
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::Policy,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
-
-  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore
-  >;
-
-  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore,
-    kRank
-  >;
-
-  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
-                                                            PackedOutputTileIterator,
-                                                            StridedOutputTileIterator>::type;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    ElementAccumulator,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaSimt,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for SimtOps.
-template <
-  typename Shape_,
-  typename WarpMmaSimt_,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueSimtStridedDgrad {
-
-  using Shape = Shape_;
-  using WarpMmaSimt = WarpMmaSimt_;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaSimt::LayoutC;
-  using ElementAccumulator = typename WarpMmaSimt::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
-    Shape,
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::Policy,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    ElementAccumulator,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaSimt,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for SimtOps.
-template <
-  int Rank,
-  typename Shape_,
-  typename WarpMmaSimt_,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueSimtAffineRankN {
-
-  using Shape = Shape_;
-  using WarpMmaSimt = WarpMmaSimt_;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaSimt::LayoutC;
-  using ElementAccumulator = typename WarpMmaSimt::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
-    Shape,
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::Policy,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
-    OutputTileThreadMap,
-    ElementOutput,
-    Rank
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    ElementAccumulator,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaSimt,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for SimtOps.
-template <typename Shape_,        // ThreadBlock Shape
-          typename WarpMmaSimt_,  // mma_depthwise_simt
-          typename OutputOp_,
-          int ElementsPerAccess_,
-          typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
-          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >
-struct DefaultDirectConvEpilogueSimt {
-  using Shape = Shape_;
-  using WarpMmaSimt = WarpMmaSimt_;
-  using WarpShape = typename WarpMmaSimt::Shape;
-  using OutputOp = OutputOp_;
-  using ThreadOutputShape = ThreadOutputShape_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  static int const kElementsPerAccess = ElementsPerAccess_;
-
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaSimt::LayoutC;
-  using ElementAccumulator = typename WarpMmaSimt::ElementC;
-
-  /// Number of threads total
-  using WarpCount = gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN
-  >;
-
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
-
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  //
-  // Thread map
-  //
-  
-  using OutputTileThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<ThreadBlockOutputShape::kC, ThreadBlockOutputShape::kNHW>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorDirectConv<
-    OutputTileThreadMap,
-    ElementOutput,
-    ThreadOutputShape,
-    ThreadBlockOutputShape 
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
-    typename WarpMmaSimt::Shape,
-    typename WarpMmaSimt::ThreadMma,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
-    typename WarpMmaSimt::Shape,
-    ThreadOutputShape,
-    ThreadBlockOutputShape,
-    typename WarpMmaSimt::ThreadMma,
-    ElementAccumulator,
-    layout::RowMajor,
-    typename WarpMmaSimt::Policy
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorPitchLinear<
-    OutputTileThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::EpilogueDepthwise<
-    Shape,
-    ThreadOutputShape,
-    ThreadBlockOutputShape,
-    WarpMmaSimt,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
deleted file mode 100755
index 1d62f4fc3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ /dev/null
@@ -1,904 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_relu0.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename ElementOutput,
-  typename ElementAccumulator,
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    ElementAccumulator,
-    layout::RowMajor
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    ElementAccumulator
-  >;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-/// Partial specialization for float <= float x 4
-template <
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    float,
-    layout::RowMajor
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    float
-  >;
-
-  static int const kFragmentsPerIteration = 2;
-};
-
-/// Partial specialization for int32_t <= int32_t
-template <
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    layout::RowMajor
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    int32_t
-  >;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-/// Partial specialization for float <= int32_t
-template <
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    layout::RowMajor
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    int32_t
-  >;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
-template <
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  half_t, 
-  float, 
-  8, 
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    float,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    float,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  static int const kFragmentsPerIteration = 2;
-};
-
-/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
-template <
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  bfloat16_t,
-  int32_t,
-  8,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  ThreadMap> {
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    int32_t,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  static int const kFragmentsPerIteration = 2;
-};
-
-/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
-template <
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  half_t, 
-  int32_t, 
-  8, 
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    int32_t,
-    32,
-    16,
-    8,
-    8
-  >;
-
-  static int const kFragmentsPerIteration = 2;
-};
-
-/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
-/// Threadblock::kN = 256 still has bank conflicts.
-template <
-  typename ElementOutput,
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  ElementOutput, 
-  int32_t, 
-  ElementsPerAccess,
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
-
-  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
-                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
-                platform::is_same<ElementOutput, int8_t>::value ||
-                platform::is_same<ElementOutput, uint8_t>::value,
-                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
-
-   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
-                "ElementsPerAccess needs to be 16 or 8.");
-  
-  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    int32_t,
-    layout::RowMajor
-  >;
-
-  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             WarpTileIteratorNotMixed,
-                             WarpTileIteratorMixed>::type;
-
-  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    int32_t,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    int32_t
-  >;
-
-  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             SharedLoadIteratorNotMixed,
-                             SharedLoadIteratorMixed>::type;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
-/// Threadblock::kN = 256 still has bank conflicts.
-template <
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  cutlass::float_e4m3_t,
-  float, 
-  ElementsPerAccess,
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
-
-  using ElementOutput = cutlass::float_e4m3_t;
-
-  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
-              "ElementsPerAccess needs to be 16 or 8.");
-  
-  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    float,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    float,
-    layout::RowMajor
-  >;
-
-  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             WarpTileIteratorNotMixed,
-                             WarpTileIteratorMixed>::type;
-
-  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    float,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    float
-  >;
-
-  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             SharedLoadIteratorNotMixed,
-                             SharedLoadIteratorMixed>::type;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
-/// Threadblock::kN = 256 still has bank conflicts.
-template <
-  int ElementsPerAccess,
-  typename ThreadblockShape,
-  typename WarpShape,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  cutlass::float_e5m2_t,
-  float, 
-  ElementsPerAccess,
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
-
-  using ElementOutput = cutlass::float_e5m2_t;
-
-  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
-              "ElementsPerAccess needs to be 16 or 8.");
-  
-  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    WarpShape,
-    InstructionShape,
-    float,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
-    WarpShape,
-    InstructionShape,
-    float,
-    layout::RowMajor
-  >;
-
-  using WarpTileIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             WarpTileIteratorNotMixed,
-                             WarpTileIteratorMixed>::type;
-
-  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
-    float,
-    32,
-    cutlass::sizeof_bits<ElementOutput>::value,
-    ElementsPerAccess,
-    8
-  >;
-
-  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
-    ThreadMap,
-    float
-  >;
-
-  using SharedLoadIterator = typename platform::conditional<
-                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
-                             SharedLoadIteratorNotMixed,
-                             SharedLoadIteratorMixed>::type;
-
-  static int const kFragmentsPerIteration = 1;
-};
-
-} // namespace detail
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute,
-  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
-  int Rank = 4
->
-struct DefaultEpilogueTensorOp {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-  static conv::StrideSupport const kStrideSupport = StrideSupport;
-  static int const kRank = Rank;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
-
-  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore
-  >;
-
-  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore,
-    kRank
-  >;
-
-  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
-                                                            PackedOutputTileIterator,
-                                                            StridedOutputTileIterator>::type;
-
-  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
-                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC>,
-                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC> >::type;
-
-  /// Support several implementations depending on structure of epilogue
-  using DefaultIterators = detail::DefaultIteratorsTensorOp<
-    ElementOutput,
-    ElementAccumulator,
-    kElementsPerAccess,
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename OutputTileThreadMap::CompactedThreadMap
-  >;
-
-  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
-  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
-
-  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding,
-    kFragmentsPerIteration
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueTensorOpStridedDgrad {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
-                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC>,
-                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC> >::type;
-
-  /// Support several implementations depending on structure of epilogue
-  using DefaultIterators = detail::DefaultIteratorsTensorOp<
-    ElementOutput,
-    ElementAccumulator,
-    kElementsPerAccess,
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename OutputTileThreadMap::CompactedThreadMap
-  >;
-
-  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
-  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
-
-  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding,
-    kFragmentsPerIteration
-  >;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  int Rank,
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueTensorOpAffineRankN {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
-    OutputTileThreadMap,
-    ElementOutput,
-    Rank
-  >;
-
-  // Map to the row major iterator since the iterator selection for affineN is the same.
-  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
-                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        layout::RowMajor>,
-                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        layout::RowMajor> >::type;
-
-  /// Support several implementations depending on structure of epilogue
-  using DefaultIterators = detail::DefaultIteratorsTensorOp<
-    ElementOutput,
-    ElementAccumulator,
-    kElementsPerAccess,
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename OutputTileThreadMap::CompactedThreadMap
-  >;
-
-  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
-  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
-
-  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding,
-    kFragmentsPerIteration
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Defines sensible defaults for epilogues for TensorOps which uses
-/// intereleaved output layout. For this case, shared memory is not needed.
-template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
-          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
-          bool isSplitK = false>
-struct DefaultInterleavedEpilogueTensorOp {
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedThreadMapTensorOp<
-          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
-          kElementsPerAccess, InterleavedK>::Type;
-
-  using OutputTileIterator =
-      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
-          OutputTileThreadMap, ElementOutput, InterleavedK>;
-
-  using AccumulatorFragmentIterator =
-      cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          typename WarpMmaTensorOp::Shape,
-          typename WarpMmaTensorOp::Policy::Operator::Shape,
-          typename WarpMmaTensorOp::Policy::Operator::ElementC,
-          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-          LayoutC>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
-      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
-      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps which uses
-/// intereleaved output layout. For this case, shared memory is not needed.
-template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
-          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
-          bool isSplitK = false>
-struct DefaultInterleavedConvEpilogue {
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedConvThreadMapTensorOp<
-          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
-          kElementsPerAccess, InterleavedK>::Type;
-
-  using OutputTileIterator =
-      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
-          OutputTileThreadMap, ElementOutput, InterleavedK>;
-
-  using AccumulatorFragmentIterator =
-      cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          typename WarpMmaTensorOp::Shape,
-          typename WarpMmaTensorOp::Policy::Operator::Shape,
-          typename WarpMmaTensorOp::Policy::Operator::ElementC,
-          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-          // can reuse the gemm version here to do element selection
-          layout::ColumnMajorInterleaved<InterleavedK>>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
-      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
-      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
deleted file mode 100755
index e1ae5a24c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess,
-  /// Is for a symmetric kernel
-  BlasMode BlasMode_ = BlasMode::kGemm
->
-struct DefaultEpilogueTensorOpBlas3 {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static BlasMode const kBlasMode = BlasMode_;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
-    OutputTileThreadMap,
-    ElementOutput,
-    kBlasMode
-  >;
-
-  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
-                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC>,
-                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC> >::type;
-
-  /// Support several implementations depending on structure of epilogue
-  using DefaultIterators = detail::DefaultIteratorsTensorOp<
-    ElementOutput,
-    ElementAccumulator,
-    kElementsPerAccess,
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename OutputTileThreadMap::CompactedThreadMap
-  >;
-
-  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
-  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
-
-  /// Hard-coded padding elements added 
-  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
deleted file mode 100755
index f73edfdec..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops on Volta.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueVoltaTensorOp {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess,
-    ElementAccumulator
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
-
-  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator,
-    kSharedMemAlignment
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueVoltaTensorOpStridedDgrad {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess,
-    ElementAccumulator
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
-
-  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator,
-    kSharedMemAlignment
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  int Rank,
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultEpilogueVoltaTensorOpAffineRankN {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess,
-    ElementAccumulator
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
-    OutputTileThreadMap,
-    ElementOutput,
-    Rank
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    gemm::GemmShape<32, 32, 4>,
-    ElementAccumulator,
-    LayoutC
-  >;
-
-  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
-
-  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator,
-    kSharedMemAlignment
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
deleted file mode 100755
index b0e89a4ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Default configuration for epilogue computing absolute maximum of output and auxiliary outputs.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for absolute-maximum-computing  epilogues with TensorOps
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementAuxOutput,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWithAbsMax {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  //
-  // Stores the output
-  //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  //
-  // Stores the auxiliary output
-  //
-  using AuxOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementAuxOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithAbsMax<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    AuxOutputTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding,
-    Base::kFragmentsPerIteration
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
deleted file mode 100755
index 16e045e1e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for SimtOps.
-template <
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute,
-  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
-  int Rank = 4
->
-struct DefaultEpilogueWithBroadcastSimt {
-
-  static conv::StrideSupport const kStrideSupport = StrideSupport;
-  static int const kRank = Rank;
-
-  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueSimt<
-    Shape,
-    WarpMmaSimt,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore
-  >;
-
-  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore,
-    kRank
-  >;
-
-  //
-  // Stores the result z = (y = GEMM(A, B, C), broadcast)
-  //
-  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
-                                                            PackedOutputTileIterator,
-                                                            StridedOutputTileIterator>::type;
-
-  //
-  // Additional tensor tile iterator - stores t = Elementwise(z)
-  //
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementTensor
-  >;
-  /// Define the epilogue
-  using Epilogue = EpilogueWithBroadcast<
-    Shape,
-    WarpMmaSimt,
-    Base::kPartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for strided dgrad epilogues for SimtOps.
-template <
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWithBroadcastSimtStridedDgrad {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueSimtStridedDgrad<
-    Shape,
-    WarpMmaSimt,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  //
-  // Stores the result z = (y = GEMM(A, B, C), broadcast)
-  //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    typename Base::OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  //
-  // Additional tensor tile iterator - stores t = Elementwise(z)
-  //
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    typename Base::OutputTileThreadMap,
-    ElementTensor
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithBroadcast<
-    Shape,
-    WarpMmaSimt,
-    Base::kPartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWithBroadcastTensorOp {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  //
-  // Stores the result z = (y = GEMM(A, B, C), broadcast)
-  //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  //
-  // Additional tensor tile iterator - stores t = Elementwise(z)
-  //
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementTensor
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithBroadcast<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding,
-    Base::kFragmentsPerIteration
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for streamk epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultStreamkEpilogueWithBroadcastTensorOp {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  //
-  // Stores the result z = (y = GEMM(A, B, C), broadcast)
-  //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  //
-  // Additional tensor tile iterator - stores t = Elementwise(z)
-  //
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementTensor
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueStreamkWithBroadcast<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding,
-    Base::kFragmentsPerIteration
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for VoltaTensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename ElementTensor,
-  typename ElementVector,
-  typename OutputOp,
-  int ElementsPerAccess
->
-struct DefaultEpilogueWithBroadcastVoltaTensorOp {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  //
-  // Stores the result z = (y = GEMM(A, B, C), broadcast)
-  //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput
-  >;
-
-  //
-  // Additional tensor tile iterator - stores t = Elementwise(z)
-  //
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementTensor
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithBroadcast<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    ElementVector,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
deleted file mode 100755
index 34ecfb741..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  typename ReductionOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWithReductionTensorOp {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  /// Additional tensor tile iterator
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    typename OutputOp::ElementTensor
-  >;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithReduction<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    typename WarpMmaTensorOp::ElementC,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    typename Base::OutputOp,
-    ReductionOp,
-    typename Base::Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  typename ReductionOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWithReductionVoltaTensorOp {
-
-  /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  /// Additional tensor tile iterator
-  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    typename OutputOp::ElementTensor
-  >;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  /// Define the epilogue
-  using Epilogue = EpilogueWithReduction<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    TensorTileIterator,
-    typename WarpMmaTensorOp::ElementC,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    typename Base::OutputOp,
-    ReductionOp,
-    typename Base::Padding
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
deleted file mode 100755
index 3b1c5dc19..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using WMMA.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/thread/linear_combination_gelu.h"
-#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
-#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines sensible defaults for epilogues for WMMA TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
-struct DefaultEpilogueWmmaTensorOp {
-
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-
-  //
-  // Thread map
-  //
-
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::ElementC,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorWmmaTensorOp<
-    typename WarpMmaTensorOp::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::Shape,
-    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-    LayoutC
-  >;
-
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
-    typename OutputTileThreadMap::CompactedThreadMap,
-    ElementAccumulator
-  >;
-
-  /// Hard-coded padding elements added 
-  using Padding = typename WarpTileIterator::Padding;
-
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    WarpTileIterator,
-    SharedLoadIterator,
-    OutputOp,
-    Padding
-  >;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
deleted file mode 100755
index 2092caf4d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for SIMT accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  typename MmaSimtPolicy_,
-  int PartitionsK,
-  typename Element_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapSimt {
-
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using MmaSimtPolicy = MmaSimtPolicy_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-
-    static int const kWarpSize = 32;
-
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
-
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-
-    /// Computes number of thread-level matrix multiplies are needed to span a warp
-    static int const kGroupCount =
-      WarpShape::kM / (MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM);
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-
-    /// Number of iterations
-    static int const kIterations = MmaSimtPolicy::LaneMmaShape::kM * kGroupCount;
-  };
-
-  //
-  // ThreadMap
-  //
-  
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMap<
-    OutputTileShape<                          // Shape
-      ThreadblockShape::kN, 
-      1, 
-      MmaSimtPolicy::WarpShape::kRow, 
-      Detail::WarpCount::kM, 
-      1>,
-    OutputTileShape<                          // Count
-      1, 
-      MmaSimtPolicy::LaneMmaShape::kM, 
-      Detail::kGroupCount, 
-      1, 
-      Detail::kIterations>,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<Element>::value
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
deleted file mode 100755
index e39ca9d53..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/pitch_linear.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  int PartitionsK,
-  typename Element_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapTensorOp {
-
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-
-    /// Tensor Operations fundamentally perform operations on 8 rows
-    static int const kTensorOpRows = 8;
-    static int const kWarpSize = 32;
-
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
-
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-
-  //
-  // ThreadMap
-  //
-  
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMap <
-    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
-    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<Element>::value
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
-          typename Element_, int ElementsPerAccess, int InterleavedK>
-struct DefaultInterleavedThreadMapTensorOp {
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kInterleavedK = InterleavedK;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-    /// Tensor Operations fundamentally perform operations on 8 rows
-    static int const kTensorOpRows = 8;
-    static int const kWarpSize = 32;
-
-    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
-                      !(ThreadblockShape::kN % WarpShape::kN),
-                  "Divisibility");
-
-    /// Number of warps
-    using WarpCount =
-        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
-                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-
-  //
-  // ThreadMap
-  //
-
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept
-  /// InterleavedOutputTileThreadMap
-  using Type = InterleavedOutputTileThreadMap<
-      layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
-      layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
-                               WarpShape::kN / InterleavedK>,
-      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
-          typename Element_, int ElementsPerAccess, int InterleavedK>
-struct DefaultInterleavedConvThreadMapTensorOp {
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kInterleavedK = InterleavedK;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-    /// Tensor Operations fundamentally perform operations on 8 rows
-    static int const kTensorOpRows = 8;
-    static int const kWarpSize = 32;
-
-    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
-                      !(ThreadblockShape::kN % WarpShape::kN),
-                  "Divisibility");
-
-    /// Number of warps
-    using WarpCount =
-        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
-                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-
-  //
-  // ThreadMap
-  //
-
-  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
-  /// InterleavedOutputTileThreadMap
-  using Type = InterleavedConvOutputTileThreadMap<
-      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
-      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
-                  WarpShape::kN / InterleavedK>,
-      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
deleted file mode 100755
index 1eac4a183..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <
-  typename ThreadblockShape,
-  typename WarpShape,
-  int PartitionsK,
-  typename ElementOutput,
-  int ElementsPerAccess,
-  typename ElementAccumulator
->
-struct DefaultThreadMapVoltaTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  int PartitionsK,
-  typename ElementOutput_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapVoltaTensorOp<
-  ThreadblockShape_, 
-  WarpShape_, 
-  PartitionsK, 
-  ElementOutput_, 
-  ElementsPerAccess, 
-  half_t> {
-
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using ElementOutput = ElementOutput_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using ElementAccumulator = half_t;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-
-    static int const kTensorOpRows = 16;
-    static int const kWarpSize = 32;
-    static int const kInterleavedTilesM = WarpShape::kM / 32;
-
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
-
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-
-    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
-      ThreadblockShape::kN,   // column
-      4,                      // row
-      4,                      // group
-      WarpCount::kM,          // cluster
-      1                       // tile
-    >;
-    
-    /// Number of iterations per subspace
-    using Count = cutlass::epilogue::threadblock::OutputTileShape<
-      1,                                // column
-      2,                                // row
-      kInterleavedTilesM,               // group
-      1,                                // cluster
-      WarpShape::kM / kTensorOpRows     // iterations
-    >;
-  };
-
-  //
-  // ThreadMap
-  //
-  
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMap <
-    typename Detail::Shape,
-    typename Detail::Count,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<ElementOutput>::value
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  int PartitionsK,
-  typename ElementOutput_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapVoltaTensorOp<
-  ThreadblockShape_,
-  WarpShape_,
-  PartitionsK,
-  ElementOutput_,
-  ElementsPerAccess,
-  float> {
-
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using ElementOutput = ElementOutput_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using ElementAccumulator = float;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-
-    static int const kTensorOpRows = 16;
-    static int const kWarpSize = 32;
-    static int const kInterleavedTilesM = WarpShape::kM / 32;
-
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
-
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-
-    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
-      ThreadblockShape::kN,   // column
-      4,                      // row
-      4,                      // group
-      WarpCount::kM,          // cluster
-      1                       // tile
-    >;
-    
-    /// Number of iterations per subspace
-    using Count = cutlass::epilogue::threadblock::OutputTileShape<
-      1,                                // column
-      2,                                // row
-      kInterleavedTilesM,               // group
-      1,                                // cluster
-      WarpShape::kM / kTensorOpRows     // iterations
-    >;
-  };
-
-  //
-  // ThreadMap
-  //
-  
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMap <
-    typename Detail::Shape,
-    typename Detail::Count,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<ElementOutput>::value
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
deleted file mode 100755
index 0dccf6525..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/pitch_linear.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the optimal thread map for Wmma TensorOp accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  typename InstructionShape_,
-  int PartitionsK,
-  typename Element_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapWmmaTensorOp {
-
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  //
-  // Definitions
-  //
-
-  struct Detail {
-
-    /// Wmma Tensor Operations fundamentally perform operations on InstructionShape::kM rows
-    static int const kTensorOpRows = InstructionShape::kM;
-    static int const kWarpSize = 32;
-
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
-
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-
-  //
-  // ThreadMap
-  //
-  
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMap <
-    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
-    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<Element>::value
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
deleted file mode 100755
index 11f89b658..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element_>
-class DirectStoreEpilogueIterator {
-public:
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = 1;
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-    using Base = PredicatedTileIteratorParams;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) {
-      stride = layout.stride(0) * sizeof(Element);
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
-public:
-
-  //
-  // Data members
-  //
-
-  Element *pointer;     // pointer to the output matrix
-
-  LongIndex stride;     // stride in elements between rows
-
-  TensorCoord extent;   // extent of output matrix
-
-  int thread_idx;       // thread index
-
-  TensorCoord threadblock_offset;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  DirectStoreEpilogueIterator(
-    PredicatedTileIteratorParams const & params,
-    Element *pointer_,
-    TensorCoord extent_,
-    int thread_idx_,
-    TensorCoord threadblock_offset_ = TensorCoord(),
-    int const * indices = nullptr
-  ): 
-    pointer(pointer_),
-    stride(params.stride / sizeof(Element)),
-    extent(extent_),
-    thread_idx(thread_idx_),
-    threadblock_offset(threadblock_offset_)
-  {
-
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
deleted file mode 100755
index 48b66a144..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-  The shared memory resource is time-sliced across warps.
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
->
-class Epilogue :
-  public EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition>,
-  public EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>
-{
-
-public:
-
-  using Base = EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition>;
-
-  using BaseStreamK = EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>;
-
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Number of warps per block
-  using WarpCount = typename Base::WarpCount;
-
-  /// Number of threads per block
-  static int const kBlockThreads = 32 * WarpCount::kCount;
-
-  /// Per-thread accumulator tile type
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Numerical accumulation element type
-  using ElementAccumulator = typename WarpMmaOperator::ElementC;
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Vector type used by the global output iterator
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Vector type used by the shared output iterator
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-
-public:
-
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
-    "Divisibility");
-
-  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
-
-
-public:
-
-  /// Aspect for when epilogue source is not needed
-  struct SourceAspectNotNeeded
-  {
-    /// Constructor
-    CUTLASS_DEVICE
-    SourceAspectNotNeeded()
-    {}
-
-    // No-op
-    CUTLASS_DEVICE
-    void load() { }
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
-    {
-      OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-      AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-      int const kOutputOpIterations =
-        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kOutputOpIterations; ++i)
-      {
-        // Call the output operator
-        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
-      }
-    }
-  };
-
-
-  /// Aspect for when epilogue source is needed
-  struct SourceAspectNeeded
-  {
-    OutputTileIterator source_iterator;
-
-    typename OutputTileIterator::Fragment source_fragment;
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    static void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
-      typename OutputTileIterator::Fragment const &source_fragment)
-    {
-      OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-      AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-      OutputAccessType const *source_frag_ptr =
-        reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-      int const kOutputOpIterations =
-        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kOutputOpIterations; ++i)
-      {
-        // Call the output operator
-        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
-      }
-    }
-
-    /// Constructor
-    CUTLASS_DEVICE
-    SourceAspectNeeded(OutputTileIterator source_iterator) :
-      source_iterator(source_iterator)
-    {
-      source_fragment.clear();
-    }
-
-    // Load addend source fragment from global memory
-    CUTLASS_DEVICE
-    void load() {
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-    }
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
-    {
-      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
-    }
-  };
-
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Thread index in the threadblock
-  int thread_idx;
-
-  /// Warp index in the threadblock
-  int warp_idx;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  Epilogue(
-      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
-      int thread_idx,                                 ///< ID of a thread within the threadblock
-      int warp_idx,                                   ///< ID of warp within threadblock
-      int lane_idx)                                   ///< Id of thread within warp
-  :
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      BaseStreamK(thread_idx),
-      shared_load_iterator_(shared_storage.reference(), thread_idx),
-      thread_idx(thread_idx),
-      warp_idx(warp_idx)
-  {}
-
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
-  /// performing epilogue computations, writing to output
-  CUTLASS_DEVICE
-  void reduce(
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *element_workspace,
-      OutputOp const &output_op,                      ///< Output operator
-      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-  {
-    // Reduce peer accumulator fragments into one fragment
-    AccumulatorFragment accum_fragment;
-    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
-
-    // Store fragment to shared memory
-    this->warp_tile_iterator_.store(accum_fragment);
-
-    __syncthreads();
-
-    // Initialize/load source-fragment data
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-
-    if (output_op.is_source_needed())
-    {
-      source_iterator += reduce_fragment_idx;
-      source_iterator.load(source_fragment);
-    }
-
-    // Load fragment from shared memory
-    typename SharedLoadIterator::Fragment aligned_accum_fragment;
-    shared_load_iterator_.load(aligned_accum_fragment);
-
-    // Add fragments shared by other k partitions
-    if (kPartitionsK > 1)
-    {
-      plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-      CUTLASS_PRAGMA_UNROLL
-      for ( int i = 1; i < kPartitionsK; ++i) {
-        typename SharedLoadIterator::Fragment aligned_addend_fragment;
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        shared_load_iterator_.load(aligned_addend_fragment);
-        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
-      }
-    }
-
-    // Compute the output result
-    typename OutputTileIterator::Fragment output_fragment;
-
-    // Apply the output operator
-    SourceAspectNeeded::apply_output_operator(
-        output_fragment,
-        output_op,
-        aligned_accum_fragment,
-        source_fragment);
-
-    // Store the final result
-    destination_iterator += reduce_fragment_idx;
-    destination_iterator.store(output_fragment);
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
-  {
-    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.  Implements
-  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
-  {
-    if (output_op.is_source_needed())
-    {
-      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
-    }
-    else
-    {
-      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
-    }
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.  Implements a
-  /// single codepath, regardless of whether the output op requires addend data to be loaded
-  CUTLASS_DEVICE
-  void unified(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
-  {
-    if (!output_op.is_source_needed())
-    {
-      source_iterator.clear_mask();
-      __syncthreads();  // Dummy (CUDA 11.0)
-    }
-
-    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
-  }
-
-  template<class Seq>
-  struct acc2smem;
-
-  template <size_t... Seq>
-  struct acc2smem<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                      WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                    AccumulatorFragmentIterator const &iterator_begin,
-                    WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-
-
-  /// Streams the result to global memory
-  template <typename SourceAspect>
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    SourceAspect source)
-  {
-    // Iterator over warp-level accumulator fragment
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter)
-    {
-      //
-      // Load the source
-      //
-
-        source.load();
-      //
-      // Convert and store fragment
-      //
-
-      __syncthreads();
-
-      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-        iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      if (kPartitionsK > 1) {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-      }
-
-      //
-      // Compute the output result
-      //
-
-      typename OutputTileIterator::Fragment output_fragment;
-      source.apply_output_operator(output_fragment, output_op, aligned_accum_fragment[0]);
-
-      //
-      // Store the final result
-      //
-
-      destination_iterator.store(output_fragment);
-      ++destination_iterator;
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
deleted file mode 100755
index 6853f5f04..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <type_traits>
-#include <utility>
-#endif
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-//
-// This is used for metaprogramming epilogue functors. If they define 
-// `static bool const kIsHeavy = true;`, then the epilogue functor itself is
-// not inlined. This results in smaller code and is advantageous if the epilogue
-// functor consists of many instructions.
-//
-// If the epilogue functor does not define `kIsHeavy` or if it is `false`, then
-// the behavior from CUTLASS 2.5 and before is retained. The epilogue is fully
-// unrolled and inlined.
-//
-
-template<class> 
-struct TypeSink {  typedef void type; };
-
-template<class T> using TypeSinkT = typename TypeSink<T>::type;
-
-template<class T, class=void> struct IsEpilogueFunctorHeavy {
-  static bool const value = false;
-};
-
-template<class T> struct IsEpilogueFunctorHeavy<T, TypeSinkT< decltype( T::kIsHeavy ) > > {
-  static bool const value = T::kIsHeavy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Base class for epilogues defining warp-level 
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpShape_,                      ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerIteration = 1
->
-class EpilogueBase {
-public:
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using Padding = Padding_;
-
-  /// Output layout is always row-major
-  using Layout = layout::RowMajor;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename AccumulatorTile::Element;
-
-  /// Number of warps
-  using WarpCount = gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    kPartitionsK
-  >;
-
-  /// Use this to control the granularity of one epilogue 'iteration'
-  static int const kFragmentsPerIteration = FragmentsPerIteration;
-
-public:
-
-  /// Shared storage allocation needed by the epilogue
-  struct SharedStorage {
-    
-    //
-    // Type definitions
-    //
-
-    /// Element type of shared memory
-    using Element = typename WarpTileIterator::Element;
-
-    /// Tensor reference to shared memory allocation
-    using TensorRef = typename WarpTileIterator::TensorRef;
-
-    /// Layout of shared memory allocation
-    using Layout = typename WarpTileIterator::Layout;
-    
-    /// Logical shape of the shared memory tile written to by all warps.
-    using Shape = MatrixShape<
-      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
-      WarpCount::kN * WarpTileIterator::Shape::kColumn
-    >;
-
-    /// Shape of the shared memory allocation for the epilogue    
-    using StorageShape = MatrixShape<
-      (Shape::kRow + Padding::kRow) * kFragmentsPerIteration, 
-      Shape::kColumn + Padding::kColumn
-    >;
-
-    //
-    // Data members
-    //
-
-    AlignedBuffer<Element, StorageShape::kCount> storage;
-
-    //
-    // Methods
-    //
-
-    /// Returns a pointer to the shared memory buffer
-    CUTLASS_DEVICE
-    Element *data() {
-      return storage.data();
-    }
-
-    /// Returns a tensor reference to the shared memory buffer
-    CUTLASS_DEVICE
-    TensorRef reference() {
-      return TensorRef(
-        storage.data(), 
-        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
-    }
-  };
-
-protected:
-
-  //
-  // Data members
-  //
-
-  SharedStorage &shared_storage_;
-
-  /// Stores a warp's fragment of accumulators to SMEM
-  WarpTileIterator warp_tile_iterator_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueBase(
-    SharedStorage &shared_storage,    ///< Shared storage object    
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                      ///< Id of thread within warp
-  ):
-    shared_storage_(shared_storage),
-    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
-    //
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
-    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
-    int warp_m = warp_mn % WarpCount::kM;
-    int warp_n = warp_mn / WarpCount::kM;
-
-    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
-
-    warp_tile_iterator_.add_tile_offset(warp_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
deleted file mode 100755
index 294e9a514..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Basic subset of epilogue functionality for supporting StreamK decompositions
-*/
-
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-#include "cutlass/block_striped.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// StreamK epilogue functionality for cross-block accumulator fragment reduction
-template <
-  typename Shape,                          ///< Shape of threadblock tile (concept: GemmShape)
-  int PartitionsK,
-  typename WarpMmaOperator,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  typename AccumulatorFragmentIterator>    ///< Iterator for enumerating fragments within the per-thread tile of raw accumulators
-class EpilogueBaseStreamK
-{
-
-protected:
-
-  /// The per-thread tile of raw accumulators
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Number of warps
-  using WarpCount = gemm::GemmShape<
-                        Shape::kM / WarpMmaOperator::Shape::kM,
-                        Shape::kN / WarpMmaOperator::Shape::kN,
-                        PartitionsK>;
-
-  /// Number of threads per block
-  static int const kBlockThreads = 32 * WarpCount::kCount;
-
-  /// Numerical accumulation element type
-  using ElementAccumulator = typename WarpMmaOperator::ElementC;
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
-
-public:
-
-  /// Number of AccumulatorTile fragments per thread
-  static int const kAccumulatorFragments = AccumulatorFragmentIterator::Policy::kIterations;
-
-protected:
-
-  /// Number of AccumulatorTile fragments per block output tile
-  static int const kOutputTileFragments = kBlockThreads * kAccumulatorFragments;
-
-  /// Block-striped transfer utility for sharing AccumulatorFragment
-  using BlockStripedT = BlockStriped<kBlockThreads, AccumulatorFragment>;
-
-  /// AccumulatorFragment stride in the shared workspace between different peer blocks (each thread block can share accumulators for up to two block output tiles)
-  static const int kPeerFragmentStride = kOutputTileFragments * 2;
-
-public:
-
-  /// Workspace bytes per thread block
-  static size_t const kWorkspaceBytesPerBlock =sizeof(AccumulatorFragment) * kPeerFragmentStride;
-
-public:
-
-  /// Thread index in the threadblock
-  int thread_idx;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueBaseStreamK(
-      int thread_idx)                                       ///< ID of a thread within the threadblock
-  :
-      thread_idx(thread_idx)
-  {}
-
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace
-  CUTLASS_DEVICE
-  void reduce(
-      AccumulatorFragment &accum_fragment,                  ///< [out] sum of all shared accumulator fragments for these peer partials
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *workspace_ptr)
-  {
-    plus<AccumulatorFragment> add_fragments;
-
-    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
-
-    int fragment_offset = (peer_idx_begin * kPeerFragmentStride) + (reduce_fragment_idx * kBlockThreads);
-
-    // Load first peer fragment
-    BlockStripedT::load(accum_fragment, fragment_workspace + fragment_offset, this->thread_idx);
-
-    fragment_offset += kPeerFragmentStride;         // Move to next peer
-    fragment_offset += kOutputTileFragments;        // Move to the set of fragments for this peer's "non-started" output tile
-
-    // Reduce fragments from additional peers
-    #pragma unroll 2
-    for (; fragment_offset < peer_idx_end * kPeerFragmentStride; fragment_offset += kPeerFragmentStride)
-    {
-      // Load peer fragment
-      AccumulatorFragment addend_fragment;
-      BlockStripedT::load(addend_fragment, fragment_workspace + fragment_offset, this->thread_idx);
-
-      // Add peer fragment
-      accum_fragment = add_fragments(accum_fragment, addend_fragment);
-    }
-  }
-
-
-  /// Shares the accumulator set with peers in the global workspace
-  CUTLASS_DEVICE
-  void share(
-      int peer_idx,
-      void *workspace_ptr,
-      AccumulatorTile const &accumulators,
-      bool started_tile)                      ///< Whether this thread block computed the first work volume for the current output tile
-  {
-    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
-
-    int fragment_offset = peer_idx * kPeerFragmentStride;
-
-    if (!started_tile) {
-      // Move to the set of fragments for the "non-started" output tile
-      fragment_offset += kOutputTileFragments;
-    }
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    // Convert raw accumulator tile to fragments and store
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < kAccumulatorFragments; ++iter)
-    {
-      // Acquire reordered accumulator fragment
-      AccumulatorFragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
-
-      // Store accumulator fragment
-      BlockStripedT::store(fragment_workspace + fragment_offset, accum_fragment, this->thread_idx);
-
-      fragment_offset += kBlockThreads;
-    }
-  }
-
-};
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
deleted file mode 100755
index 83cbc8ab3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for Depthwise convoltuion
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <typename Shape_,                   ///< Shape of threadblock tile (concept: GemmShape)
-          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
-          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
-          typename WarpMmaOperator_,         ///< Warp-level MMA operator (concept:
-                                             ///< gemm::warp::MmaTensorOp)
-          typename OutputTileIterator_,      ///< Tile iterator reading and writing output tensors
-          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting accumulators
-          typename WarpTileIterator_,    ///< Warp-scoped tile iterator writing accumulators to SMEM
-          typename SharedLoadIterator_,  ///< Threadblock-scoped tile iterator loading from SMEM
-          typename OutputOp_,            ///< Output operator
-          typename Padding_  ///< Padding added to SMEM allocation to avoid bank conflicts (concept:
-                             ///< MatrixShape)
-          >
-class EpilogueDepthwise {
- public:
-  using Shape = Shape_;
-  using WarpShape = typename WarpMmaOperator_::Shape;
-  using ThreadOutputShape = ThreadOutputShape_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType =
-      Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType =
-      Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Number of warps
-  using WarpCount =
-      gemm::GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN>;
-
- public:
-  static_assert(SharedLoadIterator::Fragment::kElements ==
-  OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess,
-                "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
-                "Divisibility");
-
-  /// Shared storage allocation needed by the epilogue
-  struct SharedStorage {
-    //
-    // Type definitions
-    //
-
-    /// Element type of shared memory
-    using Element = typename WarpTileIterator::Element;
-
-    /// Tensor reference to shared memory allocation
-    using TensorRef = typename WarpTileIterator::TensorRef;
-
-    /// Layout of shared memory allocation
-    using Layout = typename WarpTileIterator::Layout;
-
-    /// Logical shape of the shared memory tile written to by all warps.
-    using Shape = MatrixShape<ThreadBlockOutputShape::kNHW, ThreadBlockOutputShape::kC>;
-
-    /// Shape of the shared memory allocation for the epilogue
-    using StorageShape = MatrixShape<Shape::kRow, Shape::kColumn>;
-
-    //
-    // Data members
-    //
-
-    AlignedBuffer<Element, StorageShape::kCount> storage;
-
-    //
-    // Methods
-    //
-
-    /// Returns a pointer to the shared memory buffer
-    CUTLASS_DEVICE
-    Element *data() { return storage.data(); }
-
-    /// Returns a tensor reference to the shared memory buffer
-    CUTLASS_DEVICE
-    TensorRef reference() {
-      return TensorRef(storage.data(), Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
-    }
-  };
-
- private:
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Stores a warp's fragment of accumulators to SMEM
-  WarpTileIterator warp_tile_iterator_;
-
-  LongIndex warp_offset;
-  int thread_idx;
-  int warp_idx;
-  int lane_idx;
-  int warp_m, warp_n;  // warp coordinates within a cta
-  int tid_m, tid_n;    // thread coordinates within a warp
-
- public:
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueDepthwise(SharedStorage &shared_storage,  ///< Shared storage object
-                    int thread_idx_,                ///< ID of a thread within the threadblock
-                    int warp_idx_,                  ///< ID of warp within threadblock
-                    int lane_idx_                   ///< Id of thread within warp
-                    )
-      : thread_idx(thread_idx_),
-        warp_idx(warp_idx_),
-        lane_idx(lane_idx_),
-        shared_load_iterator_(shared_storage.reference(), thread_idx_),
-        warp_tile_iterator_(shared_storage.reference(), thread_idx_, lane_idx_) {}
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(OutputOp const &output_op,                ///< Output operator
-                  OutputTileIterator destination_iterator,  ///< Tile iterator for destination
-                  AccumulatorTile const &accumulators,  ///< Complete warp-level accumulator tile
-                  OutputTileIterator source_iterator,   ///< Threadblock tile coordinate in GEMM (in
-                                                        ///< units of threadblock tiles)
-                  const int smem_base_offset) {         ///< SMEM base offset for epilogue operation
-    // initiate the smem base offset for different output tile.
-    warp_tile_iterator_.set_smem_base_address(smem_base_offset);
-
-    shared_load_iterator_.set_smem_base_address(smem_base_offset);
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(output_op, destination_iterator, accumulators);
-    } else {
-      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
-    }
-  }
-
- private:
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-      OutputOp const &output_op,                ///< Output operator
-      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
-      AccumulatorTile const &accumulators,      ///< Complete warp-level accumulator tile
-      OutputTileIterator source_iterator) {     ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    typename OutputTileIterator::Fragment source_fragment;
-
-    source_fragment.clear();
-
-    source_iterator.load(source_fragment);
-
-    // store to smem
-    warp_tile_iterator_.store(accumulators);
-
-    __syncthreads();
-
-    typename SharedLoadIterator::Fragment aligned_accum_fragment;
-
-    // load from smem
-    shared_load_iterator_.load(aligned_accum_fragment);
-
-    typename OutputTileIterator::Fragment output_fragment;
-
-    apply_output_operator_(output_fragment, output_op, aligned_accum_fragment, source_fragment);
-
-    // Store to GMEM
-    destination_iterator.store(output_fragment);
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-      OutputOp const &output_op,                ///< Output operator
-      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
-      AccumulatorTile const &accumulators) {    ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    // store to smem
-    warp_tile_iterator_.store(accumulators);
-
-    __syncthreads();
-
-    typename SharedLoadIterator::Fragment aligned_accum_fragment;
-
-    // load from smem
-    shared_load_iterator_.load(aligned_accum_fragment);
-
-    typename OutputTileIterator::Fragment output_fragment;
-
-    apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment);
-
-    // Store to GMEM
-    destination_iterator.store(output_fragment);
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment &output_fragment,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
-    typename OutputTileIterator::Fragment const &source_fragment) {
-      
-    OutputAccessType *output_frag_ptr = 
-      reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    OutputAccessType const *source_frag_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,  ///< Output operator
-      typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
-    OutputAccessType *output_frag_ptr = reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-    AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    int const kOutputOpIterations =
-        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-      // Call the output operator
-      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace epilogue
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
deleted file mode 100755
index 02de00dd6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs and convolution using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_                        ///< Output operator
->
-class EpilogueDirectStore {
-public:
-
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  using WarpShape = typename WarpMmaOperator_::Shape;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = MatrixShape<0, 0>;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-  
-  /// Number of warps
-  using WarpCount = gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    kPartitionsK
-  >;
-
-  /// Use this to control the granularity of one epilogue 'iteration'
-  static int const kFragmentsPerIteration = 1;
-
-  static int constexpr kSmemTiles = 1;
-  static int constexpr kSmemPointerOffset = 0;
-
-  /// Shared storage allocation needed by the epilogue
-  struct SharedStorage { } ;
-
-private:
-
-  // Assume accumulator tile is multipile interleaved 32x32 tile.
-  static int const kElementsPerPartial = 4;
-  using EleShapePerPatial = typename platform::conditional<
-                              platform::is_same<ElementAccumulator, float>::value,
-                              MatrixShape<2, 2>,
-                              MatrixShape<1, 4> >::type;
-  static int const kElementsPerMma = 8;
-  static int const kAccumulatorPatials = 2;
-  using QuadShapePerPatialMma = MatrixShape<4, 4>;
-
-  static_assert(OutputOp::kCount >= 2, 
-    "The direct store epilogue for Tensor Ops requires the output functor have kCount >= 2.");
-
-private:
-
-  LongIndex warp_offset;
-  int thread_idx;
-  int warp_idx;
-  int lane_idx;
-  int warp_m, warp_n; // warp coordinates within a cta
-  int tid_m, tid_n;   // thread coordinates within a warp
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueDirectStore(
-    SharedStorage &shared_storage,    ///< Shared storage object    
-    int thread_idx_,                   ///< ID of a thread within the threadblock
-    int warp_idx_,                     ///< ID of warp within threadblock
-    int lane_idx_                     ///< Id of thread within warp
-  ):
-    thread_idx(thread_idx_), 
-    warp_idx(warp_idx_), 
-    lane_idx(lane_idx_) 
-  {
-    
-    // warp offsetting calculations
-    warp_offset = warp_idx * WarpShape::kM * WarpShape::kN;
-    int warp_id_mn = warp_idx % (WarpCount::kM * WarpShape::kN);
-    warp_m = warp_id_mn % WarpCount::kM;
-    warp_n = warp_id_mn / WarpCount::kM;
-    MatrixCoord warp_offset_coord(warp_m*WarpShape::kM, warp_n*WarpShape::kN);
-    
-    // thread offsetting calculations
-    int quad = (lane_idx >> 2);
-    int lane_in_quad = (lane_idx & 3);
-
-    // this seems to be te correct layout
-    tid_m = quad;
-    tid_n = 2 * lane_in_quad;
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(output_op, destination_iterator, accumulators);
-    }
-    else {
-      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
-    }
-  }
-
-private:
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    const int kAccumBlockN = 2;
-    const int kThreadsM = 8;
-    const int kThreadsN = 4;
-    const int kBlockM = WarpShape::kM / kThreadsM;
-
-    /// Array type used to output
-    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
-
-    /// Array type passed to the output operator - unused elements are optimized away
-    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
-
-    /// Array type used by output functor
-    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
-
-    /// Array type used by output functor
-    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
-
-    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
-
-      int accum_m = kThreadsM * accum_m_idx;
-      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
-      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
-
-      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
-      ElementOutput *source_ptr = source_iterator.pointer + mL * source_iterator.stride;
-
-      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
-
-        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
-        int accum_n = kThreadsM * accum_n_idx;
-        
-        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
-        int nL = nL_base + accum_n;
-          
-        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
-
-        AccumulatorFragmentType accum_fragment;
-        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
-
-        OutputFragmentType output_fragment;
-
-        if(guard) {
-          reinterpret_cast<OutputAccessType &>(output_fragment) = 
-            *reinterpret_cast<OutputAccessType const *>(source_ptr + nL);
-        }
-
-        // Perform output operator
-        output_fragment = output_op(accum_fragment, output_fragment);
-
-        if(guard) {
-          // Store
-          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = reinterpret_cast<OutputAccessType const &>(output_fragment);
-        }
-      }
-    }
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    const int kAccumBlockN = 2;
-    const int kThreadsM = 8;
-    const int kThreadsN = 4;
-    const int kBlockM = WarpShape::kM / kThreadsM;
-
-    /// Array type used to output
-    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
-
-    /// Array type passed to the output operator - unused elements are optimized away
-    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
-
-    /// Array type used by output functor
-    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
-
-    /// Array type used by output functor
-    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
-
-    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
-
-      int accum_m = kThreadsM * accum_m_idx;
-      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
-      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
-
-      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
-
-      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
-
-        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
-        int accum_n = kThreadsM * accum_n_idx;
-        
-        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
-        int nL = nL_base + accum_n;
-          
-        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
-                   
-        AccumulatorFragmentType accum_fragment;
-        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
-
-        OutputFragmentType output_fragment;
-
-        // Perform output operator
-        output_fragment = output_op(accum_fragment);
-
-        if(guard) { 
-
-          // Store
-          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = 
-            reinterpret_cast<OutputAccessType const &>(output_fragment);      
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
deleted file mode 100755
index 43b14c356..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/numeric_types.h"
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename ElementAccumulator_,
-  typename ElementOutput_,
-  typename ThreadBlockShape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  bool ReduceKForA_
->
-class EpilogueGemmKReduction {
-
-public:
-
-  using ThreadBlockShape = ThreadBlockShape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  using WarpShape = typename WarpMmaOperator::Shape;
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Accumulator element
-  using ElementAccumulator = ElementAccumulator_;
-
-  /// Output element
-  using ElementOutput = ElementOutput_;
-
-  /// Output access size
-  static int const kElementsPerAccess = 1;
-
-  static bool const kReduceKForA = ReduceKForA_;
-
-  static int const kThreadBlockSize = kReduceKForA ? ThreadBlockShape::kM : ThreadBlockShape::kN;
-
-  static int const kWarpSize = kReduceKForA ? WarpShape::kM : WarpShape::kN;
-
-  static int const kIterations = kWarpSize / 8;
-
-  using FragmentAccumulator = Array<ElementAccumulator, kIterations>;
-
-private:
-
-  int thread_offset_;
-  ElementOutput* pointer_;
-  int col_;
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueGemmKReduction(
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx,                     ///< Id of thread within warp
-    int threadblock_offset,
-    ElementOutput* pointer 
-  )
-  {
-     col_ = lane_idx % 4;
-     thread_offset_ = threadblock_offset * kThreadBlockSize
-                    + warp_idx * kWarpSize 
-                    + lane_idx / 4 + col_ * 8;
-
-     pointer_ = pointer + LongIndex(thread_offset_);
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    int size,
-    FragmentAccumulator &gemm_k_with_reduction_accumulation,
-    bool LoadForSerialSplitK
-  ) {
-      bool guard[kIterations / 4];
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kIterations / 4; ++i) {
-        guard[i] = ((thread_offset_ + i * 32) < size);
-      }
-
-      Array<ElementOutput, kIterations / 4> source;
-      source.clear();
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kIterations / 4; ++i) {
-        ElementOutput *source_ptr = reinterpret_cast<ElementOutput *>(&source);
-        cutlass::arch::global_load<ElementOutput, sizeof(ElementOutput)>(
-                                                  source_ptr[i],
-                                                  (void *)(pointer_ + i * 32),
-                                                  guard[i] && LoadForSerialSplitK);
-
-      }
-
-      FragmentAccumulator sum = gemm_k_with_reduction_accumulation;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kIterations; ++i) {
-        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 1);
-        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 2);
-      }
-
-      Array<ElementAccumulator, kIterations / 4> intermediate;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kIterations / 4; ++i) {
-        if (col_ == 0) {
-          intermediate[i] = sum[0 + i * 4];
-        }
-  
-        if (col_ == 1) {
-          intermediate[i] = sum[1 + i * 4];
-        }
-  
-        if (col_ == 2) {
-          intermediate[i] = sum[2 + i * 4];
-        }
-  
-        if (col_ == 3) {
-          intermediate[i] = sum[3 + i * 4];
-        }
-      }
-
-      NumericArrayConverter<ElementAccumulator, ElementOutput, kIterations / 4> source_converter;
-      Array<ElementAccumulator, kIterations / 4> converted_source = source_converter(source);
-
-      plus<Array<ElementAccumulator, kIterations / 4>> plus_source;
-      intermediate = plus_source(intermediate, converted_source);
-
-      NumericArrayConverter<ElementOutput, ElementAccumulator, kIterations / 4> converter;
-      Array<ElementOutput, kIterations / 4> result = converter(intermediate);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kIterations / 4; ++i) {
-        cutlass::arch::global_store<ElementOutput, sizeof(ElementOutput)>(result[i], 
-                                                (void *)(pointer_ + i * 32), guard[i]);
-      }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
deleted file mode 100755
index b294244cd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ /dev/null
@@ -1,401 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/array_planar_complex.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator for planar-complex output representations.
-///
-/// Note, as with most CUTLASS components for planar complex, the template arguments describe
-/// the underlying real data type.
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator
-  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
->
-class EpiloguePlanarComplex {
-public:
-  
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-
-  /// Output layout is always row-major
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = ArrayPlanarComplex<
-    typename WarpMmaOperator::FragmentC::Element, 
-    WarpMmaOperator::FragmentC::kElements
-  >;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-  
-  /// Shape of each warp-level operation
-  using WarpShape = typename WarpMmaOperator::Shape;
-
-  /// Number of warps
-  using WarpCount = gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    kPartitionsK
-  >;
-
-  /// Shared memory allocation
-  struct SharedStorage {
-
-    //
-    // Type definitions
-    //
-
-    /// Element type of shared memory
-    using Element = typename WarpTileIterator::Element;
-
-    /// Tensor reference to shared memory allocation
-    using TensorRef = typename WarpTileIterator::TensorRef;
-
-    /// Layout of shared memory allocation
-    using Layout = typename WarpTileIterator::Layout;
-    
-    /// Logical shape of the shared memory tile written to by all warps.
-    using Shape = MatrixShape<
-      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
-      WarpCount::kN * WarpTileIterator::Shape::kColumn
-    >;
-
-    /// Shape of the shared memory allocation for the epilogue    
-    using StorageShape = MatrixShape<
-      Shape::kRow + Padding::kRow, 
-      Shape::kColumn + Padding::kColumn
-    >;
-
-    static int const kImaginaryStride = StorageShape::kCount;
-
-    //
-    // Data members
-    //
-
-    AlignedBuffer<Element, kImaginaryStride * 2> storage;
-
-    //
-    // Methods
-    //
-
-    /// Returns a pointer to the shared memory buffer
-    CUTLASS_DEVICE
-    Element *data() {
-      return storage.data();
-    }
-
-    /// Returns a tensor reference to the shared memory buffer
-    CUTLASS_DEVICE
-    TensorRef reference() {
-      return TensorRef(
-        storage.data(), 
-        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  SharedStorage &shared_storage_;
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Stores a warp's fragment of accumulators to SMEM
-  WarpTileIterator warp_tile_iterator_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpiloguePlanarComplex(
-    SharedStorage &shared_storage,    ///< Shared storage object    
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                      ///< Id of thread within warp
-  ):
-    shared_storage_(shared_storage),
-    shared_load_iterator_(shared_storage.reference(), thread_idx),
-    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
-    //
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
-    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
-    int warp_m = warp_mn % WarpCount::kM;
-    int warp_n = warp_mn / WarpCount::kM;
-
-    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
-
-    warp_tile_iterator_.add_tile_offset(warp_offset);
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                        ///< Output operator
-    OutputTileIterator destination_iterator_real,     ///< Tile iterator for destination
-    OutputTileIterator destination_iterator_imag,     ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator_real,          ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    OutputTileIterator source_iterator_imag) {        ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    typename OutputTileIterator::Fragment source_fragment_real;
-    typename OutputTileIterator::Fragment source_fragment_imag;
-
-    if (!output_op.is_source_needed()) {
-      source_iterator_real.clear_mask();
-      source_iterator_imag.clear_mask();
-    }
-
-    source_fragment_real.clear();
-    source_fragment_imag.clear();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator_real(accumulators.real);
-    AccumulatorFragmentIterator accum_fragment_iterator_imag(accumulators.imag);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Load the source
-      //
-
-      source_iterator_real.load(source_fragment_real);
-      source_iterator_imag.load(source_fragment_imag);
-
-      ++source_iterator_real;
-      ++source_iterator_imag;
-
-      //
-      // Convert and store fragment
-      //
-      
-      __syncthreads();
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment_real;
-      typename AccumulatorFragmentIterator::Fragment accum_fragment_imag;
-
-      accum_fragment_iterator_real.load(accum_fragment_real);
-      accum_fragment_iterator_imag.load(accum_fragment_imag);
-      
-      ++accum_fragment_iterator_real;
-      ++accum_fragment_iterator_imag;
-
-      this->warp_tile_iterator_.store(accum_fragment_real);
-      this->warp_tile_iterator_.store_with_pointer_offset(accum_fragment_imag, SharedStorage::kImaginaryStride);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment_real[kPartitionsK];
-      typename SharedLoadIterator::Fragment aligned_accum_fragment_imag[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment_real[0]);
-      shared_load_iterator_.load_with_pointer_offset(aligned_accum_fragment_imag[0], SharedStorage::kImaginaryStride);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      static_assert(kPartitionsK  == 1, "Sliced-K not supported for planar complex at this time");
-    
-      //
-      // Compute the output result
-      //
-     
-      typename OutputTileIterator::Fragment output_fragment_real;
-      typename OutputTileIterator::Fragment output_fragment_imag;
-
-      apply_output_operator_(
-        output_fragment_real, 
-        output_fragment_imag, 
-        output_op, 
-        aligned_accum_fragment_real[0],
-        aligned_accum_fragment_imag[0], 
-        source_fragment_real,
-        source_fragment_imag);
-
-      //
-      // Store the final result
-      //
-
-      destination_iterator_real.store(output_fragment_real);
-      destination_iterator_imag.store(output_fragment_imag);
-
-      ++destination_iterator_real;
-      ++destination_iterator_imag;
-    }
-  }
-
-private:
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment &output_fragment_real,
-    typename OutputTileIterator::Fragment &output_fragment_imag,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_real,
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_imag,
-    typename OutputTileIterator::Fragment const &source_fragment_real,
-    typename OutputTileIterator::Fragment const &source_fragment_imag) {
-
-    OutputAccessType *output_frag_real_ptr = 
-      reinterpret_cast<OutputAccessType *>(&output_fragment_real);
-
-    OutputAccessType *output_frag_imag_ptr = 
-      reinterpret_cast<OutputAccessType *>(&output_fragment_imag);
-
-    AccumulatorAccessType const *compute_frag_real_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_real);
-
-    AccumulatorAccessType const *compute_frag_imag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_imag);
-
-    OutputAccessType const *source_frag_real_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&source_fragment_real);
-
-    OutputAccessType const *source_frag_imag_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&source_fragment_imag);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      // Call the output operator
-      auto result_fragment = output_op(
-        make_ArrayPlanarComplex(compute_frag_real_ptr[i], compute_frag_imag_ptr[i]), 
-        make_ArrayPlanarComplex(source_frag_real_ptr[i], source_frag_imag_ptr[i])
-      );
-
-      output_frag_real_ptr[i] = result_fragment.real;
-      output_frag_imag_ptr[i] = result_fragment.imag;
-    }
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
deleted file mode 100755
index 2be1fa55a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMM/CONV to store accumulator in shared memory after
-    applying scale, bias loaded from global memory and element-wise operations.
-
-    This Epilogue is typically used in fused GEMM/CONV to stage the intermediate accumulator.
-
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename SmemTileIterator_,               ///< Shared memory Tile iterator to output to shared memory
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename ScaleBiasIterator_,              ///< Iterator to load scale and bias from global memory
-  typename OutputOp_                        ///< Output operator
->
-class EpilogueSmemAccumulator {
-
-public:
-
-  using SmemTileIterator = SmemTileIterator_;
-
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-
-  using ScaleBiasIterator = ScaleBiasIterator_;
-
-  using OutputOp = OutputOp_;
-
-  /// Fragment of accumulator tile
-  using FragmentAccumulator = typename AccumulatorFragmentIterator::Fragment;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Fragment of Scale and Bias loaded from global memory
-  using FragmentScaleBias = typename ScaleBiasIterator::Fragment;
-
-  static const bool PerChannelScale = (OutputOp::kScale ==
-      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueSmemAccumulator() {}
-
-  /// Streams the result to shared memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
-    AccumulatorTile const &accumulator,          ///< Complete warp-level accumulator tile
-    ScaleBiasIterator scale_iterator,             ///< iterator for scale vector in global memory
-    ScaleBiasIterator bias_iterator) {            ///< iterator for bias vector in global memory
- 
-  
-    // Fragment to load scale bias from global memory
-    FragmentScaleBias tb_frag_scale;
-    FragmentScaleBias tb_frag_bias;
-      
-    /// Fragment Iterator to load slice of accumulator tile
-    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
-    FragmentAccumulator tb_frag_accum;
-  
-    /// Epilogue output fragment
-    typename SmemTileIterator::Fragment tb_frag_smem;
-  
-    /// Load scale and bias from global memory
-  
-    if(PerChannelScale)
-        scale_iterator.load(tb_frag_scale);
-  
-    bias_iterator.load(tb_frag_bias);
-  
-    /// Iterate over the accumulator tile and store to shared memory
-    CUTLASS_PRAGMA_UNROLL
-    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
-    
-      CUTLASS_PRAGMA_UNROLL
-      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
-  
-        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
-        using ScaleBiasAccessType = typename OutputOp::FragmentScaleBias;
-        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
-  
-  
-        ScaleBiasAccessType const * scale_frag_ptr =
-          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_scale);
-        ScaleBiasAccessType const * bias_frag_ptr =
-          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_bias);
-   
-        FragmentSmemAccessType * smem_frag_ptr =  
-          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
-  
-        CUTLASS_PRAGMA_UNROLL
-        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
-          frag_iterator_accum.load(tb_frag_accum);
-          ++frag_iterator_accum;
-  
-          AccumulatorAccessType const * accumulator_frag_ptr = 
-            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
-          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
-  
-          CUTLASS_PRAGMA_UNROLL
-          for (int it = 0; it < kOutputIterations; it++) {
-            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it],
-                scale_frag_ptr[cid * kOutputIterations + it], bias_frag_ptr[cid * kOutputIterations + it]);
-          }
-        }
-  
-        smem_iterator.store(tb_frag_smem);
-        ++smem_iterator;
-  
-      }
-    }
-  }
-
-  /// Streams the result to shared memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
-    AccumulatorTile const &accumulator) {          ///< Complete warp-level accumulator tile
- 
-    /// Fragment Iterator to load slice of accumulator tile
-    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
-    FragmentAccumulator tb_frag_accum;
-  
-    /// Epilogue output fragment
-    typename SmemTileIterator::Fragment tb_frag_smem;
-  
-    /// Iterate over the accumulator tile and store to shared memory
-    CUTLASS_PRAGMA_UNROLL
-    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
-    
-      CUTLASS_PRAGMA_UNROLL
-      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
-  
-        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
-        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
-  
-        FragmentSmemAccessType * smem_frag_ptr =  
-          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
-  
-        CUTLASS_PRAGMA_UNROLL
-        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
-          frag_iterator_accum.load(tb_frag_accum);
-          ++frag_iterator_accum;
-  
-          AccumulatorAccessType const * accumulator_frag_ptr = 
-            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
-          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
-  
-          CUTLASS_PRAGMA_UNROLL
-          for (int it = 0; it < kOutputIterations; it++) {
-            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it]);
-          }
-        }
-  
-        smem_iterator.store(tb_frag_smem);
-        ++smem_iterator;
-  
-      }
-    }
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
- 
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
deleted file mode 100755
index 9efbee477..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
+++ /dev/null
@@ -1,443 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#include <cuda/std/utility>
-#else
-#include <assert.h>
-#include <utility>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This base class is meant to define the concept required of the
-/// EpilogueStreamkWithBroadcast::OutputOp
-template <
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename ElementCompute_,
-  typename ElementZ_,
-  typename ElementT_,
-  int ElementsPerAccess,
-  bool StoreZ = true,
-  bool StoreT = true
->
-struct EpilogueStreamkWithBroadcastOpBase : EpilogueWithBroadcastOpBase<
-                                            ElementC_,
-                                            ElementAccumulator_,
-                                            ElementCompute_,
-                                            ElementZ_,
-                                            ElementT_,
-                                            ElementsPerAccess,
-                                            StoreZ,
-                                            StoreT
-                                            > 
-{
-
-  /// Parameters structure - required
-  struct Params { };
-
-  //
-  // Methods
-  //
-
-  /// Constructor from Params
-  EpilogueStreamkWithBroadcastOpBase(Params const &params_) { }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator with bias vector broadcast over columns.
-///
-/// Computes the following:
-///
-///
-///  Z, T = OutputOp(AB, C, Broadcast)
-///
-///  if (ElementwiseOp::kStoreZ) {
-///    store(converted_u);
-///  }  
-///
-///  if (ElementwiseOp::kStoreT) {
-///    store(v);
-///  }  
-///
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
-  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
-  typename ElementVector_,                  ///< Pointer to broadcast vector
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
-  bool IsSingleSource = OutputOp_::kIsSingleSource
->
-class EpilogueStreamkWithBroadcast;
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// EpilogueStreamkWithBroadcast: Two sources
-
-template <
-  typename Shape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputTileIterator_,
-  typename TensorTileIterator_,
-  typename ElementVector_,
-  typename AccumulatorFragmentIterator_,
-  typename WarpTileIterator_,
-  typename SharedLoadIterator_,
-  typename OutputOp_,
-  typename Padding_,
-  int FragmentsPerPartition,
-  int IterationsUnroll
->
-class EpilogueStreamkWithBroadcast<
-  Shape_,
-  WarpMmaOperator_,
-  PartitionsK,
-  OutputTileIterator_,
-  TensorTileIterator_,
-  ElementVector_,
-  AccumulatorFragmentIterator_,
-  WarpTileIterator_,
-  SharedLoadIterator_,
-  OutputOp_,
-  Padding_,
-  FragmentsPerPartition,
-  IterationsUnroll,
-  false
-> : 
-  public EpilogueWithBroadcast<
-    Shape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputTileIterator_,
-    TensorTileIterator_,
-    ElementVector_,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    SharedLoadIterator_,
-    OutputOp_,
-    Padding_,
-    FragmentsPerPartition,
-    IterationsUnroll,
-    false>,
-  public EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>
-{
-
-public:
-
-  using Base = EpilogueWithBroadcast<
-    Shape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputTileIterator_,
-    TensorTileIterator_,
-    ElementVector_,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    SharedLoadIterator_,
-    OutputOp_,
-    Padding_,
-    FragmentsPerPartition,
-    IterationsUnroll,
-    false>;
-
-  using BaseStreamK = EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>;
-
-  using Shape = Shape_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using TensorTileIterator = TensorTileIterator_;
-  using ElementVector = ElementVector_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  using SharedStorage = typename Base::SharedStorage;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueStreamkWithBroadcast(
-    SharedStorage &shared_storage,                    ///< Shared storage object    
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    BaseStreamK(thread_idx)
-  { }
-
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
-  /// performing epilogue computations, writing to output
-  CUTLASS_DEVICE
-  void reduce(
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *element_workspace,
-      OutputOp const &output_op,                      ///< Output operator
-      ElementVector const * broadcast_ptr,            ///< Broadcast vector
-      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-      OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
-      OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
-      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
-      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
-          MatrixCoord(Shape::kM, Shape::kN),
-      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
-          MatrixCoord()) 
-  {
-    // Reduce peer accumulator fragments into one fragment
-    AccumulatorFragment accum_fragment;
-    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
-
-    // Store fragment to shared memory
-    this->warp_tile_iterator_.store(accum_fragment);
-
-    __syncthreads();
-
-    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator1, source_iterator2, tensor_iterator, problem_size, threadblock_offset);
-    
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// EpilogueStreamkWithBroadcast: Single source
-
-template <
-  typename Shape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputTileIterator_,
-  typename TensorTileIterator_,
-  typename ElementVector_,
-  typename AccumulatorFragmentIterator_,
-  typename WarpTileIterator_,
-  typename SharedLoadIterator_,
-  typename OutputOp_,
-  typename Padding_,
-  int FragmentsPerPartition,
-  int IterationsUnroll
->
-class EpilogueStreamkWithBroadcast<
-  Shape_,
-  WarpMmaOperator_,
-  PartitionsK,
-  OutputTileIterator_,
-  TensorTileIterator_,
-  ElementVector_,
-  AccumulatorFragmentIterator_,
-  WarpTileIterator_,
-  SharedLoadIterator_,
-  OutputOp_,
-  Padding_,
-  FragmentsPerPartition,
-  IterationsUnroll,
-  true
-> : 
-  public EpilogueWithBroadcast<
-    Shape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputTileIterator_,
-    TensorTileIterator_,
-    ElementVector_,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    SharedLoadIterator_,
-    OutputOp_,
-    Padding_,
-    FragmentsPerPartition,
-    IterationsUnroll,
-    true>,
-  public EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>
-{
-
-public:
-
-  using Base = EpilogueWithBroadcast<
-    Shape_,
-    WarpMmaOperator_,
-    PartitionsK,
-    OutputTileIterator_,
-    TensorTileIterator_,
-    ElementVector_,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    SharedLoadIterator_,
-    OutputOp_,
-    Padding_,
-    FragmentsPerPartition,
-    IterationsUnroll,
-    true>;
-
-  using BaseStreamK = EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>;
-
-  using Shape = Shape_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using TensorTileIterator = TensorTileIterator_;
-  using ElementVector = ElementVector_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  using SharedStorage = typename Base::SharedStorage;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueStreamkWithBroadcast(
-    SharedStorage &shared_storage,                    ///< Shared storage object    
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    BaseStreamK(thread_idx)
-  { }
-
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
-  /// performing epilogue computations, writing to output
-  CUTLASS_DEVICE
-  void reduce(
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *element_workspace,
-      OutputOp const &output_op,                      ///< Output operator
-      ElementVector const * broadcast_ptr,            ///< Broadcast vector
-      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-      OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
-      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
-          MatrixCoord(Shape::kM, Shape::kN),
-      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
-          MatrixCoord()) 
-  {
-    // Reduce peer accumulator fragments into one fragment
-    AccumulatorFragment accum_fragment;
-    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
-
-    // Store fragment to shared memory
-    this->warp_tile_iterator_.store(accum_fragment);
-
-    __syncthreads();
-
-    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator, tensor_iterator, problem_size, threadblock_offset);
-    
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
deleted file mode 100755
index 8202284b6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
+++ /dev/null
@@ -1,513 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
-
-  The epilogue finds max values in each row of the row-major output matrix and stores them.
-  The max values are also used for a further round of threadblock scoped reduction operation, where
-  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
-
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/fast_math.h"
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-template <
-  typename ThreadblockShape_,
-  int ThreadCount,
-  typename OutputTileIterator_,
-  typename ElementAccumulator_,
-  typename ElementNorm_,
-  typename ElementSum_,
-  typename ElementSoftmaxCompute_,
-  typename ElementwiseFunctor_,
-  bool UseMasking_ = false
->
-class EpilogueVisitorSoftmax {
-public:
-
-  using ThreadblockShape   = ThreadblockShape_;
-  static int const kThreadCount = ThreadCount;
-
-  using OutputTileIterator = OutputTileIterator_;
-  using ElementwiseFunctor = ElementwiseFunctor_;
-
-  static int const kIterations = OutputTileIterator::kIterations;
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  using ElementOutput = typename OutputTileIterator::Element;
-  using LayoutOutput = cutlass::layout::RowMajor;
-  using ElementAccumulator = ElementAccumulator_;
-
-  using ElementNorm = ElementNorm_;
-  using ElementSum = ElementSum_;
-  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
-
-  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
-  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
-  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
-  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
-
-  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
-  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
-  static bool const kUseMasking = UseMasking_;
-
-  /// Argument structure
-  struct Arguments {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    int64_t                               batch_stride_C;
-    int64_t                               batch_stride_D;
-    int64_t                               batch_stride_Max;
-    int64_t                               batch_stride_Sum;
-
-    //
-    // Methods
-    //
-    Arguments():
-      batch_stride_C(0),
-      batch_stride_D(0),
-      batch_stride_Max(0),
-      batch_stride_Sum(0)
-    {
-
-    }
-
-    Arguments(
-      typename ElementwiseFunctor::Params   elementwise_
-    ):
-      elementwise(elementwise_),
-      batch_stride_C(0),
-      batch_stride_D(0),
-      batch_stride_Max(0),
-      batch_stride_Sum(0)
-    {
-
-    }
-
-    Arguments(
-      typename ElementwiseFunctor::Params   elementwise_,
-      int64_t                               batch_stride_C_,
-      int64_t                               batch_stride_D_,
-      int64_t                               batch_stride_Max_,
-      int64_t                               batch_stride_Sum_
-    ):
-      elementwise(elementwise_),
-      batch_stride_C(batch_stride_C_),
-      batch_stride_D(batch_stride_D_),
-      batch_stride_Max(batch_stride_Max_),
-      batch_stride_Sum(batch_stride_Sum_)
-    {
-
-    }
-
-  };
-
-  struct Params {
-
-    typename ElementwiseFunctor::Params   elementwise;
-    int64_t                               batch_stride_C;
-    int64_t                               batch_stride_D;
-    int64_t                               batch_stride_Max;
-    int64_t                               batch_stride_Sum;
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params()
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args):
-      elementwise(args.elementwise),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      batch_stride_Max(args.batch_stride_Max),
-      batch_stride_Sum(args.batch_stride_Sum)
-    {
-
-    }
-  };
-
-  /// Shared storage
-  struct SharedStorage {
-
-  };
-
-private:
-
-  Params const &                        params_;
-  SharedStorage &                       shared_storage_;
-  MatrixCoord                           extent_;
-  MatrixCoord                           extent_real_;
-  ElementwiseFunctor                    elementwise_;
-
-  OutputTileIterator                    iterator_C_;
-  OutputTileIterator                    iterator_D_;
-  typename OutputTileIterator::Fragment fragment_C_;
-  typename OutputTileIterator::Fragment fragment_D_;
-
-  ElementAccumulator                    alpha_;
-  ElementAccumulator                    beta_;
-
-  ElementNorm                           *ptr_Max_;
-  ElementSum                            *ptr_Sum_;
-
-  int                                   column_offset_;
-
-  ElementSoftmaxCompute                 accum_max_;
-  ElementSoftmaxCompute                 accum_sum_;
-
-  MatrixCoord                           thread_offset_;
-
-  float                                 infinity_;
-
-public:
-
-  CUTLASS_DEVICE
-  EpilogueVisitorSoftmax(
-    Params const &params,
-    SharedStorage &shared_storage,
-    cutlass::MatrixCoord const &problem_size,
-    int thread_idx,
-    int warp_idx,
-    int lane_idx,
-    typename OutputTileIterator::Params params_C,
-    typename OutputTileIterator::Params params_D,
-    typename OutputTileIterator::Element *ptr_C,
-    typename OutputTileIterator::Element *ptr_D,
-    ElementNorm *ptr_Max = nullptr,
-    ElementSum *ptr_Sum = nullptr,
-    cutlass::MatrixCoord const &threadblock_offset = cutlass::MatrixCoord(0, 0),
-    int column_offset = 0,
-    cutlass::MatrixCoord const &problem_size_real = cutlass::MatrixCoord(0, 0),
-    float infinity = 10000.0f
-  ):
-    params_(params),
-    shared_storage_(shared_storage),
-    extent_(problem_size),
-    elementwise_(params.elementwise),
-    iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
-    iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
-    ptr_Max_(ptr_Max),
-    ptr_Sum_(ptr_Sum),
-    column_offset_(column_offset),
-    extent_real_(problem_size_real),
-    infinity_(infinity)
-  {
-    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
-    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
-
-    if (beta_ == ElementAccumulator()) {
-      iterator_C_.clear_mask();
-    }
-  }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
-    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-    fragment_D_.clear();
-    fragment_C_.clear();
-
-    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      iterator_C_.load(fragment_C_);
-      ++iterator_C_;
-    }
-    
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-    // Clear accumulators for max and sum when starting a whole row
-    clear_accum_();
-
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorFragment const &accum) {
-
-    using Mul = cutlass::multiplies<SoftmaxFragment>;
-    using Minus = cutlass::minus<SoftmaxFragment>;
-    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
-
-    Minus     minus;
-    Exp       exponential;
-
-    SoftmaxFragment result;
-
-    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
-    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
-
-    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
-      result = source_converter(elementwise_(accum));
-    }else{
-      result = source_converter(elementwise_(accum, source_vector));
-    }
-
-    thread_offset_ =
-      iterator_D_.thread_start() +
-      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
-
-    bool column_guard = (thread_offset_.column() < extent_.column());
-
-    if (kUseMasking) {
-      int elements_in_boundary = extent_real_.column() - thread_offset_.column();
-      elements_in_boundary = (elements_in_boundary > kElementsPerAccess) ? kElementsPerAccess : elements_in_boundary;
-      elementwise_padding_(result, elements_in_boundary);
-    }
-
-    ElementSoftmaxCompute accum_max_prev = accum_max_;
-
-    // Compute the maximum within one row
-    if (!column_idx) {
-      // This is the first fragment in a new row
-      if (column_guard) {
-        accum_max_ = maximum_accumulator_(result);
-      }
-    }
-    else {
-      // This is an additional fragment in the same row
-      if (column_guard) {
-        accum_max_ = maximum_accumulator_(result, accum_max_);
-      }
-    }
-
-    // proactively compute max in warps
-    accum_max_ = warp_reduce_max_(accum_max_);
-
-    ElementSoftmaxCompute updater = fast_exp(accum_max_prev - accum_max_);
-
-    SoftmaxFragment intermediate = exponential(minus(result, accum_max_));
-
-    if (kHasMultiStepsInRow) {
-      if (!column_idx) {
-        accum_sum_ = (column_guard) ? \
-          sum_accumulator_(intermediate) : ElementSoftmaxCompute(0);
-      } else {
-        // Algorithm in $3.1, https://arxiv.org/pdf/2205.14135v1.pdf
-        // S* = S* x updater + sum_row(P'), where updater = exp(M* - M_row)
-        accum_sum_ = (column_guard) ? \
-          sum_accumulator_(intermediate, accum_sum_ * updater) : accum_sum_ * updater;
-      }
-    } else {
-      accum_sum_ = (column_guard) ? sum_accumulator_(intermediate, accum_sum_) : ElementSoftmaxCompute(0);
-    }
-
-    // Convert to the output
-    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
-    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
-    output = output_converter(result);
-  }
-
-  /// Called at the end of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-
-    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
-    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
-
-    ConvertSumOutput   convert_sum_output;
-    ConvertNormOutput  convert_norm_output;
-
-    // Compute accumulate sum only in the last step
-    accum_sum_ = warp_reduce_sum_(accum_sum_);
-
-    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
-    bool row_guard = thread_offset_.row() < extent_.row();
-    bool is_write_thread = row_guard && is_first_thread_in_tile;
-
-    int block_batch = blockIdx.z;
-
-    ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
-    ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
-
-    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
-              convert_norm_output(accum_max_),
-              (void *)curr_ptr_max,
-              is_write_thread);
-
-    arch::global_store<ElementSum, sizeof(ElementSum)>(
-              convert_sum_output(accum_sum_),
-              (void *)curr_ptr_sum,
-              is_write_thread);
-
-    // Clear accumulators for max and sum when finishing a whole row
-    clear_accum_();
-
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-
-    iterator_D_.store(fragment_D_);
-    ++iterator_D_;
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void elementwise_padding_(SoftmaxFragment &result, int elements_in_boundary) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      result[i] = (i < elements_in_boundary) ? result[i] : ElementSoftmaxCompute(-infinity_);
-    }
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute warp_reduce_sum_(ElementSoftmaxCompute sum_) {
-    int half_thread_in_row = (kThreadsPerRow >> 1);
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
-      sum_ += tmp;
-    }
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute warp_reduce_max_(ElementSoftmaxCompute max_) {
-    int half_thread_in_row = (kThreadsPerRow >> 1);
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
-      max_ = fast_max(max_, tmp);
-    }
-    return max_;
-  }
-
-  CUTLASS_DEVICE
-  void clear_accum_() {
-
-    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
-    float min_float = reinterpret_cast<float const &>(float_max_bits);
-    accum_max_ = ElementSoftmaxCompute(min_float);
-    accum_sum_ = ElementSoftmaxCompute(0);
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
-    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      sum_ += ElementSoftmaxCompute(accum[i]);
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute sum_) {
-    // ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      sum_ += ElementSoftmaxCompute(accum[i]);
-    }
-
-    return sum_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
-    ElementSoftmaxCompute max_ = accum[0];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
-      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
-    }
-
-    return max_;
-  }
-
-  CUTLASS_DEVICE
-  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
-      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
-    }
-
-    return max_;
-  }
-};
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
deleted file mode 100755
index 9bae7a742..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
+++ /dev/null
@@ -1,923 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-
-  \brief Threadblock-level epilogue computing:
-    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
-    D = activation(Aux)
-
-    if Aux is fp8 type:
-        abs_max_output = max( abs(aux) | (for every aux in Aux))
-        Aux = scale_aux * Aux
-    endif
-
-    if D is fp8 type:
-        abs_max_output = max( abs(d) | (for every d in D))
-        D = scale_d * D
-    endif
-
-    Parameter Aux is optionally stored to global memory
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#include <cuda/std/utility>
-#else
-#include <assert.h>
-#include <utility>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Helper class for keeping track of absolute maximums and performing scaling
-template <
-  typename Iterator,        // Iterator type used for storing the data for which absolute maximum and scaling
-                            // will be computed. This type is used for predicating absolute maximum calculations.
-  typename Fragment,        // Type of input to be computed on
-  bool ScalingAndAmaxNeeded // Whether to perform absolute maximum and scaling operations
->
-struct ScalingAndAmaxHelper;
-
-/// Partial specialization that does not perform scaling or calculate an absolute maximum
-template <typename Iterator, typename Fragment>
-struct ScalingAndAmaxHelper<Iterator, Fragment, false> {
-  using Element = typename Fragment::Element;
-
-  CUTLASS_HOST_DEVICE
-  ScalingAndAmaxHelper(Element scale) { }
-
-  CUTLASS_DEVICE
-  Fragment operator()(const Iterator& iterator, const Fragment& inp) {
-    return inp;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Element get_abs_max() const {
-    return Element(0.);
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_scaling_factor(Element scale_) { }
-};
-
-/// Partial specialization that keeps track of an absolute maximum value of inputs seen
-/// and scales inputs
-template <typename Iterator, typename Fragment>
-struct ScalingAndAmaxHelper<Iterator, Fragment, true> {
-  using Element = typename Fragment::Element;
-  using AccessType = typename Iterator::AccessType;
-  using ThreadMap = typename Iterator::ThreadMap;
-
-  Element abs_max;
-  Element scale;
-
-  // Operators
-  maximum_with_nan_propogation<Element> max_op;
-  absolute_value_op<Element> abs_op;
-  multiplies<Fragment> multiply;
-
-  CUTLASS_HOST_DEVICE
-  ScalingAndAmaxHelper(Element scale_) : abs_max(0.), scale(scale_) { }
-
-  // Compute the absolute maximum value between `abs_max` and the entries
-  // of `frag` for predicated-on entries of `iterator`. Return a scaled
-  // version of `inp`.
-  CUTLASS_DEVICE
-  Fragment operator()(const Iterator& iterator, const Fragment& frag) {
-    using PredicateGroup = Array<Element, Iterator::ThreadMap::kElementsPerAccess>;
-    PredicateGroup const *frag_ptr = reinterpret_cast<PredicateGroup const *>(&frag);
-
-    typename Iterator::Mask mask;
-    iterator.get_mask(mask);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + iterator.thread_start_row()) < iterator.extent_row());
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            bool guard = row_guard && mask.predicates[column];
-
-            if (guard) {
-              int access_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
-              CUTLASS_PRAGMA_UNROLL
-              for (int i = 0; i < PredicateGroup::kElements; ++i) {
-                abs_max = max_op(abs_max, abs_op(frag_ptr[access_idx][i]));
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // Perform scaling
-    return multiply(scale, frag);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Element get_abs_max() const {
-    return abs_max;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_scaling_factor(Element scale_) {
-    scale = scale_;
-  }
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AuxOutputTileIterator_,          ///< Tile iterator writing auxiliary output tensors
-  typename ElementVector_,                  ///< Data type of bias vector
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerPartition = 1,            ///< Used to coarsen the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
->
-class EpilogueWithAbsMax :
-  public EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition> {
-
-public:
-
-  using Base = EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition>;
-
-  static bool const kIsSingleSource = true;
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using AuxOutputTileIterator = AuxOutputTileIterator_;
-  using ElementVector = ElementVector_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Data type used for absolute maximum value
-  using ElementAbsmax = typename OutputOp::ElementAbsmax;
-
-  /// Compute data type produced by the output op
-  using ElementCompute = typename OutputOp::ElementCompute;
-
-  /// Compute fragment
-  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
-
-  /// Helpers for (optionally) computing absolute maximums and scaling output and auxiliary output
-  using OutputScaler = detail::ScalingAndAmaxHelper<OutputTileIterator,
-                                                    FragmentCompute,
-                                                    OutputOp::kIsScalingAndAmaxOutputNeeded>;
-
-  using AuxOutputScaler = detail::ScalingAndAmaxHelper<AuxOutputTileIterator,
-                                                       FragmentCompute,
-                                                       OutputOp::kIsScalingAndAmaxAuxOutputNeeded>;
-
-  /// Thread map used by output tile iterators
-  using ThreadMap = typename OutputTileIterator::ThreadMap;
-
-  /// Fragment object used to store the broadcast values
-  using BroadcastFragment = Array<
-    ElementCompute,
-    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Data type of auxiliary output
-  using ElementAuxOutput = typename AuxOutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
-
-  /// Auxiliary output access type
-  using AuxAccessType = Array<ElementAuxOutput, OutputTileIterator::kElementsPerAccess>;
-
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-
-  /// Shared memory allocation from epilogue base class
-  using BaseSharedStorage = typename Base::SharedStorage;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-  /// Used for the broadcast
-  struct BroadcastDetail {
-
-    /// Number of threads per warp
-    static int const kWarpSize = 32;
-
-    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar column indices handled by each thread
-    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar row indices handled by each thread
-    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-    /// Number of threads per threadblock
-    static int const kThreadCount = kWarpSize * WarpCount::kCount;
-
-    /// Number of distinct threads per row of output tile
-    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
-
-    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-    static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-    /// I'm not sure what I meant here.
-    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-
-    /// Shape of the shared memory allocation for the epilogue
-    using StorageShape = MatrixShape<
-      kThreadRows,
-      Shape::kN
-    >;
-
-    /// Debug printing
-    CUTLASS_DEVICE
-    static void print() {
-#if 0
-      printf("BroadcastDetail {\n");
-      printf(
-        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
-        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
-        kColumnsPerThread,
-        kRowsPerThread,
-        kThreadCount,
-        kThreadsPerRow,
-        kThreadRows,
-        kThreadAccessesPerRow,
-        StorageShape::kRow,
-        StorageShape::kColumn,
-        StorageShape::kCount
-      );
-      printf("};\n");
-#endif
-    }
-  };
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  struct SharedStorage {
-    union {
-      BaseSharedStorage base;
-    };
-
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
-    "Divisibility");
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Thread index within the threadblock
-  int thread_idx_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithAbsMax(
-    SharedStorage &shared_storage,                    ///< Shared storage object
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
-    thread_idx_(thread_idx)
-  {
-
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp &output_op,                              ///< Output operator
-    ElementVector const * broadcast_ptr,              ///< Broadcast vector
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
-    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
-    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
-        MatrixCoord(Shape::kM, Shape::kN),
-    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
-        MatrixCoord()) {
-
-    BroadcastFragment broadcast_fragment;
-
-    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
-
-    OutputScaler output_scaler(output_op.get_scale_d());
-
-    AuxOutputScaler aux_scaler(output_op.get_scale_aux());
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(
-        output_op,
-        broadcast_fragment,
-        destination_iterator,
-        accumulators,
-        aux_iterator,
-        output_scaler,
-        aux_scaler);
-    }
-    else {
-      compute_source_needed_(
-        output_op,
-        broadcast_fragment,
-        destination_iterator,
-        accumulators,
-        source_iterator,
-        aux_iterator,
-        output_scaler,
-        aux_scaler);
-    }
-
-    // Store the absolute maximum values of the output and auxiliar tensors, if needed.
-    if (output_op.get_ptr_output_abs_max() != nullptr) {
-      ElementAbsmax local_abs_max =
-          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(output_scaler.get_abs_max());
-      atomic_maximum<ElementAbsmax>{}(
-        output_op.get_ptr_output_abs_max(), local_abs_max);
-    }
-
-    if (output_op.get_ptr_aux_output_abs_max() != nullptr) {
-      ElementAbsmax local_abs_max =
-          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(aux_scaler.get_abs_max());
-      atomic_maximum<ElementAbsmax>{}(
-        output_op.get_ptr_aux_output_abs_max(), local_abs_max);
-    }
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void load_broadcast_fragment_(
-    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    ElementVector const * broadcast_ptr,         ///< Broadcast vector
-    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
-    ) {
-
-    broadcast_fragment.clear();
-
-    // If no pointer is supplied, set with all zeros and avoid memory accesses
-    if (!broadcast_ptr) {
-      return;
-    }
-
-    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
-
-    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
-    broadcast_ptr += thread_initial_column;
-
-    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
-    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
-    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
-
-    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
-
-      AccessType loaded;
-
-      loaded.clear();
-
-      if (thread_column_idx < problem_size.column()) {
-        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
-      }
-
-      ComputeFragmentType cvt = converter(loaded);
-      frag_ptr[j] = cvt;
-
-      thread_column_idx += ThreadMap::Delta::kColumn;
-      broadcast_ptr += ThreadMap::Delta::kColumn;
-    }
-  }
-
-  template <class Seq>
-  struct acc2smem_source_not_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
-    template <int Advance>
-    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                                      WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-        typename AccumulatorFragmentIterator::Fragment accum_fragment;
-
-        accum_fragment_iterator.load(accum_fragment);
-        ++accum_fragment_iterator;
-
-        warp_tile_iterator.store(accum_fragment);
-        if (p < Base::kFragmentsPerIteration - 1) {
-          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
-                                              (1 - Base::kFragmentsPerIteration));
-      }
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {
-          (pos == (Seq * Base::kFragmentsPerIteration)) &&
-          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
-
-      CUTLASS_UNUSED(dummy[0]);
-    }
-  };
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp &output_op,                              ///< Output operator
-    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
-    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
-    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
-    ) {
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    // CUTLASS_PRAGMA_UNROLL
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
-
-      //
-      // Convert and store fragment
-      //
-
-
-      __syncthreads();
-
-      acc2smem_source_not_needed<
-          cutlass::make_index_sequence<OutputTileIterator::kIterations /
-                                   Base::kFragmentsPerIteration>>::push(iter,
-                                                                        accum_fragment_iterator,
-                                                                        this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-
-
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-        if (p < Base::kFragmentsPerIteration - 1) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        }
-        else if (kPartitionsK > 1) {
-
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
-
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
-
-        //
-        // Apply output operation
-        //
-
-        FragmentCompute frag_Z_compute;
-        FragmentCompute frag_Aux_compute;
-
-        apply_output_operator_source_not_needed_(
-          frag_Z_compute,
-          frag_Aux_compute,
-          output_op,
-          aligned_accum_fragment[0],
-          broadcast_fragment);
-
-        //
-        // Conditionally store fragments
-        //
-
-        // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
-        frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
-        NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
-                              OutputTileIterator::Fragment::kElements> cvt_to_dst;
-        typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
-
-        // Always store the output
-        destination_iterator.store(frag_Z);
-        ++destination_iterator;
-
-        // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
-        if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
-          frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
-
-          NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
-                                AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
-          typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
-          aux_iterator.store(frag_Aux);
-          ++aux_iterator;
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
-      }
-    }
-  }
-
-
-  template<class Seq>
-  struct acc2smem_source_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp &output_op,                          ///< Output operator
-    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
-    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
-    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
-    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
-    ) {
-
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Load the source
-      //
-
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-
-      //
-      // Convert and store fragment
-      //
-
-      __syncthreads();
-
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
-
-      //
-      // Apply output operation
-      //
-
-      FragmentCompute frag_Z_compute;
-      FragmentCompute frag_Aux_compute;
-
-      apply_output_operator_(
-        frag_Z_compute,
-        frag_Aux_compute,
-        output_op,
-        aligned_accum_fragment[0],
-        source_fragment,
-        broadcast_fragment);
-
-      //
-      // Conditionally store fragments
-      //
-
-      // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
-      frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
-      NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
-                            OutputTileIterator::Fragment::kElements> cvt_to_dst;
-      typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
-
-      // Always store the output
-      destination_iterator.store(frag_Z);
-      ++destination_iterator;
-
-      // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
-      if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
-        frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
-
-        NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
-                              AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
-        typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
-        aux_iterator.store(frag_Aux);
-        ++aux_iterator;
-      }
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    FragmentCompute &frag_Z,
-    FragmentCompute &frag_Aux,
-    OutputOp &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    typename OutputTileIterator::Fragment const &frag_C,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
-    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
-
-    AccumulatorAccessType const *frag_AB_ptr =
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    OutputAccessType const *frag_C_ptr =
-      reinterpret_cast<OutputAccessType const *>(&frag_C);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations =
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-        output_op(
-          frag_Z_ptr[i],
-          frag_Aux_ptr[i],
-          frag_AB_ptr[i],
-          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn],
-          frag_C_ptr[i]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    FragmentCompute &frag_Z,
-    FragmentCompute &frag_Aux,
-    OutputOp &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
-    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
-
-    AccumulatorAccessType const *frag_AB_ptr =
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations =
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      output_op(
-        frag_Z_ptr[i],
-        frag_Aux_ptr[i],
-        frag_AB_ptr[i],
-        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
deleted file mode 100755
index 7e6d2a698..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
+++ /dev/null
@@ -1,1718 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#include <cuda/std/utility>
-#else
-#include <assert.h>
-#include <utility>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-#include "cutlass/numeric_types.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This base class is meant to define the concept required of the
-/// EpilogueWithBroadcast::OutputOp
-template <
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename ElementCompute_,
-  typename ElementZ_,
-  typename ElementT_,
-  int ElementsPerAccess,
-  bool StoreZ = true,
-  bool StoreT = true
->
-struct EpilogueWithBroadcastOpBase {
-  
-  using ElementOutput = ElementC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using ElementZ = ElementZ_;
-  using ElementT = ElementT_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
-  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
-  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
-  using FragmentT = Array<ElementT, kElementsPerAccess>;
-
-  /// If true, the 'Z' tensor is stored
-  static bool const kStoreZ = StoreZ;
-
-  /// If true, the 'T' tensor is stored
-  static bool const kStoreT = StoreT;
-
-  /// Parameters structure - required
-  struct Params { };
-
-  //
-  // Methods
-  //
-
-  /// Constructor from Params
-  EpilogueWithBroadcastOpBase(Params const &params_) { }
-
-  /// Determine if the source is needed. May return false if 
-  bool is_source_needed() const {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) { }
-
-  /// Applies the operation when is_source_needed() is true
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
-    FragmentAccumulator const &AB,
-    FragmentC const &frag_C1,
-    FragmentC const &frag_C2,
-    FragmentCompute const &V) const {
-
-  }
-
-  /// Applies the operation when is_source_needed() is false
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentZ &frag_Z, 
-    FragmentT &frag_T, 
-    FragmentAccumulator const &AB,
-    FragmentCompute const &V) const {
-
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator with bias vector broadcast over columns.
-///
-/// Computes the following:
-///
-///
-///  Z, T = OutputOp(AB, C, Broadcast)
-///
-///  if (ElementwiseOp::kStoreZ) {
-///    store(converted_u);
-///  }  
-///
-///  if (ElementwiseOp::kStoreT) {
-///    store(v);
-///  }  
-///
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
-  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
-  typename ElementVector_,                  ///< Pointer to broadcast vector
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
-  bool IsSingleSource = OutputOp_::kIsSingleSource
->
-class EpilogueWithBroadcast;
-
-template <
-  typename Shape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputTileIterator_,
-  typename TensorTileIterator_,
-  typename ElementVector_,
-  typename AccumulatorFragmentIterator_,
-  typename WarpTileIterator_,
-  typename SharedLoadIterator_,
-  typename OutputOp_,
-  typename Padding_,
-  int FragmentsPerPartition,
-  int IterationsUnroll
->
-class EpilogueWithBroadcast<
-  Shape_,
-  WarpMmaOperator_,
-  PartitionsK,
-  OutputTileIterator_,
-  TensorTileIterator_,
-  ElementVector_,
-  AccumulatorFragmentIterator_,
-  WarpTileIterator_,
-  SharedLoadIterator_,
-  OutputOp_,
-  Padding_,
-  FragmentsPerPartition,
-  IterationsUnroll,
-  false
-> : 
-  public EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_,
-    FragmentsPerPartition> {
-
-public:
-
-  using Base = EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_,
-    FragmentsPerPartition>;
-
-  static bool const kIsSingleSource = false;
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using TensorTileIterator = TensorTileIterator_;
-  using ElementVector = ElementVector_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Compute data type produced by the output op
-  using ElementCompute = typename OutputOp::ElementCompute;
-
-  /// Compute fragment
-  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
-
-  /// Thread map used by output tile iterators
-  using ThreadMap = typename OutputTileIterator::ThreadMap;
-
-  /// Fragment object used to store the broadcast values
-  using BroadcastFragment = Array<
-    ElementCompute, 
-    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Data type of additional tensor
-  using ElementTensor = typename TensorTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-
-  /// Array type used by output functor
-  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
-
-  /// Tensor access type
-  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
-  
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-
-  /// Shared memory allocation from epilogue base class
-  using BaseSharedStorage = typename Base::SharedStorage;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-  /// Used for the broadcast
-  struct BroadcastDetail {
-
-    /// Number of threads per warp
-    static int const kWarpSize = 32;
-
-    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar column indices handled by each thread
-    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar row indices handled by each thread
-    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-    /// Number of threads per threadblock
-    static int const kThreadCount = kWarpSize * WarpCount::kCount;
-
-    /// Number of distinct threads per row of output tile
-    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
-
-    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-    static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-    /// I'm not sure what I meant here.
-    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-
-    /// Shape of the shared memory allocation for the epilogue    
-    using StorageShape = MatrixShape<
-      kThreadRows,
-      Shape::kN
-    >;
-
-    /// Debug printing
-    CUTLASS_DEVICE
-    static void print() {
-#if 0
-      printf("BroadcastDetail {\n");
-      printf(
-        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
-        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
-        kColumnsPerThread,
-        kRowsPerThread,
-        kThreadCount,
-        kThreadsPerRow,
-        kThreadRows,
-        kThreadAccessesPerRow,
-        StorageShape::kRow,
-        StorageShape::kColumn,
-        StorageShape::kCount
-      );
-      printf("};\n");
-#endif
-    }
-  };
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  struct SharedStorage {
-    union {
-      BaseSharedStorage base;
-    };
-
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
-    "Divisibility");
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Thread index within the threadblock
-  int thread_idx_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithBroadcast(
-    SharedStorage &shared_storage,                    ///< Shared storage object    
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
-    thread_idx_(thread_idx)
-  {
-
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                        ///< Output operator
-    ElementVector const * broadcast_ptr,              ///< Broadcast vector
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator1,              ///< Tile iterator for first source accumulator matrix
-    OutputTileIterator source_iterator2,              ///< Tile iterator for second source accumulator matrix
-    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
-    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
-        MatrixCoord(Shape::kM, Shape::kN),
-    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
-        MatrixCoord()) {
-    
-    BroadcastFragment broadcast_fragment;
-
-    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(
-        output_op, 
-        broadcast_fragment, 
-        destination_iterator, 
-        accumulators,
-        tensor_iterator);
-    }
-    else {
-      compute_source_needed_(
-        output_op, 
-        broadcast_fragment, 
-        destination_iterator, 
-        accumulators, 
-        source_iterator1,
-        source_iterator2,
-        tensor_iterator);
-    }
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void load_broadcast_fragment_(
-    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    ElementVector const * broadcast_ptr,         ///< Broadcast vector
-    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
-    ) {
-
-    broadcast_fragment.clear();
-    
-    // If no pointer is supplied, set with all zeros and avoid memory accesses
-    if (!broadcast_ptr) {
-      return;
-    }
-
-    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
-
-    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
-    broadcast_ptr += thread_initial_column;
-
-    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
-    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
-    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
-
-    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
-
-      AccessType loaded;
-
-      loaded.clear();
-
-      if (thread_column_idx < problem_size.column()) {
-        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
-      }
-
-      ComputeFragmentType cvt = converter(loaded);
-      frag_ptr[j] = cvt;
-
-      thread_column_idx += ThreadMap::Delta::kColumn;
-      broadcast_ptr += ThreadMap::Delta::kColumn;
-    }
-  }
-
-  template <class Seq>
-  struct acc2smem_source_not_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
-    template <int Advance>
-    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                                      WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-        typename AccumulatorFragmentIterator::Fragment accum_fragment;
-
-        accum_fragment_iterator.load(accum_fragment);
-        ++accum_fragment_iterator;
-
-        warp_tile_iterator.store(accum_fragment);
-        if (p < Base::kFragmentsPerIteration - 1) {
-          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
-                                              (1 - Base::kFragmentsPerIteration));
-      }
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {
-          (pos == (Seq * Base::kFragmentsPerIteration)) &&
-          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
-
-      CUTLASS_UNUSED(dummy[0]);
-    }
-  };
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                        ///< Output operator
-    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
-    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
-    ) { 
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    // CUTLASS_PRAGMA_UNROLL
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
-
-      //
-      // Convert and store fragment
-      //
-      
-
-      __syncthreads();
-
-      acc2smem_source_not_needed<
-          cutlass::make_index_sequence<OutputTileIterator::kIterations /
-                                   Base::kFragmentsPerIteration>>::push(iter,
-                                                                        accum_fragment_iterator,
-                                                                        this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-
-
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-        if (p < Base::kFragmentsPerIteration - 1) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        }
-        else if (kPartitionsK > 1) {
-
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
-
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
-
-        //
-        // Apply output operation
-        //
-
-        typename OutputTileIterator::Fragment frag_Z;
-        typename TensorTileIterator::Fragment frag_T;
-
-        apply_output_operator_source_not_needed_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          broadcast_fragment);
-
-        //
-        // Conditionally store fragments
-        //
-
-        if (OutputOp::kStoreZ) {
-          destination_iterator.store(frag_Z);
-          ++destination_iterator;
-        }
-
-        if (OutputOp::kStoreT) {
-          tensor_iterator.store(frag_T);
-          ++tensor_iterator;
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
-      }
-    }
-  }
-
-
-  template<class Seq>
-  struct acc2smem_source_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-
-  
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator1,          ///< Tile iterator for first source accumulator matrix
-    OutputTileIterator source_iterator2,          ///< Tile iterator for second source accumulator matrix
-    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
-    ) { 
-    
-    typename OutputTileIterator::Fragment source_fragment1;
-    source_fragment1.clear();
-    typename OutputTileIterator::Fragment source_fragment2;
-    source_fragment2.clear();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Load the source
-      //
-
-      source_iterator1.load(source_fragment1);
-      ++source_iterator1;
-
-      source_iterator2.load(source_fragment2);
-      ++source_iterator2;
-
-      //
-      // Convert and store fragment
-      //
-      
-      __syncthreads();
-
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
-
-      //
-      // Apply output operation
-      //
-
-      typename OutputTileIterator::Fragment frag_Z;
-      typename TensorTileIterator::Fragment frag_T;
-
-      apply_output_operator_(
-        frag_Z,
-        frag_T,
-        output_op,
-        aligned_accum_fragment[0],
-        source_fragment1,
-        source_fragment2,
-        broadcast_fragment);
-
-      //
-      // Conditionally store fragments
-      //
-
-      if (OutputOp::kStoreZ) {
-        destination_iterator.store(frag_Z);
-        ++destination_iterator;
-      }
-
-      if (OutputOp::kStoreT) {
-        tensor_iterator.store(frag_T);
-        ++tensor_iterator;
-      }
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment &frag_Z,
-    typename TensorTileIterator::Fragment &frag_T,
-    OutputOp const &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    typename OutputTileIterator::Fragment const &frag_C1,
-    typename OutputTileIterator::Fragment const &frag_C2,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
-    
-    AccumulatorAccessType const *frag_AB_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    OutputAccessType const *frag_C1_ptr =
-      reinterpret_cast<OutputAccessType const *>(&frag_C1);
-
-    OutputAccessType const *frag_C2_ptr =
-      reinterpret_cast<OutputAccessType const *>(&frag_C2);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-        output_op(
-          frag_Z_ptr[i],
-          frag_T_ptr[i],
-          frag_AB_ptr[i],
-          frag_C1_ptr[i],
-          frag_C2_ptr[i],
-          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    typename OutputTileIterator::Fragment &frag_Z,
-    typename TensorTileIterator::Fragment &frag_T,
-    OutputOp const &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
-    
-    AccumulatorAccessType const *frag_AB_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      output_op(
-        frag_Z_ptr[i], 
-        frag_T_ptr[i], 
-        frag_AB_ptr[i], 
-        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
-    }
-  }
-
-  public:
-    /// Stream-K reduce helper
-    CUTLASS_DEVICE
-    void reduce(
-        int reduce_fragment_idx,                        ///< Reduce fragment index
-        OutputOp const &output_op,                      ///< Output operator
-        ElementVector const * broadcast_ptr,            ///< Broadcast vector
-        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-        OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
-        OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
-        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
-        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
-            MatrixCoord(Shape::kM, Shape::kN),
-        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
-            MatrixCoord()) 
-    {
-
-      BroadcastFragment broadcast_fragment;
-      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
-
-      // Initialize/load source-fragment data
-      typename OutputTileIterator::Fragment source_fragment1;
-      source_fragment1.clear();
-      typename OutputTileIterator::Fragment source_fragment2;
-      source_fragment2.clear();
-
-      if (output_op.is_source_needed())
-      {
-        source_iterator1 += reduce_fragment_idx;
-        source_iterator1.load(source_fragment1);
-
-        source_iterator2 += reduce_fragment_idx;
-        source_iterator2.load(source_fragment2);
-      }
-
-      // Load fragment from shared memory
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // Add fragments shared by other k partitions
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-      }
-
-      //
-      // Apply output operation
-      //
-
-      typename OutputTileIterator::Fragment frag_Z;
-      typename TensorTileIterator::Fragment frag_T;
-
-      if (!output_op.is_source_needed()) {
-        apply_output_operator_source_not_needed_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          broadcast_fragment);
-      } else {
-        apply_output_operator_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          source_fragment1,
-          source_fragment2,
-          broadcast_fragment);
-      }
-
-      //
-      // Conditionally store fragments
-      //
-
-      if (OutputOp::kStoreZ) {
-        destination_iterator += reduce_fragment_idx;
-        destination_iterator.store(frag_Z);
-      }
-
-      if (OutputOp::kStoreT) {
-        tensor_iterator += reduce_fragment_idx;
-        tensor_iterator.store(frag_T);
-      }
-    }
-};
-
-
-template <
-  typename Shape_,
-  typename WarpMmaOperator_,
-  int PartitionsK,
-  typename OutputTileIterator_,
-  typename TensorTileIterator_,
-  typename ElementVector_,
-  typename AccumulatorFragmentIterator_,
-  typename WarpTileIterator_,
-  typename SharedLoadIterator_,
-  typename OutputOp_,
-  typename Padding_,
-  int FragmentsPerPartition,
-  int IterationsUnroll
->
-class EpilogueWithBroadcast<
-  Shape_,
-  WarpMmaOperator_,
-  PartitionsK,
-  OutputTileIterator_,
-  TensorTileIterator_,
-  ElementVector_,
-  AccumulatorFragmentIterator_,
-  WarpTileIterator_,
-  SharedLoadIterator_,
-  OutputOp_,
-  Padding_,
-  FragmentsPerPartition,
-  IterationsUnroll,
-  true
-> : 
-  public EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_,
-    FragmentsPerPartition> {
-
-public:
-
-  using Base = EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_,
-    FragmentsPerPartition>;
-
-  static bool const kIsSingleSource = true;
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using TensorTileIterator = TensorTileIterator_;
-  using ElementVector = ElementVector_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Compute data type produced by the output op
-  using ElementCompute = typename OutputOp::ElementCompute;
-
-  /// Compute fragment
-  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
-
-  /// Thread map used by output tile iterators
-  using ThreadMap = typename OutputTileIterator::ThreadMap;
-
-  /// Fragment object used to store the broadcast values
-  using BroadcastFragment = Array<
-    ElementCompute, 
-    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Data type of additional tensor
-  using ElementTensor = typename TensorTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-
-  /// Array type used by output functor
-  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
-
-  /// Tensor access type
-  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
-  
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-
-  /// Shared memory allocation from epilogue base class
-  using BaseSharedStorage = typename Base::SharedStorage;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-  /// Used for the broadcast
-  struct BroadcastDetail {
-
-    /// Number of threads per warp
-    static int const kWarpSize = 32;
-
-    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar column indices handled by each thread
-    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar row indices handled by each thread
-    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-    /// Number of threads per threadblock
-    static int const kThreadCount = kWarpSize * WarpCount::kCount;
-
-    /// Number of distinct threads per row of output tile
-    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
-
-    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-    static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-    /// I'm not sure what I meant here.
-    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-
-    /// Shape of the shared memory allocation for the epilogue    
-    using StorageShape = MatrixShape<
-      kThreadRows,
-      Shape::kN
-    >;
-
-    /// Debug printing
-    CUTLASS_DEVICE
-    static void print() {
-#if 0
-      printf("BroadcastDetail {\n");
-      printf(
-        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
-        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
-        kColumnsPerThread,
-        kRowsPerThread,
-        kThreadCount,
-        kThreadsPerRow,
-        kThreadRows,
-        kThreadAccessesPerRow,
-        StorageShape::kRow,
-        StorageShape::kColumn,
-        StorageShape::kCount
-      );
-      printf("};\n");
-#endif
-    }
-  };
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  struct SharedStorage {
-    union {
-      BaseSharedStorage base;
-    };
-
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
-    "Divisibility");
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Thread index within the threadblock
-  int thread_idx_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithBroadcast(
-    SharedStorage &shared_storage,                    ///< Shared storage object    
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
-    thread_idx_(thread_idx)
-  {
-
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                        ///< Output operator
-    ElementVector const * broadcast_ptr,              ///< Broadcast vector
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
-    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
-    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
-        MatrixCoord(Shape::kM, Shape::kN),
-    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
-        MatrixCoord()) {
-    
-    BroadcastFragment broadcast_fragment;
-
-    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(
-        output_op, 
-        broadcast_fragment, 
-        destination_iterator, 
-        accumulators,
-        tensor_iterator);
-    }
-    else {
-      compute_source_needed_(
-        output_op, 
-        broadcast_fragment, 
-        destination_iterator, 
-        accumulators, 
-        source_iterator,
-        tensor_iterator);
-    }
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void load_broadcast_fragment_(
-    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    ElementVector const * broadcast_ptr,         ///< Broadcast vector
-    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
-    ) {
-
-    broadcast_fragment.clear();
-    
-    // If no pointer is supplied, set with all zeros and avoid memory accesses
-    if (!broadcast_ptr) {
-      return;
-    }
-
-    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
-
-    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
-    broadcast_ptr += thread_initial_column;
-
-    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
-    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
-    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
-
-    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
-
-      AccessType loaded;
-
-      loaded.clear();
-
-      if (thread_column_idx < problem_size.column()) {
-        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
-      }
-
-      ComputeFragmentType cvt = converter(loaded);
-      frag_ptr[j] = cvt;
-
-      thread_column_idx += ThreadMap::Delta::kColumn;
-      broadcast_ptr += ThreadMap::Delta::kColumn;
-    }
-  }
-
-  template <class Seq>
-  struct acc2smem_source_not_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
-    template <int Advance>
-    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                                      WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-        typename AccumulatorFragmentIterator::Fragment accum_fragment;
-
-        accum_fragment_iterator.load(accum_fragment);
-        ++accum_fragment_iterator;
-
-        warp_tile_iterator.store(accum_fragment);
-        if (p < Base::kFragmentsPerIteration - 1) {
-          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
-                                              (1 - Base::kFragmentsPerIteration));
-      }
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {
-          (pos == (Seq * Base::kFragmentsPerIteration)) &&
-          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
-
-      CUTLASS_UNUSED(dummy[0]);
-    }
-  };
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                        ///< Output operator
-    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
-    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
-    ) { 
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    // CUTLASS_PRAGMA_UNROLL
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
-
-      //
-      // Convert and store fragment
-      //
-      
-
-      __syncthreads();
-
-      acc2smem_source_not_needed<
-          cutlass::make_index_sequence<OutputTileIterator::kIterations /
-                                   Base::kFragmentsPerIteration>>::push(iter,
-                                                                        accum_fragment_iterator,
-                                                                        this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
-
-
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-        if (p < Base::kFragmentsPerIteration - 1) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        }
-        else if (kPartitionsK > 1) {
-
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
-
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
-
-        //
-        // Apply output operation
-        //
-
-        typename OutputTileIterator::Fragment frag_Z;
-        typename TensorTileIterator::Fragment frag_T;
-
-        apply_output_operator_source_not_needed_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          broadcast_fragment);
-
-        //
-        // Conditionally store fragments
-        //
-
-        if (OutputOp::kStoreZ) {
-          destination_iterator.store(frag_Z);
-          ++destination_iterator;
-        }
-
-        if (OutputOp::kStoreT) {
-          tensor_iterator.store(frag_T);
-          ++tensor_iterator;
-        }
-      }
-
-      if (Base::kFragmentsPerIteration > 1) {
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
-      }
-    }
-  }
-
-
-  template<class Seq>
-  struct acc2smem_source_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-
-  
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
-    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
-    ) { 
-    
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Load the source
-      //
-
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-
-      //
-      // Convert and store fragment
-      //
-      
-      __syncthreads();
-
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
-
-      //
-      // Apply output operation
-      //
-
-      typename OutputTileIterator::Fragment frag_Z;
-      typename TensorTileIterator::Fragment frag_T;
-
-      apply_output_operator_(
-        frag_Z,
-        frag_T,
-        output_op,
-        aligned_accum_fragment[0],
-        source_fragment,
-        broadcast_fragment);
-
-      //
-      // Conditionally store fragments
-      //
-
-      if (OutputOp::kStoreZ) {
-        destination_iterator.store(frag_Z);
-        ++destination_iterator;
-      }
-
-      if (OutputOp::kStoreT) {
-        tensor_iterator.store(frag_T);
-        ++tensor_iterator;
-      }
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment &frag_Z,
-    typename TensorTileIterator::Fragment &frag_T,
-    OutputOp const &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    typename OutputTileIterator::Fragment const &frag_C,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
-    
-    AccumulatorAccessType const *frag_AB_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    OutputAccessType const *frag_C_ptr =
-      reinterpret_cast<OutputAccessType const *>(&frag_C);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-        output_op(
-          frag_Z_ptr[i],
-          frag_T_ptr[i],
-          frag_AB_ptr[i],
-          frag_C_ptr[i],
-          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    typename OutputTileIterator::Fragment &frag_Z,
-    typename TensorTileIterator::Fragment &frag_T,
-    OutputOp const &output_op,
-    typename SharedLoadIterator::Fragment const &frag_AB,
-    BroadcastFragment const &frag_Broadcast) {
-
-    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
-    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
-
-    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
-    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
-    
-    AccumulatorAccessType const *frag_AB_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
-
-    AccessTypeBroadcast const *frag_Broadcast_ptr =
-      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      output_op(
-        frag_Z_ptr[i], 
-        frag_T_ptr[i], 
-        frag_AB_ptr[i], 
-        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
-    }
-  }
-
-
-  public:
-    /// Stream-K reduce helper
-    CUTLASS_DEVICE
-    void reduce(
-        int reduce_fragment_idx,                        ///< Reduce fragment index
-        OutputOp const &output_op,                      ///< Output operator
-        ElementVector const * broadcast_ptr,            ///< Broadcast vector
-        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-        OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
-        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
-            MatrixCoord(Shape::kM, Shape::kN),
-        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
-            MatrixCoord()) 
-    {
-
-      BroadcastFragment broadcast_fragment;
-      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
-
-      // Initialize/load source-fragment data
-      typename OutputTileIterator::Fragment source_fragment;
-      source_fragment.clear();
-
-      if (output_op.is_source_needed())
-      {
-        source_iterator += reduce_fragment_idx;
-        source_iterator.load(source_fragment);
-      }
-
-      // Load fragment from shared memory
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // Add fragments shared by other k partitions
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-      }
-
-      //
-      // Apply output operation
-      //
-
-      typename OutputTileIterator::Fragment frag_Z;
-      typename TensorTileIterator::Fragment frag_T;
-
-      if (!output_op.is_source_needed()) {
-        apply_output_operator_source_not_needed_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          broadcast_fragment);
-      } else {
-        apply_output_operator_(
-          frag_Z,
-          frag_T,
-          output_op,
-          aligned_accum_fragment[0],
-          source_fragment,
-          broadcast_fragment);
-      }
-
-      //
-      // Conditionally store fragments
-      //
-
-      if (OutputOp::kStoreZ) {
-        destination_iterator.store(frag_Z);
-        ++destination_iterator;
-      }
-
-      if (OutputOp::kStoreT) {
-        tensor_iterator.store(frag_T);
-        ++tensor_iterator;
-      }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
deleted file mode 100755
index 1d4c7016b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
+++ /dev/null
@@ -1,823 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator with reduction over each column 
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands
-  typename ElementVector_,                  ///< Pointer to reduction vector
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename OutputOp_,                       ///< Output operator
-  typename ReductionOp_,                    ///< Reduction operator
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
->
-class EpilogueWithReduction : 
-  public EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_> {
-
-public:
-
-  using Base = EpilogueBase<
-    Shape_, 
-    typename WarpMmaOperator_::Shape, 
-    PartitionsK, 
-    AccumulatorFragmentIterator_, 
-    WarpTileIterator_, 
-    Padding_>;
-
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using TensorTileIterator = TensorTileIterator_;
-  using ElementVector = ElementVector_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp = OutputOp_;
-  using ReductionOp = ReductionOp_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  static bool const kIsSingleSource = true;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Compute data type produced by the output op
-  using ElementCompute = typename OutputOp::ElementCompute;
-
-  /// Compute fragment
-  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
-
-  /// Thread map used by output tile iterators
-  using ThreadMap = typename OutputTileIterator::ThreadMap;
-
-  /// Fragment object used in reduction
-  using ReductionFragment = Array<
-    ElementAccumulator, 
-    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Data type of additional tensor
-  using ElementTensor = typename TensorTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
-
-  /// Array type used by output functor
-  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
-
-  /// Tensor access type
-  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
-  
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-
-  /// Shared memory allocation from epilogue base class
-  using BaseSharedStorage = typename Base::SharedStorage;
-
-  /// Used for the reduction
-  struct ReductionDetail {
-
-    /// If true, accumulator coordinates are computed and out-of-bounds checks are enabled when
-    /// performing the reduction.
-    static bool const kOobCheck = false;
-
-    /// Number of threads per warp
-    static int const kWarpSize = 32;
-
-    /// Number of distinct scalar column indices handled by each thread
-    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
-
-    /// Number of distinct scalar row indices handled by each thread
-    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
-
-    /// Number of threads per threadblock
-    static int const kThreadCount = kWarpSize * WarpCount::kCount;
-
-    /// Number of distinct threads per row of output tile
-    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
-
-    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
-    static int const kThreadRows = kThreadCount / kThreadsPerRow;
-
-    /// I'm not sure what I meant here.
-    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
-
-    /// Shape of the shared memory allocation for the epilogue    
-    using StorageShape = MatrixShape<
-      kThreadRows,
-      Shape::kN
-    >;
-
-    /// Debug printing
-    CUTLASS_DEVICE
-    static void print() {
-#if 0
-      printf("ReductionDetail {\n");
-      printf(
-        "  kElementsPerAccess:%d\nkColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
-        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
-        kElementsPerAccess,
-        kColumnsPerThread,
-        kRowsPerThread,
-        kThreadCount,
-        kThreadsPerRow,
-        kThreadRows,
-        kThreadAccessesPerRow,
-        StorageShape::kRow,
-        StorageShape::kColumn,
-        StorageShape::kCount
-      );
-      printf("};\n");
-#endif
-    }
-  };
-
-  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
-  struct SharedStorage {
-    union {
-      BaseSharedStorage base;
-      AlignedArray<ElementAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;    ///< Shared storage for reduction
-    };
-
-    CUTLASS_HOST_DEVICE
-    SharedStorage() { }
-  };
-
-public:
-
-
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
-    "Divisibility");
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-  /// Shared memory pointer fo rreduction
-  ElementAccumulator *reduction_ptr_;
-
-  /// Thread index within the threadblock
-  int thread_idx_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithReduction(
-    SharedStorage &shared_storage,                    ///< Shared storage object    
-    int thread_idx,                                   ///< ID of a thread within the threadblock
-    int warp_idx,                                     ///< ID of warp within threadblock
-    int lane_idx                                      ///< Id of thread within warp
-  ):
-    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
-    reduction_ptr_(shared_storage.reduction.data()),
-    thread_idx_(thread_idx)
-  {
-
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                        ///< Output operator
-    ElementVector * reduction_output_ptr,             ///< Reduction output vector
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
-    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
-    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
-        MatrixCoord(Shape::kM, Shape::kN),
-    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
-        MatrixCoord()) {
-    
-    ReductionFragment reduction_fragment;
-    reduction_fragment.clear();
-
-    if (!output_op.is_source_needed()) {
-      compute_source_not_needed_(
-        output_op, 
-        reduction_fragment, 
-        destination_iterator, 
-        accumulators,
-        tensor_iterator,
-        problem_size,
-        threadblock_offset);
-    }
-    else {
-      compute_source_needed_(
-        output_op, 
-        reduction_fragment, 
-        destination_iterator, 
-        accumulators, 
-        source_iterator,
-        tensor_iterator,
-        problem_size,
-        threadblock_offset);
-    }
-
-    if (output_op.participates_in_reduction()) {
-      reduction_(problem_size, threadblock_offset, reduction_output_ptr, reduction_fragment);
-    }
-  }
-
-private:
-
-  /// Perform the reduction
-  CUTLASS_DEVICE
-  void reduction_(
-    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset,            ///< Problem size needed to guard against out-of-bounds accesses
-    ElementVector * reduction_output_ptr,          ///< Reduction output vector
-    ReductionFragment const & reduction_fragment) {
-
-    //
-    // Store the partially reduced value to SMEM
-    //
-
-    // Guard against uses of the existing SMEM tile
-    __syncthreads();
-    
-    using AccessType = AlignedArray<ElementAccumulator, ThreadMap::kElementsPerAccess>;
-
-    //
-    // Determine a compacted thread arrangement to store to SMEM.
-    //
-    int const kThreadsPerRow = Shape::kN / (ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess);
-
-    MatrixCoord thread_offset(
-      thread_idx_ / kThreadsPerRow, 
-      (thread_idx_ % kThreadsPerRow) * ThreadMap::kElementsPerAccess);
-   
-    //
-    // Each thread store its fragment to a SMEM
-    //
-
-    AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
-      &reduction_ptr_[thread_offset.row() * Shape::kN + thread_offset.column()]);
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&reduction_fragment);
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-      int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
-
-      aligned_reduction_ptr[col_idx] = frag_ptr[column];
-    }
-
-    __syncthreads();
-
-    //
-    // Now, threads are assigned several columns of the output. They fetch over all rows from
-    // the compacted SMEM tile and perform a reduction.
-    //
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
-      int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
-
-      ReductionOp reduction_op;
-      ElementAccumulator reduction_element = ElementAccumulator();
-
-      int output_column_idx = threadblock_offset.column() + column_idx;
-
-      if (column_idx < Shape::kN && output_column_idx < problem_size.column()) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
-          if (row) {
-            auto frag = reduction_ptr_[row * Shape::kN + column_idx];
-
-            reduction_element = reduction_op(reduction_element, frag);
-          }
-          else {
-
-            reduction_element = reduction_ptr_[column_idx];
-          }
-        }
-
-        // Store
-        reduction_output_ptr[column_idx] = ElementVector(reduction_element);
-      }
-    }
-  }
-
-  template<class Seq>
-  struct acc2smem;
-
-  template <size_t... Seq>
-  struct acc2smem<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_not_needed_(
-    OutputOp const &output_op,                        ///< Output operator
-    ReductionFragment &reduction_fragment,            ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
-    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additioanl tensor operand
-    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
-    ) { 
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    typename TensorTileIterator::Fragment tensor_fragment;
-    tensor_fragment.clear();
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Convert and store fragment
-      //
-
-      tensor_iterator.load(tensor_fragment);
-      ++tensor_iterator;
-      
-      __syncthreads();
-
-      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      //
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      //
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
-
-      //
-      // Compute the output result
-      //
-     
-      FragmentCompute compute_fragment;
-
-      apply_output_operator_source_not_needed_(
-        reduction_fragment,
-        compute_fragment, 
-        output_op, 
-        aligned_accum_fragment[0],
-        tensor_fragment,
-        destination_iterator);
-
-      //
-      // Store the final result
-      //
-      
-      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
-
-      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
-
-      destination_iterator.store(output_fragment);
-      ++destination_iterator;
-    }
-  }
-
-  
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    ReductionFragment &reduction_fragment,        ///< Fragment containing the accumulated partial reduction over columns
-    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator,           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    TensorTileIterator tensor_iterator,            ///< Threadblock tile iterator for additioanl tensor operand
-    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
-    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
-    ) { 
-    
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-
-    typename TensorTileIterator::Fragment tensor_fragment;
-    tensor_fragment.clear();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    // 
-
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Load the source
-      //
-
-      source_fragment.clear();
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-
-      tensor_iterator.load(tensor_fragment);
-      ++tensor_iterator;
-
-      //
-      // Convert and store fragment
-      //
-      
-      __syncthreads();
-
-      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1)
-      {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
-      }
-
-      //
-      // Compute the output result
-      //
-     
-      FragmentCompute compute_fragment;
-
-      apply_output_operator_(
-        reduction_fragment, 
-        compute_fragment, 
-        output_op, 
-        aligned_accum_fragment[0], 
-        source_fragment,
-        tensor_fragment,
-        destination_iterator);
-
-      //
-      // Convert and store the final result
-      //
-
-      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
-
-      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
-
-      destination_iterator.store(output_fragment);      
-      ++destination_iterator;
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    ReductionFragment &reduction_fragment,
-    FragmentCompute &compute_fragment,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
-    typename OutputTileIterator::Fragment const &source_fragment,
-    typename TensorTileIterator::Fragment const &tensor_fragment,
-    OutputTileIterator const & destination_iterator) {
-      
-    ComputeAccessType *compute_frag_ptr = 
-      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
-
-    AccumulatorAccessType const *accum_frag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    OutputAccessType const *source_frag_ptr = 
-      reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-    TensorAccessType const *tensor_frag_ptr =
-      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      // Call the output operator
-      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], source_frag_ptr[i], tensor_frag_ptr[i]);
-    }
-
-    //
-    // Partial reduction over each column
-    //
-
-    ReductionOp reduction_op;
-
-    typename OutputTileIterator::Mask mask;
-    destination_iterator.get_mask(mask);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
-
-      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
-      bool column_guard = mask.predicates[column_vector_idx];
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
-
-        bool fetch;
-        if (ReductionDetail::kOobCheck) {
-          int row_idx = (row % ThreadMap::Iterations::kRow);
-          int residual = (row / ThreadMap::Iterations::kRow);
-
-          int group_idx = (residual % ThreadMap::Iterations::kGroup);
-          residual = (residual / ThreadMap::Iterations::kGroup);
-
-          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
-
-          int row_offset = row_idx * ThreadMap::Delta::kRow 
-            + group_idx * ThreadMap::Delta::kGroup 
-            + cluster_idx * ThreadMap::Delta::kCluster;
-
-          int output_row = destination_iterator.thread_start_row() + row_offset;
-
-          fetch = (output_row < destination_iterator.extent_row() && column_guard);
-        }
-        else {
-          fetch = true;
-        }
-
-        ElementCompute value = ElementCompute();
-        if (fetch) {
-          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
-        }
-
-        reduction_fragment[column] = reduction_op(
-          reduction_fragment[column], 
-          value);
-      }
-    }
-  }
-
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_source_not_needed_(
-    ReductionFragment &reduction_fragment,
-    FragmentCompute &compute_fragment,
-    OutputOp const &output_op,                    ///< Output operator
-    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
-    typename TensorTileIterator::Fragment const &tensor_fragment,
-    OutputTileIterator const & destination_iterator
-  ) {
-    
-    ComputeAccessType *compute_frag_ptr = 
-      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
-
-    AccumulatorAccessType const *accum_frag_ptr = 
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-    TensorAccessType const *tensor_frag_ptr =
-      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
-
-    int const kOutputOpIterations = 
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-
-      // Call the output operator
-      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], tensor_frag_ptr[i]);
-    }
-
-    //
-    // Partial reduction over each column
-    //
-
-    ReductionOp reduction_op;
-
-    typename OutputTileIterator::Mask mask;
-    destination_iterator.get_mask(mask);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
-
-      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
-      bool column_guard = mask.predicates[column_vector_idx];
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
-
-        bool fetch;
-        if (ReductionDetail::kOobCheck) {
-          int row_idx = (row % ThreadMap::Iterations::kRow);
-          int residual = (row / ThreadMap::Iterations::kRow);
-
-          int group_idx = (residual % ThreadMap::Iterations::kGroup);
-          residual = (residual / ThreadMap::Iterations::kGroup);
-
-          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
-
-          int row_offset = row_idx * ThreadMap::Delta::kRow 
-            + group_idx * ThreadMap::Delta::kGroup 
-            + cluster_idx * ThreadMap::Delta::kCluster;
-
-          int output_row = destination_iterator.thread_start_row() + row_offset;
-
-          fetch = (output_row < destination_iterator.extent_row() && column_guard);
-        }
-        else {
-          fetch = true;
-        }
-
-        ElementCompute value = ElementCompute();
-        if (fetch) {
-          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
-        }
-
-        reduction_fragment[column] = reduction_op(
-          reduction_fragment[column], 
-          value);
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
deleted file mode 100755
index 6ab9cf069..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
+++ /dev/null
@@ -1,409 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Generic epilogue for implementing certain kinds of fused epilogue behavior.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-class EpilogueFusedVisitorConcept {
-public:
-
-  static int const kIterations = 1;
-  static int const kElementsPerAccess = 4;
-  using ElementOutput = float;
-  using ElementAccumulator = float;
-  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
-
-  /// Arguments structure
-  struct Arguments {  };
-
-  /// Params structure
-  struct Params {
-
-    Params() { }
-    Params(Arguments const &args) { }
-  };
-
-  /// Shared storage
-  struct SharedStorage { };
-
-public:
-
-  CUTLASS_DEVICE
-  EpilogueFusedVisitorConcept(
-    Params const &params,                                         ///< Parameters routed to the epilogue
-    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
-    MatrixCoord const &problem_size,                              ///< Problem size of the output
-    int thread_idx,                                               ///< Thread index within the threadblock
-    int warp_idx,                                                 ///< Warp index within the threadblock
-    int lane_idx,                                                 ///< Lane index within the warp
-    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)) {  ///< Coordinate
-
-  }
-
-  /// Helper to indicate split-K behavior
-  CUTLASS_DEVICE
-  void set_k_partition(
-    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
-    int split_k_slices) {                                         ///< Total number of split-K slices
-
-  }
-
-  /// Called to set the batch index
-  CUTLASS_DEVICE
-  void set_batch_index(int batch_idx) {
-
-  }
-
-  /// Called at the start of the epilogue just before iterating over accumulator slices
-  CUTLASS_DEVICE
-  void begin_epilogue() {
-
-  }
-
-  /// Called at the start of one step before starting accumulator exchange
-  CUTLASS_DEVICE
-  void begin_step(int step_idx) {
-
-  }
-
-  /// Called at the start of a row
-  CUTLASS_DEVICE
-  void begin_row(int row_idx) {
-
-  }
-
-  /// Called after accumulators have been exchanged for each accumulator vector
-  CUTLASS_DEVICE
-  void visit(
-    int iter_idx,
-    int row_idx,
-    int column_idx,
-    int frag_idx,
-    AccumulatorFragment const &accum) {
-
-  }
-
-  /// Called at the end of a row
-  CUTLASS_DEVICE
-  void end_row(int row_idx) {
-
-  }
-
-  /// Called after all accumulator elements have been visited
-  CUTLASS_DEVICE
-  void end_step(int step_idx) {
-
-  }
-
-  /// Called after all steps have been completed
-  CUTLASS_DEVICE
-  void end_epilogue() {
-
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename Visitor_,                        ///< Functor containing fused operations (satisfies EpilogueFusedVisitorConcept)
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (true || !IsEpilogueFunctorHeavy<Visitor_>::value)
->
-class EpilogueWithVisitor :
-  public EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition> {
-
-public:
-
-  using Visitor = Visitor_;
-
-  using Base = EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition>;
-
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using Padding = Padding_;
-
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = Visitor::kElementsPerAccess;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<
-    typename WarpTileIterator::Element, kElementsPerAccess>;
-
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-  using SharedStorage = typename Base::SharedStorage;
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithVisitor(
-    SharedStorage &shared_storage,    ///< Shared storage object
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                      ///< Id of thread within warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    shared_load_iterator_(shared_storage.reference(), thread_idx)
-  {
-
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    Visitor & visitor,
-    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    visitor.begin_epilogue();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    #pragma unroll(IterationsUnroll ? Visitor::kIterations : 1)
-    for (int iter_idx = 0; iter_idx < Visitor::kIterations; ++iter_idx) {
-
-      //
-      // Load the source
-      //
-
-      visitor.begin_step(iter_idx);
-
-      //
-      // Convert and store fragment
-      //
-
-      __syncthreads();
-
-      acc2smem_source_needed<cutlass::make_index_sequence<Visitor::kIterations>>::push(
-          iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
-
-      __syncthreads();
-
-      //
-      // Load fragments from shared memory
-      //
-
-      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-      shared_load_iterator_.load(aligned_accum_fragment[0]);
-
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1) {
-
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator_.load(aligned_accum_fragment[i]);
-          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-        }
-
-        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-      }
-
-      //
-      // Iterate over output fragments
-      //
-
-      AccumulatorAccessType const *accum_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
-
-      int const kAccumulatorFragmentCount = AccumulatorTile::kElements / (Visitor::kIterations * AccumulatorAccessType::kElements);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
-
-        int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
-        int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
-
-        // Start a new row of the output fragment
-        if (!col_idx) {
-          visitor.begin_row(row_idx);
-        }
-
-        visitor.visit(
-          iter_idx,
-          row_idx,
-          col_idx,
-          idx,
-          accum_frag_ptr[idx]
-        );
-
-        // End the row of the output fragment
-        if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
-          visitor.end_row(row_idx);
-        }
-      }
-
-      //
-      // Conclude the step
-      //
-
-      visitor.end_step(iter_idx);
-    }
-
-    visitor.end_epilogue();
-  }
-
-private:
-
-
-  template<class Seq>
-  struct acc2smem_source_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to create an EpilogueWithVisitor from an existing epilogue
-template <typename Visitor_, typename Existing_, bool IterationsUnroll = true>
-struct EpilogueWithVisitorFromExistingEpilogue  {
-
-  using Epilogue = EpilogueWithVisitor<
-    Visitor_,
-    typename Existing_::Shape,
-    typename Existing_::WarpMmaOperator,
-    Existing_::kPartitionsK,
-    typename Existing_::AccumulatorFragmentIterator,
-    typename Existing_::WarpTileIterator,
-    typename Existing_::SharedLoadIterator,
-    typename Existing_::Padding,
-    Existing_::kFragmentsPerIteration,
-    IterationsUnroll
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
deleted file mode 100755
index 259f0729c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
+++ /dev/null
@@ -1,504 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- /*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-namespace detail {
-
-struct EVT2xBase { };
-
-template <class T>
-static constexpr bool is_2x_evt_v = platform::is_base_of<EVT2xBase, T>::value;
-
-} // namespace detail
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator
-template <
-  typename DefaultEpilogue,                 ///< Default Epilogue Descriptor
-  typename FusionCallbacks_,                ///< The called fusion callbacks
-  int Stages = 2,                           ///< Software pipeline stages for epilogue
-  int IterationsUnroll = true               ///< Used to reduce binary size when epilogue op is large
->
-class EpilogueWithVisitorCallbacks :
-  public EpilogueBase<
-    typename DefaultEpilogue::Shape,
-    typename DefaultEpilogue::WarpMmaOperator::Shape,
-    DefaultEpilogue::kPartitionsK,
-    typename DefaultEpilogue::AccumulatorFragmentIterator,
-    typename DefaultEpilogue::WarpTileIterator,
-    typename DefaultEpilogue::Padding,
-    DefaultEpilogue::kFragmentsPerIteration>,
-  public EpilogueBaseStreamK<
-    typename DefaultEpilogue::Shape,
-    DefaultEpilogue::kPartitionsK,
-    typename DefaultEpilogue::WarpMmaOperator,
-    typename DefaultEpilogue::AccumulatorFragmentIterator>,
-  public detail::EVT2xBase
-   {
-
-public:
-
-  static_assert(Stages <= 2, "Sm80 EVT only support upto 2 Stages.");
-
-  // Whether the epilogue is pipelined
-  static bool constexpr Pipelined = Stages > 1;
-
-  using FusionCallbacks = FusionCallbacks_;
-
-  using OutputTileIterator = typename DefaultEpilogue::OutputTileIterator;
-  // Number of epilogue iterations. 
-  // Each iteration processes a 8xThreadblockTile::kN output tile
-  static const int kIterations = OutputTileIterator::kIterations;
-
-  using Base = EpilogueBase<
-    typename DefaultEpilogue::Shape,
-    typename DefaultEpilogue::WarpMmaOperator::Shape,
-    DefaultEpilogue::kPartitionsK,
-    typename DefaultEpilogue::AccumulatorFragmentIterator,
-    typename DefaultEpilogue::WarpTileIterator,
-    typename DefaultEpilogue::Padding,
-    DefaultEpilogue::kFragmentsPerIteration>;
-  
-  using BaseStreamK = EpilogueBaseStreamK<
-    typename DefaultEpilogue::Shape,
-    DefaultEpilogue::kPartitionsK,
-    typename DefaultEpilogue::WarpMmaOperator,
-    typename DefaultEpilogue::AccumulatorFragmentIterator>;
-
-  static int const kPartitionsK = DefaultEpilogue::kPartitionsK;
-
-  using AccumulatorFragmentIterator = typename DefaultEpilogue::AccumulatorFragmentIterator;
-  using WarpTileIterator = typename DefaultEpilogue::WarpTileIterator;
-  using SharedLoadIterator = typename DefaultEpilogue::SharedLoadIterator;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-
-  struct OutputOp{
-    using ElementAccumulator = ElementAccumulator;
-    using Params = typename FusionCallbacks::Arguments;
-  };
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
-
-  // Output access size
-  static int const kElementsPerAccess = DefaultEpilogue::kElementsPerAccess;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<
-    typename WarpTileIterator::Element, kElementsPerAccess>;
-
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
-
-  using Params = typename FusionCallbacks::Params;
-
-  static size_t constexpr kSmemStageOffset = sizeof(Base::SharedStorage) / sizeof(ElementAccumulator);
-  static int constexpr kAccumulatorFragmentCount = AccumulatorTile::kElements / (kIterations * AccumulatorAccessType::kElements) / kPartitionsK;
-
-  struct SharedStorage {
-    typename Base::SharedStorage acc_smem[Stages];
-    typename FusionCallbacks::SharedStorage callback_smem;
-  };
-
-private:
-
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator_;
-  FusionCallbacks fusion_callbacks;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWithVisitorCallbacks(
-    const Params &params_callbacks,   ///< Epilogue Visitor params
-    SharedStorage &shared_storage,    ///< Shared storage object
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                      ///< Id of thread within warp
-  ):
-    Base(shared_storage.acc_smem[0], thread_idx, warp_idx, lane_idx),
-    BaseStreamK(thread_idx),
-    shared_load_iterator_(shared_storage.acc_smem[0].reference(), thread_idx),
-    fusion_callbacks(params_callbacks, shared_storage.callback_smem)
-  { }
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
-  /// performing epilogue computations, writing to output
-  template <class ProblemShape>
-  CUTLASS_DEVICE
-  void reduce(
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *element_workspace,
-      cutlass::gemm::GemmCoord threadblock_tile_offset,
-      ProblemShape problem_shape,
-      int thread_idx) 
-  {
-    auto callbacks = fusion_callbacks.get_callbacks(
-      threadblock_tile_offset,
-      thread_idx,
-      problem_shape
-    );
-
-    callbacks.begin_epilogue();
-    // Reduce peer accumulator fragments into one fragment
-    AccumulatorFragment accum_fragment;
-    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
-
-    // Store fragment to shared memory
-    this->warp_tile_iterator_.store(accum_fragment);
-
-    __syncthreads();
-
-    callbacks.begin_step(reduce_fragment_idx);
-
-    // Load fragment from shared memory
-    typename SharedLoadIterator::Fragment aligned_accum_fragment;
-    shared_load_iterator_.load(aligned_accum_fragment);
-
-    // Add fragments shared by other k partitions
-    if (kPartitionsK > 1)
-    {
-      plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-      CUTLASS_PRAGMA_UNROLL
-      for ( int i = 1; i < kPartitionsK; ++i) {
-        typename SharedLoadIterator::Fragment aligned_addend_fragment;
-        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-        shared_load_iterator_.load(aligned_addend_fragment);
-        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
-      }
-    }
-
-    //
-    // Iterate over output fragment
-    //
-
-    AccumulatorAccessType const *accum_frag_ptr =
-      reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
-      int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
-      int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
-
-      // Start a new row of the output fragment
-      if (!col_idx) {
-        callbacks.begin_row(row_idx);
-      }
-
-      callbacks.visit(
-        reduce_fragment_idx,
-        row_idx,
-        col_idx,
-        idx,
-        accum_frag_ptr[idx]
-      );
-
-      // End the row of the output fragment
-      if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
-        callbacks.end_row(row_idx);
-      }
-    }
-
-    callbacks.end_step(reduce_fragment_idx);
-    callbacks.end_epilogue();
-  }
-
-  /// Streams the result to global memory
-  template <class ProblemShape>
-  CUTLASS_DEVICE
-  void operator()(
-    AccumulatorTile const &accumulators,
-    cutlass::gemm::GemmCoord threadblock_tile_offset,
-    ProblemShape problem_shape,
-    int thread_idx
-    ) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-
-    auto callbacks = fusion_callbacks.get_callbacks(
-      threadblock_tile_offset,
-      thread_idx,
-      problem_shape
-    );
-
-    callbacks.begin_epilogue();
-
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    if constexpr(Pipelined){
-      __syncthreads();
-
-      //
-      // Pipeline Prologue
-      //
-      size_t warp_iterator_offset = kSmemStageOffset;
-      size_t smem_iterator_offset = kSmemStageOffset;
-      callbacks.begin_step(0);
-    
-      acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
-            0, accum_fragment_iterator, this->warp_tile_iterator_);
-      
-      this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
-      warp_iterator_offset = -warp_iterator_offset;
-
-      //
-      // Pipeline Loop
-      //
-
-      #pragma unroll(IterationsUnroll ? kIterations : 1)
-      for (int iter_idx = 1; iter_idx < kIterations + 1; ++iter_idx) {
-
-        __syncthreads();
-
-        // Skip the load for epilogue
-        if (iter_idx < kIterations) {
-          callbacks.begin_step(iter_idx);
-
-          acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
-              iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
-
-          this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
-          warp_iterator_offset = -warp_iterator_offset;
-        }
-        
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
-        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-        if (kPartitionsK > 1) {
-
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
-
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
-        shared_load_iterator_.add_pointer_offset(smem_iterator_offset);
-        smem_iterator_offset = -smem_iterator_offset;
-        
-        //
-        // Iterate over output fragments
-        //
-
-        AccumulatorAccessType const *accum_frag_ptr =
-          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
-
-          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
-          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
-
-          // Start a new row of the output fragment
-          if (!col_idx) {
-            callbacks.begin_row(row_idx);
-          }
-
-          callbacks.visit(
-            iter_idx-1,
-            row_idx,
-            col_idx,
-            idx,
-            accum_frag_ptr[idx]
-          );
-
-          // End the row of the output fragment
-          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
-            callbacks.end_row(row_idx);
-          }
-        }
-
-        //
-        // Conclude the step
-        //
-
-        callbacks.end_step(iter_idx-1);
-      }
-    } else {
-
-      #pragma unroll(IterationsUnroll ? kIterations : 1)
-      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
-
-        //
-        // Load the source
-        //
-
-        callbacks.begin_step(iter_idx);
-
-        //
-        // Convert and store fragment
-        //
-
-        __syncthreads();
-
-        acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
-            iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
-
-        __syncthreads();
-
-        //
-        // Load fragments from shared memory
-        //
-
-        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
-
-        shared_load_iterator_.load(aligned_accum_fragment[0]);
-        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-        if (kPartitionsK > 1) {
-
-          plus <typename SharedLoadIterator::Fragment> add_fragments;
-
-          CUTLASS_PRAGMA_UNROLL
-          for ( int i = 1; i < kPartitionsK; ++i) {
-            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
-            shared_load_iterator_.load(aligned_accum_fragment[i]);
-            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-          }
-
-          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        }
-
-        //
-        // Iterate over output fragments
-        //
-
-        AccumulatorAccessType const *accum_frag_ptr =
-          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
-
-          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
-          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
-
-          // Start a new row of the output fragment
-          if (!col_idx) {
-            callbacks.begin_row(row_idx);
-          }
-
-          callbacks.visit(
-            iter_idx,
-            row_idx,
-            col_idx,
-            idx,
-            accum_frag_ptr[idx]
-          );
-
-          // End the row of the output fragment
-          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
-            callbacks.end_row(row_idx);
-          }
-        }
-
-        //
-        // Conclude the step
-        //
-
-        callbacks.end_step(iter_idx);
-      }
-    }
-
-    callbacks.end_epilogue();
-  }
-
-private:
-
-
-  template<class Seq>
-  struct acc2smem_source_needed;
-
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
deleted file mode 100755
index d41a0fa43..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs.
-
-  This does not attempt to target any particular output layout. Instead, each threadblock
-  streams out its accumulator elements using 128b store operations. This assumes all threadblocks
-  have unique output tiles.
-
-  The target data layout is:
-  - threadblock indices mapped to linear offsets as (m, n, k), where m is fastest-changing
-  - threadblock output space partitioned into warps; each warp's region is contiguous
-  - per-thread accumulators partitioned into 128b accesses
-  - output memory striped across the threads of a warp
-
-  This enables very fast streaming of data, completely limited by the memory system. No predication
-  or data exchange is performed, and each threadblock is assumed to have a full region of memory
-  to write to.
-
-  This epilogue establishes an upper bound for epilogue performance and is suitable for
-  reductions across the GEMM K dimension which require a separate workspace.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,      ///< shape of accumulator tile (concept: MatrixShape)
-  int WarpCount,        ///< number of warps
-  typename FragmentC_   ///< warp-level GEMM operator (concept: gemm::warp::Mma)
->
-class EpilogueWorkspace {
-public:
-
-  using Shape = Shape_;
-  using FragmentC = FragmentC_;
-  using ElementC = typename FragmentC::value_type;
-
-  static int const kWarpCount = WarpCount;
-
-  /// Optimize for 128b accesses
-  static int const kAccessSizeInBits = 128;
-
-  /// Warp size from the perspective of memory operations
-  static int const kWarpSize = 32;
-
-  /// Vector length of accesses
-  static int const kElementsPerAccess = 
-    kAccessSizeInBits / sizeof_bits<ElementC>::value;
-
-  /// Number of stores per thread
-  static int const kIterations = FragmentC::kElements / kElementsPerAccess;
-
-  static_assert(
-    !(FragmentC::kElements % kElementsPerAccess), 
-    "The number of accumulators must be divisible by the access size.");
-
-  /// Total number of vectorized accesses in warp (in units of vector)
-  static int const kWarpAccesses = kIterations * kWarpSize;
-
-  /// Total number of vectorized accesses in threadblock tile (in units of vector)
-  static int const kThreadblockAccesses = kWarpAccesses * kWarpCount;
-
-  /// Parameters structure
-  struct Params {
-
-    /// Pointer to C matrix
-    ElementC *ptr_C;
-
-    /// Stride between tiles along the GEMM N dimension (in units of vectors)
-    int stride_n;
-
-    /// Stride between tiles along the GEMM K dimension (in units of vectors)
-    int stride_k;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      ElementC *ptr_C,   ///< Pointer to C matrix
-      int stride_n_,      ///< Stride between tiles along the GEMM N dimension (in units of ElementC)
-      int stride_k_       ///< Stride between tiles along the GEMM K dimension (in units of ElementC)
-    ):
-      ptr_C(ptr_C), stride_n(stride_n_ / kElementsPerAccess), stride_k(stride_k_ / kElementsPerAccess) {
-
-    }
-  };
-
-  /// Shared storage allocation needed by the epilogue
-  struct SharedStorage {
-    // Intentionally empty
-  };
-
-private:
-
-  struct alignas((kAccessSizeInBits / 8)) AccessType {
-    Array<ElementC, kElementsPerAccess> storage;
-  };
-
-  /// Constant reference to parameters object
-  AccessType *pointer_;
-
-  /// Stride between tiles along the n dimension (in vectors)
-  int stride_n_;
-
-  /// Stride between tiles along the k dimension (in vectors)
-  int stride_k_;
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpilogueWorkspace(
-    Params const &params,     ///< Host-constructable params object
-    SharedStorage &,          ///< Shared storage object
-    int warp_idx,             ///< ID of warp within threadblock
-    int lane_idx              ///< Id of thread within warp
-
-  ):
-    pointer_(reinterpret_cast<AccessType *>(params.ptr_C)),
-    stride_n_(params.stride_n), 
-    stride_k_(params.stride_k) {
-
-    // Add per-thread offset
-    pointer_ += lane_idx + warp_idx * kWarpAccesses;
-  }
-
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    cutlass::gemm::GemmCoord problem_size,       ///< Problem size of GEMM (units of ElementC)
-    cutlass::gemm::GemmCoord tb_tile_coord,      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    FragmentC const &accum) {     ///< Accumulator tile
-    
-    // Compute offset for entire threadblock (note, per-thread offset has been folded in already)
-    AccessType *pointer = pointer_ + 
-      tb_tile_coord.m() * kThreadblockAccesses + 
-      tb_tile_coord.n() * stride_n_ +
-      tb_tile_coord.k() * stride_k_;
-
-    // Cast to vectorized view of accumulator fragments
-    AccessType const * src_pointer = reinterpret_cast<AccessType const *>(&accum);
-
-    // Write out accumulators at full speed
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kIterations; ++i) {
-      pointer[i * kWarpSize] = src_pointer[i];
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
deleted file mode 100755
index 8b1cd4fd3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
+++ /dev/null
@@ -1,433 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree operation base implementation to enable composable fusions
-         for the CUTLASS 2x epilogue
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::threadblock {
-
-using namespace cute;
-using cute::tuple;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <class... Ops>
-struct VisitorImpl2x: fusion::detail::Sm90VisitorImplBase<Ops...> {
-  using fusion::detail::Sm90VisitorImplBase<Ops...>::Sm90VisitorImplBase;
-  using fusion::detail::Sm90VisitorImplBase<Ops...>::ops;
-
-  template <class CallbacksTuple>
-  struct Callbacks {
-    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
-    CallbacksTuple callbacks_tuple;
-
-    /// Called at the start of the epilogue just before iterating over accumulator slices
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      for_each(callbacks_tuple,
-        [] (auto& callbacks) {
-          callbacks.begin_epilogue();
-        }
-      );
-    }
-
-    /// Called at the start of one step before starting accumulator exchange
-    CUTLASS_DEVICE void
-    begin_step(int step_idx) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.begin_step(step_idx);
-        }
-      );
-    }
-
-    /// Called at the start of a row
-    CUTLASS_DEVICE void
-    begin_row(int row_idx) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.begin_row(row_idx);
-        }
-      );
-    }
-
-    /// Called after accumulators have been exchanged for each accumulator vector
-    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc,
-          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
-      = delete; // Must be implemented for each operation
-
-    /// Called at the start of a row
-    CUTLASS_DEVICE void
-    end_row(int row_idx) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.end_row(row_idx);
-        }
-      );
-    }
-
-    /// Called after all accumulator elements have been visited
-    CUTLASS_DEVICE void
-    end_step(int step_idx) {
-      for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
-          callbacks.end_step(step_idx);
-        }
-      );
-    }
-
-    /// Called after all steps have been completed
-    CUTLASS_DEVICE void
-    end_epilogue() {
-      for_each(callbacks_tuple,
-        [] (auto& callbacks) {
-          callbacks.end_epilogue();
-        }
-      );
-    }
-  };
-
-  // Callbacks factory
-  // All operations must redefine this
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    return transform_apply(ops,
-      [&] (auto& op) {
-        return op.get_callbacks(
-          threadblock_tile_offset,
-          thread_idx,
-          problem_shape);
-      },
-      [] (auto&&... callbacks) {
-        auto callbacks_tuple = cute::make_tuple(callbacks...);
-        return Callbacks<decltype(callbacks_tuple)>{callbacks_tuple};
-      }
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convenience aliases
-using EmptyCallbacks = VisitorImpl2x<>::Callbacks<cute::tuple<>>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Tree visitor
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class NodeOp, class... ChildOps>
-struct TreeVisitor2x : VisitorImpl2x<ChildOps..., NodeOp> {
-
-  using VisitorImpl2x<ChildOps..., NodeOp>::VisitorImpl2x;
-
-  template<class CallbacksImpl>
-  struct Callbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    Callbacks(CallbacksImpl&& impl)
-      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    using CallbacksImpl::callbacks_tuple;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      constexpr int Rm1 = sizeof...(ChildOps);
-      return cute::detail::tapply(callbacks_tuple,
-        [&] (auto& child_callbacks) {
-          return child_callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc);
-        },
-        [&] (auto&&... frg_inputs) {
-          return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
-        },
-        make_seq<Rm1>{}
-      );
-    }
-  };
-
-  // Callbacks factory
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    return Callbacks<
-    decltype(VisitorImpl2x<ChildOps..., NodeOp>::
-      get_callbacks(
-        threadblock_tile_offset,
-        thread_idx,
-        problem_shape
-      ))>(
-      VisitorImpl2x<ChildOps..., NodeOp>::
-      get_callbacks(
-        threadblock_tile_offset,
-        thread_idx,
-        problem_shape
-      )
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementCompute,
-  class EdgeTuple,
-  class... Ops
->
-struct TopologicalVisitor2x : VisitorImpl2x<Ops...> {
-  static_assert(is_static_v<EdgeTuple>);
-  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
-  static_assert(sizeof...(Ops) > 1);
-
-  using VisitorImpl2x<Ops...>::VisitorImpl2x;
-
-  template<class CallbacksImpl>
-  struct Callbacks : CallbacksImpl {
-    CUTLASS_DEVICE
-    Callbacks(CallbacksImpl&& impl)
-      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
-
-    using CallbacksImpl::callbacks_tuple;
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      constexpr int Rm1 = sizeof...(Ops) - 1;
-      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
-
-      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
-        // Visit the first R-1 ops in topological order
-        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
-          frg_compute = cute::detail::apply(frg_compute_tuple,
-          // Compute the current op with children inputs
-          [&] (auto const&... frg_inputs) {
-            auto frg_output = callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
-            using ElementOutput = typename decltype(frg_output)::Element;
-            using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
-            ConvertOutput convert_output{};
-
-            return convert_output(frg_output);
-          },
-          // Get inputs in the sequence given by the children indices of the current op
-          edge_seq
-        );
-        return frg_compute;
-      },
-      // Visit the last op
-      [&] (auto const&...ops) {
-        return cute::detail::apply(frg_compute_tuple,
-          // Compute the last op with children inputs
-          [&] (auto const&... frg_inputs) {
-            return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
-          },
-          // Get inputs in the sequence given by the children indices of the last op
-          get<Rm1>(EdgeTuple{})
-        );
-      },
-      // Transform to visit R-1 ops, apply to visit last op
-      make_seq<Rm1>{}
-      );
-    }
-  };
-
-  // Callbacks factory
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    return Callbacks<decltype(
-      VisitorImpl2x<Ops...>::
-      get_callbacks(
-        threadblock_tile_offset,
-        thread_idx,
-        problem_shape
-      ))>(
-      VisitorImpl2x<Ops...>::
-      get_callbacks(
-        threadblock_tile_offset,
-        thread_idx,
-        problem_shape
-      )
-    );
-  }
-};
-
-
-template <class NodeOp, class... ChildOps>
-using Sm80EVT = TreeVisitor2x<NodeOp, ChildOps...>;
-
-template<
-  class ElementCompute,
-  class EdgeTuple,
-  class... Ops
->
-using Sm80TopologicalVisitor = TopologicalVisitor2x<ElementCompute, EdgeTuple, Ops...>;
-
-
-using X = Underscore;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// OutputTileThreadLayout translate the CUTLASS 2.X OutputTileOptimalThreadMap into cute layout
-// used by CUTLASS 3.X Epilogue
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  typename Element_,
-  int ElementsPerAccess,
-  int Stages_=1
->
-struct OutputTileThreadLayout: DefaultThreadMapTensorOp<
-  ThreadblockShape_,
-  WarpShape_,
-  ThreadblockShape_::kK/WarpShape_::kK,
-  Element_,
-  ElementsPerAccess>::Type {
-
-  using Base = typename DefaultThreadMapTensorOp<
-    ThreadblockShape_,
-    WarpShape_,
-    ThreadblockShape_::kK/WarpShape_::kK,
-    Element_,
-    ElementsPerAccess>::Type;
-  using Base::Base;
-
-  // Software pipeline stages in epilogue
-  static_assert(Stages_ <= 2, "Sm80 EVT only support upto 2 Stages.");
-  static const int Stages = Stages_;
-
-  using ThreadShape = cute::Shape<
-    cute::Int<Base::Detail::kAccessWidth>,                 // lane col idx
-    cute::Int<Base::Detail::kAccessRows>,                  // lane row idx
-    cute::Int<Base::Detail::kWarpsRemainingForRows>,       // warp row idx
-    cute::Int<Base::Shape::kGroup>,                        // group idx
-    cute::Int<Base::Shape::kCluster>                       // cluster idx
-  >;
-
-  using Shape = typename Base::Shape;
-  using Count = typename Base::Count;
-
-  using ThreadMapShape = cute::Shape<
-    // Column
-    Int<Base::kElementsPerAccess>,                // vector
-    Int<Base::Detail::kAccessWidth>,              // lane_col_coord
-    Int<Base::Iterations::kColumn>,               // iteration::column
-    // Row
-    Int<Base::Detail::kAccessRows>,               // lane_row_coord
-    Int<Base::Iterations::kRow>,                  // iterations in row
-    Int<Base::Detail::kWarpsRemainingForRows>,    // warp_row_coord
-    Int<Count::kRow>,                             // iteration::row
-    Int<Count::kGroup>,                           // iteration::group
-    Int<Shape::kGroup>,                           // group_coord
-    Int<Count::kCluster>,                         // iteration::cluster
-    Int<Shape::kCluster>                          // cluster_coord
-  >;
-
-  // The shape of CTA Tile
-  using CtaShapeMNL = cute::Shape<
-    Int<
-      Shape::kRow * Count::kRow *
-      Shape::kGroup * Count::kGroup *
-      Shape::kCluster * Count::kCluster
-    >,
-    Int<Shape::kColumn * Count::kColumn>,
-    _1
-  >;
-
-  static const int kElementsPerAccess = ElementsPerAccess;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  static auto tid2coord(int thread_idx) {
-    return cute::idx2crd(thread_idx, ThreadShape{});
-  }
-
-  template <class TensorInput>
-  CUTLASS_DEVICE
-  static auto partition(TensorInput &&xT, int thread_idx, gemm::GemmCoord threadblock_tile_offset) {
-
-    // (BLK_M,BLK_N)
-    Tensor bCxT = local_tile(
-      xT, CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
-    )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k());
-
-    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = tid2coord(thread_idx);
-
-    // transform to column-major
-    Tensor bCxT_nm = make_tensor(
-      std::forward<decltype(bCxT)>(bCxT).data(), make_layout(get<1>(bCxT.layout()), get<0>(bCxT.layout()))
-    ).compose(make_layout(ThreadMapShape{}));
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
-    return bCxT_nm(_,lane_col_coord,_,lane_row_coord,_,warp_row_coord,_,_,group_coord,_,cluster_coord);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::threadblock
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
deleted file mode 100755
index 69a0feab2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree compute operations for the CUTLASS 2x epilogue
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::threadblock {
-
-using namespace cute;
-using namespace detail;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// N-nary Elementwise Compute Operation
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  template <class> class ComputeFn,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class = void
->
-struct VisitorCompute : VisitorImpl2x<> {
-
-  using VisitorImpl2x<>::VisitorImpl2x;
-
-  struct Callbacks : EmptyCallbacks {
-    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
-          Array<ElementAccumulator, FragmentSize> const& frg_acc,
-          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
-      return transform_apply(cute::make_tuple(frg_inputs...),
-        [&] (auto&& frg_input) {
-          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
-          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-          ConvertInput convert_input{};
-
-          return convert_input(frg_input);
-        },
-        [&] (auto&&... cvt_frg_inputs) {
-          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
-          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
-          ComputeOutput compute_output{};
-          ConvertOutput convert_output{};
-
-          return convert_output(compute_output(cvt_frg_inputs...));
-        }
-      );
-    }
-
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    return Callbacks();
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::threadblock
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
deleted file mode 100755
index 7a332f11f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
+++ /dev/null
@@ -1,583 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree load operations for the CUTLASS 2x epilogue
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::threadblock {
-
-using namespace cute;
-using namespace detail;
-
-using X = Underscore;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Fetch Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// returns accumulator
-struct VisitorAccFetch : VisitorImpl2x<> {
-
-  using VisitorImpl2x<>::VisitorImpl2x;
-
-  struct Callbacks : EmptyCallbacks {
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      return frg_acc;
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    return Callbacks{};
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Broadcast Load Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Scalar broadcast
-template<
-  class Element,
-  class StrideMNL = Stride<_0,_0,_0>,
-  int BroadcastCount = 1,
-  template <class> class ReductionFn = multiplies
->
-struct VisitorScalarBroadcast {
-  static_assert(
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) || // scalar broadcast, e.g. alpha
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,_1>>) ||
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
-
-  struct SharedStorage { };
-
-  struct Arguments {
-    Element scalars[BroadcastCount] = {};
-    Element const* scalar_ptrs[BroadcastCount] = {};
-    StrideMNL dScalar = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  CUTLASS_HOST_DEVICE
-  VisitorScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params_ptr(&params) {
-    // Get the scalar for non-batched broadcast
-    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
-      update_scalar();
-    }
-  }
-
-  Element scalar;
-  Params const* params_ptr;
-
-  struct Callbacks: EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(Element scalar)
-      : scalar(scalar) {}
-
-    Element scalar;
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Array<Element, FragmentSize> frg_scalar;
-      frg_scalar.fill(scalar);
-
-      return frg_scalar;
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    // Get the scalar for batched broadcast
-    if constexpr (
-      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> ||
-      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
-      update_scalar(threadblock_tile_offset.k());
-    }
-    return Callbacks(scalar);
-  }
-
-private:
-  CUTLASS_DEVICE void
-  update_scalar(int l_coord = 0) {
-    int l_offset = l_coord * size<2>(params_ptr->dScalar);
-
-    if (params_ptr->scalar_ptrs[0] != nullptr) {
-      scalar = params_ptr->scalar_ptrs[0][l_offset];
-    } else {
-      // batch stride is ignored for nullptr fallback
-      scalar = params_ptr->scalars[0];
-    }
-
-    // Do reduction over multiple broadcasts if necessary
-    ReductionFn<Element> reduction_fn;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < BroadcastCount; ++i) {
-      if (params_ptr->scalar_ptrs[i] != nullptr) {
-        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][l_offset]);
-      } else {
-        // batch stride is ignored for nullptr fallback
-        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
-      }
-    }
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Load Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL
->
-struct VisitorAuxLoad{
-
-  struct Arguments {
-    Element* ptr_aux = nullptr;
-    Element null_default = Element(0);
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  // Software pipeline stages
-  static const int Stages = ThreadMap::Stages;
-
-  struct SharedStorage {};
-
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorAuxLoad() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorAuxLoad(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gAux,
-      RTensor&& tC_rAux,
-      CTensor&& tC_cAux,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gAux(cute::forward<GTensor>(tC_gAux)),
-      tC_rAux(cute::forward<RTensor>(tC_rAux)),
-      tC_cAux(cute::forward<CTensor>(tC_cAux)),
-      problem_shape(problem_shape),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gAux;
-    RTensor tC_rAux;
-    CTensor tC_cAux;
-    Params const* params_ptr;
-    ProblemShape problem_shape;
-
-    CUTLASS_DEVICE void
-    begin_step(int step_idx) {
-      clear(tC_rAux(_,_,_,step_idx%Stages));
-      auto src_v = filter(tC_gAux(_,_,_,step_idx));
-      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
-      auto dst_v = filter(tC_rAux(_,_,_,step_idx%Stages));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(src_v); ++i) {
-        bool guard = elem_less(coord_v(i), problem_shape);
-        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux(_,_,_,iter_idx%Stages)));
-      return tC_rAux_frg(frg_idx);
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mAux = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_aux),
-      problem_shape,
-      params_ptr->dAux);   // (M,N,L)
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
-    Tensor tC_gAux = recast<VecType>(
-      group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, Stages
-    Tensor tC_rAux = make_tensor<VecType>(
-      make_layout(flatten(make_shape(take<0,3>(tC_gAux.shape()), Int<Stages>{}))));
-
-    // Generate the pred tensor
-    Tensor cAux = make_identity_tensor(mAux.shape());
-    Tensor tC_cAux = outer_partition(
-      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gAux), decltype(tC_rAux),
-      decltype(tC_cAux), ProblemShape>(
-      cute::move(tC_gAux),
-      cute::move(tC_rAux),
-      cute::move(tC_cAux),
-      problem_shape,
-      params_ptr
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Row vector broadcast
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL
->
-struct VisitorRowBroadcast {
-
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    Element null_default = Element(0);
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage {};
-
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gRow,
-      RTensor&& tC_rRow,
-      CTensor&& tC_cRow,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gRow(cute::forward<GTensor>(tC_gRow)),
-      tC_rRow(cute::forward<RTensor>(tC_rRow)),
-      tC_cRow(cute::forward<CTensor>(tC_cRow)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gRow;
-    RTensor tC_rRow;
-    CTensor tC_cRow;
-    Params const* params_ptr;
-    int n;
-
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      clear(tC_rRow);
-      auto src_v = filter(tC_gRow);
-      auto coord_v = filter(tC_cRow);
-      auto dst_v = filter(tC_rRow);
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(src_v); ++i) {
-        bool guard = get<1>(coord_v(i)) < n;
-        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
-      return rRow_frg(column_idx);
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mRow = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_row),
-      problem_shape,
-      params_ptr->dRow);
-
-    // VECTOR, FRAGMENT_COLUMN
-    Tensor tC_gRow = recast<VecType>(
-      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
-    )(_,_,_0{},_0{},_0{},_0{});
-    Tensor tC_rRow = make_tensor_like(tC_gRow);
-
-    // Generate the pred tensor
-    Tensor cRow = make_identity_tensor(mRow.shape());
-    Tensor tC_cRow = outer_partition(
-      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gRow), decltype(tC_rRow),
-      decltype(tC_cRow), ProblemShape>(
-      cute::move(tC_gRow),
-      cute::move(tC_rRow),
-      cute::move(tC_cRow),
-      problem_shape,
-      params_ptr
-    );
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL = Stride<_1,_0,_0>
->
-struct VisitorColBroadcast {
-
-  struct Arguments {
-    Element const* ptr_col = nullptr;
-    Element null_default = Element(0);
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage { };
-
-  CUTLASS_HOST_DEVICE
-  VisitorColBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorColBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gCol,
-      RTensor&& tC_rCol,
-      CTensor&& tC_cCol,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gCol(cute::forward<GTensor>(tC_gCol)),
-      tC_rCol(cute::forward<RTensor>(tC_rCol)),
-      tC_cCol(cute::forward<CTensor>(tC_cCol)),
-      m(get<0>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gCol;
-    RTensor tC_rCol;
-    CTensor tC_cCol;
-    Params const* params_ptr;
-    int m;
-
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      clear(tC_rCol);
-      Tensor pred = make_tensor<bool>(shape(tC_gCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(pred); ++i) {
-        pred(i) = get<0>(tC_cCol(i)) < m;
-      }
-      copy_if(pred, tC_gCol, tC_rCol);
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Array<Element, FragmentSize> frg_col;
-      frg_col.fill(tC_rCol(row_idx,iter_idx));
-      return frg_col;
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mCol = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_col),
-      problem_shape,
-      params_ptr->dCol);
-
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
-    Tensor tC_gCol = group_modes<1,4>(
-      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
-    Tensor tC_rCol = make_tensor_like(tC_gCol);
-
-    // Generate the pred tensor
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    Tensor tC_cCol = group_modes<1,4>(
-      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
-
-    return Callbacks<
-      decltype(tC_gCol), decltype(tC_rCol),
-      decltype(tC_cCol), ProblemShape>(
-      cute::move(tC_gCol),
-      cute::move(tC_rCol),
-      cute::move(tC_cCol),
-      problem_shape,
-      params_ptr
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::threadblock
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
deleted file mode 100755
index 1c24e22d5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Visitor tree store operations for the CUTLASS 2x epilogue
-*/
-
-#pragma once
-
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::epilogue::threadblock {
-
-using namespace cute;
-using namespace detail;
-using X = Underscore;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Elementwise Store Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ThreadMap,
-  class Element,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL
->
-struct VisitorAuxStore{
-
-  struct Arguments {
-    Element* ptr_aux = nullptr;
-    StrideMNL dAux = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage {};
-
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorAuxStore() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorAuxStore(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gAux,
-      RTensor&& tC_rAux,
-      CTensor&& tC_cAux,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gAux(cute::forward<GTensor>(tC_gAux)),
-      tC_rAux(cute::forward<RTensor>(tC_rAux)),
-      tC_cAux(cute::forward<CTensor>(tC_cAux)),
-      problem_shape(problem_shape),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gAux;
-    RTensor tC_rAux;
-    CTensor tC_cAux;
-    Params const* params_ptr;
-    ProblemShape problem_shape;
-
-    CUTLASS_DEVICE void
-    begin_step(int step_idx) {
-      clear(tC_rAux);
-    }
-
-    template <class ElementAccumulator, class ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-
-      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
-      tC_rAux_frg(frg_idx) = convert_input(frg_input);
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE void
-    end_step(int step_idx) {
-      auto src_v = filter(tC_rAux);
-      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
-      auto dst_v = filter(tC_gAux(_,_,_,step_idx));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(src_v); ++i) {
-        bool guard = elem_less(coord_v(i), problem_shape);
-        cutlass::arch::global_store<VecType, sizeof(VecType)>(src_v(i), (void*)&dst_v(i), guard);
-      }
-    }
-
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mAux = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_aux),
-      problem_shape,
-      params_ptr->dAux);   // (M,N,L)
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
-    Tensor tC_gAux = recast<VecType>(group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
-    Tensor tC_rAux = make_tensor_like(take<0,3>(tC_gAux));
-
-    // Generate the pred tensor
-    Tensor cAux = make_identity_tensor(mAux.shape());
-    Tensor tC_cAux = outer_partition(
-      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gAux), decltype(tC_rAux),
-      decltype(tC_cAux), ProblemShape>(
-      cute::move(tC_gAux),
-      cute::move(tC_rAux),
-      cute::move(tC_cAux),
-      problem_shape,
-      params_ptr
-    );
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Reduction Store Operations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Helper functions
-template <
-  template <class> class ReduceFn,
-  int kThreads, class T>
-CUTLASS_DEVICE
-void intra_warp_row_reduce(T& value) {
-  using ReduceInput = ReduceFn<T>;
-  ReduceInput reduce_input{};
-  constexpr int kHalfThreads = kThreads >> 1;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = kHalfThreads; i > 0; i >>= 1) {
-    value = reduce_input(value, __shfl_xor_sync(0xFFFFFFFF, value, i));
-  }
-}
-
-template <
-  template <class> class ReduceFn,
-  FloatRoundStyle RoundStyle,
-  class ElementCompute,
-  class ElementFragment, int FragmentSize>
-CUTLASS_DEVICE
-void fragment_reduce(ElementCompute& value, Array<ElementFragment, FragmentSize> const& frg) {
-  using ReduceInput = ReduceFn<ElementCompute>;
-  ReduceInput reduce_input{};
-  using ConvertInput = NumericConverter<ElementCompute, ElementFragment, RoundStyle>;
-  ConvertInput convert_input{};
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < FragmentSize; ++i) {
-    value = reduce_input(value, convert_input(frg[i]));
-  }
-}
-
-template<
-  template <class> class AtomicReduceFn,
-  FloatRoundStyle RoundStyle,
-  class ElementCompute,
-  class ElementOutput>
-CUTLASS_DEVICE
-void atomic_reduce(ElementOutput* ptr, ElementCompute const& value) {
-  using ReduceOutput = AtomicReduceFn<ElementOutput>;
-  using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
-  ReduceOutput reduce_output{};
-  ConvertOutput convert_output{};
-
-  reduce_output(ptr, convert_output(value));
-}
-
-// Col vector reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class AtomicReduceFn,
-  class ThreadMap,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_1,_0,_0>
->
-struct VisitorColReduction {
-
-  struct Arguments {
-    ElementOutput* ptr_col = nullptr;
-    ElementCompute reduction_identity = 0;
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage { };
-
-  CUTLASS_HOST_DEVICE
-  VisitorColReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorColReduction(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gCol,
-      CTensor&& tC_cCol,
-      ProblemShape problem_shape,
-      Params const* params_ptr,
-      int thread_idx
-    ):
-      tC_gCol(cute::forward<GTensor>(tC_gCol)),
-      tC_cCol(cute::forward<CTensor>(tC_cCol)),
-      m(get<0>(problem_shape)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) {
-        // The partial reduction results of each warp are further
-        // reduced to the first thread in each row.
-        // Only the first thread in each row is the writing thread
-        is_writing_thread = thread_idx % ThreadMap::Detail::kAccessWidth == 0;
-      }
-
-    GTensor tC_gCol;
-    CTensor tC_cCol;
-    Params const* params_ptr;
-    int m;
-    int n;
-    int curr_iter_idx;
-    bool is_writing_thread;
-
-    ElementCompute reduction_accum;
-
-    CUTLASS_DEVICE void
-    begin_row(int row_idx) {
-      reduction_accum = ElementCompute(params_ptr->reduction_identity);
-    }
-
-    template <class ElementAccumulator, class ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-
-      curr_iter_idx = iter_idx;
-
-      int coord_n = get<1>(tC_cCol(column_idx, row_idx, iter_idx));
-      if (coord_n < n) {
-        fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
-      }
-
-      // Intra-warp reduction
-      if (column_idx + 1 == ThreadMap::Iterations::kColumn) {
-        intra_warp_row_reduce<RegReduceFn, ThreadMap::Detail::kAccessWidth>(reduction_accum);
-      }
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE auto
-    end_row(int row_idx) {
-      bool guard = get<0>(tC_cCol(_0{}, row_idx,curr_iter_idx)) < m;
-
-      if (guard && is_writing_thread) {
-        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gCol(row_idx,curr_iter_idx), reduction_accum);
-      }
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-
-    Tensor mCol = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_col),
-      problem_shape,
-      params_ptr->dCol);
-    // FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
-    Tensor tC_gCol = group_modes<1,4>(
-      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
-
-    // Generate the pred tensor
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
-    Tensor tC_cCol = group_modes<2,5>(
-      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
-
-    return Callbacks<
-      decltype(tC_gCol), decltype(tC_cCol),
-      ProblemShape>(
-      cute::move(tC_gCol),
-      cute::move(tC_cCol),
-      problem_shape,
-      params_ptr,
-      thread_idx
-    );
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Row vector reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class AtomicReduceFn,
-  class ThreadMap,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_0,_1,_0>
->
-struct VisitorRowReduction {
-
-  struct Arguments {
-    ElementOutput* ptr_row = nullptr;
-    ElementCompute reduction_identity = 0;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  using SharedStorageShape = decltype(select<0,1,2,3,5,8,10>(typename ThreadMap::ThreadMapShape{}));
-
-  struct SharedStorage {
-    AlignedArray<ElementCompute, size(SharedStorageShape{}), 16> reduction;
-  };
-
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<ElementOutput>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowReduction() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowReduction(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params),
-      smem_reduce(const_cast<ElementCompute*>(shared_storage.reduction.data())) { }
-
-  Params const* params_ptr;
-  ElementCompute* smem_reduce;
-
-  template <
-    class RTensorR2S, class STensorR2S, class CTensorR2S,
-    class STensorS2R, class RTensorS2R, class CTensorS2R,
-    class GTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      // R->S
-      RTensorR2S&& tRS_rSrc,
-      STensorR2S&& tRS_sRows,
-      CTensorR2S&& tRS_cSrc,
-      // S->R
-      STensorS2R&& tSR_sRows,
-      RTensorS2R&& tSR_rRows,
-      CTensorS2R&& tSR_cRows,
-      // R->G
-      GTensor&& tC_gRow,
-      CTensor&& tC_cRow,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      // R->S
-      tRS_rSrc(cute::forward<RTensorR2S>(tRS_rSrc)),
-      tRS_sRows(cute::forward<STensorR2S>(tRS_sRows)),
-      tRS_cSrc(cute::forward<CTensorR2S>(tRS_cSrc)),
-      // S->R
-      tSR_sRows(cute::forward<STensorS2R>(tSR_sRows)),
-      tSR_rRows(cute::forward<RTensorS2R>(tSR_rRows)),
-      tSR_cRows(cute::forward<CTensorS2R>(tSR_cRows)),
-      // R->G
-      tC_gRow(cute::forward<GTensor>(tC_gRow)),
-      tC_cRow(cute::forward<CTensor>(tC_cRow)),
-      m(get<0>(problem_shape)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    // R->S
-    RTensorR2S tRS_rSrc;
-    STensorR2S tRS_sRows;
-    CTensorR2S tRS_cSrc;
-    // S->R
-    STensorS2R tSR_sRows;
-    RTensorS2R tSR_rRows;
-    CTensorS2R tSR_cRows;
-    // R->G
-    GTensor tC_gRow;
-    CTensor tC_cRow;
-
-    Params const* params_ptr;
-    int n;
-    int m;
-
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      fill(tRS_rSrc, params_ptr->reduction_identity);
-    }
-
-    template <class ElementAccumulator, class ElementInput, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc,
-          Array<ElementInput, FragmentSize> const& frg_input) {
-
-      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
-      ConvertInput convert_input{};
-      Tensor tRS_rRow_frg = recast<Array<ElementCompute, FragmentSize>>(coalesce(tRS_rSrc));
-
-      int coord_m = get<0>(tRS_cSrc(column_idx,row_idx,iter_idx));
-      if (coord_m < m)
-        reduction(tRS_rRow_frg[column_idx], convert_input(frg_input));
-
-      return frg_input;
-    }
-
-    CUTLASS_DEVICE void
-    end_epilogue() {
-      //
-      // Store the partially reduced value to SMEM
-      //
-
-      // Guard against uses of the existing SMEM tile
-      __syncthreads();
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(tRS_rSrc); ++i) {
-        copy_vec<VecType>(filter(tRS_rSrc), filter(tRS_sRows));
-      }
-
-      __syncthreads();
-
-      //
-      // Now, threads are assigned several columns of the output. They fetch over all rows from
-      // the compacted SMEM tile and perform a reduction.
-      //
-
-      fill(tSR_rRows, params_ptr->reduction_identity);
-
-      using ReduceInputReg = RegReduceFn<ElementCompute>;
-      ReduceInputReg reduce_input_reg{};
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < size(tSR_rRows); ++j) {
-        if (get<0>(tSR_cRows(j)) < get<1>(typename ThreadMap::CtaShapeMNL{}) && get<1>(tC_cRow(j)) < n) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int i = 0; i < size(tSR_sRows) / size(tSR_rRows); ++i) {
-            tSR_rRows(j) = reduce_input_reg(tSR_rRows(j), tSR_sRows(i + j * size(tSR_sRows) / size(tSR_rRows)));
-          }
-          atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gRow(j), tSR_rRows(j));
-        }
-
-      }
-    }
-
-  private:
-
-    template <int FragmentSize>
-    CUTLASS_DEVICE ElementCompute
-    reduction(Array<ElementCompute, FragmentSize>& reduce_buffer, Array<ElementCompute, FragmentSize> const& result) {
-      using ReduceInput = RegReduceFn<ElementCompute>;
-      ReduceInput reduce_input{};
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < FragmentSize; ++i) {
-            reduce_buffer[i] = reduce_input(reduce_buffer[i], result[i]);
-        }
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mRow = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_row),
-      problem_shape,
-      params_ptr->dRow);
-
-    //
-    // Step 1: reduce fragment input (Src) into tRS_rSrc
-    //
-
-    // VECTOR,FRAGMENT_COL
-    Tensor tRS_rSrc = make_tensor<ElementCompute>(select<0,2>(typename ThreadMap::ThreadMapShape{}));
-
-    Tensor cSrc = make_identity_tensor(mRow.shape());
-    // FRAGMENT_COLUMN, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
-    Tensor tRS_cSrc = group_modes<2,5>(ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
-
-    //
-    // Step 2: copy the partial results in tRS_rSrc to sRows in shared memory
-    //
-
-    // VECTOR,ACCESS_WIDTH,FRAGMENT_COL,ACCESS_ROWS,WARPS_PER_ROW,GROUPS,CLUSTERS
-    Tensor sRows = make_tensor(
-      make_smem_ptr(smem_reduce), SharedStorageShape{}
-    );
-
-    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = ThreadMap::tid2coord(thread_idx);
-    Tensor tRS_sRows = sRows(_,lane_col_coord,_,lane_row_coord,warp_row_coord,group_coord,cluster_coord);
-
-    //
-    // Step 3: copy the partial results in sRows to tSR_sRow for reduction
-    //
-
-    // VECTOR*ACCESS_WIDTH*FRAGMENT_COL,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
-    Tensor sRows_nm = coalesce(group_modes<1,5>(group_modes<0,3>(sRows)), Shape<_1,_1>{});
-    // SMEM_ROW/THREADS,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
-    Tensor tSR_sRows = outer_partition(sRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx);
-    // SMEM_ROW/THREADS
-    Tensor tSR_rRows = make_tensor_like(tSR_sRows(_,_0{}));
-    // Coord
-    Tensor cRows_nm = make_identity_tensor(sRows_nm.shape());
-    Tensor tSR_cRows = outer_partition(cRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx)(_,_0{});
-
-    //
-    // Step 4: atomically reduce the results to global memory
-    //
-
-    Tensor tC_gRow = outer_partition(
-      // Cta tile
-      local_tile(
-        mRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_),Step<_1,_1, X>{}
-      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
-      // Partition to threads
-      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
-    )(_0{},_);
-
-    Tensor cRow = make_identity_tensor(mRow.shape());
-    Tensor tC_cRow = outer_partition(
-      // Cta tile
-      local_tile(
-        cRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
-      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
-      // Partition to threads
-      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
-    )(_0{},_);
-
-    return Callbacks<
-      decltype(tRS_rSrc), decltype(tRS_sRows),
-      decltype(tRS_cSrc), decltype(tSR_sRows),
-      decltype(tSR_rRows), decltype(tSR_cRows),
-      decltype(tC_gRow), decltype(tC_cRow),
-      ProblemShape>(
-      // R->S
-      cute::move(tRS_rSrc),
-      cute::move(tRS_sRows),
-      cute::move(tRS_cSrc),
-      // S->R
-      cute::move(tSR_sRows),
-      cute::move(tSR_rRows),
-      cute::move(tSR_cRows),
-      // R->G
-      cute::move(tC_gRow),
-      cute::move(tC_cRow),
-      problem_shape,
-      params_ptr
-    );
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Scalar reduction
-template <
-  template <class> class RegReduceFn,
-  template <class> class AtomicReduceFn,
-  class ThreadMap,
-  class ElementOutput,
-  class ElementCompute,
-  FloatRoundStyle RoundStyle,
-  class StrideMNL = Stride<_0,_0,_0>
->
-struct VisitorScalarReduction {
-  static_assert(
-    (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
-    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
-    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));
-
-  struct Arguments {
-    ElementOutput* ptr_scalar = nullptr;
-    ElementCompute reduction_identity = 0;
-    StrideMNL dScalar = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage { };
-
-  CUTLASS_HOST_DEVICE
-  VisitorScalarReduction(){ };
-
-  CUTLASS_HOST_DEVICE
-  VisitorScalarReduction(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class CTensor, class GTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      CTensor&& tC_cSrc,
-      GTensor&& tC_gScalar,
-      ProblemShape problem_shape,
-      Params const* params_ptr,
-      int thread_idx
-    ):
-      tC_cSrc(cute::forward<CTensor>(tC_cSrc)),
-      tC_gScalar(cute::forward<GTensor>(tC_gScalar)),
-      problem_shape(problem_shape),
-      params_ptr(params_ptr) {
-        // The partial reduction results of each warp are further
-        // reduced to this first thread.
-        // Only the first thread of each warp is the writing thread
-        is_writing_thread = thread_idx % ThreadMap::kWarpSize == 0;
-      }
-
-      GTensor tC_gScalar;
-      CTensor tC_cSrc;
-      Params const* params_ptr;
-      ProblemShape problem_shape;
-      bool is_writing_thread;
-
-      ElementCompute reduction_accum;
-
-      CUTLASS_DEVICE void
-      begin_epilogue() {
-        reduction_accum = ElementCompute(params_ptr->reduction_identity);
-      }
-
-      template <class ElementAccumulator, class ElementInput, int FragmentSize>
-      CUTLASS_DEVICE auto
-      visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-            Array<ElementAccumulator, FragmentSize> const& frg_acc,
-            Array<ElementInput, FragmentSize> const& frg_input) {
-
-        auto coord = tC_cSrc(column_idx, row_idx, iter_idx);
-        if (elem_less(coord, problem_shape)) {
-          fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
-        }
-
-        return frg_input;
-      }
-
-      CUTLASS_DEVICE auto
-      end_epilogue() {
-        // Intra-warp reduction
-        intra_warp_row_reduce<RegReduceFn, ThreadMap::kWarpSize>(reduction_accum);
-
-        // Atomically reduce to global memory
-        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gScalar(_0{},_0{}), reduction_accum);
-      }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor cSrc = make_identity_tensor(problem_shape);
-    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
-    Tensor tC_cSrc = group_modes<2,5>(
-      ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_)
-    );
-
-    Tensor mScalar = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_scalar),
-      problem_shape,
-      params_ptr->dScalar
-    );
-
-    Tensor tC_gScalar = mScalar(_,_,threadblock_tile_offset.k());
-
-    return Callbacks<
-      decltype(tC_cSrc), decltype(tC_gScalar),
-      ProblemShape>(
-      cute::move(tC_cSrc),
-      cute::move(tC_gScalar),
-      problem_shape,
-      params_ptr,
-      thread_idx
-    );
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::epilogue::threadblock
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
deleted file mode 100755
index 96fbc01d7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Higher-level header file includes all the CUTLASS 2x visitors
-*/
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-#include "cutlass/epilogue/threadblock/fusion/visitor_load.hpp"
-#include "cutlass/epilogue/threadblock/fusion/visitor_store.hpp"
-#include "cutlass/epilogue/threadblock/fusion/visitor_compute.hpp"
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
deleted file mode 100755
index 305f5d783..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Epilogue operator without splitk
-template <
-    /// Shape of threadblock tile (concept: GemmShape)
-    typename Shape_,
-    /// Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-    typename WarpMmaOperator_,
-    /// Number of partitions of the K dimension
-    int PartitionsK,
-    /// Tile iterator reading and writing output tensors
-    typename OutputTileIterator_,
-    /// Fragment iterator selecting accumulators
-    typename AccumulatorFragmentIterator_,
-    /// Output operator
-    typename OutputOp_,
-    /// Number of interleaved k
-    int InterleavedK>
-class InterleavedEpilogue :
-  public EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>
-{
-public:
-
-  using BaseStreamK = EpilogueBaseStreamK<
-    Shape_,
-    PartitionsK,
-    WarpMmaOperator_,
-    AccumulatorFragmentIterator_>;
-
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using OutputTileIterator = OutputTileIterator_;
-  using OutputOp = OutputOp_;
-
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-
-  /// Fragment type used by the accumulator tile's fragment iterator
-  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
-
-  /// Accumulator element
-  using ElementAccumulator = typename AccumulatorTile::Element;
-
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-
-  /// Tensor reference to sync tensor
-  using SyncTensorRef =
-      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-
-  /// Array type used to output
-  using OutputAccessType = Array<typename OutputTileIterator::Element,
-                                 OutputTileIterator::kElementsPerAccess>;
-
-  /// Array type used by output functor
-  using AccumulatorAccessType =
-      Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
-
-  /// Number of warps
-  using WarpCount =
-      gemm::GemmShape<Shape::kM / WarpMmaOperator::Shape::kM,
-                      Shape::kN / WarpMmaOperator::Shape::kN, kPartitionsK>;
-
-public:
-
-  static_assert(OutputTileIterator::kElementsPerAccess,
-                "This must not be zero.");
-
-  static_assert(!(OutputTileIterator::Fragment::kElements %
-                  OutputTileIterator::kElementsPerAccess),
-                "Divisibility");
-
-public:
-
-  /// Aspect for when epilogue source is not needed
-  struct SourceAspectNotNeeded
-  {
-    /// Constructor
-    CUTLASS_DEVICE
-    SourceAspectNotNeeded()
-    {}
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
-    {
-      OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-      AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-      int const kOutputOpIterations =
-        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kOutputOpIterations; ++i)
-      {
-        // Call the output operator
-        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
-      }
-    }
-  };
-
-
-  /// Aspect for when epilogue source is needed
-  struct SourceAspectNeeded
-  {
-    OutputTileIterator source_iterator;
-
-    typename OutputTileIterator::Fragment source_fragment;
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    static void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment,
-      typename OutputTileIterator::Fragment const &source_fragment)
-    {
-      OutputAccessType *output_frag_ptr =
-        reinterpret_cast<OutputAccessType *>(&output_fragment);
-
-      AccumulatorAccessType const *compute_frag_ptr =
-        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
-
-      OutputAccessType const *source_frag_ptr =
-        reinterpret_cast<OutputAccessType const *>(&source_fragment);
-
-      int const kOutputOpIterations =
-        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kOutputOpIterations; ++i)
-      {
-        // Call the output operator
-        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
-      }
-    }
-
-    /// Constructor
-    CUTLASS_DEVICE
-    SourceAspectNeeded(OutputTileIterator source_iterator) :
-      source_iterator(source_iterator)
-    {
-      source_fragment.clear();
-    }
-
-    /// Invoke the output functor over each vector of output
-    CUTLASS_DEVICE
-    void apply_output_operator(
-      typename OutputTileIterator::Fragment &output_fragment,
-      OutputOp const &output_op,
-      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
-    {
-      // Load addend source fragment from global memory
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-
-      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
-    }
-  };
-
-
-  /// Shared storage allocation needed by the epilogue
-  struct SharedStorage {};
-
-
-public:
-
-  /// Constructor
-  CUTLASS_DEVICE
-  InterleavedEpilogue(
-      SharedStorage &shared_storage,  ///< Shared storage object
-      int thread_idx,                 ///< ID of a thread within the threadblock
-      int warp_idx,                   ///< ID of warp within threadblock
-      int lane_idx)                   ///< Id of thread within warp
-  :
-      BaseStreamK(thread_idx)
-  {}
-
-
-  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
-  /// performing epilogue computations, writing to output
-  CUTLASS_DEVICE
-  void reduce(
-      int peer_idx_begin,
-      int peer_idx_end,
-      int reduce_fragment_idx,
-      void *element_workspace,
-      OutputOp const &output_op,                      ///< Output operator
-      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-  {
-    // Redcuce peer accumulator fragments into one fragment
-    AccumulatorFragment accum_fragment;
-    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
-
-    // Source-fragment data (zero-initialized for scenarios where the
-    // output operator allows us to skip loading it from global input)
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-
-    if (output_op.is_source_needed())
-    {
-      source_iterator += reduce_fragment_idx;
-      source_iterator.load(source_fragment);
-    }
-
-    // Compute the output result
-    typename OutputTileIterator::Fragment output_fragment;
-
-    // Apply the output operator
-    SourceAspectNeeded::apply_output_operator(output_fragment, output_op, accum_fragment, source_fragment);
-
-    // Store the final result
-    destination_iterator += reduce_fragment_idx;
-    destination_iterator.store(output_fragment);
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
-  {
-    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.  Implements
-  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
-  {
-    if (output_op.is_source_needed())
-    {
-      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
-    }
-    else
-    {
-      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
-    }
-  }
-
-
-  /// Perform the epilogue computations and stream the result to global memory.  Implements a
-  /// single codepath, regardless of whether the output op requires addend data to be loaded
-  CUTLASS_DEVICE
-  void unified(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
-  {
-    if (!output_op.is_source_needed())
-    {
-      source_iterator.clear_mask();
-      __syncthreads();  // Dummy (CUDA 11.0)
-    }
-
-    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
-  }
-
-
-  /// Streams the result to global memory
-  template <typename SourceAspect>
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                      ///< Output operator
-    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
-    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
-    SourceAspect source)
-  {
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-
-    //
-    // Iterate over accumulator tile
-    //
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-
-      //
-      // Convert fragment
-      //
-
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
-
-      //
-      // Compute the output result
-      //
-
-      typename OutputTileIterator::Fragment output_fragment;
-      source.apply_output_operator(output_fragment, output_op, accum_fragment);
-
-      //
-      // Store the final result
-      //
-
-      destination_iterator.set_iteration_index(iter);
-      destination_iterator.store(output_fragment);
-      ++destination_iterator;
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
deleted file mode 100755
index 730088273..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/tensor_ref.h"
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-template<
-  typename TensorLayout_,                             ///! The original output tensor layout
-  typename OutputIteratorLayout_,                     ///! Layout used by epilogue output iterator
-  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
-  conv::Operator ConvOperator,                        ///! Convolutional operator (Fprop, Dgrad, Wgrad)
-  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
->
-struct ConvOutputIteratorParameter {
-
-  using TensorLayout = TensorLayout_;
-  using OutputIteratorLayout = OutputIteratorLayout_;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = ConvOperator;
-  using ConvProblemSize = ConvProblemSize_;
-
-  /// Wgrad stride idx for implicit gemm algorithm 
-  // Conv2d row-major matrix (KxRSC) 
-  // Conv3d row-major matrix (KxTRSC)
-  static int const kWgradStrideIdx = 
-    platform::is_same<TensorLayout, layout::TensorNHWC>::value ? 2 : 3;
-
-  /// This chooses the appropriate stride element of the C tensor.
-  static int const kTensorStrideIdx = 
-    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0);
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride(kTensorStrideIdx);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static OutputTensorCoord extent(ConvProblemSize problem_size) {
-    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
-  }
-};
-
-template<
-  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
-  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
->
-struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
-
-  using TensorLayout = layout::TensorNHWC;
-  using OutputIteratorLayout = layout::TensorNHWC;
-  using MappedLayout = layout::RowMajor;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using MappedTensorCoord = typename MappedLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
-  using ConvProblemSize = ConvProblemSize_;
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static MappedTensorCoord extent(ConvProblemSize problem_size) {
-    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
-  }
-};
-
-template<
-  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
-  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
->
-struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
-
-  using TensorLayout = layout::TensorNHWC;
-  using OutputIteratorLayout = layout::TensorNHWC;
-  using MappedLayout = layout::RowMajor;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using MappedTensorCoord = typename MappedLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
-  using ConvProblemSize = ConvProblemSize_;
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static MappedTensorCoord extent(ConvProblemSize problem_size) {
-    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
-  }
-};
-
-template<
-  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
-  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
->
-struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
-
-  using TensorLayout = layout::TensorNDHWC;
-  using OutputIteratorLayout = layout::TensorNDHWC;
-  using MappedLayout = layout::RowMajor;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using MappedTensorCoord = typename MappedLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
-  using ConvProblemSize = ConvProblemSize_;
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static MappedTensorCoord extent(ConvProblemSize problem_size) {
-    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
-  }
-};
-
-template<
-  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
-  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
->
-struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
-
-  using TensorLayout = layout::TensorNDHWC;
-  using OutputIteratorLayout = layout::TensorNDHWC;
-  using MappedLayout = layout::RowMajor;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using MappedTensorCoord = typename MappedLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
-  using ConvProblemSize = ConvProblemSize_;
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static MappedTensorCoord extent(ConvProblemSize problem_size) {
-    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
-  }
-};
-
-template <
-  int InterleavedK,
-  typename TensorRef_,
-  conv::Operator ConvOperator,
-  typename ConvProblemSize_
->
-struct ConvOutputIteratorParameter<
-  layout::TensorNCxHWx<InterleavedK>, 
-  layout::TensorNCxHWx<InterleavedK>,
-  TensorRef_,
-  ConvOperator,
-  ConvProblemSize_>
-{ 
-
-  using TensorLayout = typename layout::TensorNCxHWx<InterleavedK>;
-  using OutputIteratorLayout = typename layout::TensorNCxHWx<InterleavedK>;
-  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
-  using TensorRef = TensorRef_;
-  static conv::Operator const kConvolutionalOperator = ConvOperator;
-  using ConvProblemSize = ConvProblemSize_;
-
-  CUTLASS_HOST_DEVICE
-  static OutputIteratorLayout layout(const TensorRef & ref) {
-    return ref.stride();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static OutputTensorCoord extent(ConvProblemSize problem_size) {
-    return problem_size.output_extent();
-  }
-
-};
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
deleted file mode 100755
index 617b8e39f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ /dev/null
@@ -1,628 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Metaprogram for determining the mapping of output elements to threads for epilogue tiles.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/fast_math.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tuple defining point in output tile
-template <
-  int Column,
-  int Row,
-  int Group,
-  int Cluster,
-  int Tile
->
-struct OutputTileShape {
-  static int const kColumn = Column;
-  static int const kRow = Row;
-  static int const kGroup = Group;
-  static int const kCluster = Cluster;
-  static int const kTile = Tile;
-
-  static int const kCount = kColumn * kRow * kGroup * kCluster * kTile;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename Iterations, typename Delta>
-struct OutputTileThreadMapHelpers {
-
-  /// Determines the iteration index of a vector access according to the thread map
-  CUTLASS_HOST_DEVICE
-  static void iteration_index(
-    int &column_idx,
-    int &row_idx,
-    int &group_idx,
-    int &cluster_idx,
-    int &tile_idx,
-    int iter_idx) {
-
-    column_idx = iter_idx % Iterations::kColumn;
-    int residual   = iter_idx / Iterations::kColumn;
-
-    row_idx    = residual % Iterations::kRow;
-    residual       = residual / Iterations::kRow;
-
-    group_idx  = residual % Iterations::kGroup;
-    residual       = residual / Iterations::kGroup;
-
-    cluster_idx = residual % Iterations::kCluster;
-    tile_idx    = residual / Iterations::kCluster;
-  }
-
-  /// Computes the offset of a given vector access
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord iteration_offset(int iter_idx) {
-
-    int column_idx;
-    int row_idx;
-    int group_idx;
-    int cluster_idx;
-    int tile_idx;
-
-    iteration_index(column_idx, row_idx, group_idx, cluster_idx, tile_idx, iter_idx);
-
-    return
-      MatrixCoord(
-        row_idx     * Delta::kRow     +
-        group_idx   * Delta::kGroup   +
-        cluster_idx * Delta::kCluster +
-        tile_idx    * Delta::kTile,
-
-        column_idx  * Delta::kColumn);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <
-  typename ThreadMap_,
-  typename Shape_,
-  typename Iterations_,
-  typename Delta_,
-  typename Count_
->
-struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
-
-  /// Conventional thread map (concept: ThreadMap)
-  using ThreadMap = ThreadMap_;
-
-  /// Number of threads participating in the operation
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Number of scalar elements per access
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  /// Shape of the tile
-  using Shape = Shape_;
-
-  /// Iterations performed by each thread
-  using Iterations = Iterations_;
-
-  /// Delta between accesses
-  using Delta = Delta_;
-
-  /// Number of iterator iterations 
-  using Count = Count_;
-
-  /// Initial offset function
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord initial_offset(int thread_idx) {
-
-    using Index = typename layout::PitchLinearCoord::Index;
-    
-    layout::PitchLinearCoord coord = ThreadMap::initial_offset(thread_idx);
-
-    Index cluster = coord.strided() / (Shape::kGroup * Shape::kRow);
-    Index cluster_residual = coord.strided() % (Shape::kGroup * Shape::kRow);
-
-    Index group = cluster_residual / (Shape::kRow);
-    Index row = cluster_residual % (Shape::kRow);
-
-    return MatrixCoord{
-      row + group * Shape::kRow * Count::kRow 
-        + cluster * Shape::kGroup * Count::kGroup * Shape::kRow * Count::kRow,
-      coord.contiguous()
-    };
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// RowArrangement determines how one or more warps cover a region of consecutive rows.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize,
-  bool Is2dTile
->
-struct RowArrangement;
-
-/// RowArrangement in which each warp's access is a 1D tiled arrangement.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
-  static int const kWarpSize = 32;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  static int const kIterationsRow = 1;
-  static int const kDeltaRow = 1;
-  static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
-  static int const kDeltaColumn = kWarpSize * kElementsPerAccess;
-
-  static int const kAccessWidth = kWarpSize;
-  static int const kAccessRows = 1;
-  static int const kWarpPartitionsRow = 1;
-  static int const kWarpPartitionsColumn = WarpsRemaining;
-};
-
-/// RowArrangement in which each warp's access is a 2D tiled arrangement.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {
-
-  static int const kMemoryAccessSize = 256; // Preferred access size
-  static int const kWarpSize = 32;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  struct Detail {
-    static int const kShapeRow = Shape::kRow / WarpsRemaining;
-    static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;
-
-    static int const kTargetMemoryAccessWidth = 
-      kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);
-
-    static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
-  };
-
-  static int const kAccessWidth = 
-    (Detail::kTargetAccessRows > Detail::kShapeRow ?
-      kWarpSize / Detail::kShapeRow
-      : const_min(
-          Detail::kShapeWidth,
-        const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
-        ));
-
-  static int const kAccessRows =
-    (Detail::kTargetAccessRows > Detail::kShapeRow ?
-      Detail::kShapeRow
-      : const_min(Shape::kRow, kWarpSize / kAccessWidth));
-
-  static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
-  static int const kDeltaRow = kAccessRows;
-
-  static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
-  static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;
-
-  static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
-  static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
-  static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );
-
-  static int const kWarpPartitionsRow = 1;
-  static int const kWarpPartitionsColumn = 1;
-};
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template metaprogram for partitioning a 4D space across warps to achieve several performance
-/// objectives:
-///
-///   - coalesced memory accesses in units of 128 Byte lines
-///   - minimal address arithmetic
-///   - minimal predicate calculations
-///
-template <
-  typename Shape_,
-  typename Count_,
-  int Threads,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct OutputTileOptimalThreadMap {
-
-  using Shape = Shape_;
-  using Count = Count_;
-
-  static int const kWarpSize = 32;
-  static int const kThreads = Threads;
-  static int const kWarpCount = kThreads / kWarpSize;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  //
-  // Metaprogram computation
-  //
-
-  struct Detail {
-
-    // Clusters
-    static int const kIterationsCluster = 
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kCluster / kWarpCount
-        : 1);
-
-    static int const kDeltaCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
-        : 1);
-
-    static int const kCompactedDeltaCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
-        : 1);
-
-    static int const kWarpPartitionsCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        kWarpCount
-        : kWarpCount / Shape::kCluster);
-
-    static int const kWarpsRemainingForGroups =
-      ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);
-
-    // Groups
-    static int const kIterationsGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kGroup / kWarpsRemainingForGroups
-        : 1);
-
-    static int const kDeltaGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
-        : 1);
-
-    static int const kCompactedDeltaGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kRow * Shape::kGroup / kIterationsGroup
-        : 1);
-
-    static int const kWarpPartitionsGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        1
-        : kWarpsRemainingForGroups / Shape::kGroup);
-
-    static int const kWarpsRemainingForRows =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        1
-        : kWarpsRemainingForGroups / Shape::kGroup);
-    
-    // Rows
-    using RowArrangement = detail::RowArrangement<
-      Shape,
-      kWarpsRemainingForRows,
-      kElementsPerAccess,
-      kElementSize,
-      (Shape::kRow > kWarpsRemainingForRows)
-    >;
-
-    // Warp partitions
-    using WarpPartitions = OutputTileShape<
-      RowArrangement::kWarpPartitionsColumn,
-      RowArrangement::kWarpPartitionsRow,
-      kWarpPartitionsGroup,
-      kWarpPartitionsCluster,
-      1>;
-
-    static int const kAccessWidth = RowArrangement::kAccessWidth;
-    static int const kAccessRows = RowArrangement::kAccessRows;
-  };
-
-  //
-  // Output
-  //
-
-  using Iterations = OutputTileShape<
-    Detail::RowArrangement::kIterationsColumn, 
-    Detail::RowArrangement::kIterationsRow, 
-    Detail::kIterationsGroup, 
-    Detail::kIterationsCluster, 
-    1>;
-
-  using Delta = OutputTileShape<
-    Detail::RowArrangement::kDeltaColumn,
-    Detail::RowArrangement::kDeltaRow,
-    Detail::kDeltaGroup,
-    Detail::kDeltaCluster,
-    1>;
-
-  /// Initial offset function
-  CUTLASS_DEVICE
-  static MatrixCoord initial_offset(int thread_idx) {
-
-//    int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
-    int warp_idx = thread_idx / kWarpSize;
-    int lane_idx = thread_idx % kWarpSize;
-
-    // Compute warp location
-    int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
-    int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
-
-    int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
-    int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
-
-    int row_idx = residual_group / Detail::WarpPartitions::kRow;
-    int col_idx = residual_group % Detail::WarpPartitions::kRow;
-
-    // Compute per-lane offset
-    int lane_row_offset = lane_idx / Detail::kAccessWidth;
-    int lane_col_offset = lane_idx % Detail::kAccessWidth;
-
-    // Compute coordinate in output space
-    int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
-    int group_offset = group_idx * Shape::kRow * Count::kRow;
-    int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
-    int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
-
-    return MatrixCoord(
-      cluster_offset + group_offset + row_offset + lane_row_offset,
-      column_offset + lane_col_offset * kElementsPerAccess
-    );
-  }
-
-  /// Computes the offset of a given vector access
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord iteration_offset(int iter_idx) {
-    return OutputTileThreadMapHelpers<Iterations, Delta>::iteration_offset(iter_idx);
-  }
-
-  /// Compacted thread map in which the 4D region is contiguous
-  struct CompactedThreadMap {
-
-
-    using Shape = Shape_;
-
-    using TileShape = MatrixShape<
-      Shape::kTile * Shape::kCluster * Shape::kGroup * Shape::kRow,
-      Shape::kColumn
-    >;
-
-    using Iterations = OutputTileShape<
-      Detail::RowArrangement::kIterationsColumn,
-      Detail::RowArrangement::kIterationsRow,
-      Detail::kIterationsGroup,
-      Detail::kIterationsCluster,
-      1>;
-
-    using Delta = OutputTileShape<
-      Detail::RowArrangement::kDeltaColumn,
-      Detail::RowArrangement::kDeltaRow,
-      Detail::kCompactedDeltaGroup,
-      Detail::kCompactedDeltaCluster,
-      1>;
-
-    /// Number of elements within each vector access
-    static int const kElementsPerAccess = ElementsPerAccess;
-
-    /// Number  of threads
-    static int const kThreads = Threads;
-
-    /// Function to compute each thread's initial offset
-    CUTLASS_DEVICE
-    static MatrixCoord initial_offset(int thread_idx) {
-
-//      int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
-      int warp_idx = thread_idx / kWarpSize;
-      int lane_idx = thread_idx % kWarpSize;
-
-      // Compute warp location
-      int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
-      int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
-
-      int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
-      int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
-
-      int row_idx = residual_group / Detail::WarpPartitions::kRow;
-      int col_idx = residual_group % Detail::WarpPartitions::kRow;
-
-      // Compute per-lane offset
-      int lane_row_offset = lane_idx / Detail::kAccessWidth;
-      int lane_col_offset = lane_idx % Detail::kAccessWidth;
-
-      // Compute coordinate in output space
-      int cluster_offset = cluster_idx * Shape::kRow * Shape::kGroup;
-      int group_offset = group_idx * Shape::kRow;
-      int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
-      int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
-
-      MatrixCoord coord(
-        cluster_offset + group_offset + row_offset + lane_row_offset,
-        column_offset + lane_col_offset * kElementsPerAccess
-      );
-
-      return coord;
-    }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template metaprogram for partitioning a 3D interleaved layout across warps
-/// to achieve several performance objectives:
-///
-///   - coalesced memory accesses in units of 64 Byte lines
-///   - minimal address arithmetic
-///   - minimal predicate calculations
-///
-template <typename WarpCount_, typename Iterations_, int Threads,
-          int ElementsPerAccess, int ElementSize>
-struct InterleavedOutputTileThreadMap {
-  using WarpCount = WarpCount_;
-
-  static int const kWarpSize = 32;
-  static int const kThreads = Threads;
-  static int const kWarpCount = kThreads / kWarpSize;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  //
-  // Metaprogram computation
-  //
-
-  struct Detail {};
-
-  //
-  // Output
-  //
-
-  using Iterations = Iterations_;
-
-  using Delta = layout::PitchLinearShape<kWarpSize * kElementsPerAccess, 1>;
-
-  /// Initial offset function
-  CUTLASS_HOST_DEVICE
-  static layout::PitchLinearCoord initial_offset(int thread_idx) {
-    int warp_idx = thread_idx / kWarpSize;
-    int lane_idx = thread_idx % kWarpSize;
-
-    // Compute warp location
-    layout::PitchLinearCoord warp_footprint{
-        Delta::kContiguous * Iterations::kContiguous,
-        Delta::kStrided * Iterations::kStrided};
-
-    layout::PitchLinearCoord warp_offset{warp_idx % WarpCount::kContiguous,
-                                         warp_idx / WarpCount::kContiguous};
-
-    // Compute per-lane offset
-    layout::PitchLinearCoord thread_offset_in_warp{
-        lane_idx * kElementsPerAccess, 0};
-
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-        warp_footprint * warp_offset + thread_offset_in_warp;
-
-    return thread_offset_in_threadblock_tile;
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template metaprogram for partitioning a 4D interleaved layout across warps
-/// to achieve several performance objectives:
-///
-///   - coalesced memory accesses in units of 64 Byte lines
-///   - minimal address arithmetic
-///   - minimal predicate calculations
-///
-template <typename WarpCount_, typename Iterations_, int Threads,
-          int ElementsPerAccess, int ElementSize>
-struct InterleavedConvOutputTileThreadMap {
-  using WarpCount = WarpCount_;
-
-  static int const kWarpSize = 32;
-  static int const kThreads = Threads;
-  static int const kWarpCount = kThreads / kWarpSize;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-
-  //
-  // Metaprogram computation
-  //
-
-  struct Detail {};
-
-  //
-  // Output
-  //
-
-  using Iterations = Iterations_;
-
-  using Delta = MatrixShape<kWarpSize / 4, 4 * kElementsPerAccess>;
-
-  /// Initial offset function
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord initial_offset(int thread_idx) {
-    int warp_idx = thread_idx / kWarpSize;
-    int lane_idx = thread_idx % kWarpSize;
-
-    // Compute warp location
-    MatrixCoord warp_footprint{
-        Delta::kRow * Iterations::kRow,
-        Delta::kColumn * Iterations::kColumn,
-    };
-
-    MatrixCoord warp_offset{warp_idx % WarpCount::kRow,
-                            warp_idx / WarpCount::kRow};
-
-    // Compute per-lane offset
-    MatrixCoord thread_offset_in_warp{lane_idx / 4,
-                                      (lane_idx % 4) * kElementsPerAccess};
-
-    MatrixCoord thread_offset_in_threadblock_tile =
-        warp_footprint * warp_offset + thread_offset_in_warp;
-
-    return thread_offset_in_threadblock_tile;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
deleted file mode 100755
index 9943ea256..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ /dev/null
@@ -1,1387 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/permute.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  bool ScatterD = false,     ///< Scatter D operand or not
-  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
-  bool UseCUDAStore = false
->
-class PredicatedTileIterator {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element,
-    ThreadMap::Iterations::kColumn *
-    ThreadMap::Iterations::kRow *
-    ThreadMap::Iterations::kGroup *
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-    using Base = PredicatedTileIteratorParams;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout):
-      PredicatedTileIteratorParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) 
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout,
-           // Not needed.  Added to be compatible with strided conv epilogue.
-           cutlass::Tensor4DCoord const &tensor_extent):
-      Params(layout)
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout,
-           // Not needed.  Added to be compatible with strided conv epilogue.
-           cutlass::Tensor5DCoord const &tensor_extent):
-      Params(layout)
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  PredicatedTileIteratorParams params_;
-
-  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
-  uint8_t *byte_pointer_;
-
-  /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_ may be with different address computation compared to byte_pointer_.
-  uint8_t *store_byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_column_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column
-  Index thread_start_column_;
-
-  /// Internal state counter
-  int state_[3];
-
-  /// Scatter indices
-  int const *indices_;
-
-  /// PermuteDLayout
-  PermuteDLayout permute_layout_;
-
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIterator(
-    PredicatedTileIteratorParams const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord(),
-    int const *indices = nullptr
-  ): 
-    params_(params), indices_(indices),
-    permute_layout_(PitchLinearCoord(extent.column(), extent.row()), params_.stride * kElementsPerAccess / sizeof(AccessType))
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    extent_row_ = extent.row();
-    extent_column_ = extent.column();
-
-    thread_start_row_ = thread_offset.row();
-    thread_start_column_ = thread_offset.column();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column()
-        + ThreadMap::Delta::kColumn * c) < extent.column());
-    }
-
-    // Null pointer performs no accesses
-    if (!pointer) {
-      mask_.clear();
-    }
-
-    if (ScatterD && !indices) {
-      mask_.clear();
-    }
-
-    // Initialize byte_pointer_
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
-      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-
-    if (ScatterD) {
-      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-    }
-
-    // store_byte_pointer_ is set to be the same with byte_pointer_ unless PermuteD is used.
-    store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t *>(pointer) : byte_pointer_;
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          if (ScatterD && row_guard) {
-            assert(indices_);
-
-            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
-              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType,
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            if (!ScatterD) {
-              byte_pointer += params_.increment_row;
-            }
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
-    uint8_t *byte_pointer = store_byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          if (ScatterD && row_guard) {
-            assert(indices_);
-
-            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
-              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-            
-            if (PermuteD) {
-
-              int col_offset = column * ThreadMap::Delta::kColumn;
-
-              int col = col_offset + thread_start_column_;
-              int row = row_offset + thread_start_row_;
-
-              // Locate memory_pointer
-              memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset
-                 + permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) / kElementsPerAccess);
-            }
-
-            if (UseCUDAStore) {
-              if (guard) {
-                memory_pointer[0] =
-                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
-              }
-            } else {
-              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                  (void *)&memory_pointer[0],
-                  guard);
-            }
-
-            if (!PermuteD) {
-              memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
-            }
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            if (!ScatterD && !PermuteD) {
-              byte_pointer += params_.increment_row;
-            }
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          if (!ScatterD && !PermuteD) {
-            byte_pointer += params_.increment_group;
-          }
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        if (!ScatterD && !PermuteD) {
-          byte_pointer += params_.increment_cluster;
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) const {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void downsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          int output_row = row_offset + thread_start_row_;
-          int output_N = output_row / (convolution_P * convolution_Q);
-          int output_PQ = output_row % (convolution_P * convolution_Q);
-          int output_P = output_PQ / convolution_Q;
-          int output_Q = output_PQ % convolution_Q;
-
-          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
-            (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
-
-          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void upsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          int output_row = row_offset + thread_start_row_;
-          int output_N = output_row / (convolution_P * convolution_Q);
-          int output_PQ = output_row % (convolution_P * convolution_Q);
-          int output_P = output_PQ / convolution_Q;
-          int output_Q = output_PQ % convolution_Q;
-          int row_add_P = add_P;
-          int row_add_Q = add_Q;
-	  if (output_P > convolution_P - 2) row_add_P = 0;
-	  if (output_Q > convolution_Q - 2) row_add_Q = 0;
-
-          int input_row = output_N * (convolution_P/2) * (convolution_Q/2) +
-            ((output_P + row_add_P)/2) * (convolution_Q/2) + (output_Q + row_add_Q)/2;
-
-          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  CUTLASS_DEVICE
-  MatrixCoord thread_start() const {
-    return MatrixCoord(thread_start_row_, thread_start_column_);
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_row() const {
-    return thread_start_row_;
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_column() const {
-    return thread_start_column_;
-  }
-
-  /// Extent of the matrix in rows
-  CUTLASS_DEVICE
-  Index extent_row() const {
-    return extent_row_;
-  }
-
-  /// Extent of the matrix in columns
-  CUTLASS_DEVICE
-  Index extent_column() const {
-    return extent_column_;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-
-    ++state_[0];
-
-    if (!ScatterD) {
-      byte_pointer_ += params_.advance_row;
-    }
-
-    if (!ScatterD && !PermuteD) {
-      store_byte_pointer_ += params_.advance_row;
-    }
-
-    thread_start_row_ += ThreadMap::Shape::kRow;
-
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-
-      if (!ScatterD) {
-        byte_pointer_ += params_.advance_group;
-      }
-
-      if (!ScatterD && !PermuteD) {
-        store_byte_pointer_ += params_.advance_group;
-      }
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-
-        if (!ScatterD) {
-          byte_pointer_ += params_.advance_cluster;
-        }
-
-        if (!ScatterD && !PermuteD) {
-          store_byte_pointer_ += params_.advance_cluster;
-        }
-
-        thread_start_row_ += ThreadMap::Count::kGroup *
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-
-          if (!ScatterD) {
-            byte_pointer_ += params_.advance_tile;
-          }
-
-          if (!ScatterD && !PermuteD) {
-            store_byte_pointer_ += params_.advance_tile;
-          }
-
-          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
-            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances a number of positions to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator+=(int increment)
-  {
-    // Row
-    state_[0] += increment;
-    int increment_row = state_[0] / ThreadMap::Count::kRow;
-    state_[0] = state_[0] % ThreadMap::Count::kRow;
-
-    byte_pointer_ += (params_.advance_row * increment);
-    store_byte_pointer_ += (params_.advance_row * increment);
-    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
-
-    // Group
-    state_[1] += increment_row;
-    int increment_group = state_[1] / ThreadMap::Count::kGroup;
-    state_[1] = state_[1] % ThreadMap::Count::kGroup;
-
-    byte_pointer_ += (params_.advance_group * increment_row);
-    store_byte_pointer_ += (params_.advance_group * increment_row);
-    thread_start_row_ +=
-        (ThreadMap::Shape::kGroup - 1) *
-        ThreadMap::Shape::kRow *
-        ThreadMap::Count::kRow *
-        increment_row;
-
-
-    // Cluster
-    state_[2] += increment_group;
-    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
-    state_[2] = state_[2] % ThreadMap::Count::kCluster;
-
-    byte_pointer_ += (params_.advance_cluster * increment_group);
-    store_byte_pointer_ += (params_.advance_cluster * increment_group);
-    thread_start_row_ +=
-        ThreadMap::Count::kGroup *
-        ThreadMap::Shape::kGroup *
-        ThreadMap::Count::kRow *
-        ThreadMap::Shape::kRow *
-        increment_group;
-
-    // Tile
-    byte_pointer_ += (params_.advance_tile * increment_cluster);
-    store_byte_pointer_ += (params_.advance_tile * increment_cluster);
-    thread_start_row_ +=
-        ThreadMap::Shape::kGroup *
-        ThreadMap::Shape::kRow *
-        ThreadMap::Shape::kCluster *
-        ThreadMap::Shape::kTile *
-        increment_cluster;
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) const {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | InterleavedPredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  int InterleavedN           ///< Number of Interleaved N 
->
-class InterleavedPredicatedTileIterator {
-public:
-  using ThreadMap = ThreadMap_;
-
-  using Element = Element_;
-
-  using Layout = layout::ColumnMajorInterleaved<InterleavedN>;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = layout::PitchLinearCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Iterations::kCount;
-
-  /// Fragment object
-  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  /// Uses a non-template class
-  struct Params : InterleavedPredicatedTileIteratorParams {
-    using Base = InterleavedPredicatedTileIteratorParams;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): 
-      Base(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>()
-      ) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
-  /// Mask object
-  struct Mask {
-    static int const kCount = (ThreadMap::Iterations::kContiguous < 8)
-                                  ? 8
-                                  : ThreadMap::Iterations::kContiguous;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  Params params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in columns
-  Index extent_col_;
-
-  /// A thread's starting column position (assuming steady-state predicates have
-  /// been computed)
-  Index thread_start_col_;
-
-  /// Internal iteration counter
-  int iteration_contiguous_;
-
-  int iteration_strided_;
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  InterleavedPredicatedTileIterator(
-    Params const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset,
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    params_(params) {
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) +
-                                TensorCoord(threadblock_offset.contiguous() * InterleavedN,
-                                 threadblock_offset.strided() / InterleavedN);
-
-    extent_col_ = extent.strided() / InterleavedN;
-    thread_start_col_ = thread_offset.strided();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      mask_.predicates[c] =
-          ((thread_offset.contiguous() + ThreadMap::Delta::kContiguous * c) <
-           (extent.contiguous() * InterleavedN));
-    }
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
-      LongIndex(thread_offset.strided()) * LongIndex(params_.stride) + 
-      LongIndex(thread_offset.contiguous()) * sizeof(AccessType) / kElementsPerAccess;
-
-    // Initialize internal state counter
-    iteration_contiguous_ = iteration_strided_ = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
-
-    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
-
-    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
-
-    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
-
-    cutlass::arch::global_load<
-      AccessType, 
-      sizeof(AccessType)
-    >(
-        *frag_ptr,
-        (void *)memory_pointer,
-        guard);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
-
-    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
-
-    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
-
-    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
-
-    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-        *frag_ptr, (void *)memory_pointer, guard);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int iteration) {
-    iteration_contiguous_ = iteration % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = iteration / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIterator &operator++() {
-
-    ++iteration_contiguous_;
-    byte_pointer_ += params_.advance_row;
-
-    if (iteration_contiguous_ == ThreadMap::Iterations::kContiguous) {
-
-      iteration_contiguous_ = 0;
-      ++iteration_strided_;
-      byte_pointer_ += params_.advance_column;
-
-      if (iteration_strided_ == ThreadMap::Iterations::kStrided) {
-        iteration_strided_ = 0;
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances a number of positions to load or store
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIterator &operator+=(int increment)
-  {
-    // Contiguous
-    iteration_contiguous_ += increment;
-    int increment_strided = iteration_contiguous_ / ThreadMap::Iterations::kContiguous;
-    iteration_contiguous_ = iteration_contiguous_ % ThreadMap::Iterations::kContiguous;
-    byte_pointer_ += (params_.advance_row * increment);
-
-    // Strided
-    iteration_strided_ += increment_strided;
-    byte_pointer_ += (params_.advance_column * increment_strided);
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | InterleavedMaskedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  int InterleavedN           ///< Number of Interleaved N
->
-class InterleavedConvPredicatedTileIterator {
-public:
-  using ThreadMap = ThreadMap_;
-
-  using Element = Element_;
-
-  using Layout = layout::TensorNCxHWx<InterleavedN>;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = Tensor4DCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Iterations::kCount;
-
-  /// Fragment object
-  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    LongIndex stride_col;           ///< stride in bytes between columns
-    LongIndex stride_row;           ///< stride in bytes between rows
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Status initialize(typename Layout::Stride stride_) {
-      stride_col = stride_[1];
-      stride_row = stride_[2];
-
-      return Status::kSuccess;
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params() {
-      initialize(cutlass::make_Coord(0, 0, 0));
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) {
-
-      initialize(layout.stride());
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout,
-           // Not needed.  Added to be compatible with strided conv epilogue.
-           cutlass::Tensor4DCoord const &tensor_extent):
-      Params(layout)
-    { }
-
-  };
-
-  /// Mask object
-  struct Mask {
-    static int const kCount =
-        (ThreadMap::Iterations::kRow < 8) ? 8 : ThreadMap::Iterations::kRow;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  Params params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in columns
-  Index extent_col_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in pq 
-  Index extent_pq_;
-
-  /// A thread's starting row position (assuming steady-state predicates have
-  /// been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column position (assuming steady-state predicates have
-  /// been computed)
-  Index thread_start_col_;
-
-  /// Internal iteration counter
-  LongIndex iteration_row_;
-  LongIndex iteration_col_;
-
-  uint32_t pq_mul_;
-
-  uint32_t pq_shr_;
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  InterleavedConvPredicatedTileIterator(
-    Params const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    MatrixCoord threadblock_offset
-  ):
-    params_(params) {
-    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-                                
-    extent_col_ = extent.c();
-    extent_pq_ = extent.h() * extent.w();
-    extent_row_ = extent.n() * extent_pq_;
-
-    find_divisor(pq_mul_, pq_shr_, extent_pq_);
-
-    thread_start_row_ = thread_offset.row();
-    thread_start_col_ = thread_offset.column();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int r = 0; r < ThreadMap::Iterations::kRow; ++r) {
-      mask_.predicates[r] =
-          ((thread_offset.row() + ThreadMap::Delta::kRow * r) < extent_row_);
-    }
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-                    ((thread_start_col_ / InterleavedN) * params_.stride_col +
-                     (thread_start_col_ % InterleavedN)) *
-                        sizeof_bits<Element>::value / 8;
-
-    // Initialize internal state counter
-    iteration_row_ = iteration_col_ = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
-    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
-    bool guard = col_guard && mask_.predicates[iteration_row_];
-
-    int n, pq_rem;
-
-    fast_divmod(n, pq_rem,
-                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
-                extent_pq_, pq_mul_, pq_shr_);
-
-    uint8_t *byte_pointer =
-        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
-                            sizeof_bits<Element>::value / 8;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    AccessType const *memory_pointer =
-        reinterpret_cast<AccessType const *>(byte_pointer);
-
-    cutlass::arch::global_load<
-      AccessType, 
-      sizeof(AccessType)
-    >(
-        *frag_ptr,
-        (void *)memory_pointer,
-        guard);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-
-    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
-    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
-    bool guard = col_guard && mask_.predicates[iteration_row_];
-
-    int n, pq_rem;
-
-    fast_divmod(n, pq_rem,
-                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
-                extent_pq_, pq_mul_, pq_shr_);
-
-    uint8_t *byte_pointer =
-        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
-                            sizeof_bits<Element>::value / 8;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
-
-    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-        *frag_ptr, (void *)memory_pointer, guard);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int iteration) {
-    iteration_row_ = iteration % ThreadMap::Iterations::kRow;
-    iteration_col_ = iteration / ThreadMap::Iterations::kRow;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  InterleavedConvPredicatedTileIterator &operator++() {
-
-    ++iteration_row_;
-
-    if (iteration_row_ == ThreadMap::Iterations::kRow) {
-
-      iteration_row_ = 0;
-      ++iteration_col_;
-      byte_pointer_ += params_.stride_col;
-
-      if (iteration_col_ == ThreadMap::Iterations::kColumn) {
-        iteration_col_ = 0;
-      }
-    }
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
deleted file mode 100755
index 2b86ac0ea..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-/// It provides a fast path for the case Rank = 2 which does not need div/rem to 
-/// calculate modes.
-
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  int Rank
->
-class PredicatedTileIteratorAffineRankN {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::AffineRankN<Rank>;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-  static_assert( !(Layout::kRank % 2), 
-    "Layout rank must be even. This assumes the first half of the modes correspond to the 'row' "
-    "and the second half of the modes correspond to the 'column'");
-
-  static bool const kBigEndian = false;
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    Layout layout;
-
-    /// Stride in units of bytes along M modes
-    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
-
-    /// Stride in units of bytes along N modes
-    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
-
-    /// Fast divmod objects divided by tensor extents
-    FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
-
-    /// Fast divmod objects divided by tensor extents
-    FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
-
-    int64_t rank2_inc_col;
-    int64_t rank2_inc_row;
-
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(TensorCoord const &extent, Layout const &layout_): layout(layout_) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Layout::kRank / 2; ++i) {
-        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
-        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
-      }
-
-      if (kBigEndian) {
-        // "Big Endian" scheme
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
-          divmod_m[i] = FastDivmod(extent[i + 1]);
-          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
-        }
-      }
-      else {
-        // "Little Endian" scheme
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
-          divmod_m[i] = FastDivmod(extent[i]);
-          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
-        }
-      }
-
-      #if 0
-      //
-      // Debug print statements to verify extents and strides are passed correctly.
-      //
-      printf("PredicatedTileIteratorAffine::Params() entered\n");
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Layout::kRank; ++i) {
-        printf("  extent[%d]: %d\n", i, extent[i]);
-      }
-      for (int i = 0; i < Layout::kRank; ++i) {
-        printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
-      }
-      printf("PredicatedTileIteratorAffine::Params() returning\n");
-      #endif
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout_): layout(layout_) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Layout::kRank / 2; ++i) {
-        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
-        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
-      }
-
-      rank2_inc_col = ThreadMap::Delta::kColumn * stride_n[0];
-      rank2_inc_row = ThreadMap::Delta::kRow * stride_m[0];
-    }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  Params params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in columns
-  Index extent_col_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column position (assuming steady-state predicates have been computed)
-  Index thread_start_column_;
-
-  /// Internal state counter
-  int state_[3];
-
-  /// Offsets in columns, cached for performance
-  int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
- 
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorAffineRankN(
-    Params const & params,
-    Element *pointer,
-    MatrixCoord extent,
-    int thread_idx,
-    MatrixCoord threadblock_offset = MatrixCoord(),
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ): 
-    params_(params)
-  {
-
-    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    extent_row_ = extent.row();
-    extent_col_ = extent.column();
-
-    thread_start_row_ = thread_offset.row();
-    thread_start_column_ = thread_offset.column();
-
-    if (Layout::kRank > 2) {
-      // Initialize predicates
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-        // 
-        // Compute coordinate and decompose into N modes
-        //
-
-        int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
-
-        mask_.predicates[c] = coord_n < extent.column();
-        
-        Coord<Layout::kRank / 2, Index> modes_n;
-
-        int64_t offset_modes_n = 0;
-
-        if (kBigEndian) {
-          modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
-
-          offset_modes_n = dot(modes_n, params_.stride_n);
-        }
-        else {
-          modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
-
-          offset_modes_n = dot(modes_n, params_.stride_n);
-        }
-
-        offset_modes_n_[c] = offset_modes_n;
-
-      }
-
-      if (!pointer) {
-        mask_.clear();
-      }
-    }
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
-    uint8_t const *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
-        int64_t offset_modes_m = row_begin * params_.stride_m[0];
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          // 
-          // Compute coordinate and decompose into M modes
-          //
-
-          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
-
-          Coord<Layout::kRank / 2, Index> modes_m;
-
-          if (Layout::kRank > 2) {
-            if (kBigEndian) {
-              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
-            } else {
-              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
-            }
-
-            offset_modes_m = dot(modes_m, params_.stride_m);
-          }
-
-          //
-          // Compute the offset due to modes M
-          //
-
-          bool row_guard = (coord_m < extent_row_);
-          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            // 
-            // Compute coordinate and decompose into N modes
-            //
-            
-            if (Layout::kRank > 2) {
-              offset_modes_n = offset_modes_n_[column];
-            }
-
-            //
-            // Compute the pointer and access
-            //
-            bool guard;
-
-            if (Layout::kRank > 2) {
-              guard = row_guard && mask_.predicates[column];
-            } else {
-              guard = (coord_m < extent_row_) && 
-              ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
-            }
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-              (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
-              guard
-            );
-
-            if (Layout::kRank == 2) {
-              offset_modes_n += params_.rank2_inc_col;
-            }
-          }
-
-          if (Layout::kRank == 2) {
-            offset_modes_m += params_.rank2_inc_row;
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
-        int64_t offset_modes_m = row_begin * params_.stride_m[0];
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          // 
-          // Compute coordinate and decompose into M modes
-          //
-
-          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
-
-          Coord<Layout::kRank / 2, Index> modes_m;
-
-          if (Layout::kRank > 2) {
-            if (kBigEndian) {
-              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
-            } else {
-              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
-            }
-
-            offset_modes_m = dot(modes_m, params_.stride_m);
-          }
-
-          //
-          // Compute the offset due to modes M
-          //
-
-          bool row_guard = (coord_m < extent_row_);
-          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            // 
-            // Compute coordinate and decompose into N modes
-            //
-            
-            if (Layout::kRank > 2) {
-              offset_modes_n = offset_modes_n_[column];
-            } 
-
-            //
-            // Compute the pointer and access
-            //
-            bool guard;
-            if (Layout::kRank > 2) {            
-              guard = row_guard && mask_.predicates[column];
-            } else {
-              guard = (coord_m < extent_row_) && ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
-            }
-
-            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
-                guard);
-
-            if (Layout::kRank == 2) {
-              offset_modes_n += params_.rank2_inc_col;
-            }
-          }
-
-          if (Layout::kRank == 2) {
-            offset_modes_m += params_.rank2_inc_row;
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorAffineRankN &operator++() {
-
-    ++state_[0];
-    thread_start_row_ += ThreadMap::Shape::kRow;
-    
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-
-        thread_start_row_ += ThreadMap::Count::kGroup * 
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
deleted file mode 100755
index 7f7f17b5a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/fast_math.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Rank
->
-struct PredicatedTileIteratorAffineLayoutRankNParams {
-  using Layout = layout::AffineRankN<Rank>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  static bool const kBigEndian = false;
-  
-  //
-  // Data members
-  //
-
-  Layout layout;
-
-  /// Stride in units of bytes along M modes
-  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
-
-  /// Stride in units of bytes along N modes
-  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
-
-  /// Fast divmod objects divided by tensor extents
-  FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
-
-  /// Fast divmod objects divided by tensor extents
-  FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
-
-  int64_t rank2_inc_col;
-  int64_t rank2_inc_row;
-
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorAffineLayoutRankNParams() { }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorAffineLayoutRankNParams(TensorCoord const &extent, 
-                                                Layout const &layout_,
-                                                int64_t element_sizeof_bits)
-  : layout(layout_) 
-  {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank / 2; ++i) {
-      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
-      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
-    }
-
-    if (kBigEndian) {
-      // "Big Endian" scheme
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
-        divmod_m[i] = FastDivmod(extent[i + 1]);
-        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
-      }
-    }
-    else {
-      // "Little Endian" scheme
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
-        divmod_m[i] = FastDivmod(extent[i]);
-        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
-      }
-    }
-
-    #if 0
-    //
-    // Debug print statements to verify extents and strides are passed correctly.
-    //
-    printf("PredicatedTileIteratorAffine::Params() entered\n");
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      printf("  extent[%d]: %d\n", i, extent[i]);
-    }
-    for (int i = 0; i < Layout::kRank; ++i) {
-      printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
-    }
-    printf("PredicatedTileIteratorAffine::Params() returning\n");
-    #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorAffineLayoutRankNParams(Layout const &layout_,
-                                                int32_t threadmap_delta_kColumn,
-                                                int32_t threadmap_delta_kRow,
-                                                int64_t element_sizeof_bits)
-  : layout(layout_) 
-  {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank / 2; ++i) {
-      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
-      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
-    }
-
-    rank2_inc_col = threadmap_delta_kColumn * stride_n[0];
-    rank2_inc_row = threadmap_delta_kRow * stride_m[0];
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
deleted file mode 100755
index c2583674c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
+++ /dev/null
@@ -1,633 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,                     ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,                        ///< Element data type
-  BlasMode BlasMode_ = BlasMode::kGemm   ///< Tile Iterator for a Symmetric or Hermitian Kernel
->
-class PredicatedTileIteratorBlas3 {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static BlasMode const kBlasMode = BlasMode_;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-  static_assert( AccessType::kElements == 1, "BLAS3 Epilogue must use AccessType::kElements as 1");
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): 
-      PredicatedTileIteratorParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) 
-    {
-        
-    }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  PredicatedTileIteratorParams params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Fill Mode for a tile on diagonal of a symmetric kernel
-  cutlass::FillMode fill_mode;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// Internal state counter
-  int state_[3];
-
-  /// Starting address of the matrix  
-  size_t matrix_start_addr; 
- 
-  static_assert((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian), 
-    "Unsupported blas3 mode.");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorBlas3(
-    PredicatedTileIteratorParams const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset
-    , cutlass::FillMode fill_mode
-  ): 
-    params_(params), fill_mode(fill_mode)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    extent_row_ = extent.row();
-    thread_start_row_ = thread_offset.row();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column() 
-        + ThreadMap::Delta::kColumn * c) < extent.column());
-    }
-
-    // Check Symmetric kernel modes (Lower and Upper - for diagonal CTAs, None for rest CTAs)
-    if ((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian) && 
-        fill_mode == cutlass::FillMode::kInvalid) {
-      arch::device_breakpoint();
-    }
-
-    // Starting address of the matrix
-    matrix_start_addr =  reinterpret_cast<size_t>(pointer); 
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
-      LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
-      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Loads a fragment on the diagonal of a symmetric kernel to memory 
-  CUTLASS_DEVICE
-  void load_symmetric_with_byte_offset(Fragment &frag, int64_t byte_offset) {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          // Offset of row from beginning of the matrix per thread
-          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
-
-          // Absolute row index
-          int row_index = int(row_start_offset/params_.stride);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            // Offset of column from beginning of row per thread     
-            size_t col_start_offset = row_start_offset + 
-                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
-
-            // Absolute column index
-            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
-            guard = guard && ( (isLowerMode && row_index >= col_index) ||
-                               (!isLowerMode && row_index <= col_index) );
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-
-            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
-            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
-              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr);
-
-              if (row_index == col_index) {
-                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
-                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
-              }
-            }
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    
-    if (fill_mode == cutlass::FillMode::kNone) {
-      load_with_byte_offset(frag, 0);
-    }
-    else {
-      load_symmetric_with_byte_offset(frag, 0);
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Stores a fragment on the diagonal of a symmetric kernel to memory 
-  CUTLASS_DEVICE
-  void store_symmetric_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          // Offset of row from beginning of the matrix per thread
-          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
-
-          // Absolute row index
-          int row_index = int(row_start_offset/params_.stride);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            // Offset of column from beginning of row per thread     
-            size_t col_start_offset = row_start_offset + 
-                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
-
-            // Absolute column index
-            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
-
-            guard = guard && ( (isLowerMode && row_index >= col_index) ||
-                               (!isLowerMode && row_index <= col_index) );
-
-            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
-            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
-
-              AccessType *frag_ptr_modify = const_cast<AccessType *>(frag_ptr);
-              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr_modify);
-
-              if (row_index == col_index) {
-                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
-                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
-              }
-            }
-
-            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    
-    if (fill_mode == cutlass::FillMode::kNone) {
-      store_with_byte_offset(frag, 0);
-    }
-    else {
-      store_symmetric_with_byte_offset(frag, 0); 
-    }
-
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorBlas3 &operator++() {
-
-    ++state_[0];
-    byte_pointer_ += params_.advance_row;
-    thread_start_row_ += ThreadMap::Shape::kRow;
-    
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-      byte_pointer_ += params_.advance_group;
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-        byte_pointer_ += params_.advance_cluster;
-
-        thread_start_row_ += ThreadMap::Count::kGroup * 
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-          byte_pointer_ += params_.advance_tile;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
deleted file mode 100755
index a59437c09..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
+++ /dev/null
@@ -1,562 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/permute.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIteratorConv | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  bool ScatterD = false,     ///< Scatter D operand or not
-  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
-  bool UseCUDAStore = false,
-  int Rank = 4
->
-class PredicatedTileIteratorConv {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  static int const kRank = Rank;
-  using Layout = typename platform::conditional<kRank == 4,
-                                       layout::TensorNHWC,
-                                       layout::TensorNDHWC>::type;
-
-  using Stride = typename Layout::Stride;
-  static int const kStrideRank = Layout::kStrideRank;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using MappedLayout = layout::RowMajor;
-  using Index = typename MappedLayout::Index;
-  using LongIndex = typename MappedLayout::LongIndex;
-  using TensorCoord = typename MappedLayout::TensorCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element,
-    ThreadMap::Iterations::kColumn *
-    ThreadMap::Iterations::kRow *
-    ThreadMap::Iterations::kGroup *
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-    using Base = PredicatedTileIteratorParams;
-
-    /// Fast divmod objects divided by tensor extents
-    FastDivmod divmod[kStrideRank - 1];
-    Stride tensor_stride;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout, cutlass::Tensor4DCoord const &tensor_extent):
-      PredicatedTileIteratorParams(
-        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) {
-      divmod[0] = FastDivmod(tensor_extent[2] /* Q for Fprop & W for Deconv*/);
-      divmod[1] = FastDivmod(tensor_extent[1] /* P for Fprop & H for Deconv*/);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kStrideRank; ++i) {
-        tensor_stride[i] = layout.stride()[i];
-      }
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout, cutlass::Tensor5DCoord const &tensor_extent):
-      PredicatedTileIteratorParams(
-        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) {
-      divmod[0] = FastDivmod(tensor_extent[3] /* Q for Fprop & W for Deconv*/);
-      divmod[1] = FastDivmod(tensor_extent[2] /* P for Fprop & H for Deconv*/);
-      divmod[2] = FastDivmod(tensor_extent[1] /* Z for Fprop & D for Deconv*/);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kStrideRank; ++i) {
-        tensor_stride[i] = layout.stride()[i];
-      }
-    }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) :
-      Base(base) { }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  Params params_;
-
-  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_column_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column
-  Index thread_start_column_;
-
-  /// Internal state counter
-  int state_[3];
-
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorConv(
-    Params const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord()
-  ):
-    params_(params)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    extent_row_ = extent.row();
-    extent_column_ = extent.column();
-
-    thread_start_row_ = thread_offset.row();
-    thread_start_column_ = thread_offset.column();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column()
-        + ThreadMap::Delta::kColumn * c) < extent.column());
-    }
-
-    // Null pointer performs no accesses
-    if (!pointer) {
-      mask_.clear();
-    }
-
-    // Initialize byte_pointer_
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>(row_offset + thread_start_row_, params_.divmod);
-
-          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType,
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess + tensor_offset / kElementsPerAccess],
-                guard);
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>((row_offset + thread_start_row_), params_.divmod);
-
-          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            if (UseCUDAStore) {
-              if (guard) {
-                memory_pointer[tensor_offset / kElementsPerAccess] =
-                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
-              }
-            } else {
-              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                  (void *)&memory_pointer[tensor_offset / kElementsPerAccess],
-                  guard);
-            }
-
-            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) const {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  MatrixCoord thread_start() const {
-    return MatrixCoord(thread_start_row_, thread_start_column_);
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_row() const {
-    return thread_start_row_;
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_column() const {
-    return thread_start_column_;
-  }
-
-  /// Extent of the matrix in rows
-  CUTLASS_DEVICE
-  Index extent_row() const {
-    return extent_row_;
-  }
-
-  /// Extent of the matrix in columns
-  CUTLASS_DEVICE
-  Index extent_column() const {
-    return extent_column_;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorConv &operator++() {
-
-    ++state_[0];
-
-    thread_start_row_ += ThreadMap::Shape::kRow;
-
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-
-        thread_start_row_ += ThreadMap::Count::kGroup *
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-
-          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
-            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances a number of positions to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorConv &operator+=(int increment)
-  {
-    // Row
-    state_[0] += increment;
-    int increment_row = state_[0] / ThreadMap::Count::kRow;
-    state_[0] = state_[0] % ThreadMap::Count::kRow;
-
-    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
-
-    // Group
-    state_[1] += increment_row;
-    int increment_group = state_[1] / ThreadMap::Count::kGroup;
-    state_[1] = state_[1] % ThreadMap::Count::kGroup;
-
-    thread_start_row_ +=
-        (ThreadMap::Shape::kGroup - 1) *
-        ThreadMap::Shape::kRow *
-        ThreadMap::Count::kRow *
-        increment_row;
-
-    // Cluster
-    state_[2] += increment_group;
-    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
-    state_[2] = state_[2] % ThreadMap::Count::kCluster;
-
-    thread_start_row_ +=
-        ThreadMap::Count::kGroup *
-        ThreadMap::Shape::kGroup *
-        ThreadMap::Count::kRow *
-        ThreadMap::Shape::kRow *
-        increment_group;
-
-    // Tile
-    thread_start_row_ +=
-        ThreadMap::Shape::kGroup *
-        ThreadMap::Shape::kRow *
-        ThreadMap::Shape::kCluster *
-        ThreadMap::Shape::kTile *
-        increment_cluster;
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) const {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
deleted file mode 100755
index 8d7bf7edb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
+++ /dev/null
@@ -1,445 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/permute.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: PitchLinearThreadMap)
-  typename Element_,         ///< Element data type
-  typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
-  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
->
-class PredicatedTileIteratorDirectConv {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-  using ThreadOutputShape = ThreadOutputShape_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-
-  using ConvProblemSize = typename cutlass::conv::Conv2dProblemSize;
-
-  /// Fragment object
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-  static int const kLoadsPerAccess = AccessType::kElements / AccessType::kElements;
-
-  using ThreadTileCount = MatrixShape<
-    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
-    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
-  >;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorDirect2dConvParams {
-    using Base = PredicatedTileIteratorDirect2dConvParams;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize const &problem_size): 
-      PredicatedTileIteratorDirect2dConvParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        problem_size,
-        {ThreadBlockOutputShape::kH, ThreadBlockOutputShape::kW}
-      ) 
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kContiguous;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  PredicatedTileIteratorDirect2dConvParams params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  ///     
-  Element *pointer_;
-
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_column_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column
-  Index thread_start_column_;
-
-  /// Initial thread output location
-  int thread_start_n_, thread_start_p_, thread_start_q_;
-
-  /// Current threadblock tile index
-  int tile_index_;
-
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorDirect2dConvParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorDirectConv(
-    PredicatedTileIteratorDirect2dConvParams const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord()
-  ): 
-    params_(params), pointer_(pointer)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-
-    extent_row_ = extent.row();
-    extent_column_ = extent.column();
-
-    // stride dim (PQ)
-    thread_start_row_ = thread_offset.column();
-    // contiguous dim (Channels)
-    thread_start_column_ = threadblock_offset.column() + thread_offset.row();
-
-    tile_index_ = threadblock_offset.row();
-
-    set_tile_index(0);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void set_tile_index(const int index) { 
-   
-    int residual;
-    params_.pq_divmod(thread_start_n_, residual, tile_index_ + index);
-    params_.q_divmod(thread_start_p_, thread_start_q_, residual);
-
-    // Compute the base output coord of ThreadBlock
-    thread_start_p_ *= ThreadBlockOutputShape::kH;
-    thread_start_q_ *= ThreadBlockOutputShape::kW;
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-      mask_.predicates[c] = ((thread_start_column_ 
-        + c * ThreadMap::Delta::kContiguous) < extent_column_);
-    }
-
-    // Null pointer performs no accesses
-    if (!pointer_) {
-      mask_.clear();
-    }
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
-
-        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
-        int p = current_row / ThreadBlockOutputShape::kW;
-        int q = current_row % ThreadBlockOutputShape::kW;
-
-        int current_p = thread_start_p_ + p;
-        int current_q = thread_start_q_ + q;
-
-        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
-                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
-
-        int output_row_offset =
-            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
-
-        uint8_t *byte_pointer =
-            reinterpret_cast<uint8_t *>(pointer_) +
-            LongIndex(output_row_offset) * LongIndex(params_.stride) +
-            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
-                sizeof(AccessType) / kElementsPerAccess;
-
-        AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-        bool guard = row_guard && mask_.predicates[c];
-
-        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
-            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
-
-        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
-        int p = current_row / ThreadBlockOutputShape::kW;
-        int q = current_row % ThreadBlockOutputShape::kW;
-
-        int current_p = thread_start_p_ + p;
-        int current_q = thread_start_q_ + q;
-
-        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
-                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
-
-        int output_row_offset =
-            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
-
-        uint8_t *byte_pointer =
-            reinterpret_cast<uint8_t *>(pointer_) +
-            LongIndex(output_row_offset) * LongIndex(params_.stride) +
-            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
-                sizeof(AccessType) / kElementsPerAccess;
-
-        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-        bool guard = row_guard && mask_.predicates[c];
-
-        cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) const {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  MatrixCoord thread_start() const {
-    return MatrixCoord(thread_start_row_, thread_start_column_);
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_row() const {
-    return thread_start_row_;
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_column() const {
-    return thread_start_column_;
-  }
-
-  /// Extent of the matrix in rows
-  CUTLASS_DEVICE
-  Index extent_row() const {
-    return extent_row_;
-  }
-
-  /// Extent of the matrix in columns
-  CUTLASS_DEVICE
-  Index extent_column() const {
-    return extent_column_;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorDirectConv &operator++() {
-    // do nothing
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) const {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
deleted file mode 100755
index 5e9aa22bd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/conv/conv2d_problem_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct OutputTileShapeDesc {
-
-  int column;
-  int row;
-  int group;
-  int cluster;
-  int tile;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  OutputTileShapeDesc(): column(0), row(0), group(0), cluster(0), tile(0) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  OutputTileShapeDesc(
-    int column_,
-    int row_,
-    int group_,
-    int cluster_,
-    int tile_
-  ):
-    column(column_),
-    row(row_),
-    group(group_),
-    cluster(cluster_),
-    tile(tile_) { }
-
-  /// Total number of points in the 5D space
-  CUTLASS_HOST_DEVICE
-  int count() const {
-    return column * row * group * cluster * tile;
-  }
-
-  #if 0
-  CUTLASS_HOST_DEVICE
-  void print() const {
-    printf("{%d, %d, %d, %d, %d}", column, row, group, cluster, tile);
-  }
-  #endif
-};
-
-/// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template.
-template <typename Shape>
-CUTLASS_HOST_DEVICE
-OutputTileShapeDesc make_OutputTileShapeDesc() {
-  return OutputTileShapeDesc(
-    Shape::kColumn,
-    Shape::kRow,
-    Shape::kGroup,
-    Shape::kCluster,
-    Shape::kTile
-  );
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Thread map description
-struct OutputTileThreadMapDesc {
-
-  int threads;
-  int elements_per_access;
-  OutputTileShapeDesc shape;
-  OutputTileShapeDesc iterations;
-  OutputTileShapeDesc delta;
-  OutputTileShapeDesc count;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  OutputTileThreadMapDesc() { }
-
-  CUTLASS_HOST_DEVICE
-  OutputTileThreadMapDesc(
-    int threads_,
-    int elements_per_access_,
-    OutputTileShapeDesc shape_,
-    OutputTileShapeDesc iterations_,
-    OutputTileShapeDesc delta_,
-    OutputTileShapeDesc count_
-  ):
-    threads(threads_), 
-    elements_per_access(elements_per_access_),
-    shape(shape_),
-    iterations(iterations_),
-    delta(delta_),
-    count(count_) 
-  {
-    
-  }
-};
-
-/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
-template <typename ThreadMap>
-CUTLASS_HOST_DEVICE
-OutputTileThreadMapDesc make_OutputTileThreadMapDesc() {
-  return OutputTileThreadMapDesc(
-    ThreadMap::kThreads,
-    ThreadMap::kElementsPerAccess,
-    make_OutputTileShapeDesc<typename ThreadMap::Shape>(),
-    make_OutputTileShapeDesc<typename ThreadMap::Iterations>(),
-    make_OutputTileShapeDesc<typename ThreadMap::Delta>(),
-    make_OutputTileShapeDesc<typename ThreadMap::Count>()
-  );
-}
-///////////////////////////////////////////////////////////////////////////////
-
-//
-// Parameters struct for PredicatedTileIterator
-//
-
-struct PredicatedTileIteratorParams {
-
-  using Index = int32_t;
-  using LongIndex = int64_t;
-
-  //
-  // Data members
-  //
-
-  LongIndex stride;               ///< stride in bytes between rows
-
-  LongIndex increment_row;        ///< increment quantity (in bytes) to advance when moving between rows
-  LongIndex increment_group;      ///< increment quantity (in bytes) to advance when moving to the next group
-  LongIndex increment_cluster;    ///< increment quantity (in bytes) to advance when moving to the next cluster
-
-  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
-  LongIndex advance_group;        ///< amount to add to move to the next 'group' position
-  LongIndex advance_cluster;      ///< amount to add to move to the next 'cluster' position
-  LongIndex advance_tile;         ///< amount to add to move to the next 'tile'
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(LongIndex stride_, OutputTileThreadMapDesc thread_map) {
-    
-    stride = stride_;
-
-    increment_row = stride * thread_map.delta.row;
-
-    increment_group = stride * thread_map.delta.group
-      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
-
-    increment_cluster = stride * thread_map.delta.cluster
-      - stride * thread_map.delta.group * (thread_map.iterations.group - 1)
-      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
-
-    advance_row = stride * thread_map.shape.row;
-
-    advance_group = 
-      stride * 
-      (thread_map.shape.group - 1) * thread_map.shape.row * thread_map.count.row;
-    
-    advance_cluster = 
-      stride * 
-      thread_map.count.group * 
-      thread_map.shape.group * 
-      thread_map.count.row * 
-      thread_map.shape.row;
-    
-    advance_tile =
-      stride * 
-      thread_map.shape.group * 
-      thread_map.shape.row * 
-      thread_map.shape.cluster * 
-      thread_map.shape.tile;
-
-    return Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(Index stride_, OutputTileThreadMapDesc thread_map) {
-    return initialize(LongIndex(stride_), thread_map); 
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorParams() {
-    initialize(LongIndex(0), OutputTileThreadMapDesc());
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorParams(Index stride, OutputTileThreadMapDesc thread_map) {
-    initialize(stride, thread_map);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorParams(LongIndex stride, OutputTileThreadMapDesc thread_map) {
-    initialize(stride, thread_map);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-//
-// Parameters struct for PredicatedTileIteratorDirect2dConv
-//
-
-struct PredicatedTileIteratorDirect2dConvParams{
-  using Index = int32_t;
-  using LongIndex = int64_t;
-
-  //
-  // Data members
-  //
-  FastDivmod pq_divmod;
-  FastDivmod q_divmod;
-
-  LongIndex stride;
-  LongIndex stride_n;
-  LongIndex stride_p;
-
-  int N;
-  int P;
-  int Q;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(LongIndex stride_,
-                    cutlass::conv::Conv2dProblemSize const &problem_size,
-                    MatrixCoord threadblock_output_shape) {
-    stride = stride_; // The stride per row of output tensor (bytes)
-    stride_n = problem_size.P * problem_size.Q;
-    stride_p = problem_size.Q ;
-
-    N = problem_size.N;
-    P = problem_size.P;
-    Q = problem_size.Q;
-
-    // Fastdivmod for output O, P, Q
-    if(threadblock_output_shape.row() != 0 && threadblock_output_shape.column() !=0 ){
-      // MSVC emits a "potential divide by 0" warning as error
-      // if the code just divides without a check and substitution.
-
-      CUTLASS_ASSERT(threadblock_output_shape.row() != 0);
-      const auto row_denom = threadblock_output_shape.row() != 0 ?
-        threadblock_output_shape.row() : cutlass::MatrixCoord::Index(1);
-      int tiles_p =
-          (problem_size.P + (threadblock_output_shape.row() - 1)) / row_denom;
-
-      CUTLASS_ASSERT(threadblock_output_shape.column() != 0);
-      const auto col_denom = threadblock_output_shape.column() != 0 ?
-        threadblock_output_shape.column() : cutlass::MatrixCoord::Index(1);
-      int tiles_q = (problem_size.Q + (threadblock_output_shape.column() - 1)) /
-                    col_denom;
-
-      pq_divmod = FastDivmod(tiles_p * tiles_q);
-      q_divmod = FastDivmod(tiles_q);
-    }
-
-    return Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(
-      Index stride_,
-      cutlass::conv::Conv2dProblemSize const &problem_size = cutlass::conv::Conv2dProblemSize(),
-      MatrixCoord threadblock_output_shape = MatrixCoord()) {
-    return initialize(LongIndex(stride_), problem_size, threadblock_output_shape);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorDirect2dConvParams() { initialize(LongIndex(0)); }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorDirect2dConvParams(Index stride,
-                               cutlass::conv::Conv2dProblemSize const &problem_size,
-                               MatrixCoord threadblock_output_shape) {
-    initialize(stride, problem_size, threadblock_output_shape);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorDirect2dConvParams(LongIndex stride,
-                               cutlass::conv::Conv2dProblemSize const &problem_size,
-                               MatrixCoord threadblock_output_shape) {
-    initialize(stride, problem_size, threadblock_output_shape);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-//  InterleavedPredicatedTileIterator
-///////////////////////////////////////////////////////////////////////////////
-
-
-/// Predicated tile access iterator descriptor object containing template dependent state
-struct InterleavedPredicatedTileIteratorDesc {
-
-  int element_size_bits;
-  int elements_per_access;
-  int threadmap_warp_size;
-  layout::PitchLinearCoord threadmap_iterations;
-  layout::PitchLinearCoord threadmap_delta;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorDesc() { }
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorDesc(
-    int element_size_bits_,
-    int elements_per_access_,
-    int threadmap_warp_size_,
-    layout::PitchLinearCoord threadmap_iterations_,
-    layout::PitchLinearCoord threadmap_delta_
-  ):
-    element_size_bits(element_size_bits_),
-    elements_per_access(elements_per_access_),
-    threadmap_warp_size(threadmap_warp_size_),
-    threadmap_iterations(threadmap_iterations_),
-    threadmap_delta(threadmap_delta_) { }
-};
-
-//
-// Parameters struct InterleavedPredicatedTileIterator
-//
-
-struct InterleavedPredicatedTileIteratorParams {
-
-  using Index = int32_t;
-  using LongIndex = int64_t;
-
-  //
-  // Data members
-  //
-
-  LongIndex stride;               ///< stride in bytes between rows
-  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
-  LongIndex advance_column;       ///< amount to add to move to the next 'column' position
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(LongIndex stride_, InterleavedPredicatedTileIteratorDesc desc) {
-    
-    stride = stride_;
-
-    advance_row = desc.threadmap_delta.contiguous() * desc.element_size_bits / 8;
-
-    advance_column = stride_ - desc.threadmap_iterations.contiguous() *
-                               desc.elements_per_access *
-                               desc.element_size_bits *
-                               desc.threadmap_warp_size / 8;
-
-    return Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorParams() {
-    initialize(LongIndex(0), InterleavedPredicatedTileIteratorDesc());
-  }
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorParams(Index stride, InterleavedPredicatedTileIteratorDesc desc) {
-    initialize(stride, desc);
-  }
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorParams(LongIndex stride, InterleavedPredicatedTileIteratorDesc desc) {
-    initialize(stride, desc);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
-template <typename Element, typename ThreadMap>
-CUTLASS_HOST_DEVICE
-InterleavedPredicatedTileIteratorDesc make_InterleavedPredicatedTileIteratorDesc() {
-  return InterleavedPredicatedTileIteratorDesc(
-    sizeof_bits<Element>::value,
-    ThreadMap::kElementsPerAccess,
-    ThreadMap::kWarpSize,
-    {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-    {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-  );
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Helper template to construct an MakePredicatedTileIteratorDesc from a template 
-// dependent state
-template <typename Element, typename Layout,
-   typename ThreadMap>
-  struct MakePredicatedTileIteratorDesc;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for layout::RowMajor output data.
-template <typename Element, typename ThreadMap>
-struct MakePredicatedTileIteratorDesc <
-    Element, layout::RowMajor, ThreadMap> {
-
-  CUTLASS_HOST_DEVICE
-  OutputTileThreadMapDesc operator()() {
-
-    return make_OutputTileThreadMapDesc<ThreadMap>();
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for layout::ColumnMajorInterleaved<InterleavedN> output data.
-template <typename Element, typename ThreadMap, int InterleavedN>
-struct MakePredicatedTileIteratorDesc <
-    Element, layout::ColumnMajorInterleaved<InterleavedN>, ThreadMap> {
-
-  CUTLASS_HOST_DEVICE
-  InterleavedPredicatedTileIteratorDesc operator()() {
-
-    return make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>();
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
deleted file mode 100755
index 2fbbc9a4f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief PredicatedTileIteratorPredicates.
-
-  PredicatedTileIteratorPredicates enables both upper and lower bounds for predicates.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator predicates used to bound computations in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_          ///< Element data type
->
-class PredicatedTileIteratorPredicates {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): 
-      PredicatedTileIteratorParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) 
-    {
-        
-    }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  PredicatedTileIteratorParams params_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index lower_extent_row_;
-  Index upper_extent_row_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// Internal state counter
-  int state_[3];
- 
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(lower_extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(upper_extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorPredicates(
-    PredicatedTileIteratorParams const & params,
-    TensorCoord lower_extent,
-    TensorCoord upper_extent,
-    int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord()
-  ): 
-    params_(params)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    lower_extent_row_ = lower_extent.row();
-    upper_extent_row_ = upper_extent.row();
-    thread_start_row_ = thread_offset.row();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column() 
-        + ThreadMap::Delta::kColumn * c) < upper_extent.column()) &&
-        ((thread_offset.column() + ThreadMap::Delta::kColumn * c) >= lower_extent.column());
-    }
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorPredicates &operator++() {
-
-    ++state_[0];
-    thread_start_row_ += ThreadMap::Shape::kRow;
-
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-
-        thread_start_row_ += ThreadMap::Count::kGroup *
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Gets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-
-  ///< Gets lower_extent_row_
-  CUTLASS_DEVICE Index get_lower_extent_row() {
-    return lower_extent_row_;
-  }
-
-  ///< Gets upper_extent_row_
-  CUTLASS_DEVICE Index get_upper_extent_row() {
-    return upper_extent_row_;
-  }
-
-  ///< Gets thread_start_row_
-  CUTLASS_DEVICE Index get_thread_start_row() {
-    return thread_start_row_;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
deleted file mode 100755
index 94b71b9b8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
+++ /dev/null
@@ -1,479 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace epilogue {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_          ///< Element data type
->
-class PredicatedTileIteratorStridedDgrad {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-
-    /// Convolution problem size
-    cutlass::conv::Conv2dProblemSize problem_size;
-    int tiled_rows_per_filter;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize problem_size_, int threadblock_row): 
-      problem_size(problem_size_), 
-      PredicatedTileIteratorParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      ) 
-    {
-  
-      int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_row);
-
-      tiled_rows_per_filter = tile_m_per_filter * threadblock_row;
-    }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  Params params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Starting Dx h and w dimension for strided dgrad mapping
-  int start_h_, start_w_;
-
-  /// Effective Dy P and Q dimensions for strided dgrad mapping
-  int p_, q_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column position (assuming steady-state predicates have been computed)
-  Index thread_start_column_;
-
-  /// Internal state counter
-  int state_[3];
- 
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  PredicatedTileIteratorStridedDgrad(
-    Params const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
-    int start_r, int start_s,
-    TensorCoord threadblock_offset = TensorCoord()
-  ): 
-    params_(params)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    int r = start_r;
-    int s = start_s;
-
-    if (params_.problem_size.mode == cutlass::conv::Mode::kConvolution) {
-      r = (params_.problem_size.R - 1 - r);
-      s = (params_.problem_size.S - 1 - s);
-    }
-
-    // compute starting coordinates in Dx start_h_ and start_w_
-    strided_dgrad_starting_coords(
-      params_.problem_size, 
-      stride_h_divmod, stride_w_divmod, 
-      r, s, 
-      start_h_, start_w_);
-
-    p_ = (params_.problem_size.H - start_h_ + params_.problem_size.stride_h - 1) / params_.problem_size.stride_h;
-    q_ = (params_.problem_size.W - start_w_ + params_.problem_size.stride_w - 1) / params_.problem_size.stride_w;
-
-    extent_row_ = extent.row();
-    thread_start_row_ = thread_offset.row();
-    thread_start_column_ = thread_offset.column();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column() 
-        + ThreadMap::Delta::kColumn * c) < extent.column());
-    }
-
-    // Null pointer performs no accesses
-    if (!pointer) {
-      mask_.clear();
-    }
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          // remapping rows to find the mapped_row_offset
-          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
-
-          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
-          int n = npq_offset / (p_ * q_); 
-          int residual = npq_offset % (p_ * q_);
-          int p = residual / q_;
-          int q = residual % q_;
-        
-          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
-                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
-                                  (start_w_ + q * params_.problem_size.stride_w);
-          bool row_guard = mapped_row_offset < extent_row_;
-
-          int64_t row_byte_offset = mapped_row_offset * params_.stride;
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType, 
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
-                guard);
-          }
-        }
-      }
-    }
-  }
-
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow 
-            + group * ThreadMap::Delta::kGroup 
-            + cluster * ThreadMap::Delta::kCluster;
-
-          // remapping rows to find the mapped_row_offset
-          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
-
-          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
-          int n = npq_offset / (p_ * q_); 
-          int residual = npq_offset % (p_ * q_);
-          int p = residual / q_;
-          int q = residual % q_;
-        
-          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
-                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
-                                  (start_w_ + q * params_.problem_size.stride_w);
-          bool row_guard = mapped_row_offset < extent_row_;
-
-          int64_t row_byte_offset = mapped_row_offset * params_.stride;
-          
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_store<AccessType, sizeof(AccessType) >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
-                guard);            
-          }
-        }
-      }
-    }
-  }
-
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorStridedDgrad &operator++() {
-
-    ++state_[0];
-
-    thread_start_row_ += ThreadMap::Shape::kRow;
-    
-    if (state_[0] == ThreadMap::Count::kRow) {
-
-      state_[0] = 0;
-      ++state_[1];
-
-      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
-        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
-
-      if (state_[1] == ThreadMap::Count::kGroup) {
-
-        state_[1] = 0;
-        ++state_[2];
-
-        thread_start_row_ += ThreadMap::Count::kGroup * 
-          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
-
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
deleted file mode 100755
index ccdb4a9f8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from shared memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  int MaxAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8
->
-class SharedLoadIterator {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::TileShape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  static int const kMinAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
-
-  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
-
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * 
-    ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<
-    Element, 
-    ThreadMap::kElementsPerAccess, 
-    kAlignment>;
-
-  /// Vector type used for SMEM loads
-  using LoadType = AlignedArray<
-    Element,
-    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
-    const_min(16, kAlignment)
-  >;
-
-  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Stride along adjacent rows
-  int stride_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  SharedLoadIterator(
-    TensorRef ref,
-    int thread_idx
-  ):
-    byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
-    stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-
-    // Initialize pointer
-    byte_pointer_ +=
-      thread_offset.row() * stride_ + 
-      thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &offset) {
-    byte_pointer_ += 
-      offset.row() * Shape::kRow * stride_ + 
-      offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          uint8_t const *byte_pointer = byte_pointer_ + 
-            row * ThreadMap::Delta::kRow * stride_ + 
-            group * ThreadMap::Delta::kGroup* stride_ + 
-            cluster * ThreadMap::Delta::kCluster * stride_ +
-            pointer_offset * sizeof_bits<Element>::value / 8;
-
-          int frag_row_idx = 
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
-          LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            
-            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
-
-            CUTLASS_PRAGMA_UNROLL
-            for (int v = 0; v < kLoadsPerAccess; ++v) {
-              frag_ptr[frag_idx * kLoadsPerAccess + v] = 
-                memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
deleted file mode 100755
index eef4d22bd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ /dev/null
@@ -1,594 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision.
-
-  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
-
-  When the fragment is loaded into registers, it matches the row-major thread map assumed by
-  the predicated tile iterator writing to global memory.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from shared memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Accumulator data type
-  int ElementSizeBits_,      ///< Size of accumulator in bits
-  int OutputSizeBits_,       ///< Size of output element in bits
-  int ElementsPerAccess,     ///< Vector length of output vector
-  int ContiguousLanes,       ///< Number of lanes in the warp writing to contiguous elements
-                             ///  in the global memory tensor
-  bool EightBitsOutputOrLess = (OutputSizeBits_ <= 8)
->
-class SharedLoadIteratorMixed;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from shared memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_          ///< Accumulator data type
->
-class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8, false> {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
-
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * 
-    ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<
-    Element, 
-    ThreadMap::kElementsPerAccess, 
-    kAlignment>;
-
-  /// Vector type used for SMEM loads
-  using LoadType = AlignedArray<
-    Element,
-    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
-    const_min(16, kAlignment)
-  >;
-
-  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Byte-level pointer
-  LoadType const *pointers_[kLoadsPerAccess];
-
-  /// Stride along adjacent rows in units of LoadType
-  int stride_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  SharedLoadIteratorMixed(
-    TensorRef ref,
-    int thread_idx
-  ):
-    stride_((ref.stride(0) / LoadType::kElements)) {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-
-    // Initialize pointers
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] = reinterpret_cast<LoadType const *>(ref.data());
-
-      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
-      int bank_offset = (col_idx * int(sizeof(LoadType)) / 128) % kLoadsPerAccess;
-
-      col_idx += (bank_offset + i) % kLoadsPerAccess;
-
-      pointers_[i] += thread_offset.row() * stride_ + col_idx;
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += pointer_offset / LoadType::kElements;
-    }
-  }
-
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += 
-        offset.row() * Shape::kRow * stride_ + 
-        offset.column() * Shape::kColumn / LoadType::kElements;
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int row_ptr_offset =
-            row * ThreadMap::Delta::kRow * stride_ + 
-            group * ThreadMap::Delta::kGroup* stride_ + 
-            cluster * ThreadMap::Delta::kCluster * stride_ +
-            pointer_offset / LoadType::kElements;
-
-          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            
-            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
-
-            CUTLASS_PRAGMA_UNROLL
-            for (int v = 0; v < kLoadsPerAccess; ++v) {
-           
-              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); 
-
-              LoadType const *memory_pointer = pointers_[v] + row_ptr_offset;
-            
-              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Set base smem address
-  CUTLASS_DEVICE
-  void set_smem_base_address(Index address) {}
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for
-///   int32_t x 16 => int8_t/int4b_t x 16 and
-///   float x 16 => float_e4m3_t/float_e5m2_t x 16
-template <
-  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
-  typename Element_,
-  int OutputSizeBits_       ///< Size of output element in bits
->
-class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 16, 8, true> {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  static int const kAlignment = 16;
-
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * 
-    ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<
-    Element, 
-    16, 
-    kAlignment>;
-
-  /// Vector type used for SMEM loads
-  using LoadType = AlignedArray<
-    Element,
-    4,
-    16
-  >;
-
-  static int const kLoadsPerAccess = 4;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Byte-level pointer
-  LoadType const *pointers_[kLoadsPerAccess];
-
-  /// Stride along adjacent rows in units of LoadType
-  int stride_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  SharedLoadIteratorMixed(
-    TensorRef ref,
-    int thread_idx
-  ):
-    stride_((ref.stride(0) / LoadType::kElements)) {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-    
-    // Initialize pointers
-    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
-      
-    int lane_col_idx = thread_offset.column() / 16;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i);
- 
-      pointers_[i] = base_ptr + lane_offset;
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += pointer_offset / LoadType::kElements;
-    }
-  }
-
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += 
-        offset.row() * Shape::kRow * stride_ + 
-        offset.column() * Shape::kColumn / LoadType::kElements;
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int row_ptr_offset =
-            row * ThreadMap::Delta::kRow * stride_ + 
-            group * ThreadMap::Delta::kGroup* stride_ + 
-            cluster * ThreadMap::Delta::kCluster * stride_ +
-            pointer_offset / LoadType::kElements;
-
-          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            
-            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
-
-            CUTLASS_PRAGMA_UNROLL
-            for (int v = 0; v < kLoadsPerAccess; ++v) {
-           
-              LoadType const *memory_pointer = pointers_[v];
-            
-              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Set base smem address
-  CUTLASS_DEVICE
-  void set_smem_base_address(Index address) {}
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for:
-///   int32_t x 8 => int8_t/int4b_t x 8 and
-///   float x 8 => float_e4m3_t/float_e5m2_t x 8
-template <
-  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
-  typename Element_,
-  int OutputSizeBits_
->
-class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 8, 8, true> {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  static int const kAlignment = 8;
-
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Fragment object
-  using Fragment = Array<
-    Element, 
-    ThreadMap::Iterations::kColumn * 
-    ThreadMap::Iterations::kRow * 
-    ThreadMap::Iterations::kGroup * 
-    ThreadMap::Iterations::kCluster * 
-    ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<
-    Element, 
-    8, 
-    kAlignment>;
-
-  /// Vector type used for SMEM loads
-  using LoadType = AlignedArray<
-    Element,
-    4,
-    16
-  >;
-
-  static int const kLoadsPerAccess = 2;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Byte-level pointer
-  LoadType const *pointers_[kLoadsPerAccess];
-
-  /// Stride along adjacent rows in units of LoadType
-  int stride_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  SharedLoadIteratorMixed(
-    TensorRef ref,
-    int thread_idx
-  ):
-    stride_((ref.stride(0) / LoadType::kElements)) {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-    
-    // Initialize pointers
-    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
-      
-    int lane_col_idx = thread_offset.column() / 8;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i);
-
-      pointers_[i] = base_ptr + lane_offset;
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += pointer_offset / LoadType::kElements;
-    }
-  }
-
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &offset) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kLoadsPerAccess; ++i) {
-      pointers_[i] += 
-        offset.row() * Shape::kRow * stride_ + 
-        offset.column() * Shape::kColumn / LoadType::kElements;
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int row_ptr_offset =
-            row * ThreadMap::Delta::kRow * stride_ + 
-            group * ThreadMap::Delta::kGroup* stride_ + 
-            cluster * ThreadMap::Delta::kCluster * stride_ +
-            pointer_offset / LoadType::kElements;
-
-          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-            
-            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
-
-            CUTLASS_PRAGMA_UNROLL
-            for (int v = 0; v < kLoadsPerAccess; ++v) {
-           
-              LoadType const *memory_pointer = pointers_[v];
-            
-              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Set base smem address
-  CUTLASS_DEVICE
-  void set_smem_base_address(Index address) {}
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
deleted file mode 100755
index 5af6997ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-
-  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
-  
-  When the fragment is loaded into registers, it matches the row-major thread map assumed by
-  the predicated tile iterator writing to global memory.
-
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load output tile from shared memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator
-///
-template <typename ThreadMap_,  ///< Thread map (conept: PitchLinearThreadMap)
-          typename Element_,    ///< Element data type
-          int MaxAlignment = ThreadMap_::kElementsPerAccess *sizeof_bits<Element_>::value / 8>
-class SharedLoadIteratorPitchLinear {
- public:
-  using ThreadMap = ThreadMap_;
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  static int const kMinAlignment =
-      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
-
-  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
-
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Fragment object
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, kElementsPerAccess, kAlignment>;
-
-  /// Vector type used for SMEM loads
-  using LoadType =
-      AlignedArray<Element,
-                   const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
-                   const_min(16, kAlignment)>;
-
-  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Stride along adjacent rows
-  int stride_;
-
-  /// Base address offset
-  Index base_smem_address_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  SharedLoadIteratorPitchLinear(TensorRef ref, int thread_idx)
-      : byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
-        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8),
-        base_smem_address_(0) {
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
-
-    // Initialize pointer
-    // thread_offset.row() is contiguous dim
-    // thread_offset.column() is stride dim
-    byte_pointer_ += thread_offset.row() * sizeof(AccessType) / kElementsPerAccess+
-                     thread_offset.column() * stride_ ;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &offset) {
-    byte_pointer_ +=
-        offset.row() * ThreadMap::StorageShape::kContiguous * sizeof(AccessType) / kElementsPerAccess +
-        offset.column() * ThreadMap::StorageShape::kStrided * stride_;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        uint8_t const *byte_pointer =
-            byte_pointer_ + s * ThreadMap::Delta::kStrided * stride_ +
-            c * ThreadMap::Delta::kContiguous * ThreadMap::kElementsPerAccess *
-                sizeof_bits<Element>::value / 8 +
-            pointer_offset * sizeof_bits<Element>::value / 8 + base_smem_address_;
-
-        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
-
-        LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
-
-        LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kLoadsPerAccess; ++v) {
-          frag_ptr[frag_base_idx * kLoadsPerAccess + v] = memory_pointer[v];
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void set_smem_base_address(Index address) { base_smem_address_ = address; }
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace epilogue
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
deleted file mode 100755
index 84a096c65..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// 
-template <
-  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
-  typename Layout             ///< target shared memory layout
->
-class FragmentIteratorComplexTensorOp;
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
-  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
->
-class FragmentIteratorComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::RowMajor;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    complex<OperatorElementC>, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  static int const kRealIndex = 0;
-
-  /// Offset into the accumulator fragment
-  static int const kImaginaryIndex = 
-    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<OperatorElementC, 2 * kImaginaryIndex>;
-
-  /// This is the complete warp-level accumulator tile.
-  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kImaginaryIndex>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
-
-  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorComplexTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorComplexTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorComplexTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    int index = index_ + index_offset;
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int accumulator_access_offset = 
-        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
-
-      auto const & real_accum_array = accumulators_[accumulator_access_offset + kRealIndex];
-      auto const & imag_accum_array = accumulators_[accumulator_access_offset + kImaginaryIndex / Policy::kElementsPerAccess];
-
-      // Pack real and imaginary parts into a structure. This is likely to result in MOVs
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
-
-        frag_ptr[n][i].real() = real_accum_array[i];
-        frag_ptr[n][i].imag() = imag_accum_array[i]; 
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
deleted file mode 100755
index 13b00762e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// 
-template <
-  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
-  typename Layout             ///< target shared memory layout
->
-class FragmentIteratorGaussianComplexTensorOp;
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
-  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
->
-class FragmentIteratorGaussianComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::RowMajor;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    complex<OperatorElementC>, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC
-  static int const kElementsAccumulatorPerPart = 
-    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
-
-  /// Offset into the accumulator fragment part 1
-  static int const kPart1Index = kElementsAccumulatorPerPart * 0;
-
-  /// Offset into the accumulator fragment part 2
-  static int const kPart2Index = kElementsAccumulatorPerPart * 1;
-
-  /// Offset into the accumulator fragment part 3
-  static int const kPart3Index = kElementsAccumulatorPerPart * 2;
-
-  /// This is the complete warp-level accumulator tile holding part1, part2, and part3
-  using AccumulatorTile = Array<OperatorElementC, kElementsAccumulatorPerPart * 3>;
-
-  /// This is the complete warp-level accumulator tile holding final output of complex<T> type 
-  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kElementsAccumulatorPerPart>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
-
-  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorGaussianComplexTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorGaussianComplexTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    int index = index_ + index_offset;
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int accumulator_access_offset = 
-        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
-
-      auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index];
-      auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess];
-      auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess];
-
-      // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
-
-        frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i];
-        frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; 
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
deleted file mode 100755
index 92d3bf582..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/warp/simt_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fragment iterator for SIMT accumulator arrangements
-template <
-  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename Operator,              ///< matrix multiply operation (concept: arch::Mma)
-  typename Layout,                ///< target shared memory layout
-  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-class FragmentIteratorSimt;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,     ///< shape of the warp-level GEMM tile
-  typename Operator_ ,     ///< matrix multiply operator (concept: arch::Mma)
-  typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-class FragmentIteratorSimt<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
-public:
-
-  using WarpShape = WarpShape_;
-  using Operator = Operator_;
-  using Layout = layout::RowMajor;
-
-  /// Policy for warp-level epilogue components
-  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    typename Operator::ElementC, 
-    Policy::kElementsPerIteration>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    typename Operator::ElementC, 
-    Policy::kAccumulatorElementCount>;
-
-  using OutputAccumulatorTile = AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<typename Operator::ElementC, Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorSimt(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorSimt &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorSimt &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-
-      int accumulator_access_offset = index_ * Policy::kAccessesPerIteration + n;
-
-      frag_ptr[n] = accumulators_[accumulator_access_offset];
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
deleted file mode 100755
index a69f0fd25..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// 
-template <
-  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
-  typename Layout             ///< target shared memory layout
->
-class FragmentIteratorTensorOp;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
->
-class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::RowMajor;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    OperatorElementC, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    OperatorElementC, 
-    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
-
-  using OutputAccumulatorTile = AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-  using TileIterations = typename Policy::TileIterations;
-  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    int index = index_ + index_offset;
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int accumulator_access_offset = 
-        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
-
-      frag_ptr[n] = accumulators_[accumulator_access_offset];
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for col-major shared memory
-/// Only works for 168x tensor core kernels
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
->
-class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::ColumnMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::ColumnMajor;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    OperatorElementC, 
-    4 * Policy::OperatorCount::kRow * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    OperatorElementC, 
-    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
-
-  using OutputAccumulatorTile = AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-  using TileIterations = typename Policy::TileIterations;
-  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    int index = index_ + index_offset;
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Policy::kAccumulatorRowStride; ++i) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < (Policy::OperatorCount::kRow * 2); ++m) {
-
-        int accumulator_access_offset = 
-          index * Policy::kAccumulatorColumnStride + m * Policy::kAccumulatorRowStride / Policy::kElementsPerAccess + i;
-
-        frag_ptr[m + i * Policy::OperatorCount::kRow * 2] = accumulators_[accumulator_access_offset];
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Dedicated to interleaved layout
-template <
-    /// shape of the warp-level GEMM tile
-    typename WarpShape_,
-    /// matrix multiply operator shape (concept: gemm::GemmShape)
-    typename OperatorShape_,
-    /// matrix multiply operator data type (concept: data type)
-    typename OperatorElementC_,
-    /// matrix multiply operator fragment (concept: Array)
-    typename OperatorFragmentC_,
-    /// number of interleaved k
-    int InterleavedK>
-class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
-                               layout::ColumnMajorInterleaved<InterleavedK>> {
- public:
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment =
-      Array<OperatorElementC,
-            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile =
-      Array<OperatorElementC, OperatorFragmentC::kElements *
-                                  Policy::OperatorCount::kRow *
-                                  Policy::OperatorCount::kColumn>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-  using TileIterations = typename Policy::TileIterations;
-  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
-
- private:
-  /// Internal access type
-  using AccessType =
-      Array<OperatorElementC, Policy::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
- public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp(AccumulatorTile const &accum)
-      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
-        index_(0) {}
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-    int index = index_ + index_offset;
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
-      int index_m = index % (Policy::OperatorCount::kRow *
-                             Policy::kIterationsPerInstruction);
-      int index_n = index / (Policy::OperatorCount::kRow *
-                             Policy::kIterationsPerInstruction);
-      int accumulator_access_offset =
-          (index_m / Policy::kIterationsPerInstruction) *
-              (Policy::OperatorCount::kColumn *
-               Policy::kIterationsPerInstruction) +
-          (index_m % Policy::kIterationsPerInstruction) +
-          index_n * (InterleavedK / OperatorShape::kN) *
-              Policy::kIterationsPerInstruction +
-          n * Policy::kIterationsPerInstruction;
-
-      frag_ptr[n] = accumulators_[accumulator_access_offset];
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
deleted file mode 100755
index 4979a3803..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 
-template <
-  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
-  typename ElementC,              ///< Accumulator layout
-  typename Layout                 ///< target shared memory layout
->
-class FragmentIteratorVoltaTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
->
-class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using ElementC = half_t;
-  using Layout = layout::RowMajor;
-
-  /// Policy operator
-  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
-
-  /// Array type for aligned memory accesses
-  using AccessType = typename Policy::AccessType;
-  
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = typename Policy::Fragment;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = typename Policy::AccumulatorTile;
-
-  using OutputAccumulatorTile = AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-private:
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    static int const kAccessesPerMma = Policy::kElementsPerMma / Policy::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
-      
-      int tile_access_idx = 
-        (tile_n * Policy::TileIterations::kRow + (index_ & 2) / 2) * Policy::MmaIterations::kCount * kAccessesPerMma;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * kAccessesPerMma; ++mma_n) {
-
-        int mma_access_idx = ((mma_n & 1) * 2 + (index_ & 1)) * kAccessesPerMma + (mma_n & 2) / 2;
-
-        frag_ptr[tile_n * Policy::MmaIterations::kColumn * kAccessesPerMma +
-          mma_n] = accumulators_[tile_access_idx + mma_access_idx];
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
->
-class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using ElementC = float;
-  using Layout = layout::RowMajor;
-
-  /// Policy operator
-  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
-
-  /// Array type for aligned memory accesses
-  using AccessType = typename Policy::AccessType;
-  
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = typename Policy::Fragment;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = typename Policy::AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-private:
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) {
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorVoltaTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    int const kRegsPerMmaRow = 2;
-      
-    CUTLASS_PRAGMA_UNROLL
-    for (int reg_row = 0; reg_row < Policy::kRowsPerMmaTile; ++reg_row) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
-    
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * 2; ++mma_n) {
-
-          int mma_idx = (index_ & 1) + (index_ & 2) * Policy::MmaIterations::kCount / 2 +
-            (tile_n * Policy::TileIterations::kRow) * Policy::MmaIterations::kCount + (mma_n & 1) * 2;
-
-          int reg_offset = reg_row * kRegsPerMmaRow + (mma_n & 2) * 2;
-          int reg_idx = mma_idx * Policy::kElementsPerMma + reg_offset;
-
-          *frag_ptr = accumulators_[reg_idx / Policy::kElementsPerAccess];
-          ++frag_ptr;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
deleted file mode 100755
index 955409f32..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-
-      Typically, the accumulator tile is the largest single block of register-backed storage 
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-
-#pragma once
-
-#if !(defined(__clang__) && defined(__CUDA__))
-
-#include "cutlass/wmma_array.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// 
-template <
-  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
-  typename Layout             ///< target shared memory layout
->
-class FragmentIteratorWmmaTensorOp;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
->
-class FragmentIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::RowMajor;
-
-  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kColumn>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kCount>;
-
-  using OutputAccumulatorTile = AccumulatorTile;
-
-private:
-
-  /// Internal access type
-  using AccessType = WmmaFragmentArray<OperatorFragmentC, Policy::kWmmaFragmentsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-public:
-
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorWmmaTensorOp(AccumulatorTile const &accum): 
-    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
-    index_(0) { 
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorWmmaTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FragmentIteratorWmmaTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
-      
-      int accumulator_access_offset = index_ * Policy::OperatorCount::kColumn + n;
-
-      frag_ptr[n] = accumulators_[accumulator_access_offset];
-    }
-  }
-};
-
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
-
-#else
-#error (defined(__clang__) && defined(__CUDA__))
-#endif // !defined(__clang__)
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
deleted file mode 100755
index b30bf19d6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
-          These quantities assume a 'column-major' arrangement of SimtOp instructions, of which
-          a row-oriented slice is visible per iteration.
-*/
-
-#pragma once
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename WarpShape,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename Operator,             ///< matrix multiply operation (concept: arch::Mma)
-  typename Layout,               ///< destination layout in shared memory
-  typename MmaSimtPolicy         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-struct SimtPolicy;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape_,           ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename Operator_,            ///< matrix multiply operation (concept: arch::Mma)
-  typename MmaSimtPolicy_        ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-struct SimtPolicy<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
-
-  using WarpShape = WarpShape_;
-  using Operator = Operator_;
-  using MmaSimtPolicy = MmaSimtPolicy_;
-
-  static_assert(!(WarpShape::kM % MmaSimtPolicy::WarpShape::kRow), "Divisibility");
-  static_assert(!(WarpShape::kN % MmaSimtPolicy::WarpShape::kColumn), "Divisibility");
-
-  /// Number of iterations
-  static int const kIterations = WarpShape::kM / MmaSimtPolicy::WarpShape::kRow;
-
-  /// Number of accumulators written per iteration
-  static int const kElementsPerIteration = 
-    (WarpShape::kN / MmaSimtPolicy::WarpShape::kColumn);
-
-  /// Total number of accumulators
-  static int const kAccumulatorElementCount = kElementsPerIteration * kIterations;
-
-  /// Number of consecutive elements
-  static int const kElementsPerAccess = MmaSimtPolicy::LaneMmaShape::kN;
-
-  /// Number of rows per epilogue iteration
-  static int const kRowsPerIteration = MmaSimtPolicy::WarpShape::kRow;
-
-  /// Number of accesses made in one iteration
-  static int const kAccessesPerIteration = kElementsPerIteration / kElementsPerAccess;
-
-  /// Number of elements in between accumulator chunks of (LaneMmaShape::kM x LaneMmaShape::kN)
-  using Delta = MatrixShape<
-    MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM,
-    MmaSimtPolicy::WarpShape::kColumn * MmaSimtPolicy::LaneMmaShape::kN
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
deleted file mode 100755
index b3f3a4f59..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
-          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
-          a row-oriented slice is visible per iteration.
-*/
-
-#pragma once
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy details related to the epilogue
-template <
-  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
-  typename Layout         ///< target shared memory layout
->
-struct TensorOpPolicy; 
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
->
-struct TensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
-
-  /// Number of operations
-  using OperatorCount = MatrixShape<
-    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
-    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
-  >;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-
-  static int const kElementsPerAccess = 2;
-  static int const kRowsPerIteration = 8;
-  static bool const kDivisible = 
-    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
-
-  //
-  // Derived quantities
-  //
-
-  // Number of 'externally visible' iterations per actual instruction
-  static int const kIterationsPerInstruction = OperatorShape::kM / kRowsPerIteration;
-
-  // Number of externally visible iterations
-  static int const kIterations = OperatorCount::kRow * kIterationsPerInstruction;
-
-  using TileIterations = MatrixShape<kIterations, 1>;
-
-  static int const kAccumulatorRowStride = kElementsPerAccess;
-  static int const kAccumulatorColumnStride = kElementsPerAccess * OperatorCount::kRow * kIterationsPerInstruction;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
->
-struct TensorOpPolicy<WarpShape, OperatorShape, layout::ColumnMajor> {
-
-  /// Number of operations
-  using OperatorCount = MatrixShape<
-    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
-    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
-  >;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-
-  static int const kElementsPerAccess = 1;
-  static int const kColumnsPerIteration = 8;
-  static bool const kDivisible = 
-    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
-
-  //
-  // Derived quantities
-  //
-
-  // Number of 'externally visible' iterations per actual instruction
-  static int const kIterationsPerInstruction = OperatorShape::kN / kColumnsPerIteration;
-
-  // Number of externally visible iterations
-  static int const kIterations = OperatorCount::kColumn * kIterationsPerInstruction;
-
-  using TileIterations = MatrixShape<kIterations, 1>;
-
-  // Hard code for 16x8
-  static int const kAccumulatorRowStride = 2;
-  static int const kAccumulatorColumnStride = 4 * OperatorCount::kRow;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major-interleaved
-template <
-    typename WarpShape,  ///< shape of warp-level GEMM (concept: MatrixShape)
-    typename OperatorShape,   ///< matrix multiply operation (concept: arch::Mma)
-    int InterleavedK     ///< number of interleaved k
-    >
-struct TensorOpPolicy<WarpShape, OperatorShape,
-                      layout::ColumnMajorInterleaved<InterleavedK> > {
-  /// Number of operations
-  using OperatorCount = MatrixShape<WarpShape::kM / OperatorShape::kM,
-                                    WarpShape::kN / OperatorShape::kN>;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-
-  static int const kElementsPerAccess = 2;
-  static int const kRowsPerIteration = 8;
-
-  //
-  // Derived quantities
-  //
-
-  // Number of 'externally visible' iterations per actual instruction
-  static int const kIterationsPerInstruction =
-      OperatorShape::kM / kRowsPerIteration;
-
-  // Number of externally visible iterations
-  static int const kIterations = WarpShape::kN / InterleavedK *
-                                 OperatorCount::kRow *
-                                 kIterationsPerInstruction;
-
-  static int const kElementsPerIteration = InterleavedK / OperatorShape::kN * kElementsPerAccess;
-
-  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
-
-  // Number of externally visible iterations
-  //static int const kTileIterations = OperatorCount::kRow * kIterationsPerInstruction;
-  using TileIterations = MatrixShape<1, WarpShape::kN / InterleavedK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
deleted file mode 100755
index 0f470ff76..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ /dev/null
@@ -1,785 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/epilogue/warp/simt_policy.h"
-
-#define CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES 1
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename Operator,      ///< matrix multiply operation (concept: arch::Mma)
-  typename Element,       ///< data type of element to be written
-  typename Layout,        ///< target shared memory layout
-  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-class TileIteratorSimt;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
-  typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
-  typename Element_,       ///< data type of element to be written
-  typename MmaSimtPolicy_         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-class TileIteratorSimt<WarpShape_, Operator_, Element_, layout::RowMajor, MmaSimtPolicy_> {
-public:
-
-  using WarpShape = WarpShape_;
-  using Operator = Operator_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    typename Operator::ElementC, 
-    Policy::kElementsPerIteration>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    typename Operator::ElementC, 
-    Policy::kAccumulatorElementCount>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    4 * Policy::kElementsPerAccess
-#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
-    + 1
-#endif
-  >;
-
-private:
-
-#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<
-    Element, 
-    1
-  >;
-
-#else
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<
-    Element, 
-    Policy::kElementsPerAccess
-  >;
-#endif
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimt(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimt(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / AccessType::kElements) { 
-
-    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
-
-    pointer_ += layout_({
-      lane_offset.row(),
-      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
-    });
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimt & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / AccessType::kElements;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimt & add_tile_offset(TensorCoord const &tile_offset) {
-
-    pointer_ += layout_({
-      tile_offset.row() * Shape::kRow, 
-      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
-    });
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimt & operator+=(TensorCoord const &tile_offset) {
-
-    add_tile_offset(tile_offset);
-    
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
-      // de-vectorized stores
-      using ScalarAccessType = AlignedArray<Element, 1>;
-      ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
-      ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
-          scalarPointer[n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s] = scalarFragPtr[n * Policy::kElementsPerAccess + s];
-        }
-      }
-#else
-    // original vector stores
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-      pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
-    }
-#endif
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
-          typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
-          typename Element_,       ///< data type of element to be written
-          typename Layout_,         ///< target shared memory layout
-          typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
-          >
-class TileIteratorSimtDirectConv {
- public:
-
-  using WarpShape = WarpShape_;
-  using Operator = Operator_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<Policy::kRowsPerIteration, WarpShape::kN>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<typename Operator::ElementC, Policy::kElementsPerIteration>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<typename Operator::ElementC, Policy::kAccumulatorElementCount>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Padding quantity
-  using Padding = MatrixShape<0,
-                              0
-                              >;
-
-private:
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<
-    Element, 
-    Policy::kElementsPerAccess
-  >;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-  /// Base smem offset;
-  Index base_smem_address_;
-
- public:
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirectConv() : pointer_(nullptr) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirectConv(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / AccessType::kElements) {
-
-    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
-
-    pointer_ += layout_({
-      lane_offset.row(),
-      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
-    });
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirectConv & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / AccessType::kElements;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirectConv & add_tile_offset(TensorCoord const &tile_offset) {
-
-    pointer_ += layout_({
-      tile_offset.row() * Shape::kRow, 
-      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
-    });
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirectConv & operator+=(TensorCoord const &tile_offset) {
-
-    add_tile_offset(tile_offset);
-    
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    // original vector stores
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-    AccessType * load_pointer_ = reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-      load_pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address){
-    base_smem_address_ = address;
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Template for reading and writing tiles of accumulators to shared memory
-template <typename WarpShape_,               ///< shape of warp-level GEMM (concept: GemmShape)
-          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
-          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
-          typename Operator_,                ///< matrix multi ply operation (concept: arch::Mma)
-          typename Element_,                 ///< data type of element to be written
-          typename Layout_,                  ///< target shared memory layout
-          typename MmaSimtPolicy_            ///< policy defining lane arrangement (concept: MmaSimtPolicy)
-          >
-class TileIteratorSimtDirect2dConv {
- public:
-  using WarpShape = WarpShape_;
-  using ThreadOutputShape = ThreadOutputShape_;
-  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
-  using Operator = Operator_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  using MmaSimtPolicy = MmaSimtPolicy_;
-
-  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  // Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<ThreadOutputShape::kNHW, ThreadOutputShape::kC>;
-
-  static_assert(!(ThreadShape::kColumn % MmaSimtPolicy::LaneMmaShape::kN),
-                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
-                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
-
-  using Iterations =
-      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / MmaSimtPolicy::LaneMmaShape::kN>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = typename Operator::FragmentC;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = AccumulatorTile;
-
-  /// Padding quantity
-  using Padding = MatrixShape<0, 0>;
-
- private:
-  // Storage type for accessing memory
-  using AccessType = AlignedArray<Element, MmaSimtPolicy::LaneMmaShape::kN>;
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-  /// Base smem offset;
-  Index base_smem_address_;
-
- public:
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirect2dConv() : pointer_(nullptr) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirect2dConv(TensorRef const &ref, unsigned thread_id, unsigned lane_id)
-      : pointer_(reinterpret_cast<AccessType *>(ref.data())),
-        layout_(ref.stride()[0] / AccessType::kElements) {
-  
-    auto lane_layout = MmaSimtPolicy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
-
-    // Get base HW offset of current threads
-    const int threadgroup = thread_id / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
-    const int base_p = (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
-    const int base_q = (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
-
-    const int row_offset = base_p * ThreadBlockOutputShape::kW + base_q;
-
-    pointer_ += layout_(
-        {row_offset,
-         lane_offset.column() * MmaSimtPolicy::LaneMmaShape::kN / int(AccessType::kElements)});
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtDirect2dConv &add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / AccessType::kElements;
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    AccessType *storer_pointer_ =
-        reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int h = 0; h < ThreadOutputShape::kH; ++h) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int w = 0; w < ThreadOutputShape::kW; ++w) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int col = 0; col < Iterations::kColumn; ++col) {
-          int offset = (w + h * ThreadBlockOutputShape::kW) *
-                           (ThreadBlockOutputShape::kC / AccessType::kElements) +
-                       col;
-          storer_pointer_[offset + pointer_offset / int(AccessType::kElements)] =
-              frag_ptr[w + h * ThreadOutputShape::kW + col];
-        }
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) { base_smem_address_ = address; }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,        ///< shape of warp-level GEMM (concept: GemmShape)
-  typename Operator_,         ///< matrix multiply operation (concept: arch::Mma)
-  typename Element_,          ///< data type of element to be written
-  typename Layout_,            ///< target shared memory layout
-  typename MmaSimtPolicy_     ///< policy defining lane arrangement (concept: MmaSimtPolicy)
->
-class TileIteratorSimtCanonical {
-public:
-
-  using WarpShape = WarpShape_;
-  using Operator = Operator_;
-  using Element = Element_;
-  using Layout = Layout_;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    typename Operator::ElementC, 
-    Policy::kElementsPerIteration>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    typename Operator::ElementC, 
-    Policy::kAccumulatorElementCount>;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    4 * Policy::kElementsPerAccess + 1
-  >;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<
-    Element, 
-    1
-  >;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-  /// Guard to indicate whether the shape is divisible
-  bool divisible_;
-
-  /// Extent of the output tensor
-  MatrixCoord extent_;
-
-  /// Thread offset
-  MatrixCoord thread_offset_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / AccessType::kElements),
-    divisible_(true),
-    extent_(WarpShape::kM, WarpShape::kN) { 
-
-    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
-
-    thread_offset_ = {
-      lane_offset.row() * Shape::kRow, 
-      lane_offset.column() * Policy::kElementsPerAccess
-    };
-
-    pointer_ += layout_({
-      lane_offset.row() * Shape::kRow,
-      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
-    });
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical(
-    TensorRef const &ref,
-    TensorCoord const &extent,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / AccessType::kElements),
-    divisible_(false),
-    extent_(extent) { 
-
-    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
-
-    thread_offset_ = {
-      lane_offset.row() * Shape::kRow, 
-      lane_offset.column() * Policy::kElementsPerAccess
-    };
-
-    pointer_ += layout_({
-      lane_offset.row() * Shape::kRow,
-      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
-    });
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / AccessType::kElements;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical & add_tile_offset(TensorCoord const &tile_offset) {
-
-    MatrixCoord coord_offset(
-      tile_offset.row(), 
-      tile_offset.column() * Shape::kColumn
-    );
-
-    thread_offset_ += coord_offset;
-
-    pointer_ += layout_({
-      coord_offset.row(), 
-      coord_offset.column()
-    });
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical & operator+=(TensorCoord const &tile_offset) {
-
-    add_tile_offset(tile_offset);
-    
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    // de-vectorized stores
-    using ScalarAccessType = AlignedArray<Element, 1>;
-    ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
-    ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int s = 0; s < Policy::kElementsPerAccess; s++) {
-        
-        int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
-        int frag_idx = n * Policy::kElementsPerAccess + s;
-        
-        int col = thread_offset_.column() + ptr_idx;
-
-        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
-          scalarPointer[ptr_idx] = scalarFragPtr[frag_idx];
-        }
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-      // de-vectorized loads
-      using ScalarAccessType = AlignedArray<Element, 1>;
-      ScalarAccessType *scalarFragPtr = reinterpret_cast<ScalarAccessType *>(&frag);
-      ScalarAccessType const *scalarPointer = reinterpret_cast<ScalarAccessType const*>(pointer_) + pointer_offset;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
-          
-          int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
-          int frag_idx = n * Policy::kElementsPerAccess + s;
-          
-          int col = thread_offset_.column() + ptr_idx;
-
-          if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
-            scalarFragPtr[frag_idx] = scalarPointer[ptr_idx];
-          }
-        }
-      }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  TileIteratorSimtCanonical & operator++() {
-    return add_tile_offset({1, 0});
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
deleted file mode 100755
index 0bef03106..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ /dev/null
@@ -1,671 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename Element,       ///< data type of element to be written
-  typename Layout         ///< target shared memory layout
->
-class TileIteratorTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename Element_        ///< data type of element to be written
->
-class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using TensorLayout = Layout;
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Number of times this iterator can be incremented
-  using TileIterations = typename Policy::TileIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-  /// Thread offset
-  MatrixCoord thread_offset_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    thread_offset_ = {
-      quad_id, lane_in_quad * Policy::kElementsPerAccess
-    };
-
-    pointer_ += layout_({thread_offset_.row(), thread_offset_.column() / Policy::kElementsPerAccess});
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / Policy::kElementsPerAccess;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
-
-    MatrixCoord coord_offset(
-      tile_offset.row() * Shape::kRow, 
-      tile_offset.column() * Shape::kColumn
-    );
-
-    thread_offset_ += coord_offset;
-
-    pointer_ += layout_({
-      coord_offset.row(),
-      coord_offset.column() / Policy::kElementsPerAccess
-    });
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n];
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      frag_ptr[n] = pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & operator++() {
-    return add_tile_offset({1, 0});
-  }
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename Element_,       ///< data type of element to be written
-  int InterleavedK         ///< number of interleaved k
->
-class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, 
-                            layout::ColumnMajorInterleaved<InterleavedK> > {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorInterleaved<InterleavedK>;
-  using TensorLayout = Layout;                ///< shared memory tensor ref layout
-
-  using TensorRef = TensorRef<Element, TensorLayout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-//    Policy::kRowsPerIteration,
-    WarpShape::kM,
-    InterleavedK
-  >;
-
-  /// This is the fragment size produced by one tile
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction 
-        * Policy::kElementsPerIteration>;
-
-  /// This is the fragment size produced by one iteration
-//  using Fragment = Array<
-//    Element, Policy::kElementsPerIteration >;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  using TileIterations = typename Policy::TileIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Detail::kLanesInQuad * Policy::kElementsPerIteration>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  TensorLayout layout_;
-
-  /// Thread offset
-  MatrixCoord thread_offset_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0]) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    thread_offset_ = {
-      quad_id, lane_in_quad * Policy::kElementsPerIteration
-    };
-
-    pointer_ += (layout_({thread_offset_.row(), thread_offset_.column()}) / Policy::kElementsPerAccess);
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / Policy::kElementsPerAccess;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
-
-    MatrixCoord coord_offset(
-      tile_offset.row() * Shape::kRow, 
-      tile_offset.column() * Shape::kColumn
-    );
-
-    thread_offset_ += coord_offset;
-
-    pointer_ += (layout_({
-      coord_offset.row(),
-      coord_offset.column()
-    }) / Policy::kElementsPerAccess);
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-      
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
-
-      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
-        ptr[a + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n * Policy::kAccessPerIteration + a];
-
-//        printf("store thread %d, address %p, bank %ld\n", threadIdx.x, pointer_+a+n*Detail::kLanesInQuad, 
-//            ((long long)(pointer_+a+n*Detail::kLanesInQuad)>>2)&0x1f);
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
-
-      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
-        frag_ptr[n * Policy::kAccessPerIteration + a] = ptr[a + pointer_offset / Policy::kElementsPerAccess];
-      }
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOp & operator++() {
-    return add_tile_offset({0, 1});
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename Element_,       ///< data type of element to be written
-  typename Layout_
->
-class TileIteratorTensorOpCanonical {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = Element_;
-  using Layout = Layout_;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  static int const kAccessSize = 1;
-  static int const kAccessCount = Policy::kElementsPerAccess / kAccessSize;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, kAccessSize>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-  /// Guard to indicate whether the shape is divisible
-  bool divisible_;
-
-  /// Extent of the output tensor
-  MatrixCoord extent_;
-
-  /// Thread offset
-  MatrixCoord thread_offset_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0]),
-    divisible_(true),
-    extent_(WarpShape::kM, WarpShape::kN) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    thread_offset_ = {
-      quad_id, lane_in_quad * Policy::kElementsPerAccess
-    };
-
-    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical(
-    TensorRef const &ref,
-    TensorCoord const &extent,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0]),
-    divisible_(false),
-    extent_(extent) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    thread_offset_ = {
-      quad_id, lane_in_quad * Policy::kElementsPerAccess
-    };
-
-    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical & add_tile_offset(TensorCoord const &tile_offset) {
-
-    MatrixCoord coord_offset(
-      tile_offset.row() * Shape::kRow, 
-      tile_offset.column() * Shape::kColumn
-    );
-
-    thread_offset_ += coord_offset;
-
-    pointer_ += layout_({
-      coord_offset.row(),
-      coord_offset.column()
-    });
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int a = 0; a < kAccessCount; ++a) {
-
-        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
-        int frag_idx = n * kAccessCount + a;
-
-        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
-
-        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
-          pointer_[ptr_idx] = frag_ptr[frag_idx];
-        }
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int a = 0; a < kAccessCount; ++a) {
-
-        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
-        int frag_idx = n * kAccessCount + a;
-        
-        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
-
-        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
-          frag_ptr[frag_idx] = pointer_[ptr_idx];
-        }
-      }
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpCanonical & operator++() {
-    return add_tile_offset({1, 0});
-  }
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
deleted file mode 100755
index c512dd873..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ /dev/null
@@ -1,1081 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// This is an optimization available on CUDA 11.2 and beyond that eliminates branches in the epilogue.
-#define CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED ((__CUDACC_VER_MAJOR__ * 10 + __CUDACC_VER_MINOR__) >= 112)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory. This is optimized
-/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output
-/// data type is smaller. 
-template <
-  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename Element_,              ///< data type of accumulator element
-  int ElementSizeBits,            ///< Size of accumulator element in bits
-  int OutputSizeBits,             ///< Size of output element in bits
-  int OutputElementCount,         ///< number of elements in output vector
-  int ContiguousLanes,            ///< Number of consecutive lanes writing to contiguous memory
-  bool EightBitsOutputOrLess = (OutputSizeBits <= 8)
->
-class TileIteratorTensorOpMixed {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kOutputElementCount = OutputElementCount;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-
-    /// Number of pointers needed to write accumulators
-    static int const kPointerCount = 
-      (OutputElementCount * sizeof_bits<Element>::value) / (const_min(128, OutputElementCount * sizeof_bits<Element>::value));
-
-    // Currently support max 4 ptr
-    static constexpr int kMaxPointerCount{4};
-
-    static_assert(kPointerCount <= kMaxPointerCount, "Can only accommodate four pointers at present.");
-    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
-
-  /// Stride in units of AccessType
-  int stride_{0};
-
-  /// Logical column in which warp tile is aligned
-  int warp_column_{0};
-
-public:
-
-  /// Default constructor
-  TileIteratorTensorOpMixed() = default;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    stride_(ref.stride()[0] / Policy::kElementsPerAccess),
-    warp_column_(0) { 
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
-      int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2;
-
-      ptr += column_idx;
-
-      pointers_[i % Detail::kPointerCount] = ptr;
-    }
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += pointer_offset / Policy::kElementsPerAccess;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + 
-        tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess;
-    }
-
-    warp_column_ += tile_offset.column() * Shape::kColumn;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
-    return add_tile_offset(tile_offset);
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    AccessType *ptr = pointers_[0];
-
-#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
-
-    // When the optimization is enabled, small tiles require separate logic.
-    bool kN32_optimization = (WarpShape::kN * Detail::kLanesInQuad * Policy::kElementsPerAccess * sizeof_bits<Element>::value) % 1024 == 0;
-    if (kN32_optimization) {
-      int ptr_idx = ((warp_column_ * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
-      if (ptr_idx == 0) {
-        ptr = pointers_[0];
-      } else if (ptr_idx == 1) {
-        ptr = pointers_[1];
-      } else if (ptr_idx == 2) {
-        ptr = pointers_[2];
-      } else if (ptr_idx == 3) {
-        ptr = pointers_[3];
-      }
-    }
-
-#endif
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      
-#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
-
-      //
-      // When the optimization is enabled, this expression suffices to obtain the SMEM pointer.
-      //
-      if (WarpShape::kN == 64) {
-        ptr = pointers_[n / 4];
-      }
-      else if (!kN32_optimization)
-#endif
-      {
-        // This is the reference implementation
-        int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
-        int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
-  
-        if (ptr_idx == 0) {
-          ptr = pointers_[0 % Detail::kPointerCount];
-        }
-        else if (ptr_idx == 1) {
-          ptr = pointers_[1 % Detail::kPointerCount];
-        }
-        else if (ptr_idx == 2) {
-          ptr = pointers_[2 % Detail::kPointerCount];
-        }
-        else if (ptr_idx == 3) {
-          ptr = pointers_[3 % Detail::kPointerCount];
-        }
-      }
-
-      int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess;
-      ptr[offset] = frag_ptr[n];
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
-      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
-
-      AccessType const *smem_ptr = pointers_[ptr_idx];
-      frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16
-template <
-  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
-  int OutputSizeBits              ///< Size of output element in bits
->
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8, true> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = int32_t;
-  using Layout = layout::RowMajor;
-  static int const kOutputElementCount = 16;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-
-    /// Number of pointers needed to write accumulators
-    static int const kPointerCount = 2;
-
-    /// Offsets added 
-    static int const kOffsetCount = 4;
-
-    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, 2>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
-
-  /// Stride in units of AccessType
-  int stride_{0};
-
-  /// Uniform offset in bytes added to warp tile iterator
-  int uniform_offset_[Detail::kOffsetCount] = {0};
-
-public:
-
-  /// Default constructor
-  TileIteratorTensorOpMixed() = default;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
-      int column_idx = lane_in_quad ^ (i * 2);
-
-      ptr += column_idx;
-    
-      if (i == 0) {
-        pointers_[0] = ptr;
-      }
-      else if (i == 1) {
-        pointers_[1] = ptr;
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kOffsetCount; ++i) {
-      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
-    }
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += pointer_offset / AccessType::kElements;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
-    
-    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
-      tile_offset.column() * Shape::kColumn / AccessType::kElements;
-
-    pointers_[0] += ptr_offset;
-    pointers_[1] += ptr_offset;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kOffsetCount; ++i) {
-      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
-    return add_tile_offset(tile_offset);
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int ptr_idx = (n / 4);
-      int offset_idx = (n % 4);
-
-      AccessType *ptr;
-      if (ptr_idx == 0) {
-        ptr = pointers_[0];
-      }
-      else if (ptr_idx == 1) {
-        ptr = pointers_[1];
-      }
-
-      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
-
-#if 0
-      //
-      // Using inline PTX to avoid generic memory
-      //
-      AccessType *smem_ptr = pointers_[ptr_idx];
-      smem_ptr[offset] = frag_ptr[n];
-#else
-      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
-      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
-      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
-
-      asm volatile(
-        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
-        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
-      );
-#endif
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
-template <
-  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  int OutputSizeBits              ///< Size of output element in bits
->
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8, true> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = int32_t;
-  using Layout = layout::RowMajor;
-  static int const kOutputElementCount = 8;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element, 
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-
-    /// Number of pointers needed to write accumulators
-    static int const kPointerCount = 2;
-
-    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, 2>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
-
-  /// Stride in units of AccessType
-  int stride_{0};
-
-public:
-
-  /// Default constructor
-  TileIteratorTensorOpMixed() = default;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    stride_(ref.stride()[0] / AccessType::kElements) { 
-
-    int quad_id = (lane_id / Detail::kLanesInQuad); 
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
-      int column_idx = lane_in_quad ^ (i * 2);
-
-      ptr += column_idx;
-    
-      if (i == 0) {
-        pointers_[0] = ptr;
-      }
-      else if (i == 1) {
-        pointers_[1] = ptr;
-      }
-    }
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += pointer_offset / AccessType::kElements;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
-    
-    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
-      tile_offset.column() * Shape::kColumn / AccessType::kElements;
-
-    pointers_[0] += ptr_offset;
-    pointers_[1] += ptr_offset;
-   
-    if (tile_offset.column() % 2) {
-      auto tmp = pointers_[0];
-      pointers_[0] = pointers_[1];
-      pointers_[1] = tmp;
-    }
- 
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
-    return add_tile_offset(tile_offset);
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int ptr_idx = (n / 4);
-
-      AccessType *ptr;
-      if (ptr_idx == 0) {
-        ptr = pointers_[0];
-      }
-      else if (ptr_idx == 1) {
-        ptr = pointers_[1];
-      }
-
-      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
-
-#if 0
-      //
-      // Using inline PTX to avoid generic memory
-      //
-      AccessType *smem_ptr = pointers_[ptr_idx];
-      smem_ptr[offset] = frag_ptr[n];
-#else
-      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
-      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
-      uint32_t offset_in_bytes = offset * sizeof(AccessType);
-
-      asm volatile(
-        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
-        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
-      );
-#endif
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float x 16 => float_e4m3_t/float_e5m2_t x 16
-template <
-  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape),
->
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = float;
-  using Layout = layout::RowMajor;
-  static int const kOutputElementCount = 16;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element,
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-
-    /// Number of pointers needed to write accumulators
-    static int const kPointerCount = 2;
-
-    /// Offsets added
-    static int const kOffsetCount = 4;
-
-    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, 2>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
-
-  /// Stride in units of AccessType
-  int stride_{0};
-
-  /// Uniform offset in bytes added to warp tile iterator
-  int uniform_offset_[Detail::kOffsetCount] = {0};
-
-public:
-
-  /// Default constructor
-  TileIteratorTensorOpMixed() = default;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad);
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
-      int column_idx = lane_in_quad ^ (i * 2);
-
-      ptr += column_idx;
-
-      if (i == 0) {
-        pointers_[0] = ptr;
-      }
-      else if (i == 1) {
-        pointers_[1] = ptr;
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kOffsetCount; ++i) {
-      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
-    }
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += pointer_offset / AccessType::kElements;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
-
-    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
-      tile_offset.column() * Shape::kColumn / AccessType::kElements;
-
-    pointers_[0] += ptr_offset;
-    pointers_[1] += ptr_offset;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kOffsetCount; ++i) {
-      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
-    return add_tile_offset(tile_offset);
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int ptr_idx = (n / 4);
-      int offset_idx = (n % 4);
-
-      AccessType *ptr;
-      if (ptr_idx == 0) {
-        ptr = pointers_[0];
-      }
-      else if (ptr_idx == 1) {
-        ptr = pointers_[1];
-      }
-
-      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
-
-#if 0
-      //
-      // Using inline PTX to avoid generic memory
-      //
-      AccessType *smem_ptr = pointers_[ptr_idx];
-      smem_ptr[offset] = frag_ptr[n];
-#else
-      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
-      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
-      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
-
-      asm volatile(
-        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
-        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
-      );
-#endif
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float x 8 => float_e4m3_t/float_e5m2_t x 8
-template <
-  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
->
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using Element = float;
-  using Layout = layout::RowMajor;
-  static int const kOutputElementCount = 8;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    Element,
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-
-    /// Number of pointers needed to write accumulators
-    static int const kPointerCount = 2;
-
-    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
-
-private:
-
-  /// Storage type for accessing memory
-  using AccessType = AlignedArray<Element, 2>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
-
-  /// Stride in units of AccessType
-  int stride_{0};
-
-public:
-
-  /// Default constructor
-  TileIteratorTensorOpMixed() = default;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    stride_(ref.stride()[0] / AccessType::kElements) {
-
-    int quad_id = (lane_id / Detail::kLanesInQuad);
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
-      int column_idx = lane_in_quad ^ (i * 2);
-
-      ptr += column_idx;
-
-      if (i == 0) {
-        pointers_[0] = ptr;
-      }
-      else if (i == 1) {
-        pointers_[1] = ptr;
-      }
-    }
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
-      pointers_[i] += pointer_offset / AccessType::kElements;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
-
-    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
-      tile_offset.column() * Shape::kColumn / AccessType::kElements;
-
-    pointers_[0] += ptr_offset;
-    pointers_[1] += ptr_offset;
-
-    if (tile_offset.column() % 2) {
-      auto tmp = pointers_[0];
-      pointers_[0] = pointers_[1];
-      pointers_[1] = tmp;
-    }
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
-    return add_tile_offset(tile_offset);
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-
-      int ptr_idx = (n / 4);
-
-      AccessType *ptr;
-      if (ptr_idx == 0) {
-        ptr = pointers_[0];
-      }
-      else if (ptr_idx == 1) {
-        ptr = pointers_[1];
-      }
-
-      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
-
-#if 0
-      //
-      // Using inline PTX to avoid generic memory
-      //
-      AccessType *smem_ptr = pointers_[ptr_idx];
-      smem_ptr[offset] = frag_ptr[n];
-#else
-      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
-      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
-      uint32_t offset_in_bytes = offset * sizeof(AccessType);
-
-      asm volatile(
-        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
-        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
-      );
-#endif
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#undef CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
deleted file mode 100755
index 8ce4750c3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ /dev/null
@@ -1,440 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
-  typename ElementC,              ///< Accumulator layout
-  typename Layout                 ///< target shared memory layout
->
-struct TileIteratorVoltaTensorOp; 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
->
-struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using Element = half_t;
-  using Layout = layout::RowMajor;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// Array type for aligned memory accesses
-  using AccessType = typename Policy::AccessType;
-  
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = typename Policy::Fragment;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = typename Policy::AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Number of elements per access
-  static int const kElementsPerAccess = Policy::kElementsPerAccess;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-    static int const kRowsPerQuad = 4;
-    static int const kColumnsPerQuad = 8;
-    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
-    static int const kAccessQuadDelta = 16;
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  TileIteratorVoltaTensorOp(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
-
-    int quad_id = lane_id / Detail::kLanesInQuad;
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    int quad_row_idx = ((quad_id & 4) >> 1) + (quad_id & 1);
-    int quad_col_idx = ((quad_id & 2) >> 1);
-
-    int row = quad_row_idx * Detail::kRowsPerQuad + lane_in_quad;
-    int column = quad_col_idx * Detail::kColumnsPerQuad;
-
-    pointer_ += layout_({row, column / kElementsPerAccess});
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / Policy::kElementsPerAccess;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
-
-    pointer_ += layout_({
-      tile_offset.row() * Shape::kRow, 
-      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
-
-        int access_quad = access_idx / 2;
-        int access = access_idx % 2;
-
-        int ptr_offset = tile_idx * InterleavedTileShape::kN / Policy::kElementsPerAccess +
-          access_quad * Detail::kAccessQuadDelta / Policy::kElementsPerAccess + 
-          access + pointer_offset / Policy::kElementsPerAccess;
-
-        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
-
-        AccessType access_vector = frag_ptr[frag_idx];
-
-        pointer_[ptr_offset] = access_vector;
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
-
-        int access_quad = access_idx / 2;
-        int access = access_idx % 2;
-
-        int ptr_offset = tile_idx * Detail::kTileDelta + access_quad * Detail::kAccessQuadDelta + 
-          access + pointer_offset / Policy::kElementsPerAccess;
-
-        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
-
-        frag_ptr[frag_idx] = pointer_[ptr_offset];
-      }
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment const &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
->
-struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using Element = float;
-  using Layout = layout::RowMajor;
-
-  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// Array type for aligned memory accesses
-  using AccessType = typename Policy::AccessType;
-  
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = typename Policy::Fragment;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = typename Policy::AccumulatorTile;
-
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-
-  /// Number of elements per access
-  static int const kElementsPerAccess = Policy::kElementsPerAccess;
-
-  // Internal constants
-  struct Detail {
-    static int const kLanesInQuad = 4;
-    static int const kRowsPerQuad = 4;
-    static int const kColumnsPerQuad = 8;
-    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
-    static int const kAccessQuadDelta = 16;
-  };
-
-  /// Padding quantity
-  using Padding = MatrixShape<
-    0,
-    Policy::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to memory
-  AccessType *pointer_;
-
-  /// Internal layout object
-  Layout layout_;
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  TileIteratorVoltaTensorOp(
-    TensorRef const &ref,
-    unsigned lane_id
-  ):
-    pointer_(reinterpret_cast<AccessType *>(ref.data())),
-    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
-
-    int quad_id = lane_id / Detail::kLanesInQuad;
-    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
-
-    int const kQuadRowDelta = 4;
-    int const kQuadColumnDelta = 2 * Policy::MmaIterations::kColumn;
-
-    int quad_row_offset = ((quad_id & 4) / 2 + (quad_id & 1)) * kQuadRowDelta;
-    int quad_column_offset = (quad_id & 2) / 2 * kQuadColumnDelta;
-
-    int thread_row_offset = (lane_in_quad & 1);
-    int thread_column_offset = (lane_in_quad & 2) / 2;
-
-    int row = quad_row_offset + thread_row_offset;
-    int column = quad_column_offset + thread_column_offset;
-
-    pointer_ += layout_({row, column});
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
-    pointer_ += pointer_offset / Policy::kElementsPerAccess;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
-
-    pointer_ += layout_({
-      tile_offset.row() * Shape::kRow, 
-      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    int const kAccessesPerRow = Policy::TileIterations::kColumn * Policy::MmaIterations::kColumn * 2;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int row_idx = 0; row_idx < Policy::kRowsPerMmaTile; ++row_idx) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int access_idx = 0; access_idx < kAccessesPerRow; ++access_idx) {
-
-        int frag_idx = row_idx * kAccessesPerRow + access_idx;
-
-        int ptr_column_offset = (access_idx & 1) * 2 + 
-          (access_idx & 2) * Policy::MmaIterations::kColumn * 2 + 
-          (access_idx & 4) * Policy::MmaIterations::kColumn * 2;
-
-        int ptr_row_offset = row_idx * 2;
-
-        int ptr_offset = layout_({ptr_row_offset, ptr_column_offset}) + pointer_offset / Policy::kElementsPerAccess;
-
-        pointer_[ptr_offset] = frag_ptr[frag_idx];
-      }
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    assert(0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment const &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
deleted file mode 100755
index 951833d4e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#if !(defined(__clang__) && defined(__CUDA__))
-
-#include "cutlass/cutlass.h"
-#include "cutlass/wmma_array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,       ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorFragment,    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
-  typename Layout               ///< target shared memory layout
->
-class TileIteratorWmmaTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template for reading and writing tiles of accumulators to shared memory
-template <
-  typename WarpShape_,          ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_,      ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorFragment_    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
->
-class TileIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorFragment_, layout::RowMajor> {
-public:
-
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorFragment = OperatorFragment_;
-  using Layout = layout::RowMajor;
-
-  //
-  // Derived types
-  //
-  using WmmaDataType = typename OperatorFragment::element_type;
-  using Element = typename cutlass::arch::WmmaToCutlassDataType<WmmaDataType>::Type; ///< Data Type of element stored in nvcuda::wmma::frament         
-  using TensorRef = TensorRef<Element, Layout>;                                      ///< Tensor Reference object
-  using TensorCoord = MatrixCoord;                                                   ///< Logical coordinate in referenced tensor
-  using Index = typename TensorRef::Index;
-  using LongIndex = typename TensorRef::LongIndex;
-
-  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
-
-  /// Shape of the tile in memory
-  using Shape = MatrixShape<
-    Policy::kRowsPerIteration,
-    WarpShape::kN
-  >;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = WmmaFragmentArray<OperatorFragment, Policy::OperatorCount::kColumn * Policy::kWmmaFragmentsPerAccess>;
-
-
-  /// This is the complete warp-level accumulator tile.
-  //using AccumulatorTile = typename Operator::FragmentC;
-
-
-  /// Padding quantity 
-  // (Epilogue shared memory padding for WMMA Gemm kernel is set to run optimaly on Turing)
-  using Padding = MatrixShape<
-    0,
-    4 * Policy::kElementsPerAccess
-  >;
-
-private:
-
-  /// Storage type for accessing memory
-  //using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
-
-  //
-  // Data members
-  //
-
-  /// Internal pointer to shared memory
-  TensorRef ref_;
-
-
-public:
-
-  /// Default constructor
-  CUTLASS_HOST_DEVICE
-  TileIteratorWmmaTensorOp(): ref_(nullptr) { 
-
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  TileIteratorWmmaTensorOp(
-    TensorRef const &ref,
-    unsigned lane_id
-  ): ref_(ref) {
-  }
-
-  /// Adds a pointer offset
-  CUTLASS_HOST_DEVICE
-  TileIteratorWmmaTensorOp & add_pointer_offset(Index pointer_offset) {
-    ref_.add_pointer_offset(pointer_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorWmmaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
-    ref_.add_coord_offset({tile_offset.row() * OperatorShape::kM, tile_offset.column() * WarpShape::kN});
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_HOST_DEVICE
-  TileIteratorWmmaTensorOp & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
-      
-      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
-
-      nvcuda::wmma::store_matrix_sync(
-        ptr, 
-        frag[n], 
-        ref_.stride()[0], 
-        nvcuda::wmma::layout_t::mem_row_major
-      ); 
-    
-    }
-  }
-
-  /// Store
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
- 
-    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
-
-      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
-
-      nvcuda::wmma::load_matrix_sync(         
-        frag[n], 
-        ptr,
-        ref_.stride()[0], 
-        nvcuda::wmma::layout_t::mem_row_major
-      ); 
-    
-    }
-  }
-
-  /// Load
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  
-  /// Set smem base address
-  CUTLASS_HOST_DEVICE
-  void set_smem_base_address(Index address) {
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif // !defined(__clang__)
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
deleted file mode 100755
index f6df868e3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
-          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
-          a row-oriented slice is visible per iteration.
-*/
-
-#pragma once
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Policy details related to the epilogue
-template <
-  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
-  typename ElementC,              ///< Accumulator layout
-  typename Layout                 ///< target shared memory layout
->
-struct VoltaTensorOpPolicy; 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape_          ///< shape of warp-level GEMM (concept: GemmShape)
->
-struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using ElementC = half_t;
-  using Layout = layout::RowMajor;
-
-  /// Shape of one warp-levelinstruction
-  using InstructionShape = gemm::GemmShape<16, 16, 4>;
-
-  /// Number of mma operations performed for one 32x32x4 interleaved tile
-  using MmaIterations = MatrixShape<
-    InterleavedTileShape::kM / InstructionShape::kM,
-    InterleavedTileShape::kN / InstructionShape::kN
-  >;
-
-  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
-  using TileIterations = MatrixShape<
-    WarpShape::kM / InterleavedTileShape::kM,
-    WarpShape::kN / InterleavedTileShape::kN
-  >;
-
-  /// Number of accumulator elements owned by each thread per Mma
-  static int const kElementsPerMma = 8;
-  static int const kRowsPerIteration = 16;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-
-  /// Number of accumulator elements stored per memory instruction to shared memory
-  static int const kElementsPerAccess = 4;
-  
-  /// Number of accesses performed per interleaved tile
-  static int const kAccessesPerInterleavedTile = 4;
-
-  /// Total number of iterations needed to cover the entire tile
-  static int const kIterations = TileIterations::kRow * 2;
-
-  //
-  // Derived types
-  //
-
-  /// Array type for aligned memory accesses
-  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    ElementC, 
-    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    ElementC, 
-    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape_          ///< shape of warp-level GEMM (concept: MatrixShape)
->
-struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
-
-  using WarpShape = WarpShape_;
-  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
-  using ElementC = float;
-  using Layout = layout::RowMajor;
-
-  /// Shape of one warp-levelinstruction
-  using InstructionShape = gemm::GemmShape<16, 16, 4>;
-
-  /// Number of mma operations performed for one 32x32x4 interleaved tile
-  using MmaIterations = MatrixShape<
-    InterleavedTileShape::kM / InstructionShape::kM,
-    InterleavedTileShape::kN / InstructionShape::kN
-  >;
-
-  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
-  using TileIterations = MatrixShape<
-    WarpShape::kM / InterleavedTileShape::kM,
-    WarpShape::kN / InterleavedTileShape::kN
-  >;
-
-  /// Number of accumulator elements owned by each thread per Mma
-  static int const kElementsPerMma = 8;
-  static int const kRowsPerIteration = 16;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-
-  /// Number of accumulator elements stored per memory instruction to shared memory
-  static int const kElementsPerAccess = 2;
-  
-  /// Number of accesses performed per interleaved tile
-  static int const kAccessesPerInterleavedTile = 8;
-
-  /// Number of rows per interleaved tile
-  static int const kRowsPerMmaTile = 2;
-
-  /// Total number of iterations needed to cover the entire tile
-  static int const kIterations = TileIterations::kRow * MmaIterations::kRow;
-
-  //
-  // Derived types
-  //
-  
-  /// Array type for aligned memory accesses
-  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
-
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    ElementC, 
-    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
-
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    ElementC, 
-    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
deleted file mode 100755
index a09c1f792..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
-          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
-          a row-oriented slice is visible per iteration.
-*/
-
-#pragma once
-
-#include "cutlass/arch/wmma.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy details related to the epilogue
-template <
-  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
-  typename Layout         ///< target shared memory layout
->
-struct WmmaTensorOpPolicy; 
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for row-major
-template <
-  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
->
-struct WmmaTensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
-
-  /// Number of operations
-  using OperatorCount = MatrixShape<
-    WarpShape::kM / OperatorShape::kM,
-    WarpShape::kN / OperatorShape::kN
-  >;
-
-  //
-  // Hard-coded constants regarding Tensor Operations
-  //
-  static int const kElementsPerAccess = 2;
-  static int const kRowsPerIteration = OperatorShape::kM;
-  static int const kWmmaFragmentsPerAccess = 1;
-
-  //
-  // Derived quantities
-  //
-
-  // Number of externally visible iterations
-  static int const kIterations = OperatorCount::kRow;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
-
-#endif
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/fast_math.h b/lightllm-kernel/cutlass/include/cutlass/fast_math.h
deleted file mode 100755
index fa3873c5e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/fast_math.h
+++ /dev/null
@@ -1,1067 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#include <cmath>
-#include <type_traits>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/uint128.h"
-#include "cutlass/coord.h"
-#include "cutlass/half.h"
-
-/**
- * \file
- * \brief Math utilities
- */
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-CUTLASS_HOST_DEVICE void swap(T &lhs, T &rhs) {
-  T tmp = lhs;
-  lhs = rhs;
-  rhs = tmp;
-}
-
-/******************************************************************************
- * Static math utilities
- ******************************************************************************/
-
-/// Mixed precision dot product
-template <typename Index, typename LongIndex, int N>
-CUTLASS_HOST_DEVICE LongIndex dot(
-  Coord<N, Index> const &coord,
-  Coord<N, LongIndex> const &stride,
-  LongIndex acc = LongIndex()) {
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < N; ++n) {
-    acc += LongIndex(coord[n]) * stride[n];
-  }
-  return acc;
-}
-
-/**
- * Statically determine if N is a power-of-two
- */
-template <int N>
-struct is_pow2 {
-  static bool const value = ((N & (N - 1)) == 0);
-};
-
-/**
- * Statically determine log2(N), rounded down
- */
-template <int N, int CurrentVal = N, int Count = 0>
-struct log2_down {
-  /// Static logarithm value
-  enum { value = log2_down<N, (CurrentVal >> 1), Count + 1>::value };
-};
-
-// Base case
-template <int N, int Count>
-struct log2_down<N, 1, Count> {
-  enum { value = Count };
-};
-
-/**
- * Statically determine log2(N), rounded up
- */
-template <int N, int CurrentVal = N, int Count = 0>
-struct log2_up {
-  /// Static logarithm value
-  enum { value = log2_up<N, (CurrentVal >> 1), Count + 1>::value };
-};
-
-// Base case
-template <int N, int Count>
-struct log2_up<N, 1, Count> {
-  enum { value = ((1 << Count) < N) ? Count + 1 : Count };
-};
-
-/**
- * Statically estimate sqrt(N) to the nearest power-of-two
- */
-template <int N>
-struct sqrt_est {
-  enum { value = 1 << (log2_up<N>::value / 2) };
-};
-
-/**
- * For performing a constant-division with a compile-time assertion that the
- * Divisor evenly-divides the Dividend.
- */
-template <int Dividend, int Divisor>
-struct divide_assert {
-  enum { value = Dividend / Divisor };
-
-  static_assert((Dividend % Divisor == 0), "Not an even multiple");
-};
-
-/******************************************************************************
- * Rounding
- ******************************************************************************/
-
-/**
- * Round dividend up to the nearest multiple of divisor
- */
-template <typename dividend_t, typename divisor_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
-  return ((dividend + divisor - 1) / divisor) * divisor;
-}
-
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t abs_for_integer(value_t a) {
-  return ((a > 0) ? a : -a);
-}
-/**
- * Greatest common divisor
- */
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t gcd(value_t a, value_t b) {
-  for (;;) {
-    if (a == 0) return cutlass::abs_for_integer(b);
-    b %= a;
-    if (b == 0) return cutlass::abs_for_integer(a);
-    a %= b;
-  }
-}
-
-/**
- * Least common multiple
- */
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t lcm(value_t a, value_t b) {
-  value_t temp = cutlass::gcd(a, b);
-  return (temp != 0) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
-}
-
-/**
- * Greatest common divisor
- */
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t gcd_cxx11(value_t a, value_t b) {
-  return (a == 0 || b == 0) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
-}
-
-/**
- * Least common multiple
- */
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t lcm_cxx11(value_t a, value_t b) {
-  return cutlass::gcd_cxx11(a, b) ? (cutlass::abs_for_integer(a) / cutlass::gcd_cxx11(a, b) *
-                                    cutlass::abs_for_integer(b))
-                                  : value_t{};
-}
-
-/// Returns the smallest value in the half-open range [a, a+b) that is a multiple of b
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-int round_up(int a, int b) {
-  return ((a + b - 1) / b) * b;
-}
-
-/// Returns the ceiling of (a / b)
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-int ceil_div(int a, int b) {
-  return (a + b - 1) / b;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/**
- * log2 computation, what's the
- * difference between the below codes and
- * log2_up/down codes?
- */
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t clz(value_t x) {
-  for (int i = 31; i >= 0; --i) {
-    if ((1 << i) & x)
-      return value_t(31 - i);
-  }
-  return value_t(32);
-}
-
-template <typename value_t>
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-value_t find_log2(value_t x) {
-  int a = int(31 - clz(x));
-  a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
-  return a;
-}
-
-
-/**
- * Find divisor, using find_log2
- */
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-void find_divisor(unsigned int& mul, unsigned int& shr, unsigned int denom) {
-  if (denom == 1) {
-    mul = 0;
-    shr = 0;
-  } else {
-    unsigned int p = 31 + find_log2(denom);
-    unsigned m = unsigned(((1ull << p) + unsigned(denom) - 1) / unsigned(denom));
-
-    mul = m;
-    shr = p - 32;
-  }
-}
-
-/**
- * Find quotient and remainder using device-side intrinsics
- */
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigned int shr) {
-
-  #if defined(__CUDA_ARCH__)
-  // Use IMUL.HI if div != 1, else simply copy the source.
-  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
-  #else
-  quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src);
-  #endif
-
-  // The remainder.
-  rem = src - (quo * div);
-}
-
-// For long int input
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17
-void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, unsigned int shr) {
-
-  #if defined(__CUDA_ARCH__)
-  // Use IMUL.HI if div != 1, else simply copy the source.
-  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
-  #else
-  quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src);
-  #endif
-  // The remainder.
-  rem = src - (quo * div);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Object to encapsulate the fast division+modulus operation.
-///
-/// This object precomputes two values used to accelerate the computation and is best used
-/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
-/// marshalled along other kernel arguments using the 'Params' pattern.
-///
-/// Example:
-///
-///
-///   int quotient, remainder, dividend, divisor;
-///
-///   FastDivmod divmod(divisor);
-///
-///   divmod(quotient, remainder, dividend);
-///
-///   // quotient = (dividend / divisor)
-///   // remainder = (dividend % divisor)
-///
-struct FastDivmod {
-  using value_div_type = int;
-  using value_mod_type = int64_t;
-  int32_t divisor = 1;
-  uint32_t multiplier = 0u;
-  uint32_t shift_right = 0u;
-
-  // Find quotient and remainder using device-side intrinsics
-  CUTLASS_HOST_DEVICE
-  void fast_divmod(int& quotient, int& remainder, int dividend) const {
-
-#if defined(__CUDA_ARCH__)
-    // Use IMUL.HI if divisor != 1, else simply copy the source.
-    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
-#else
-    quotient = int((divisor != 1) ? int(((int64_t)dividend * multiplier) >> 32) >> shift_right : dividend);
-#endif
-
-    // The remainder.
-    remainder = dividend - (quotient * divisor);
-  }
-
-  /// For long int input
-  CUTLASS_HOST_DEVICE
-  void fast_divmod(int& quotient, int64_t& remainder, int64_t dividend) const {
-
-#if defined(__CUDA_ARCH__)
-    // Use IMUL.HI if divisor != 1, else simply copy the source.
-    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
-#else
-    quotient = int((divisor != 1) ? ((dividend * multiplier) >> 32) >> shift_right : dividend);
-#endif
-    // The remainder.
-    remainder = dividend - (quotient * divisor);
-  }
-
-
-  /// Construct the FastDivmod object, in host code ideally.
-  ///
-  /// This precomputes some values based on the divisor and is computationally expensive.
-
-  constexpr FastDivmod() = default;
-
-  CUTLASS_HOST_DEVICE
-  FastDivmod(int divisor_): divisor(divisor_) {
-    assert(divisor_ >= 0);
-    if (divisor != 1) {
-      unsigned int p = 31 + find_log2(divisor);
-      unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
-
-      multiplier = m;
-      shift_right = p - 32;
-    }
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  void operator()(int &quotient, int &remainder, int dividend) const {
-    fast_divmod(quotient, remainder, dividend);
-  }
-
-  /// Computes integer division using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  int div(int dividend) const {
-    int quotient, remainder;
-    fast_divmod(quotient, remainder, dividend);
-    return quotient;
-  }
-
-  /// Alias for `div` to match the interface of FastDivmodU64
-  CUTLASS_HOST_DEVICE
-  int divide(int dividend) const {
-    return div(dividend);
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  ///
-  /// Simply returns the quotient
-  CUTLASS_HOST_DEVICE
-  int divmod(int &remainder, int dividend) const {
-    int quotient;
-    fast_divmod(quotient, remainder, dividend);
-    return quotient;
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  void operator()(int &quotient, int64_t &remainder, int64_t dividend) const {
-    fast_divmod(quotient, remainder, dividend);
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  int divmod(int64_t &remainder, int64_t dividend) const {
-    int quotient;
-    fast_divmod(quotient, remainder, dividend);
-    return quotient;
-  }
-
-  /// Returns the divisor when cast to integer
-  CUTLASS_HOST_DEVICE
-  operator int() const { return divisor; }
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Object to encapsulate the fast division+modulus operation for 64b integer division.
-///
-/// This object precomputes two values used to accelerate the computation and is best used
-/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
-/// marshalled along other kernel arguments using the 'Params' pattern.
-///
-/// Example:
-///
-///
-///   uint64_t quotient, remainder, dividend, divisor;
-///
-///   FastDivmodU64 divmod(divisor);
-///
-///   divmod(quotient, remainder, dividend);
-///
-///   // quotient = (dividend / divisor)
-///   // remainder = (dividend % divisor)
-///
-struct FastDivmodU64 {
-
-  uint64_t divisor;
-  uint64_t multiplier;
-  unsigned int shift_right;
-  unsigned int round_up;
-
-  //
-  // Static methods
-  //
-
-  /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
-  CUTLASS_HOST_DEVICE
-  static uint32_t integer_log2(uint64_t x) {
-    uint32_t n = 0;
-    while (x >>= 1) {
-      ++n;
-    }
-    return n;
-  }
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  FastDivmodU64(): divisor(0), multiplier(0), shift_right(0), round_up(0) { }
-
-  /// Construct the FastDivmod object, in host code ideally.
-  ///
-  /// This precomputes some values based on the divisor and is computationally expensive.
-  CUTLASS_HOST_DEVICE
-  FastDivmodU64(uint64_t divisor_): divisor(divisor_), multiplier(1), shift_right(0), round_up(0) {
-
-    if (divisor) {
-      shift_right = integer_log2(divisor);
-
-      if ((divisor & (divisor - 1)) == 0) {
-        multiplier = 0;
-      }
-      else {
-        uint64_t power_of_two = (uint64_t(1) << shift_right);
-        uint64_t multiplier_lo = uint128_t(0, power_of_two) / divisor;
-        multiplier = uint128_t(power_of_two, power_of_two) / divisor;
-        round_up = (multiplier_lo == multiplier ? 1 : 0);
-      }
-    }
-  }
-
-  /// Returns the quotient of floor(dividend / divisor)
-  CUTLASS_HOST_DEVICE
-  uint64_t divide(uint64_t dividend) const {
-    uint64_t quotient = 0;
-
-    #ifdef __CUDA_ARCH__
-      uint64_t x = dividend;
-      if (multiplier) {
-        x = __umul64hi(dividend + round_up, multiplier);
-      }
-      quotient = (x >> shift_right);
-    #else
-      quotient = dividend / divisor;
-    #endif
-
-    return quotient;
-  }
-
-  /// Computes the remainder given a computed quotient and dividend
-  CUTLASS_HOST_DEVICE
-  uint64_t modulus(uint64_t quotient, uint64_t dividend) const {
-    return dividend - quotient * divisor;
-  }
-
-  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
-  CUTLASS_HOST_DEVICE
-  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
-    uint64_t quotient = divide(dividend);
-    remainder = modulus(quotient, dividend);
-    return quotient;
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
-    quotient = divmod(remainder, dividend);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Object to encapsulate the fast division+modulus operation for 64b integer division
-/// in which the divisor is a power of two.
-struct FastDivmodU64Pow2 {
-
-  uint64_t divisor;
-  unsigned int shift_right;
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  FastDivmodU64Pow2(): divisor(0), shift_right(0) { }
-
-  /// Construct the FastDivmod object, in host code ideally.
-  ///
-  /// This precomputes some values based on the divisor and is computationally expensive.
-  CUTLASS_HOST_DEVICE
-  FastDivmodU64Pow2(uint64_t divisor_): divisor(divisor_), shift_right(FastDivmodU64::integer_log2(divisor_)) { }
-
-  /// Returns the quotient of floor(dividend / divisor)
-  CUTLASS_HOST_DEVICE
-  uint64_t divide(uint64_t dividend) const {
-    return dividend >> shift_right;
-  }
-
-  /// Computes the remainder given a computed quotient and dividend
-  CUTLASS_HOST_DEVICE
-  uint64_t modulus(uint64_t dividend) const {
-    // See https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#division-modulo-operations
-    return dividend & (divisor - 1);
-  }
-
-  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
-  CUTLASS_HOST_DEVICE
-  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
-    uint64_t quotient = divide(dividend);
-    remainder = modulus(dividend);
-    return quotient;
-  }
-
-  /// Computes integer division and modulus using precomputed values. This is computationally
-  /// inexpensive.
-  CUTLASS_HOST_DEVICE
-  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
-    quotient = divmod(remainder, dividend);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes the coordinate decomposition from a linear index (64-bit linear index => coord<int32_t>)
-///
-/// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that
-/// a coordinate of <Rank> indices can be decomposed by <Rank - 1> div/mod operations.
-/// Note, is assumed that element divmod[0] divides by extent[1].
-///
-/// For example, assume 4-D coordinate (n, p, q, c) is mapped to a linear index `npqc`. This
-/// can be decomposed via three divide and modulus operations:
-///
-///      c = npqc % C;         |  divmod[2] = FastDivmodU64(C)
-///    npq = npqc / C;         |   coord[3] = c
-///
-///      q =  npq % Q;         |  divmod[1] = FastDivmodU64(Q)
-///     np =  npq / Q;         |   coord[2] = q
-///
-///      p =   np % P;         |  divmod[0] = FastDivmodU64(P)
-///      n =   np / P;         |   coord[1] = p
-///
-///                            |   coord[0] = n
-///
-template <int Rank>
-CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
-  uint64_t linear_idx,                    ///< Linear index to decompose
-  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
-
-  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
-
-  Coord<Rank> coord;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = Rank; i > 1; --i) {
-    uint64_t remainder;
-    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
-    coord[i - 1] = int(remainder);
-  }
-
-  coord[0] = int(linear_idx);
-
-  return coord;
-}
-
-/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
-template <int Rank>
-CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
-  int linear_idx,                    ///< Linear index to decompose
-  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
-
-  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
-
-  Coord<Rank> coord;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = Rank; i > 1; --i) {
-    int remainder;
-    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
-    coord[i - 1] = int(remainder);
-  }
-
-  coord[0] = int(linear_idx);
-
-  return coord;
-}
-
-template <int Rank>
-CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
-  uint64_t linear_idx,                    ///< Linear index to decompose
-  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
-
-  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
-
-  Coord<Rank> coord;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank - 1; ++i) {
-    uint64_t remainder;
-    linear_idx = divmod[i].divmod(remainder, linear_idx);
-    coord[i] = int(remainder);
-  }
-
-  coord[Rank - 1] = int(linear_idx);
-
-  return coord;
-}
-
-/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
-template <int Rank>
-CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
-  int linear_idx,                    ///< Linear index to decompose
-  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
-
-  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
-
-  Coord<Rank> coord;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank - 1; ++i) {
-    int remainder;
-    linear_idx = divmod[i].divmod(remainder, linear_idx);
-    coord[i] = int(remainder);
-  }
-
-  coord[Rank - 1] = int(linear_idx);
-
-  return coord;
-}
-
-/// Safely computes the offset of a linear index in bytes for all types
-template <typename Element>
-CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index) {
-
-  static_assert(
-    (sizeof_bits<Element>::value >= 8 && !(sizeof_bits<Element>::value % 8)) ||
-    (sizeof_bits<Element>::value <  8 && !(8 % sizeof_bits<Element>::value)),
-    "Size of numeric type in bits must either be divisible by 8 bits, or 8 bits must be divisible by the size.");
-
-  if (sizeof_bits<Element>::value >= 8) {
-    return index * (sizeof_bits<Element>::value / 8);
-  }
-  else {
-    int const kElementsPerByte = ((8 / sizeof_bits<Element>::value) + ((sizeof_bits<Element>::value >= 8) ? 1 : 0));
-    return index / kElementsPerByte;
-  }
-}
-
-CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index, int64_t element_sizeof_bits) {
-  if (element_sizeof_bits >= 8) {
-    return index * (element_sizeof_bits / 8);
-  }
-  else {
-    int64_t const kElementsPerByte = ((8 / element_sizeof_bits) + ((element_sizeof_bits >= 8) ? 1 : 0));
-    return index / kElementsPerByte;
-  }
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Min/Max
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int A, int B>
-struct Min {
-  static int const kValue = (A < B) ? A : B;
-};
-
-template <int A, int B>
-struct Max {
-  static int const kValue = (A > B) ? A : B;
-};
-
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17 int const_min(int a, int b) {
-    return (b < a ? b : a);
-}
-
-CUTLASS_HOST_DEVICE
-CUTLASS_CONSTEXPR_IF_CXX17 int const_max(int a, int b) {
-    return (b > a ? b : a);
-}
-
-template <typename T>
-CUTLASS_HOST_DEVICE
-T fast_min(T a, T b) {
-  return (b < a ? b : a);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-float fast_min(float a, float b) {
-  return fminf(a, b);
-}
-
-template <typename T>
-CUTLASS_HOST_DEVICE
-T fast_max(T a, T b) {
-  return (a < b ? b : a);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-float fast_max(float a, float b) {
-  return fmaxf(a, b);
-}
-
-CUTLASS_HOST_DEVICE
-float fast_cos(float theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::cosf(theta);
-  #else
-  return std::cos(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_cos(double theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::cos(theta);
-  #else
-  return std::cos(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_sin(float theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::sinf(theta);
-  #else
-  return std::sin(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_sin(double theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::sin(theta);
-  #else
-  return std::sin(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_acos(float theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::acosf(theta);
-  #else
-  return std::acos(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_acos(double theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::acos(theta);
-  #else
-  return std::acos(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_asin(float theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::asinf(theta);
-  #else
-  return std::asin(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_asin(double theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::asin(theta);
-  #else
-  return std::asin(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_sqrt(float theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::sqrtf(theta);
-  #else
-  return std::sqrt(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_sqrt(double theta) {
-  #if defined(__CUDA_ARCH__)
-  return ::sqrt(theta);
-  #else
-  return std::sqrt(theta);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_exp(float x) {
-  #if defined(__CUDA_ARCH__)
-  return ::expf(x);
-  #else
-  return std::exp(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_exp(double x) {
-  #if defined(__CUDA_ARCH__)
-  return ::exp(x);
-  #else
-  return std::exp(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t fast_exp(half_t x) {
-  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
-      return (half_t)(::hexp(x.to_half()));
-  #else
-      return (half_t)(fast_exp(float(x)));
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_log(float x) {
-  #if defined(__CUDA_ARCH__)
-  return ::logf(x);
-  #else
-  return std::log(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_log(double x) {
-  #if defined(__CUDA_ARCH__)
-  return ::log(x);
-  #else
-  return std::log(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-float fast_tanh(float x) {
-  #if defined(__CUDA_ARCH__)
-    #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
-      float y;
-      asm volatile ( "tanh.approx.f32 %0, %1; " : "=f"(y) : "f"(x));
-      return y;
-    #else
-      return ::tanhf(x);
-    #endif
-  #else
-  return std::tanh(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-double fast_tanh(double x) {
-  #if defined(__CUDA_ARCH__)
-  return ::tanh(x);
-  #else
-  return std::tanh(x);
-  #endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t fast_tanh(half_t x) {
-  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
-
-  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(x.raw()) : "h"(x.raw()));
-  return x;
-
-  #else
-  return half_t(fast_tanh(float(x)));
-  #endif
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct fast_exp_op {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &rhs) const {
-    return fast_exp(rhs);
-  }
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
-template <int N>
-struct fast_exp_op<Array<half_t, N>> {
-  CUTLASS_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
-
-    Array<half_t, N> result;
-
-    // use x2 specialization
-    __half2 const *in  = reinterpret_cast<__half2 const *>(&rhs);
-    __half2 *out = reinterpret_cast<__half2 *>(&result);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      out[i] = ::h2exp(in[i]);
-    }
-
-    // residual
-    if (N % 2) {
-      half_t last = rhs[N - 1];
-      result[N - 1] = half_t(::hexp(last.to_half()));
-    }
-
-    return result;
-  }
-};
-#endif // #if defined(__CUDA_ARCH__)
-
-template <typename T, int N>
-struct fast_exp_op<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
-
-    fast_exp_op<T> fast_op;
-    Array<T, N> y;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = fast_op(rhs[i]);
-    }
-
-    return y;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct fast_tanh_op {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &rhs) const {
-    return fast_tanh(rhs);
-  }
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
-template <int N>
-struct fast_tanh_op<Array<half_t, N>> {
-  CUTLASS_DEVICE
-  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
-
-    Array<half_t, N> result;
-
-    // use x2 specialization
-    uint32_t const *in  = reinterpret_cast<uint32_t const *>(&rhs);
-    uint32_t *out = reinterpret_cast<uint32_t *>(&result);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      asm volatile ("tanh.approx.f16x2 %0, %1;" : "=r"(out[i]) : "r"(in[i]));
-    }
-
-    // residual
-    if (N % 2) {
-      uint16_t const *in = reinterpret_cast<uint16_t const *>(&rhs);
-      uint16_t *out = reinterpret_cast<uint16_t *>(&result);
-      asm volatile ("tanh.approx.f16 %0, %1;" : "=h"(out[N - 1]) : "h"(in[N - 1]));
-    }
-
-    return result;
-  }
-};
-#endif // #if defined(__CUDA_ARCH__)
-
-template <typename T, int N>
-struct fast_tanh_op<Array<T, N>> {
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &rhs) const {
-
-    fast_tanh_op<T> fast_op;
-    Array<T, N> y;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      y[i] = fast_op(rhs[i]);
-    }
-
-    return y;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Absolute value function
-template <typename T>
-CUTLASS_HOST_DEVICE
-T absolute_value(T x) {
-  if (x < T()) {
-    return -x;
-  }
-  return x;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/float8.h b/lightllm-kernel/cutlass/include/cutlass/float8.h
deleted file mode 100755
index 38ea4008c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/float8.h
+++ /dev/null
@@ -1,1284 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines a class for using IEEE half-precision floating-point types in host or
-      device code.
-*/
-
-#pragma once
-
-// FP8 types are available starting CUDA 11.8+
-#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
-#define CUDA_FP8_ENABLED 1
-#endif
-
-#if defined(__CUDA_ARCH__)
-#  if (__CUDA_ARCH__ >= 900)
-#    if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
-#      define CUDA_PTX_FP8_CVT_ENABLED 1
-#    endif // (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
-#  elif (__CUDA_ARCH__ == 890)
-#    if (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
-#      define CUDA_PTX_FP8_CVT_ENABLED 1
-#    endif // (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
-#  endif // (__CUDA_ARCH__ >= 900)
-#endif // defined(__CUDA_ARCH__)
-
-#ifdef __GNUC__
-// Ignore checks on reinterpret-casts that are being used for bitcasts.
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDACC_RTC__)
-
-#include "cutlass/floating_point_nvrtc.h"
-
-#else
-//
-// Standard Library headers belong here to avoid conflicts with NVRTC.
-//
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#include <cstring>
-#endif
-
-#ifdef CUDA_FP8_ENABLED
-#include <cuda_fp8.h>
-#endif
-#include <cuda_fp16.h>
-
-#include "cutlass/cutlass.h"
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  FP8 Has 2 encodings possible : E4M3 and E5M2
-//
-//  E4M3 : 7  |  6 5 4 3  |  2 1 0
-//  E5M2 : 7  |  6 5 4 3 2  |  1 0
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum class FloatEncoding {
-    E4M3,
-    E5M2
-};
-
-template<FloatEncoding T>
-struct alignas(1) float8_base {
-
-    static constexpr bool IS_E4M3 = (T == FloatEncoding::E4M3);
-    static constexpr bool IS_E5M2 = (T == FloatEncoding::E5M2);
-
-    // Number of Bits representing mantissa and exponents
-    static constexpr int FP32_NUM_BITS = 32;
-    static constexpr int FP32_NUM_EXPONENT_BITS = 8;
-    static constexpr int FP32_NUM_MANTISSA_BITS = 23;
-    static constexpr uint32_t FP32_NAN = 0x7fffffff;
-    static constexpr uint32_t FP32_INFINITY_MASK = 0x7f800000;
-    static constexpr int FP32_MAX_EXPONENT  =  127;
-    static constexpr int FP32_MIN_EXPONENT  = -126;
-    static constexpr int FP32_EXPONENT_BIAS =  127;
-
-    static constexpr int FP16_NUM_BITS = 16;
-    static constexpr int FP16_NUM_EXPONENT_BITS = 5;
-    static constexpr int FP16_NUM_MANTISSA_BITS = 10;
-    static constexpr uint16_t FP16_NAN = 0x7fff;
-    static constexpr uint16_t FP16_INFINITY_MASK = 0x7c00;
-    static constexpr int FP16_MAX_EXPONENT  = 15;
-    static constexpr int FP16_MIN_EXPONENT  = -14;
-    static constexpr int FP16_EXPONENT_BIAS = 15;
-
-    static constexpr int FP8_NUM_BITS = 8;
-    static constexpr int FP8_NUM_EXPONENT_BITS = IS_E4M3 ? 4 : 5;
-    static constexpr int FP8_NUM_MANTISSA_BITS = IS_E4M3 ? 3 : 2;
-    static constexpr uint8_t  FP8_NAN = 0x7f; // Also F8_INF
-    static constexpr uint8_t  FP8_INFINITY_MASK = IS_E4M3 ? 0x78 : 0x7c;
-    static constexpr int FP8_MAX_EXPONENT  = IS_E4M3 ?  7 :  15;
-    static constexpr int FP8_MIN_EXPONENT  = IS_E4M3 ? -6 : -14;
-    static constexpr int FP8_EXPONENT_BIAS = IS_E4M3 ?  7 :  15;
-
-    static constexpr uint8_t  FP8_EXPONENT_MASK = (1 << FP8_NUM_EXPONENT_BITS) - 1;
-    static constexpr uint8_t  FP8_MANTISSA_MASK = (1 << FP8_NUM_MANTISSA_BITS) - 1;
-
-    static constexpr uint8_t FP8_MAX_FLT = (IS_E4M3 ? 0x7e : 0x7b);
-
-    // 256 in float
-    static constexpr uint32_t FP8_SAT_VAL_FP32 = 0x43800000;
-
-    //
-    // Data members
-    //
-
-    /// Data container
-    uint8_t storage;
-
-    /// Ctors.
-    CUTLASS_HOST_DEVICE
-    float8_base() : storage(0) { }
-
-    /// Is finite implementation
-    CUTLASS_HOST_DEVICE
-    static bool isfinite(float flt) {
-        uint32_t s;
-
-        #if defined(__CUDA_ARCH__)
-        s = reinterpret_cast<uint32_t const &>(flt);
-        #else
-        std::memcpy(&s, &flt, sizeof(s));
-        #endif
-
-        return (s & 0x7f800000) < 0x7f800000;
-    }
-
-    /// Is NaN implementation
-    CUTLASS_HOST_DEVICE
-    static bool isnan(float flt) {
-        uint32_t s;
-
-        #if defined(__CUDA_ARCH__)
-        s = reinterpret_cast<uint32_t const &>(flt);
-        #else
-        std::memcpy(&s, &flt, sizeof(s));
-        #endif
-
-        return (s & 0x7fffffff) > 0x7f800000;
-    }
-
-    /// Is infinite implementation
-    CUTLASS_HOST_DEVICE
-    static bool isinf(float flt) {
-        uint32_t s;
-
-        #if defined(__CUDA_ARCH__)
-        s = reinterpret_cast<uint32_t const &>(flt);
-        #else
-        std::memcpy(&s, &flt, sizeof(s));
-        #endif
-
-        // Sign = 0 for +inf, 1 for -inf
-        // Exponent = all ones
-        // Mantissa = all zeros
-        return (s == 0x7f800000) || (s == 0xff800000);
-    }
-
-    /// FP32 -> FP8 conversion - rounds to nearest even
-    CUTLASS_HOST_DEVICE
-    static uint8_t convert_float_to_fp8(float const& flt) {
-
-        // software implementation rounds toward nearest even
-        uint32_t s;
-
-        #if defined(__CUDA_ARCH__)
-        s = reinterpret_cast<uint32_t const &>(flt);
-        #else
-        std::memcpy(&s, &flt, sizeof(s));
-        #endif
-
-        // Extract the bits in the FP32 type
-        uint8_t sign = uint8_t((s >> 24 & 0x80));
-        int32_t exp = int32_t((s >> FP32_NUM_MANTISSA_BITS) & 0xff) - FP32_EXPONENT_BIAS;
-        int mantissa = s & 0x7fffff;
-        uint8_t u = 0;
-
-        uint8_t const kF8_NaN = 0x7f;
-
-        // NaN => NaN
-        if (isnan(flt)) {
-            return kF8_NaN;
-        }
-
-        // Inf => MAX_FLT (satfinite)
-        if (isinf(flt)) {
-            return sign | FP8_MAX_FLT;
-        }
-
-        // Special handling
-        if (exp == -128) {
-            // int8 range is from -128 to 127
-            // So 255(inf) - 127(bias) = 128 - will show up as -128
-
-            // satfinite
-            return (sign | FP8_MAX_FLT);
-        }
-
-        int sticky_bit = 0;
-
-        bool skip_sign = false;
-        bool may_be_nan = false;
-
-        if ( (exp >= FP8_MIN_EXPONENT) && (exp <= FP8_MAX_EXPONENT) ) {
-            // normal fp32 to normal fp8
-            exp = exp + FP8_EXPONENT_BIAS;
-            u = uint8_t((uint32_t(exp) & FP8_EXPONENT_MASK) << FP8_NUM_MANTISSA_BITS);
-            u = uint8_t(u | (mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS)));
-        } else if(exp < FP8_MIN_EXPONENT) {
-            // normal single-precision to subnormal float8-precision representation
-            int rshift = (FP8_MIN_EXPONENT - exp);
-            if (rshift < FP32_NUM_BITS) {
-                mantissa |= (1 << FP32_NUM_MANTISSA_BITS);
-
-                sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
-
-                mantissa = (mantissa >> rshift);
-                u = (uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS- FP8_NUM_MANTISSA_BITS)) & FP8_MANTISSA_MASK);
-            } else {
-                mantissa = 0;
-                u = 0;
-            }
-        // Exponent > FP8_MAX_EXPONENT - this is a special case done to match HW
-        // 0x4380_0000 to 0x43e0_0000 - maps from 256 to 448, and does not saturate / inf.
-        } else {
-            if( exp == (FP8_MAX_EXPONENT + 1) ) {
-                uint8_t mantissa_tmp = uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS));
-                if( mantissa_tmp < FP8_MANTISSA_MASK) {
-                    exp = exp + FP8_EXPONENT_BIAS;
-                    u = uint8_t(uint32_t(exp) << FP8_NUM_MANTISSA_BITS) | mantissa_tmp;
-                    may_be_nan =  (mantissa_tmp == (FP8_MANTISSA_MASK-1));
-                } else {
-                    // satfinite
-                    return (sign | FP8_MAX_FLT);
-                }
-            } else{
-                // satfinite
-                return (sign | FP8_MAX_FLT);
-            }
-        }
-
-        // round to nearest even
-        int NUM_BITS_SHIFT = FP32_NUM_MANTISSA_BITS - (FP8_NUM_MANTISSA_BITS + 1);
-        int round_bit = ((mantissa >> NUM_BITS_SHIFT) & 1);
-        sticky_bit |= ((mantissa & ((1 << NUM_BITS_SHIFT) - 1)) != 0);
-
-        if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
-            u = uint8_t(u + 1);
-            if( may_be_nan ) {
-                skip_sign = true;
-            }
-        }
-
-        if (u > FP8_MAX_FLT) {
-            // satfinite
-            u = (sign | FP8_MAX_FLT);
-        }
-
-        if( ! skip_sign ) {
-            u |= sign;
-        }
-
-        return u;
-    }
-
-
-    /// Converts a fp8 value stored as a uint8_t to a float
-    CUTLASS_HOST_DEVICE
-    static float convert_fp8_to_float(uint8_t const& x) {
-
-        uint32_t constexpr kF32_NaN = 0x7fffffff;
-
-        uint8_t const &f8 = x;
-        uint32_t sign = (f8 >> (FP8_NUM_BITS - 1)) & 1;
-        uint32_t exp = (f8 >> FP8_NUM_MANTISSA_BITS) & FP8_EXPONENT_MASK;
-        uint32_t mantissa = f8 & FP8_MANTISSA_MASK;
-        unsigned f = (sign << (FP32_NUM_BITS-1));
-
-        if (IS_E4M3 && exp == 15 && mantissa == 0x7) {
-            f = kF32_NaN;
-        }
-        else if (exp > 0 && (IS_E4M3 || exp < (FP8_MAX_EXPONENT + FP8_EXPONENT_BIAS + 1))) {
-            // normal
-            exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS);
-            f = f |
-                (exp << FP32_NUM_MANTISSA_BITS) |
-                (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
-        } else if (exp == 0) {
-            if (mantissa) {
-                // subnormal
-                exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS) + 1;
-                while ((mantissa & (1 << FP8_NUM_MANTISSA_BITS)) == 0) {
-                    mantissa <<= 1;
-                    exp--;
-                }
-                mantissa &= FP8_MANTISSA_MASK;
-                f = f |
-                    (exp << FP32_NUM_MANTISSA_BITS) |
-                    (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
-            } else {
-                // sign-preserving zero
-            }
-        } else {
-            if(mantissa == 0){
-                // Sign-preserving infinity
-                f = (f | 0x7f800000);
-            } else {
-                // Canonical NaN
-                f = kF32_NaN;
-            }
-        }
-
-        #if defined(__CUDA_ARCH__)
-        return reinterpret_cast<float const&>(f);
-        #else
-        float flt;
-        std::memcpy(&flt, &f, sizeof(flt));
-        return flt;
-        #endif
-    }
-};
-
-
-// Forward declaration of float_e5m2_t to define float_e4m3_t <=> float_e5m2_t
-// conversions in class float_e4m3_t
-struct float_e5m2_t;
-
-
-///////////////////////////////////////////////////////////////
-///
-/// floating-point 8 type : E4M3
-///
-///////////////////////////////////////////////////////////////
-struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
-
-    using Base = float8_base<FloatEncoding::E4M3>;
-
-    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
-
-    //
-    // Static conversion operators
-    //
-
-    /// Constructs from an uint8_t
-    CUTLASS_HOST_DEVICE
-    static float_e4m3_t bitcast(uint8_t x) {
-        float_e4m3_t f;
-        f.storage = x;
-        return f;
-    }
-
-    /// FP32 -> FP8 conversion - rounds to nearest even
-    CUTLASS_HOST_DEVICE
-    static float_e4m3_t from_float(float const& flt) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t tmp;
-        float y = float();
-        asm volatile("cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
-
-        return *reinterpret_cast<float_e4m3_t *>(&tmp);
-    #else
-        return bitcast(Base::convert_float_to_fp8(flt));
-    #endif
-    }
-
-    /// FP16 -> E5M2 conversion - rounds to nearest even
-    CUTLASS_HOST_DEVICE
-    static float_e4m3_t from_half(half const& flt) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t tmp = 0;
-        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
-        asm volatile("cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
-
-        return *reinterpret_cast<float_e4m3_t *>(&tmp);
-    #else
-        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
-    #endif
-    }
-
-    // E4M3 -> half
-    CUTLASS_HOST_DEVICE
-    static half to_half(float_e4m3_t const& x) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t bits = x.storage;
-        uint32_t packed;
-        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
-
-        return reinterpret_cast<half2 const &>(packed).x;
-    #else
-        return __float2half(Base::convert_fp8_to_float(x.storage));
-    #endif
-    }
-
-    // E4M3 -> Float
-    CUTLASS_HOST_DEVICE
-    static float to_float(float_e4m3_t const& x) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t bits = x.storage;
-        uint32_t packed;
-        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
-
-        return __half2float(reinterpret_cast<half2 const &>(packed).x);
-    #else
-        return Base::convert_fp8_to_float(x.storage);
-    #endif
-    }
-
-    //
-    // Methods
-    //
-
-    /// Constructor inheritance
-    using Base::Base;
-
-    /// Default constructor
-    float_e4m3_t() = default;
-
-#ifdef CUDA_FP8_ENABLED
-    /// Conversion from CUDA's FP8 type
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(__nv_fp8_e4m3 x) {
-        storage = x.__x;
-    }
-#endif
-
-    /// Floating point conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(float x) {
-        storage = from_float(x).storage;
-    }
-
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(half x) {
-        storage = from_half(x).storage;
-    }
-
-    /// Floating point conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(double x): float_e4m3_t(float(x)) {
-    }
-
-    /// Integer conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(int x): float_e4m3_t(float(x)) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(unsigned x): float_e4m3_t(float(x)) {
-    }
-
-    /// E5M2 conversion. Defined after float_e5m2_t is defined.
-    CUTLASS_HOST_DEVICE
-    explicit float_e4m3_t(float_e5m2_t x);
-
-#ifdef CUDA_FP8_ENABLED
-    /// Assignment from CUDA's FP8 type
-    CUTLASS_HOST_DEVICE
-    float_e4m3_t & operator=(__nv_fp8_e4m3 x) {
-        storage = x.__x;
-        return *this;
-    }
-#endif
-
-    /// Converts to float
-    CUTLASS_HOST_DEVICE
-    operator float() const {
-        return to_float(*this);
-    }
-
-    /// Converts to half
-    CUTLASS_HOST_DEVICE
-    operator half() const {
-        return to_half(*this);
-    }
-
-    /// Converts to float
-    CUTLASS_HOST_DEVICE
-    explicit operator double() const {
-        return double(to_float(*this));
-    }
-
-    /// Converts to int
-    CUTLASS_HOST_DEVICE
-    explicit operator int() const {
-    #if defined(__CUDA_ARCH__)
-        return __half2int_rn(to_half(*this));
-    #else
-        return int(to_float(*this));
-    #endif
-    }
-
-    /// Casts to bool
-    CUTLASS_HOST_DEVICE
-    explicit operator bool() const {
-    #if defined(__CUDA_ARCH__)
-        return bool(__half2int_rn(to_half(*this)));
-    #else
-        return bool(int(to_float(*this)));
-    #endif
-    }
-
-    /// Accesses raw internal state
-    CUTLASS_HOST_DEVICE
-    uint8_t& raw() {
-        return storage;
-    }
-
-    /// Accesses raw internal state
-    CUTLASS_HOST_DEVICE
-    uint8_t raw() const {
-        return storage;
-    }
-
-    /// Returns the sign bit
-    CUTLASS_HOST_DEVICE
-    bool signbit() const {
-        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
-    }
-
-    /// Returns the biased exponent
-    CUTLASS_HOST_DEVICE
-    int exponent_biased() const {
-        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
-    }
-
-    /// Returns the unbiased exponent
-    CUTLASS_HOST_DEVICE
-    int exponent() const {
-        return exponent_biased() - 15;
-    }
-
-    /// Returns the mantissa
-    CUTLASS_HOST_DEVICE
-    int mantissa() const {
-        return int(storage & Base::FP8_MANTISSA_MASK);
-    }
-
-    CUTLASS_HOST_DEVICE
-    friend bool isnan(float_e4m3_t const& x) {
-      return x.storage == uint8_t(0x7f);
-    }
-
-};
-///////////////////////////////////////////////////////////////
-///
-/// floating-point 8 type : E5M2
-///
-///////////////////////////////////////////////////////////////
-struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
-
-    using Base = float8_base<FloatEncoding::E5M2>;
-
-    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
-
-    //
-    // Static conversion operators
-    //
-
-    /// Constructs from an uint8_t
-    CUTLASS_HOST_DEVICE
-    static float_e5m2_t bitcast(uint8_t x) {
-        float_e5m2_t f;
-        f.storage = x;
-        return f;
-    }
-
-    /// FP32 -> FP8 conversion - rounds to nearest even
-    CUTLASS_HOST_DEVICE
-    static float_e5m2_t from_float(float const& flt) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t tmp;
-        float y = float();
-        asm volatile("cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
-
-        return *reinterpret_cast<float_e5m2_t *>(&tmp);
-    #else
-        return bitcast(Base::convert_float_to_fp8(flt));
-    #endif
-    }
-
-    /// FP16 -> E5M2 conversion - rounds to nearest even
-    CUTLASS_HOST_DEVICE
-    static float_e5m2_t from_half(half const& flt) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t tmp = 0;
-        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
-        asm volatile("cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
-
-        return *reinterpret_cast<float_e5m2_t *>(&tmp);
-    #else
-        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
-    #endif
-    }
-
-    // E5M2 -> half
-    CUTLASS_HOST_DEVICE
-    static half to_half(float_e5m2_t const& x) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t bits = x.storage;
-        uint32_t packed;
-        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
-
-        return reinterpret_cast<half2 const &>(packed).x;
-    #else
-        return __float2half(Base::convert_fp8_to_float(x.storage));
-    #endif
-    }
-
-    // E5M2 -> Float
-    CUTLASS_HOST_DEVICE
-    static float to_float(float_e5m2_t const& x) {
-    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-        uint16_t bits = x.storage;
-        uint32_t packed;
-        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
-
-        return __half2float(reinterpret_cast<half2 const &>(packed).x);
-    #else
-        return Base::convert_fp8_to_float(x.storage);
-    #endif
-    }
-
-    //
-    // Methods
-    //
-
-    /// Constructor inheritance
-    using Base::Base;
-
-    /// Default constructor
-    float_e5m2_t() = default;
-
-#ifdef CUDA_FP8_ENABLED
-    /// Conversion from CUDA's FP8 type
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(__nv_fp8_e5m2 x) {
-        storage = x.__x;
-    }
-#endif
-
-    /// Floating point conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(float x) {
-        storage = from_float(x).storage;
-    }
-
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(half x) {
-      storage = from_half(x).storage;
-    }
-
-    /// Floating point conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(double x): float_e5m2_t(float(x)) {
-    }
-
-    /// Integer conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(int x): float_e5m2_t(float(x)) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(unsigned x): float_e5m2_t(float(x)) {
-    }
-
-    /// E4M3 conversion
-    CUTLASS_HOST_DEVICE
-    explicit float_e5m2_t(float_e4m3_t x);
-
-#ifdef CUDA_FP8_ENABLED
-    /// Assignment from CUDA's FP8 type
-    CUTLASS_HOST_DEVICE
-    float_e5m2_t & operator=(__nv_fp8_e5m2 x) {
-        storage = x.__x;
-        return *this;
-    }
-#endif
-
-    /// Converts to float
-    CUTLASS_HOST_DEVICE
-    operator float() const {
-        return to_float(*this);
-    }
-
-    /// Converts to half
-    CUTLASS_HOST_DEVICE
-    operator half() const {
-      return to_half(*this);
-    }
-
-    /// Converts to float
-    CUTLASS_HOST_DEVICE
-    explicit operator double() const {
-        return double(to_float(*this));
-    }
-
-    /// Converts to int
-    CUTLASS_HOST_DEVICE
-    explicit operator int() const {
-    #if defined(__CUDA_ARCH__)
-        return __half2int_rn(to_half(*this));
-    #else
-        return int(to_float(*this));
-    #endif
-    }
-
-    /// Casts to bool
-    CUTLASS_HOST_DEVICE
-    explicit operator bool() const {
-    #if defined(__CUDA_ARCH__)
-        return bool(__half2int_rn(to_half(*this)));
-    #else
-        return bool(int(to_float(*this)));
-    #endif
-    }
-
-    /// Accesses raw internal state
-    CUTLASS_HOST_DEVICE
-    uint8_t& raw() {
-        return storage;
-    }
-
-    /// Accesses raw internal state
-    CUTLASS_HOST_DEVICE
-    uint8_t raw() const {
-        return storage;
-    }
-
-    /// Returns the sign bit
-    CUTLASS_HOST_DEVICE
-    bool signbit() const {
-        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
-    }
-
-    /// Returns the biased exponent
-    CUTLASS_HOST_DEVICE
-    int exponent_biased() const {
-        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
-    }
-
-    /// Returns the unbiased exponent
-    CUTLASS_HOST_DEVICE
-    int exponent() const {
-        return exponent_biased() - 15;
-    }
-
-    /// Returns the mantissa
-    CUTLASS_HOST_DEVICE
-    int mantissa() const {
-        return int(storage & Base::FP8_MANTISSA_MASK);
-    }
-    
-    CUTLASS_HOST_DEVICE
-    friend bool isnan(float_e5m2_t const& x) {
-      return x.storage == uint8_t(0x7f);
-    }
-
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Arithmetic operators
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool operator==(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) == float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator!=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) != float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) < float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) <= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) > float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float(lhs) >= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator+(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float_e4m3_t(float(lhs) + float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator-(float_e4m3_t const& lhs) {
-    return float_e4m3_t(-float(lhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator-(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float_e4m3_t(float(lhs) - float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator*(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float_e4m3_t(float(lhs) * float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator/(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
-    return float_e4m3_t(float(lhs) / float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator+=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
-    lhs = float_e4m3_t(float(lhs) + float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator-=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
-    lhs = float_e4m3_t(float(lhs) - float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator*=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
-    lhs = float_e4m3_t(float(lhs) * float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator/=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
-    lhs = float_e4m3_t(float(lhs) / float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator++(float_e4m3_t & lhs) {
-    float tmp(lhs);
-    ++tmp;
-    lhs = float_e4m3_t(tmp);
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t& operator--(float_e4m3_t & lhs) {
-    float tmp(lhs);
-    --tmp;
-    lhs = float_e4m3_t(tmp);
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator++(float_e4m3_t & lhs, int) {
-    float_e4m3_t ret(lhs);
-    float tmp(lhs);
-    tmp++;
-    lhs = float_e4m3_t(tmp);
-    return ret;
-}
-
-CUTLASS_HOST_DEVICE
-float_e4m3_t operator--(float_e4m3_t & lhs, int) {
-    float_e4m3_t ret(lhs);
-    float tmp(lhs);
-    tmp--;
-    lhs = float_e4m3_t(tmp);
-    return ret;
-}
-
-CUTLASS_HOST_DEVICE
-bool operator==(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) == float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator!=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) != float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) < float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) <= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) > float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float(lhs) >= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator+(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float_e5m2_t(float(lhs) + float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator-(float_e5m2_t const& lhs) {
-    return float_e5m2_t(-float(lhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator-(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float_e5m2_t(float(lhs) - float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator*(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float_e5m2_t(float(lhs) * float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator/(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
-    return float_e5m2_t(float(lhs) / float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator+=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
-    lhs = float_e5m2_t(float(lhs) + float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator-=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
-    lhs = float_e5m2_t(float(lhs) - float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator*=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
-    lhs = float_e5m2_t(float(lhs) * float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator/=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
-    lhs = float_e5m2_t(float(lhs) / float(rhs));
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator++(float_e5m2_t & lhs) {
-    float tmp(lhs);
-    ++tmp;
-    lhs = float_e5m2_t(tmp);
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t& operator--(float_e5m2_t & lhs) {
-    float tmp(lhs);
-    --tmp;
-    lhs = float_e5m2_t(tmp);
-    return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator++(float_e5m2_t & lhs, int) {
-    float_e5m2_t ret(lhs);
-    float tmp(lhs);
-    tmp++;
-    lhs = float_e5m2_t(tmp);
-    return ret;
-}
-
-CUTLASS_HOST_DEVICE
-float_e5m2_t operator--(float_e5m2_t & lhs, int) {
-    float_e5m2_t ret(lhs);
-    float tmp(lhs);
-    tmp--;
-    lhs = float_e5m2_t(tmp);
-    return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// float_e4m3_t <=> float_e5m2_t conversions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// float_e4m3_t <= float_e5m2_t
-CUTLASS_HOST_DEVICE
-float_e4m3_t::float_e4m3_t(float_e5m2_t x) {
-    storage = from_float(float_e5m2_t::to_float(x)).storage;
-}
-
-/// float_e5m2_t <= float_e4m3_t
-CUTLASS_HOST_DEVICE
-float_e5m2_t::float_e5m2_t(float_e4m3_t x) {
-    storage = from_float(float_e4m3_t::to_float(x)).storage;
-}
-
-///////////////////////////////////////////////////////////////
-///
-/// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
-/// This umbrella datatype can be enabled when a user provides a specific
-/// datatype in runtime argument list.
-///
-/// Currently supported runtime datatypes compatible with type_erased_dynamic_float8_t:
-///   QMMAFormat::E5M2
-///   QMMAFormat::E4M3
-///
-///////////////////////////////////////////////////////////////
-
-union type_erased_dynamic_float8_t {
-  uint8_t data;
-  cutlass::float_e5m2_t e5m2;
-  cutlass::float_e4m3_t e4m3;
-  CUTLASS_HOST_DEVICE
-  explicit operator cutlass::float_e5m2_t() const {
-    return e5m2;
-  }
-
-  CUTLASS_HOST_DEVICE
-  explicit operator cutlass::float_e4m3_t() const {
-    return e4m3;
-  }
-
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Standard Library operations and definitions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if !defined(__CUDACC_RTC__)
-namespace std {
-
-/// Numeric limits common to all float8 types
-template <typename T>
-struct float8_base_numeric_limits {
-private:
-  using F8Type = T;
-public:
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-  static bool const has_denorm_loss = true;
-  static std::float_round_style const round_style = std::round_to_nearest;
-  static bool const is_iec559 = false;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static F8Type min() { return F8Type::bitcast(0x01); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
-
-  /// Returns maximum rounding error
-  CUTLASS_HOST_DEVICE
-  static F8Type round_error() { return F8Type(0.5f); }
-
-  /// Returns positive infinity value
-  CUTLASS_HOST_DEVICE
-  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
-
-  /// Returns quiet NaN value
-  CUTLASS_HOST_DEVICE
-  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
-
-  /// Returns signaling NaN value
-  CUTLASS_HOST_DEVICE
-  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
-
-  /// Returns smallest positive subnormal value
-  CUTLASS_HOST_DEVICE
-  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
-};
-
-/// Numeric limits for float_e4m3_t
-template <>
-struct numeric_limits<cutlass::float_e4m3_t> :
-    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
-  static bool const has_infinity = false;
-
-  /// Minimum finite value
-  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
-
-  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
-  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
-};
-
-/// Numeric limits for float_e5m2_t
-template <>
-struct numeric_limits<cutlass::float_e5m2_t>  :
-    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
-  static bool const has_infinity = true;
-
-  /// Minimum finite value
-  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
-
-  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
-  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
-};
-
-}  // namespace std
-#endif
-
-namespace cutlass {
-namespace platform {
-
-/// Numeric limits common to all float8 types
-template <typename T>
-struct float8_base_numeric_limits {
-private:
-  using F8Type = T;
-public:
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-#if !defined(__CUDACC_RTC__)
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-#endif
-  static bool const has_denorm_loss = true;
-#if !defined(__CUDACC_RTC__)
-  static std::float_round_style const round_style = std::round_to_nearest;
-#endif
-  static bool const is_iec559 = false;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static F8Type min() { return F8Type::bitcast(0x01); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
-
-  /// Returns maximum rounding error
-  CUTLASS_HOST_DEVICE
-  static F8Type round_error() { return F8Type(0.5f); }
-
-  /// Returns positive infinity value
-  CUTLASS_HOST_DEVICE
-  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
-
-  /// Returns quiet NaN value
-  CUTLASS_HOST_DEVICE
-  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
-
-  /// Returns signaling NaN value
-  CUTLASS_HOST_DEVICE
-  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
-
-  /// Returns smallest positive subnormal value
-  CUTLASS_HOST_DEVICE
-  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
-};
-
-/// Forward Declaration
-template <class T>
-struct numeric_limits;
-
-/// Numeric limits for float_e4m3_t
-template <>
-struct numeric_limits<cutlass::float_e4m3_t> :
-    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
-  static bool const has_infinity = false;
-
-  /// Minimum finite value
-  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
-
-  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
-  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
-};
-
-/// Numeric limits for float_e5m2_t
-template <>
-struct numeric_limits<cutlass::float_e5m2_t>  :
-    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
-  static bool const has_infinity = true;
-
-  /// Minimum finite value
-  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
-
-  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
-  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
-};
-
-}  // namespace platform
-
-}  // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// User-defined literals
-//
-
-CUTLASS_HOST_DEVICE
-cutlass::float_e4m3_t operator "" _fe4m3(long double x) {
-  return cutlass::float_e4m3_t(float(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::float_e4m3_t operator "" _fe4m3(unsigned long long int x) {
-  return cutlass::float_e4m3_t(int(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::float_e5m2_t operator "" _fe5m2(long double x) {
-  return cutlass::float_e5m2_t(float(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::float_e5m2_t operator "" _fe5m2(unsigned long long int x) {
-  return cutlass::float_e5m2_t(int(x));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h b/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
deleted file mode 100755
index fdbd80fcd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines categories for floating point numbers for use in NVRTC-compiled code
-*/
-
-#pragma once
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// All floating-point numbers can be put in one of these categories.
-enum  {
-    FP_NAN =
-# define FP_NAN 0
-      FP_NAN,
-    FP_INFINITE =
-# define FP_INFINITE 1
-      FP_INFINITE,
-    FP_ZERO =
-# define FP_ZERO 2
-      FP_ZERO,
-    FP_SUBNORMAL =
-# define FP_SUBNORMAL 3
-      FP_SUBNORMAL,
-    FP_NORMAL =
-# define FP_NORMAL 4
-      FP_NORMAL
-};
-
-CUTLASS_HOST_DEVICE
-int fpclassify(float const& f) {
-
-  uint32_t s;
-
-  #if defined(__CUDA_ARCH__)
-  s = reinterpret_cast<uint32_t const &>(f);
-  #else
-  std::memcpy(&s, &f, sizeof(s));
-  #endif
-
-  uint32_t exp      = s & 0x7f800000;
-  uint32_t mantissa = s & 0x007fffff;
-
-  if (exp == 0x7f800000) {
-    if (mantissa) {
-      return FP_NAN;
-    }
-    else {
-      return FP_INFINITE;
-    }
-  }
-  else if (!exp) {
-    if (mantissa) {
-      return FP_SUBNORMAL;
-    }
-    else {
-      return FP_ZERO;
-    }
-  }
-  return FP_NORMAL;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/functional.h b/lightllm-kernel/cutlass/include/cutlass/functional.h
deleted file mode 100755
index 5b2bc3c67..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/functional.h
+++ /dev/null
@@ -1,930 +0,0 @@
-  /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Define basic numeric operators
-
-    This is inspired by the Standard Library's <functional> header.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/platform/platform.h"
-#if defined(__CUDACC_RTC__)
-#include "cutlass/floating_point_nvrtc.h"
-#endif
-
-#include <cuda_runtime.h>
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include <mma.h>
-#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#ifdef _MSC_VER
-// Provides support for alternate operators such as 'and', 'or', ...
-#include <iso646.h>
-#endif // _MSC_VER
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct absolute_value_op {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs) const {
-    return abs(lhs);
-  }
-};
-
-template <>
-struct absolute_value_op<float> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float lhs) const { return fabs(lhs); }
-};
-
-template <typename T>
-struct plus {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    lhs += rhs;
-    return lhs;
-  }
-};
-
-template <typename T>
-struct minus {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    lhs -= rhs;
-    return lhs;
-  }
-};
-
-template <typename T>
-struct multiplies {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    lhs *= rhs;
-    return lhs;
-  }
-};
-
-template <typename T>
-struct scale {
-  T const scaling_factor_;
-
-  CUTLASS_HOST_DEVICE
-  scale(float scaling_factor) : scaling_factor_(scaling_factor) {
-  }
-
-  T operator()(T const &rhs) const {
-    T result = rhs * scaling_factor_;
-    return result;
-  }
-};
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-/// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
-template<>
-struct plus<__half2> {
-  CUTLASS_HOST_DEVICE
-  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
-    return __hadd2(lhs, rhs);
-  }
-};
-
-template<>
-struct minus<__half2> {
-  CUTLASS_HOST_DEVICE
-  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
-    return __hsub2(lhs, rhs);
-  }
-};
-
-template<>
-struct multiplies<__half2> {
-  CUTLASS_HOST_DEVICE
-  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
-    return __hmul2(lhs, rhs);
-  }
-};
-
-/// Partial specializations needed when __CUDA_NO_HALF_OPERATORS__ is set
-template<>
-struct plus<__half> {
-  CUTLASS_HOST_DEVICE
-  __half operator()(__half lhs, __half const &rhs) const {
-    return __hadd(lhs, rhs);
-  }
-};
-
-template<>
-struct minus<__half> {
-  CUTLASS_HOST_DEVICE
-  __half operator()(__half lhs, __half const &rhs) const {
-    return __hsub(lhs, rhs);
-  }
-};
-
-template<>
-struct multiplies<__half> {
-  CUTLASS_HOST_DEVICE
-  __half operator()(__half lhs, __half const &rhs) const {
-    return __hmul(lhs, rhs);
-  }
-};
-#endif // defined(__CUDA_ARCH__)
-
-
-/// Squares with optional conversion
-template <typename T, typename Output = T>
-struct square {
-  CUTLASS_HOST_DEVICE
-  Output operator()(T lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y = Output(lhs);
-    return mul_op(y, y);
-  }
-};
-
-/// Returns the magnitude squared of an element.
-template <typename T, typename Output = T>
-struct magnitude_squared {
-  CUTLASS_HOST_DEVICE
-  Output operator()(T lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y = Output(lhs);
-    return mul_op(y, y);
-  }
-};
-
-/// Computes the square of a difference with optional conversion
-template <typename T, typename Output = T>
-struct square_difference {
-  CUTLASS_HOST_DEVICE
-  Output operator()(T lhs, T rhs) const {
-    multiplies<Output> mul_op;
-
-    Output y = Output(lhs) - Output(rhs);
-    return mul_op(y, y);
-  }
-};
-
-/// Computes the square of a difference with optional conversion
-template <typename T, typename Output = T>
-struct magnitude_squared_difference {
-  CUTLASS_HOST_DEVICE
-  Output operator()(T lhs, T rhs) const {
-    multiplies<Output> mul_op;
-
-    Output y = Output(lhs) - Output(rhs);
-    return mul_op(y, y);
-  }
-};
-
-// Computes the reciprocal square root
-template <typename T>
-struct inverse_square_root;
-
-template <>
-struct inverse_square_root<float> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float const &lhs) const {
-#if defined(__CUDA_ARCH__)
-    return rsqrtf(lhs);
-#else
-    return 1.f / std::sqrt(lhs);
-#endif
-  }
-};
-
-template <>
-struct inverse_square_root<half_t> {
-  CUTLASS_HOST_DEVICE
-  half_t operator()(half_t const &lhs) const {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 520)
-    auto result = hrsqrt(reinterpret_cast<__half const &>(lhs));
-    return reinterpret_cast<half_t const &>(result);
-#else
-    return half_t(1.f / std::sqrt(half_t::convert(lhs)));
-#endif
-  }
-};
-
-/// Divides
-template <typename T>
-struct divides {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    lhs /= rhs;
-    return lhs;
-  }
-};
-
-/// reciprocal_approximate
-template <typename T>
-struct reciprocal_approximate {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs) const {
-    return divides<T>{}(T(1), lhs);
-  }
-};
-
-template <>
-struct reciprocal_approximate <float> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float lhs) const {
-    float ret;
-    #if defined(__CUDA_ARCH__)
-      asm volatile ("rcp.approx.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
-    #else
-      ret = 1.0f / lhs;
-    #endif
-    return ret;
-  }
-};
-
-/// reciprocal_approximate with ftz
-template<typename T>
-struct reciprocal_approximate_ftz :  reciprocal_approximate<T>
-{};
-
-template <>
-struct reciprocal_approximate_ftz <float> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float lhs) const {
-    float ret;
-    #if defined(__CUDA_ARCH__)
-      asm volatile ("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
-    #else
-      if (std::fpclassify(lhs) == FP_SUBNORMAL) {
-        lhs = 0.0f;
-      }
-      ret = 1.0f / lhs;
-      if (std::fpclassify(ret) == FP_SUBNORMAL) {
-        ret = 0.0f;
-      }
-    #endif
-    return ret;
-  }
-};
-
-/// Negate
-template <typename T>
-struct negate {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs) const {
-    return -lhs;
-  }
-};
-
-/// Greater equal
-template <typename T>
-struct greater_equal {
-  CUTLASS_HOST_DEVICE
-  bool operator()(T const &lhs, T const &rhs) const {
-    return (lhs >= rhs);
-  }
-};
-
-/// Greater
-template <typename T>
-struct greater {
-  CUTLASS_HOST_DEVICE
-  bool operator()(T const &lhs, T const &rhs) const {
-    return (lhs > rhs);
-  }
-};
-
-/// Less equal
-template <typename T>
-struct less_equal {
-  CUTLASS_HOST_DEVICE
-  bool operator()(T const &lhs, T const &rhs) const {
-    return (lhs <= rhs);
-  }
-};
-
-/// Less
-template <typename T>
-struct less {
-  CUTLASS_HOST_DEVICE
-  bool operator()(T const &lhs, T const &rhs) const {
-    return (lhs < rhs);
-  }
-};
-
-template <typename T, bool PropagateNaN = false>
-struct maximum {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &lhs, T const &rhs) const {
-    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
-      using CUTLASS_CMATH_NAMESPACE :: isnan;
-
-      // Call isnan unqualified, so argument-dependent lookup (ADL)
-      // will find overloads such as cutlass::isnan(half_t).
-      // Calling ::isnan or std::isnan directly would force
-      // implicit conversions to float of custom number types
-      // in the cutlass namespace (e.g., cutlass::half_t).
-      return lhs > rhs || isnan(lhs) ? lhs : rhs;
-    }
-    else {
-      return (lhs < rhs ? rhs : lhs);
-    }
-  }
-};
-
-// This is a subclass and not an alias
-// in order to work around a known Clang issue,
-// where a template template parameter with one template parameter
-// does not match classes that take multiple template parameters
-// but have defaults for all but the first.
-template<typename T>
-struct maximum_with_default_nan_propagation : public maximum<T>
-{};
-
-template <>
-struct maximum<float, false> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float const &lhs, float const &rhs) const {
-    return fmaxf(lhs, rhs);
-  }
-};
-
-template <>
-struct maximum<float, true> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float lhs, float rhs) const {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    float res;
-    asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
-    return res;
-#else
-    using CUTLASS_CMATH_NAMESPACE :: isnan;
-
-    return lhs > rhs || isnan(lhs) ? lhs : rhs;
-#endif
-  }
-};
-
-// This is a subclass and not an alias
-// in order to work around a known Clang issue,
-// where a template template parameter with one template parameter
-// does not match classes that take multiple template parameters
-// but have defaults for all but the first.
-template <typename T>
-struct maximum_with_nan_propagation : maximum<T, true>
-{};
-
-// This alias exists for backwards compatibility only.
-// Please use the correctly spelled class template above.
-template <typename T>
-using maximum_with_nan_propogation = maximum_with_nan_propagation<T>;
-
-template <typename T, bool PropagateNaN = false>
-struct minimum {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &lhs, T const &rhs) const {
-    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
-      using CUTLASS_CMATH_NAMESPACE :: isnan;
-
-      return lhs < rhs || isnan(lhs) ? lhs : rhs;
-    }
-    else {
-      return (rhs < lhs ? rhs : lhs);
-    }
-  }
-};
-
-template <>
-struct minimum<float, false> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float const &lhs, float const &rhs) const {
-    return fminf(lhs, rhs);
-  }
-};
-
-template <>
-struct minimum<float, true> {
-  CUTLASS_HOST_DEVICE
-  float operator()(float lhs, float rhs) const {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    float res;
-    asm volatile("min.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
-    return res;
-#else
-    // No need for ADL; call std::isnan(float) on host and ::isnan(float) on device.
-    return lhs < rhs || (CUTLASS_CMATH_NAMESPACE :: isnan(lhs)) ? lhs : rhs;
-#endif
-  }
-};
-
-template <typename T>
-struct minimum_with_nan_propagation : minimum<T, true> 
-{};
-
-template <typename T, bool PropagateNaN = false>
-struct maximum_absolute_value {
-  CUTLASS_HOST_DEVICE
-  float operator()(T const &lhs, T const &rhs) const {
-    absolute_value_op<T> abs_op;
-    maximum<T, PropagateNaN> max_op;
-
-    return max_op(abs_op(lhs), abs_op(rhs));
-  }
-};
-
-// assumes the left operand is already an absolute value
-template <typename T, bool PropagateNaN = false>
-struct maximum_absolute_value_reduction {
-  CUTLASS_HOST_DEVICE
-  float operator()(T const &lhs, T const &rhs) const {
-    absolute_value_op<T> abs_op;
-    maximum<T, PropagateNaN> max_op;
-
-    return max_op(lhs, abs_op(rhs));
-  }
-};
-
-/// Fused multiply-add
-template <typename A, typename B = A, typename C = A>
-struct multiply_add {
-  CUTLASS_HOST_DEVICE
-  C operator()(A const &a, B const &b, C const &c) const {
-    return C(a) * C(b) + c;
-  }
-};
-
-template <typename T>
-struct square_and_plus {
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    multiply_add<T> multiply_add_op;
-    return multiply_add_op(rhs, rhs, lhs);
-  }
-};
-
-// Fused multiply-add that takes exactly one template parameter.
-// This is useful for working around a known Clang issue,
-// where a template template parameter with one template parameter
-// does not match classes that take multiple template parameters
-// but have defaults for all but the first.
-template <typename A>
-struct homogeneous_multiply_add : public multiply_add<A, A, A>
-{};
-
-/// Fused multiply-add
-template <typename A, typename B = A, typename C = A>
-struct multiply_add_relu0 {
-  CUTLASS_HOST_DEVICE
-  C operator()(A const &a, B const &b, C const &c) const {
-    maximum<C> mx;
-    return mx(C(a) * C(b) + c, C(0));
-  }
-};
-
-/// Guarded-multiply-add
-template <typename A, typename B = A, typename C = A>
-struct guarded_multiply_add {
-  CUTLASS_HOST_DEVICE
-  C operator()(A const &a, B const &b, C const &c) const {
-    using CUTLASS_CMATH_NAMESPACE :: isnan;
-
-    if (isnan(a) || isnan(b)) {
-      return C(0);
-    }
-    return C(a) * C(b) + c;
-  }
-};
-
-/// Guarded-multiply-add
-template <>
-struct guarded_multiply_add<half_t, half_t, half_t> {
-  CUTLASS_HOST_DEVICE
-  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-    half_t result;
-    asm ("fma.rn.oob.f16 %0, %1, %2, %3;\n"
-      : "=h"(*reinterpret_cast<uint16_t*>(&result))
-      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
-    return result;
-#else
-    // Namespace-qualifying isnan as cutlass::isnan saves the compiler
-    // the trouble of argument-dependent lookup.  Calling std::isnan or
-    // ::isnan here would result in unwanted implicit conversion to float.
-    if (cutlass::isnan(a) || cutlass::isnan(b)) {
-      return half_t(0);
-    }
-    return a * b + c;
-#endif
-  }
-};
-
-/// Guarded-multiply-add-relu0
-template <typename A, typename B = A, typename C = A>
-struct guarded_multiply_add_relu0 {
-  CUTLASS_HOST_DEVICE
-  C operator()(A const &a, B const &b, C const &c) const {
-    using CUTLASS_CMATH_NAMESPACE :: isnan;
-
-    if (isnan(a) || isnan(b)) {
-      return C(0);
-    }
-    maximum<C> mx;
-    return mx(C(a) * C(b) + c, C(0));
-  }
-};
-
-template <>
-struct guarded_multiply_add_relu0<half_t, half_t, half_t> {
-  CUTLASS_HOST_DEVICE
-  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-    half_t result;
-    asm ("fma.rn.oob.relu.f16 %0, %1, %2, %3;\n"
-      : "=h"(*reinterpret_cast<uint16_t*>(&result))
-      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
-    return result;
-#else
-    if (cutlass::isnan(a) || cutlass::isnan(b)) {
-      return half_t(0);
-    }
-    maximum<half_t> mx;
-    return mx(a * b + c, half_t(0));
-#endif
-  }
-};
-
-/// Fused multiply-add
-template <typename T>
-struct and_add {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b, T const &c) const {
-    return ((a & b) + c);
-  }
-};
-
-
-/// Fused multiply-add
-template <typename T>
-struct xor_add {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b, T const &c) const {
-    return ((a ^ b) + c);
-  }
-};
-
-namespace detail {
-
-// Whether namespace-unqualified conj(t) for t of type T is
-// well-formed.  This says whether the compiler can find
-// namespace-unqualified conj(T) via argument-dependent lookup.
-// If so, then CUTLASS assumes that conj(t) returns
-// the complex conjugate of t.
-template <typename T, typename Enable = void>
-struct has_unqualified_conj : cutlass::platform::false_type
-{};
-
-template<typename T>
-struct has_unqualified_conj<
-    T,
-    decltype(conj(cutlass::platform::declval<T>()), void())
-  > : cutlass::platform::true_type
-{};
-
-template <typename T>
-constexpr bool has_unqualified_conj_v = has_unqualified_conj<T>::value;
-  
-} // namespace detail
-
-// forward declaration (needed for conjugate below)
-template<class T>
-CUTLASS_HOST_DEVICE T conj(T const& z);
-
-namespace detail {
-
-// Whether cutlass::conj(t) for t of type T is well-formed.
-// If so, then CUTLASS assumes that cutlass::conj(t)
-// returns the complex conjugate of t.
-template <typename T, typename Enable = void>
-struct has_cutlass_conj : cutlass::platform::false_type
-{};
-
-template<typename T>
-struct has_cutlass_conj<
-    T,
-    decltype(cutlass::conj(cutlass::platform::declval<T>()), void())
-  > : cutlass::platform::true_type
-{};
-
-template <typename T>
-constexpr bool has_cutlass_conj_v = has_cutlass_conj<T>::value;
-
-} // namespace detail
-  
-// Return the complex conjugate of the input.
-//
-// If the struct hasn't already been specialized for type T, then
-//
-// 1. for arithmetic types, return z;
-//
-// 2. for types where either (namespace-unqualified) conj(z) or
-//    cutlass::conj(z) is well formed, declare "using cutlass::conj;"
-//    and return conj(z); and
-//
-// 3. for everything else, return z.
-//
-// Regarding (1), the C++ Standard Library makes std::conj always
-// return std::complex, even for (noncomplex) arithmetic types.
-// cutlass::conj(T t) needs to return type T.  This follows the
-// convention of linear algebra software like the BLAS, where
-// "conjugate transpose" means the same thing as "transpose" for a
-// matrix of noncomplex numbers.
-//
-// Case (2) covers std::complex, cuda::std::complex, and non-Standard
-// (including user-defined) complex number types (for which "conj(z)"
-// is findable via argument-dependent lookup).  cutlass::conj has a
-// totally generic overload, but a more type-specific overload in any
-// namespace will take precedence.
-//
-// Case (3) covers non-Standard non-complex number types.
-//
-// Users should not generally need to specialize this struct for their
-// own custom complex or noncomplex types.  The idiomatic way to
-// identify a type T as "complex" is to make namespace-unqualified
-// calls to conj(T) findable via argument-dependent lookup.
-template <typename T>
-struct conjugate {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const& z) const {
-    if constexpr (cutlass::platform::is_arithmetic_v<T>) {
-      return z;
-    }
-    else if constexpr (detail::has_unqualified_conj_v<T> || detail::has_cutlass_conj_v<T>) {
-      using cutlass::conj;
-      return conj(z);
-    }
-    else {
-      return z;
-    }
-  }
-};
-
-template <typename T>
-struct first {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const & first, T const &...) const {
-    return first;
-  }
-  CUTLASS_HOST_DEVICE
-  T operator()(T const & first) const {
-    return first;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct logical_and {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b) const {
-    return ((static_cast<bool>(a) && static_cast<bool>(b)) ? T(1) : T());
-  }
-};
-
-template <typename T>
-struct logical_or {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b) const {
-    return ((static_cast<bool>(a) || static_cast<bool>(b)) ? T(1) : T());
-  }
-};
-
-template <typename T>
-struct logical_not {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a) const {
-    return T(!(a));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct bit_and {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b) const {
-    return a & b;
-  }
-};
-
-template <typename T>
-struct bit_or {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b) const {
-    return a | b;
-  }
-};
-
-template <typename T>
-struct bit_not {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a) const {
-    return ~a;
-  }
-};
-
-template <typename T>
-struct bit_xor {
-  CUTLASS_HOST_DEVICE
-  T operator()(T const &a, T const &b) const {
-    return a ^ b;
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/// Atomic reductions
-
-template <typename T>
-struct atomic_add
-{
-  CUTLASS_DEVICE
-  void operator()(T *ptr, const T &data)
-  {
-#if defined(__CUDA_ARCH__)
-    atomicAdd(ptr, data);
-#else
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_UNUSED(data);
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-template<>
-struct atomic_add<double>
-{
-  CUTLASS_DEVICE
-  void operator()(double *ptr, const double &data)
-  {
-#if !defined(__CUDA_ARCH__)
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_UNUSED(data);
-    CUTLASS_NOT_IMPLEMENTED();
-#elif (__CUDA_ARCH__ >= 600)
-    atomicAdd(ptr, data);
-#else
-    // Use CAS loop
-    unsigned long long int* ptr_int = reinterpret_cast<unsigned long long int*>(ptr);
-    unsigned long long int old_int = *ptr_int;
-    unsigned long long int assumed_int;
-
-    do {
-      double update = data + __longlong_as_double(old_int);
-      assumed_int = old_int;
-      old_int = atomicCAS(ptr_int, assumed_int, __double_as_longlong(update));
-    } while (assumed_int != old_int);
-#endif // (__CUDA_ARCH__ >= 600)
-  }
-};
-
-template<>
-struct atomic_add<half2>
-{
-  CUTLASS_DEVICE
-  void operator()(half2 *ptr, const half2 &data)
-  {
-#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 600))
-      CUTLASS_UNUSED(ptr);
-      CUTLASS_UNUSED(data);
-      CUTLASS_NOT_IMPLEMENTED();
-#else
-    // Vector-2 atomic reduction requires .target sm_60 or higher
-    uint32_t word = reinterpret_cast<const uint32_t&>(data);
-    asm volatile ("red.gpu.global.add.noftz.f16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
-#endif // (__CUDA_ARCH__ >= 600)
-  }
-};
-
-template <typename T>
-using red [[deprecated("use atomic_add instead")]] = atomic_add<T>;
-
-template <typename T>
-struct atomic_maximum {
-  CUTLASS_DEVICE
-  T operator()(T *ptr, T value) const {
-#if defined(__CUDA_ARCH__)
-    return atomicMax(ptr, value);
-#else
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_UNUSED(value);
-    CUTLASS_NOT_IMPLEMENTED();
-    return 0;
-#endif
-  }
-};
-
-template <>
-struct atomic_maximum<float> {
-  CUTLASS_DEVICE
-  float operator()(float *ptr, float value) const {
-#if defined(__CUDA_ARCH__)
-    // In device code, make sure that we do NOT try to use
-    // std::signbit, as that won't work if building with NVRTC.
-    // Instead, prefix "::" to call signbit from the global namespace,
-    // which CUDA guarantees to work in device code without including
-    // any headers.
-    //
-    return ! ::signbit(value) ?
-      __int_as_float(atomicMax((int*)ptr, __float_as_int(value))) :
-      __uint_as_float(atomicMin((unsigned int*)ptr, __float_as_uint(value)));
-#else
-    CUTLASS_UNUSED(ptr);
-    CUTLASS_UNUSED(value);
-    CUTLASS_NOT_IMPLEMENTED();
-    return 0;
-#endif
-  }
-};
-
-// is_atomic
-template <class Fn>
-struct is_atomic : platform::false_type {};
-template <class T>
-struct is_atomic<atomic_add<T>> : platform::true_type {};
-template <class T>
-struct is_atomic<atomic_maximum<T>> : platform::true_type {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for nvcuda::wmma::fragment<Use, m, n, k, T, Layout>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-template<typename Use, int m, int n, int k, typename T, typename Layout>
-struct plus<nvcuda::wmma::fragment<Use, m, n, k, T, Layout>>
-{
-  using Fragment = nvcuda::wmma::fragment<Use, m, n, k, T, Layout>;
-  using ElementType = typename Fragment::element_type;
-
-  CUTLASS_HOST_DEVICE
-  Fragment operator()(Fragment const &lhs, Fragment const &rhs) const
-  {
-    Fragment result;
-    plus<ElementType> scalar_op;
-
-    ElementType *result_elts = reinterpret_cast<ElementType*>(&result);
-    const ElementType *lhs_elts = reinterpret_cast<const ElementType*>(&lhs);
-    const ElementType *rhs_elts = reinterpret_cast<const ElementType*>(&rhs);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Fragment::num_elements; i++) {
-      result_elts[i] = scalar_op(lhs_elts[i], rhs_elts[i]);
-    }
-
-    return result;
-  }
-};
-
-#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
deleted file mode 100755
index 8d95967f9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl
+++ /dev/null
@@ -1,419 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/detail/collective.hpp"
-#include "cutlass/detail/dependent_false.hpp"
-
-#include "cute/atom/mma_traits_sm90_gmma.hpp"
-#include "cute/atom/mma_traits_sm90_gmma_sparse.hpp"
-#include "cute/atom/copy_traits_sm90_tma.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-//
-// Some named constants
-//
-constexpr int tma_alignment_bytes = 16;
-constexpr int cp_async_min_alignment_bytes = 4;
-constexpr int sm90_smem_capacity_bytes = 232448;
-
-// Maps 2.x A matrix layout tag to respective GMMA major mode enum
-template <class ElementA, class LayoutA>
-constexpr cute::GMMA::Major
-gmma_ss_tag_to_major_A() {
-  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
-  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
-                not cute::is_same_v<ElementA, tfloat32_t> &&
-                sizeof(ElementA) != 1) {
-    return cute::GMMA::Major::MN;
-  }
-  else {
-    return cute::GMMA::Major::K;
-  }
-}
-
-// Maps 2.x B matrix layout tag to respective GMMA major mode enum
-template <class ElementB, class LayoutB>
-constexpr cute::GMMA::Major
-gmma_ss_tag_to_major_B() {
-  // MN major mode is only valid for non-TF32, non-int and non-fp8 MMAs
-  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>() &&
-                not cute::is_same_v<ElementB, tfloat32_t> &&
-                sizeof(ElementB) != 1) {
-    return cute::GMMA::Major::MN;
-  }
-  else {
-    return cute::GMMA::Major::K;
-  }
-}
-
-template <class LayoutA>
-constexpr cute::GMMA::Major
-gmma_rs_tag_to_major_A() {
-  // MN major mode is only valid for non-TF32 and non-int MMAs
-  if constexpr (cutlass::gemm::detail::is_mn_major_A<LayoutA>()) {
-    return cute::GMMA::Major::MN;
-  }
-  else {
-    return cute::GMMA::Major::K;
-  }
-}
-
-template <class LayoutB>
-constexpr cute::GMMA::Major
-gmma_rs_tag_to_major_B() {
-  // MN major mode is only valid for non-TF32 and non-int MMAs
-  if constexpr (cutlass::gemm::detail::is_mn_major_B<LayoutB>()) {
-    return cute::GMMA::Major::MN;
-  }
-  else {
-    return cute::GMMA::Major::K;
-  }
-}
-// Maps a rank-1 cute::Shape<> representing the cluster shape on to the TMA atom that should be used with it
-template <class UnimodalClusterShape>
-constexpr auto
-sm90_cluster_shape_to_tma_atom(UnimodalClusterShape) {
-  static_assert(cute::rank(UnimodalClusterShape{}) == 1,
-    "Use this function to figure out TMA for each mode individually.");
-
-  if constexpr (cute::size(UnimodalClusterShape{}) == 1) {
-    return cute::SM90_TMA_LOAD{};
-  }
-  else {
-    return cute::SM90_TMA_LOAD_MULTICAST{};
-  }
-}
-
-// Generates the most efficient possible TiledCopy with simt copy atom(e.g. cp.async) given a set of parameters.
-template<class CopyAtom, int ThreadCount, int Alignment, class StrideType, class TileMN, class TileK>
-constexpr auto
-make_simt_gmem_tiled_copy() {
-  using namespace cute;
-
-  constexpr int TileSizeMN  = cute::size(TileMN{});
-  constexpr int TileSizeK   = cute::size(TileK{});
-
-  // Maximize the number of threads along the gmem major mode to promote coalesced reads
-  // While making sure our thread layout tiles the threadblock tile evenly
-
-  if constexpr (cutlass::gemm::detail::is_k_major<StrideType>()) {
-    // K major thread layout for K major gmem
-    constexpr int threads_major = (ThreadCount >= TileSizeK / Alignment) ? (TileSizeK  / Alignment) : ThreadCount;
-    constexpr int threads_minor = ThreadCount / threads_major;
-    static_assert(threads_major > 0);
-    static_assert(ThreadCount % threads_major == 0);
-    static_assert(threads_minor == 0 || (TileSizeMN % threads_minor == 0));
-    return make_tiled_copy(
-      CopyAtom{},
-      Layout<Shape <Int<threads_minor>,Int<threads_major>>,
-             Stride<Int<threads_major>,                _1>>{},
-      Layout<Shape<_1,Int<Alignment>>>{});
-  }
-  else if constexpr (cutlass::gemm::detail::is_mn_major<StrideType>()) {
-    // MN major thread layout for MN major gmem
-    constexpr int threads_major = (ThreadCount >= TileSizeMN / Alignment) ? (TileSizeMN  / Alignment) : ThreadCount;
-    constexpr int threads_minor = ThreadCount / threads_major;
-    static_assert(threads_major > 0);
-    static_assert(ThreadCount % threads_major == 0);
-    static_assert(threads_minor == 0 || (TileSizeK % threads_minor == 0));
-    return make_tiled_copy(
-      CopyAtom{},
-      Layout<Shape <Int<threads_major>,Int<threads_minor>>,
-             Stride<                _1,Int<threads_major>>>{},
-      Layout<Shape<Int<Alignment>,_1>>{});
-  } else {
-    static_assert(cute::is_void_v<CopyAtom>, "Unsupported gmem layout for automatic gmem tiled copy builder.");
-  }
-}
-
-// Helper for SS GMMA smem selection that considers a tensor TileShape:
-//   (BLK_MN, BLK_K)
-//   or hierarchically
-//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
-//   and returns the optimal GMMA::Layout that fits BLK_MN0 and BLK_K0
-template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, const bool is_ws_transposed_B = false>
-constexpr auto
-rs_smem_selector() {
-  using namespace cute;
-
-  auto BLK_MN0 = size<0>(BLK_MN{});
-  auto BLK_K0  = size<0>(BLK_K{});
-
-  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
-  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
-  if constexpr (major == GMMA::Major::MN) {
-    if constexpr (sizeof(ElementType) == 4){
-      if constexpr (is_ws_transposed_B) {
-        // only optimized transpositionB(SW32 and SW128 for tf32) can be used, but prefer SW32 due to free bank conflict
-        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
-        }
-        else {
-          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0,
-                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{})");
-        }
-      }
-      else {
-        // Fall into SW32 due to free bank conflict
-        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
-        }
-        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
-        }
-        else {
-          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
-                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
-        }
-      }
-    }
-    // Used for int8, fp8, fp16 and bf16 I/O kernels
-    else if constexpr (sizeof(ElementType) == 1 || sizeof(ElementType) == 2) {
-      if constexpr (sizeof(ElementType) == 1 && is_ws_transposed_B) {
-        // Only optimized transpositionB (SW32 for int8 and fp8) can be used
-        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
-        }
-        else {
-          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0,
-                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_128_Atom<ElementType>{})");
-        }
-      }
-      else {
-        if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW128_Atom<ElementType>{};
-        }
-        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW64_Atom<ElementType>{};
-        }
-        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_SW32_Atom<ElementType>{};
-        }
-        else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
-          return GMMA::Layout_MN_INTER_Atom<ElementType>{};
-        }
-        else {
-          static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
-                       "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
-        }
-      }
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<ElementType>, "Smem selector does not support this element type");
-    }
-  }
-  else if constexpr (major == GMMA::Major::K) {
-    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW128_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW64_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW32_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_INTER_Atom<ElementType>{};
-    }
-    else {
-      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
-                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
-    }
-  }
-}
-
-// Helper for SS GMMA smem selection that considers a tensor TileShape:
-//   (BLK_MN, BLK_K)
-//   or hierarchically
-//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
-//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
-template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K>
-CUTE_HOST_DEVICE constexpr
-auto
-ss_smem_selector()
-{
-  using namespace cute;
-
-  auto BLK_MN0 = size<0>(BLK_MN{});
-  auto BLK_K0  = size<0>(BLK_K{});
-
-  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
-  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
-
-  if constexpr (major == GMMA::Major::MN) {
-    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_MN_SW128_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_MN_SW64_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_MN_SW32_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_MN_INTER_Atom<ElementType>{};
-    }
-    else {
-      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{}) == 0,
-                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
-    }
-  }
-  else if constexpr (major == GMMA::Major::K) {
-    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW128_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW64_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_SW32_Atom<ElementType>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0) {
-      return GMMA::Layout_K_INTER_Atom<ElementType>{};
-    }
-    else {
-      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{}) == 0,
-                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
-    }
-  }
-}
-
-// Helper for SS GMMA smem selection that considers a tensor TileShape:
-//   (BLK_MN, BLK_K)
-//   or hierarchically
-//   ((BLK_MN0,BLK_MN1,...),(BLK_K0,BLK_K1,...))
-//   and returns the largest GMMA::Layout that fits BLK_MN0 and BLK_K0
-template <cute::GMMA::Major major, class ElementType, class BLK_MN, class BLK_K, class Sparsity>
-CUTE_HOST_DEVICE constexpr
-auto
-ss_smem_selector_sparse()
-{
-  using namespace cute;
-
-  auto BLK_MN0 = size<0>(BLK_MN{});
-  auto BLK_K0  = size<0>(BLK_K{});
-
-  static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
-  static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
-
-  if constexpr (major == GMMA::Major::MN) {
-    if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_MN_SW128_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_MN_SW64_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_MN_SW32_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else {
-      static_assert(BLK_MN0 % size<0>(GMMA::Layout_MN_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
-                    "BLK_MN0 must be a multiple of size<0>(GMMA::Layout_MN_INTER_Atom<ElementType>{})");
-    }
-  }
-  else if constexpr (major == GMMA::Major::K) {
-    if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_K_SW128_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_K_SW64_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_K_SW32_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else if constexpr (BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0) {
-      return GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{};
-    }
-    else {
-      static_assert(BLK_K0 % size<1>(GMMA::Layout_K_INTER_SpAtom<ElementType, Sparsity{}>{}) == 0,
-                    "BLK_K0 must be a multiple of size<1>(GMMA::Layout_K_INTER_Atom<ElementType>{})");
-    }
-  }
-}
-
-template <class ElementA, class ElementB>
-constexpr bool
-is_input_size_two_bytes() {
-  return (sizeof(ElementA) == 2 && sizeof(ElementB) == 2);
-}
-
-template <class ElementA, class ElementB>
-constexpr bool
-is_input_fp8() {
-  return ((cute::is_same_v<ElementA, float_e4m3_t> || cute::is_same_v<ElementA, float_e5m2_t>) &&
-          (cute::is_same_v<ElementB, float_e4m3_t> || cute::is_same_v<ElementB, float_e5m2_t>));
-}
-
-// We need to handle the tuples in this function since it is used in SFINAE dispatch in the CollectiveBuilder.
-// At that point, it is not guaranteed that the tuples have been split out into the required parts.
-template <class MaybeTupleElementA, class LayoutA, class MaybeTupleElementB, class LayoutB>
-constexpr bool
-is_use_rmem_A() {
-
-  using ElementA = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementA>;
-  using ElementB = detail::deduce_mixed_width_dtype_t<0, MaybeTupleElementB>;
-
-  constexpr bool IsABDifferentWidth = cute::sizeof_bits_v<ElementA> != cute::sizeof_bits_v<ElementB>;
-  constexpr bool HasScales = cute::is_tuple<MaybeTupleElementA>::value ^ cute::is_tuple<MaybeTupleElementB>::value;
-  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
-  constexpr bool IsLayoutAkBk = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
-                                cutlass::gemm::detail::is_k_major_B<LayoutB>();
-  constexpr bool IsUseRmemA = (!IsInputSizeTwoBytes && !IsLayoutAkBk) || IsABDifferentWidth || HasScales;
-  return IsUseRmemA;
-}
-
-template <class ElementA, int AlignmentA, class ElementB, int AlignmentB, int RequiredAlignment>
-constexpr bool
-is_aligned() {
-  return ((sizeof(ElementA) * AlignmentA) % RequiredAlignment == 0) &&
-         ((sizeof(ElementB) * AlignmentB) % RequiredAlignment == 0);
-}
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
deleted file mode 100755
index 8657aad2b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ /dev/null
@@ -1,1048 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/pipeline/sm90_pipeline.hpp"
-#include "cutlass/gemm/collective/collective_mma_decl.hpp"
-#include "cutlass/gemm/collective/collective_builder_decl.hpp"
-
-// SM90 Collective Builders should be used only starting CUDA 12.0
-#if (__CUDACC_VER_MAJOR__ >= 12)
-#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override(StageCount<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override(cute::Int<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
-constexpr int
-compute_stage_count_or_override(StageCountAutoCarveout<carveout_bytes> stage_count) {
-  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
-  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
-  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
-  constexpr int stage_bytes =
-    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    static_cast<int>(mainloop_pipeline_bytes);
-
-  return (CapacityBytes - carveout_bytes) / stage_bytes;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override_single_affine_transformed_input(StageCount<stages> stage_count) {
-  return stages;
-}
-
-template <class Element>
-constexpr int get_bits_for_possibly_void_element() {
-  if constexpr (cute::is_same_v<Element, void>) {
-    return 0;
-  }
-  else {
-    return sizeof_bits<Element>::value;
-  }
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity (with an optional scale matrix), or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class ElementScale, class ElementZero, class TileShapeMNK, int carveout_bytes>
-constexpr int
-compute_stage_count_or_override_single_affine_transformed_input(StageCountAutoCarveout<carveout_bytes> stage_count) {
-
-  // 32 bytes to account for barriers etc.
-  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
-  constexpr int scale_zero_k_tile = 1;
-  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
-  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
-  constexpr auto s_bits = get_bits_for_possibly_void_element<ElementScale>();
-  constexpr auto z_bits = get_bits_for_possibly_void_element<ElementZero>();
-
-  constexpr auto scale_bytes = cutlass::bits_to_bytes(s_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
-  constexpr auto zero_bytes  = cutlass::bits_to_bytes(z_bits * size<0>(TileShapeMNK{}) * scale_zero_k_tile);
-  static_assert(scale_bytes % 128 == 0, "Scale bytes must be a multiple of 128");
-  static_assert(zero_bytes  % 128 == 0, "Zero bytes must be a multiple of 128");
-
-  // When scales are void, s_bits will be 0 so no smem will be allocated for scales.
-  constexpr int stage_bytes =
-    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    static_cast<int>(scale_bytes + zero_bytes + mainloop_pipeline_bytes);
-
-  return (CapacityBytes - carveout_bytes) / stage_bytes;
-}
-
-template <class ElementA, class LayoutA, class ElementB, class LayoutB>
-constexpr bool
-is_swapAB(){
-  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
-  constexpr bool IsLayoutAkBmn = cutlass::gemm::detail::is_k_major_A<LayoutA>() &&
-                                 cutlass::gemm::detail::is_mn_major_B<LayoutB>();
-  constexpr bool SwapAB = !IsInputSizeTwoBytes && IsLayoutAkBmn;
-  return SwapAB;
-}
-
-template <class ElementA, class LayoutA, class ElementB, class LayoutB, class KernelScheduleType>
-constexpr bool
-is_warpspecialized_transpose_B(){
-  constexpr bool IsInputSizeTwoBytes = is_input_size_two_bytes<ElementA, ElementB>();
-  constexpr bool IsLayoutAmnBmn = cutlass::gemm::detail::is_mn_major_A<LayoutA>() &&
-                                  cutlass::gemm::detail::is_mn_major_B<LayoutB>();
-  constexpr bool IsWarpSpecialized = cute::is_base_of_v<KernelTmaWarpSpecialized, KernelScheduleType>                ||
-                                     cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, KernelScheduleType>        ||
-                                     cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, KernelScheduleType>     ||
-                                     cute::is_base_of_v<KernelCpAsyncWarpSpecialized, KernelScheduleType>            ||
-                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, KernelScheduleType>    ||
-                                     cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, KernelScheduleType>;
-  constexpr bool IsWarpSpecializedTransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn && IsWarpSpecialized;
-  return IsWarpSpecializedTransposeB;
-}
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_any_of_v<KernelScheduleType,
-                         KernelTmaWarpSpecialized,
-                         KernelTmaWarpSpecializedCooperative,
-                         KernelTmaWarpSpecializedPingpong,
-                         KernelPtrArrayTmaWarpSpecializedCooperative,
-                         KernelPtrArrayTmaWarpSpecializedPingpong>) &&
-       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
-                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert(!IsFP8Input || (IsFP8Input && !IsArrayOfPointersGemm),
-                "KernelPtrArrayTmaWarpSpecialized[Cooperative|Pingpong] is only compatible with FP8 FastAccum version right now.");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
-                                                          KernelTmaWarpSpecializedCooperative,
-                                                          KernelPtrArrayTmaWarpSpecializedCooperative>;
-  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
-      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
-      /* For FP8 use a separate mainloop compared to other datatypes */
-      cute::conditional_t<IsFP8Input,
-          MainloopSm90TmaGmmaWarpSpecializedFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
-          MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_RS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
-      detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
-  static constexpr bool SwapAB = detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
-  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
-      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
-      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
-      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-
-  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<
-      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
-  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_RS Mixed Scaled GEMM
-template <
-  class ElementPairA_,
-  class GmemLayoutATag_,
-  int AlignmentA,
-  class ElementPairB_,
-  class GmemLayoutBTag_,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementPairA_,
-    GmemLayoutATag_,
-    AlignmentA,
-    ElementPairB_,
-    GmemLayoutBTag_,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedMixedInput> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongMixedInput> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeMixedInput>)>
-> {
-
-private:
-  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementPairA_>;
-  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementPairB_>;
-  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementPairA_>;
-  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementPairB_>;
-  static constexpr bool NeitherIsTuple = !cute::is_tuple<ElementPairA_>::value && !cute::is_tuple<ElementPairB_>::value;
-
-public:
-  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementPairA_>;
-  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementPairB_>;
-  static_assert(cute::is_tuple<ElementPairA_>::value ^ cute::is_tuple<ElementPairB_>::value ||
-               (NeitherIsTuple && (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value)),
-    "Either A OR B must be a tuple or the widths of A and B must be different.");
-
-  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
-
-  template<class T>
-  static auto get_stride(T const& t) {
-    if constexpr (not cute::is_layout<T>::value) {
-      return t;
-    }
-    else {
-      return cute::stride(t);
-    }
-  }
-
-  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
-  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
-
-  using ElementPairA = cute::conditional_t<IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementPairA_>;
-  using ElementPairB = cute::conditional_t<!IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementPairB_>;
-
-  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
-  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
-  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
-
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
-  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
-      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
-  static_assert(!IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
-
-  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to RF and we must swap the operands.
-  static constexpr bool SwapAB = !IsATransformed;
-
-  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
-  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
-
-  using ElementMma = cute::conditional_t<IsATransformed, ElementB, ElementA>;
-  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeMixedInput>,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
-      ElementMma, ElementMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, TiledMmaGmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementA,
-      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementB,
-      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-
-  using RealElementA = cute::conditional_t<SwapAB, ElementB, ElementA>;
-  using RealElementB = cute::conditional_t<SwapAB, ElementA, ElementB>;
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override_single_affine_transformed_input<detail::sm90_smem_capacity_bytes,
-      RealElementA, RealElementB, ElementScale, ElementZero, TileShape_MNK>(StageCountType{});
-
-  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
-  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
-
-  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  // We pack the scale data with the operand that will be optionally scaled and converted before MMA.
-  using StrideA = cute::conditional_t<cute::is_layout<GmemLayoutATag_>::value, GmemLayoutATag_, TagToStrideA_t<GmemLayoutATag>>;
-  using StrideB = cute::conditional_t<cute::is_layout<GmemLayoutBTag_>::value, GmemLayoutBTag_, TagToStrideB_t<GmemLayoutBTag>>;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementPairA,
-      StrideA,
-      ElementPairB,
-      StrideB,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_FP8_FAST_ACCUM_SS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      cute::is_any_of_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedFP8FastAccum,
-                        KernelTmaWarpSpecializedPingpongFP8FastAccum,
-                        KernelTmaWarpSpecializedCooperativeFP8FastAccum,
-                        KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
-                        KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Not meet TMA alignment requirement yet\n");
-  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
-                "Only FP8 datatypes are compatible with these kernel schedules\n");
-  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
-  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
-                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
-
-  static constexpr bool IsArrayOfPointersGemm = cute::is_any_of_v<KernelScheduleType,
-                                                                   KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
-                                                                   KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum>;
-
-  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
-                                                          KernelTmaWarpSpecializedCooperativeFP8FastAccum,
-                                                          KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>;
-
-  using AtomLayoutMNK = cute::conditional_t<IsCooperative, Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
-  static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<Sm90ReducedSmemCapacityBytes,
-      ElementA, ElementB, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = cute::conditional_t<IsArrayOfPointersGemm,
-      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
-      MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_SS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelTma> &&
-                     not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>()));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmma<PipelineStages, ClusterShape_MNK>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_CpAsync
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct [[deprecated("Use one of KernelCpAsyncWarpSpecialized schedules instead")]]
-CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      cute::is_same_v<KernelScheduleType, KernelMultistage>>
-> {
-  // Map to warp-specialized kernels for better performance
-  using CollectiveOp = typename CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelCpAsyncWarpSpecialized
-  >::CollectiveOp;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_CpAsync_WS_SS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int   AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int   AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
-       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
-       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
-      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
-    >
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
-                "Minimum alignment required for cp.async is 4B.");
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
-
-  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
-      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  static constexpr int NumLoadWarpGroups = cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ? 2 : 1;
-
-  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
-  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>;
-  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
-      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
-      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
-  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>;
-  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
-      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
-      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
-      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-
-  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<
-      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      void,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      void,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_CpAsync_WS_RS
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int   AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int   AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecialized> ||
-       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative> ||
-       cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedPingpong>) &&
-      detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()
-    >
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::cp_async_min_alignment_bytes>(),
-                "Minimum alignment required for cp.async is 4B.");
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
-  static constexpr bool SwapAB = detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
-  static constexpr bool IsWarpSpecializedTransposeB = detail::is_warpspecialized_transpose_B<
-      ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
-
-  using AtomLayoutMNK = cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelCpAsyncWarpSpecializedCooperative>,
-      Layout<Shape<cute::Int<(size<0>(TileShape_MNK{}) < 128) ? 1 : 2>,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::rs_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(), AtomLayoutMNK{}));
-
-  static constexpr int NumLoadWarpGroups = 1;
-
-  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * AlignmentA>;
-  using GmemCopyAtomA = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>;
-  using GmemTiledCopyA = decltype(detail::make_simt_gmem_tiled_copy<
-      GmemCopyAtomA, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentA, TagToStrideA_t<GmemLayoutATag>,
-      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * AlignmentB>;
-  using GmemCopyAtomB = cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>;  
-  using GmemTiledCopyB = decltype(detail::make_simt_gmem_tiled_copy<
-      GmemCopyAtomB, NumThreadsPerWarpGroup * NumLoadWarpGroups, AlignmentB, TagToStrideB_t<GmemLayoutBTag>,
-      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<GmmaMajorA, ElementAMma,
-      decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<GmmaMajorB, ElementBMma,
-      decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<
-      detail::sm90_smem_capacity_bytes, ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-
-  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<
-      PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
-  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA auto kernel schedule
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-using ExtractedElementA = detail::deduce_mixed_width_dtype_t<0, ElementA>;
-using ExtractedElementB = detail::deduce_mixed_width_dtype_t<0, ElementB>;
-
-static constexpr bool IsTmaCompatible = detail::is_aligned<
-    ExtractedElementA, AlignmentA, ExtractedElementB, AlignmentB, detail::tma_alignment_bytes>();
-
-// Users opt into scales via the builder by passing a tuple of Elements for the input that will be scaled. We detect
-// scale support if ONLY one of the inputs have tuples to describe them.
-static constexpr bool OnlyOneIsTuple = cute::is_tuple<ElementA>::value ^ cute::is_tuple<ElementB>::value;
-static constexpr bool IsDifferentWidth = sizeof_bits<ExtractedElementA>::value != sizeof_bits<ExtractedElementB>::value;
-static constexpr bool IsMixedWidthInput = IsDifferentWidth || (IsDifferentWidth && OnlyOneIsTuple);
-
-#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
-  // Persistent schedules perform best for CUDA Toolkits with version >= 12.1
-  // KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
-  using KernelTmaWarpSpecializedScheduleSameInput = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
-      KernelTmaWarpSpecializedPingpong, KernelTmaWarpSpecializedCooperative>;
-
-  using KernelTmaWarpSpecializedScheduleMixedInput = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
-      KernelTmaWarpSpecializedPingpongMixedInput, KernelTmaWarpSpecializedCooperativeMixedInput>;
-
-  using KernelTmaWarpSpecializedSchedule = cute::conditional_t<IsMixedWidthInput, KernelTmaWarpSpecializedScheduleMixedInput, KernelTmaWarpSpecializedScheduleSameInput>;
-#else
-  using KernelTmaWarpSpecializedSchedule = cute::conditional_t<IsMixedWidthInput, KernelTmaWarpSpecializedMixedInput, KernelTmaWarpSpecialized>;
-#endif
-
-  // Non-persistent schedule is a safer choice for CpAsync kernels due to register pressure
-  using KernelCpAsyncWarpSpecializedSchedule = KernelCpAsyncWarpSpecialized;
-  using KernelSchedule = cute::conditional_t<IsTmaCompatible, KernelTmaWarpSpecializedSchedule, KernelCpAsyncWarpSpecializedSchedule>;
-  static_assert((cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedSchedule> && IsMixedWidthInput) || !IsMixedWidthInput, "Only TMA warp specialized kernels are supported for mixed width input.");
-  using CollectiveOp = typename CollectiveBuilder<
-      arch::Sm90,
-      arch::OpClassTensorOp,
-      ElementA,
-      GmemLayoutATag,
-      AlignmentA,
-      ElementB,
-      GmemLayoutBTag,
-      AlignmentB,
-      ElementAccumulator,
-      TileShape_MNK,
-      ClusterShape_MNK,
-      StageCountType,
-      KernelSchedule
-    >::CollectiveOp;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
deleted file mode 100755
index f9aa7bab2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
+++ /dev/null
@@ -1,268 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Sparse configs specific for SM90 structure sparse kernels
-*/
-
-
-#pragma once
-
-#include "cute/atom/mma_traits_sm90_gmma.hpp"  // cute::GMMA::Major
-#include "cute/layout.hpp"                     // cute::Layout, cute::Shape, cute::Stride
-#include "cute/numeric/integral_constant.hpp"  // cute::Int
-#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
-#include "cute/pointer_sparse.hpp"             // cute::is_sparse
-#include "cute/util/type_traits.hpp"           // cute::is_same_v, cute::conditional_t
-#include "cutlass/fast_math.h"                 // cutlass::round_up
-#include "cutlass/layout/matrix.h"             // cutlass::RowMajor, cutlass::ColumnMajor
-
-namespace cutlass {
-
-using namespace cute;
-
-template<
-  class ElementAMma_,
-  GMMA::Major GmmaMajorA,
-  class ElementEMma_,
-  class MinTileShapeK = Int<32>
->
-struct Sm90GemmSparseConfig {
-
-  static_assert(cute::is_sparse<ElementAMma_>::value, "ElementAMma MUST be sparse elem");
-  static_assert(cute::is_sparse<ElementEMma_>::value, "ElementEMma MUST be sparse elem");
-
-  // A
-  using ElementAMma         = ElementAMma_;
-  using ElementAMmaRaw      = typename ElementAMma::raw_type;
-  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
-
-  // Metadata (E)
-  using ElementEMma         = ElementEMma_;
-  using ElementEMmaRaw      = typename ElementEMma::raw_type;
-  using ElementEMmaSparsity = Int<ElementEMma::sparsity>;
-
-  // MMA type
-  static constexpr bool IsQmma = cute::is_same_v<ElementAMmaRaw, float_e4m3_t> && ElementAMmaSparsity{} == _2{} ||
-                                  cute::is_same_v<ElementAMmaRaw, float_e5m2_t> && ElementAMmaSparsity{} == _2{};
-  static constexpr bool IsImma = cute::is_same_v<ElementAMmaRaw, int8_t> && ElementAMmaSparsity{} == _2{} ||
-                                 cute::is_same_v<ElementAMmaRaw, uint8_t> && ElementAMmaSparsity{} == _2{};
-  static constexpr bool IsHmma = cute::is_same_v<ElementAMmaRaw, half_t> && ElementAMmaSparsity{} == _2{} ||
-                                 cute::is_same_v<ElementAMmaRaw, bfloat16_t> && ElementAMmaSparsity{} == _2{};
-  static constexpr bool IsTfmma = cute::is_same_v<ElementAMmaRaw, tfloat32_t> && ElementAMmaSparsity{} == _2{} || 
-                                  cute::is_same_v<ElementAMmaRaw, float> && ElementAMmaSparsity{} == _2{};
-  static_assert(int(IsQmma) + int(IsImma) + int(IsHmma) + int(IsTfmma) == 1, "Ambigious Input Type Config (failed to choose MMA type)");
-
-  // Number of ElementARaw stored in ElementAMmaRaw. For Hopper this is always 1.
-  using ElemsARawPerElementAMmaRaw = _1;
-
-  // ElementA Sparsity Ratio
-  using ElementASparsity = ElementAMmaSparsity;
-  static_assert(ElementASparsity{} == _2{}, "ElementASparsity must be 2 for Hopper Sparse Gemm");
-
-  // Logical/Physical ElementA per Chunk
-  using LogicalElemsAPerChunk = conditional_t<IsTfmma, _2, _4>;
-  using PhysicalElemsAPerChunk = Int<LogicalElemsAPerChunk{} / ElementASparsity{}>;
-
-  // Metadata Bits
-  using ElementEBitsPerChunk = _4;
-  using ElementEBitsPerElementAMma = cute::conditional_t<IsTfmma, _4, _2>;
-
-  // Metadata Layout. Unit in corresbonding logical elements.
-  // Basic metadata block is (16,64) for 8-bit, (16,32) for 16-bit, (16,16) for 32-bit data types.
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-wgmma-metadata-64n32-f16bf16
-  // Tensor E layout atom stacks 4 basic blocks along M mode to align with WGMMA instruction shape and
-  // stacks 1-4 blocks along K mode and reorders memory layout to allow for vectorized loads from smem.
-  using BlockK = Int<512 / sizeof_bits_v<ElementAMmaRaw>>;
-  static_assert(MinTileShapeK{} % BlockK{} == 0, "MinTileShapeK must be a multiple of BlockK");
-  using NumK = decltype(MinTileShapeK{} / BlockK{});
-
-  using TensorEAtom_32bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_8,_2,NumK>>{}, 
-                                                         Step <Step <_3,_1,_5>, Step <_0,_4,  _2>>{}));
-
-  using TensorEAtom_16bit = decltype(make_ordered_layout(Shape<Shape<_8,_2,_4>, Shape<_16,_2,NumK>>{},
-                                                         Step <Step <_3,_1,_5>, Step < _0,_4,  _2>>{}));
-
-  using TensorEAtom_8bit  = decltype(make_ordered_layout(Shape<_64,MinTileShapeK>{},
-                                                         Step < _1,           _0>{}));
-
-  using TensorEAtom = cute::conditional_t<(IsQmma || IsImma),  TensorEAtom_8bit, 
-                      cute::conditional_t<IsTfmma, TensorEAtom_32bit,
-                      TensorEAtom_16bit>>;
-
-  // Logical elems that construct the atomK for tensorE/A.  
-  using TensorEAtomK = Int<size<1>(TensorEAtom{})>;
-  using TensorEAtomM = Int<size<0>(TensorEAtom{})>;
-
-  // Tensor E alignment requirements
-  using TensorEAlignmentM = TensorEAtomM;
-  using TensorEAlignmentK = TensorEAtomK;
-
-  // Tensor A alignment requirements
-  // When A is MN major, TensorAAlignmentK needs to be multiplier of chunk size
-  // When A is K major, TensorAAlignmentK needs to be multiplier of TMA requirements times tensorA sparsity
-  //   this is b.c. TensorACompressed needs to satisfy TMA requirements
-  using TensorAAlignmentK = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
-                                                LogicalElemsAPerChunk,
-                                                Int<128 / cute::sizeof_bits_v<ElementAMma>>>;
-
-  // When A is MN Major, TensorAAlignmentM needs to be multiplier of TMA requirements
-  // When A is K Major, no requirements on TensorAAlignmentM.
-  using TensorAAlignmentM = cute::conditional_t<GmmaMajorA == GMMA::Major::MN,
-                                                Int<128 / cute::sizeof_bits_v<ElementAMmaRaw> * ElemsARawPerElementAMmaRaw{}>,
-                                                _1>;
-
-  // The following two functions are provided for user determine the static layouts type
-  CUTE_HOST_DEVICE
-  static constexpr auto
-  deduce_layoutA() {
-    using LayoutMMajor = Layout<Shape <int32_t,
-                                       Shape<ElementASparsity, int32_t>,
-                                       int32_t>,
-                                Stride<ElementASparsity,
-                                       Stride<_1, int64_t>,
-                                       int64_t>>;
-
-    using LayoutKMajor = Layout<Shape <int32_t,
-                                       Shape<ElementASparsity, int32_t>,
-                                       int32_t>,
-                                Stride<int64_t,
-                                       Stride<_1, ElementASparsity>,
-                                       int64_t>>;
-
-    if constexpr (GmmaMajorA == GMMA::Major::MN) {
-      return LayoutMMajor{};
-    }
-    else {
-      return LayoutKMajor{};
-    }
-  }
-
-  CUTE_HOST_DEVICE
-  static constexpr auto
-  deduce_layoutE() {
-    return make_layout(
-      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(0)),
-                 make_shape(shape<1>(TensorEAtom{}), int32_t(0)),
-                 int32_t(0)),
-      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
-                  make_stride(stride<1>(TensorEAtom{}), int64_t(0)),
-                  int64_t(0))
-    );
-  }
-
-  // This function is used to revert a CuTe layout to a Cutlass layout tag (RowMajor/ColumnMajor)
-  template <class ShapeA, class StrideA>
-  CUTE_HOST_DEVICE
-  static constexpr auto
-  deduce_layoutA_tag(Layout<ShapeA, StrideA> layout_a) {
-    /*
-      (m, (2, k/2), l) : (2, (1, m*2), m*k) M-major
-      (m, (2, k/2), l) : (k, (1, 2), m*k) K-major
-    */
-    // Check if the given layout_a is possibly a sparse tensorA layout.
-    static_assert(rank_v<ShapeA> == 3 && depth_v<ShapeA> == 2, "Rank and depth mismatch with the sparse tensorA's layout.");
-    static_assert(rank(get<1>(ShapeA{})) == 2 && rank(flatten(ShapeA{})) == 4,
-                  "Not likely to be a sparse tensorA's layout.");
-    static_assert(get<1,0>(StrideA{}) == 1 && get<1,0>(ShapeA{}) == ElementASparsity{},
-                  "Not likely to be a sparse tensorA's layout.");
-    static_assert(get<0>(StrideA{}) == ElementASparsity{} || get<1,1>(StrideA{}) == ElementASparsity{},
-                  "Not likely to be a sparse tensorA's layout.");
-
-    if constexpr (get<0>(StrideA{}) == ElementASparsity{}) {
-      return cutlass::layout::ColumnMajor{};
-    }
-    else {
-      return  cutlass::layout::RowMajor{};
-    }
-  }
-
-  // Fill tensor A layout from dynamic problem shape
-  template <class ProblemShape>
-  CUTE_HOST_DEVICE
-  static constexpr auto
-  fill_layoutA(ProblemShape problem_shape) {
-
-    const auto [M, N, K, L] = problem_shape;
-
-    // Round up to satisfy TensorA Alignment requirement
-    const auto M_AlignedAC = cutlass::round_up(M, TensorAAlignmentM{});
-    const auto K_AlignedAC = cutlass::round_up(K, TensorAAlignmentK{});
-
-    if constexpr (GmmaMajorA == GMMA::Major::MN) {
-      return make_layout(
-        make_shape(int32_t(M_AlignedAC),
-                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC) / ElementASparsity{}),
-                   int32_t(L)),
-        make_stride(ElementASparsity{},
-                    make_stride(_1{}, int64_t(M_AlignedAC) * ElementASparsity{}),
-                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
-      );
-    }
-    else {
-      return make_layout(
-        make_shape(int32_t(M_AlignedAC),
-                   make_shape(ElementASparsity{}, int32_t(K_AlignedAC / ElementASparsity{})),
-                   int32_t(L)),
-        make_stride(int64_t(K_AlignedAC),
-                    make_stride(_1{}, ElementASparsity{}),
-                    (L == 1) ? int64_t(0) : int64_t(M_AlignedAC * K_AlignedAC))
-      );
-    }
-  }
-
-  // Fill tensor E layout from dynamic problem shape
-  template <class ProblemShape>
-  CUTE_HOST_DEVICE
-  static constexpr auto
-  fill_layoutE(ProblemShape problem_shape) {
-    const auto [M, N, K, L] = problem_shape;
-
-    // Round up to satisfy TensorEAlignment requirement
-    const auto M_AlignedE = cutlass::round_up(M, TensorEAlignmentM{});
-    const auto K_AlignedE = cutlass::round_up(K, TensorEAlignmentK{});
-
-    // TensorEAtom first along m-dim, then along k-dim, then along batch
-    static_assert(TensorEAlignmentM{} == TensorEAtomM{}, "Shape below assumes TensorEAlignmentM == TensorEAtomM");
-    static_assert(TensorEAlignmentK{} == TensorEAtomK{}, "Shape below assumes TensorEAlignmentK == TensorEAtomK");
-
-    return make_layout(
-      make_shape(make_shape(shape<0>(TensorEAtom{}), int32_t(M_AlignedE / TensorEAtomM{})),
-                 make_shape(shape<1>(TensorEAtom{}), int32_t(K_AlignedE / TensorEAtomK{})),
-                 int32_t(L)),
-      make_stride(make_stride(stride<0>(TensorEAtom{}), cute::Int<cute::cosize(TensorEAtom{})>{}),
-                  make_stride(stride<1>(TensorEAtom{}), int64_t(M_AlignedE * TensorEAtomK{})),
-                  (L == 1) ? int64_t(0) : int64_t(M_AlignedE * K_AlignedE))
-    );
-  }
-};
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
deleted file mode 100755
index 9b608fe02..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
+++ /dev/null
@@ -1,388 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
-
-// SM90 Collective Builders should be used only starting CUDA 12.0
-#if (__CUDACC_VER_MAJOR__ >= 12)
-#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override_sparse(StageCount<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override_sparse(cute::Int<stages> stage_count) {
-  return stages;
-}
-
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. 
-template<int CapacityBytes, class ElementAMma, class ElementB, class ElementEMma, class TileShapeMNK, int carveout_bytes>
-constexpr int
-compute_stage_count_or_override_sparse(StageCountAutoCarveout<carveout_bytes> stage_count) {
-  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
-  constexpr auto a_bits = cute::sizeof_bits_v<ElementAMma>;
-  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
-  constexpr auto e_bits = cute::sizeof_bits_v<ElementEMma>;
-  constexpr int stage_bytes =
-    cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    cutlass::bits_to_bytes(e_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) +
-    static_cast<int>(mainloop_pipeline_bytes);
-
-  return (CapacityBytes - carveout_bytes) / stage_bytes;
-}
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS_SPARSE
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassSparseTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
-       not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert(!IsFP8Input, "FP8 sparse collective currently only supports FastAccum schedules");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMmaRaw = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma    = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMmaRaw, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
-      ElementAMmaRaw, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using ElementAMma = typename TiledMma::ValTypeA;
-  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
-  using ElementEMma = typename TiledMma::ValTypeE;
-  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
-                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
-
-  using LayoutA = decltype(SparseConfig::deduce_layoutA());
-  using LayoutE = decltype(SparseConfig::deduce_layoutE());
-  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
-      GmmaMajorA, ElementAMmaRaw, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
-      ElementAMma, ElementBMma, ElementEMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  using SmemCopyAtomA = void; 
-  using SmemCopyAtomB = void; 
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      LayoutPairAE,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-// GMMA_TMA_WS_SS_FP8_FAST_ACCUM_SPARSE
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassSparseTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccum> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongFP8FastAccum> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>)>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
-                "Only FP8 datatypes are compatible with these kernel schedules\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
-
-  using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum>,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector_sparse<
-      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using ElementAMma = typename TiledMma::ValTypeA;
-  using ElementAMmaSparsity = Int<ElementAMma::sparsity>;
-  using ElementEMma = typename TiledMma::ValTypeE;
-  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma, GmmaMajorA, ElementEMma, 
-                                                     decltype(cute::min(size<2>(TileShape_MNK{}),_128{}))>;
-
-  using LayoutA = decltype(SparseConfig::deduce_layoutA());
-  using LayoutE = decltype(SparseConfig::deduce_layoutE());
-  using LayoutPairAE = decltype(cute::make_tuple(LayoutA{}, LayoutE{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector_sparse<
-      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{})), ElementAMmaSparsity>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
-      ElementAMma, ElementB, ElementEMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-
-  using SmemCopyAtomA = void; 
-  using SmemCopyAtomB = void; 
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      LayoutPairAE,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-// GMMA_TMA_WS_RS_SPARSE
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassSparseTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
-       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>) &&
-       detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Mainloop with sparse A sourced from RF is not implemented.");
-};
-
-// Sparse GMMA auto kernel schedule
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassSparseTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-
-  using KernelSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
-                                             cute::conditional_t<IsFP8Input,
-                                                                 KernelTmaWarpSpecializedPingpongFP8FastAccum,
-                                                                 KernelTmaWarpSpecializedPingpong>,
-                                             cute::conditional_t<IsFP8Input,
-                                                                 KernelTmaWarpSpecializedCooperativeFP8FastAccum,
-                                                                 KernelTmaWarpSpecializedCooperative>>;
-
-  using CollectiveOp = typename CollectiveBuilder<
-      arch::Sm90,
-      arch::OpClassSparseTensorOp,
-      ElementA,
-      GmemLayoutATag,
-      AlignmentA,
-      ElementB,
-      GmemLayoutBTag,
-      AlignmentB,
-      ElementAccumulator,
-      TileShape_MNK,
-      ClusterShape_MNK,
-      StageCountType,
-      KernelSchedule
-    >::CollectiveOp;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
deleted file mode 100755
index ccd8d8b3c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/collective/collective_mma_decl.hpp"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/collective/collective_builder_decl.hpp"
-#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
-#include "cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl"
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
deleted file mode 100755
index c0570d37a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/numeric/integral_constant.hpp>
-#include <cutlass/detail/dependent_false.hpp>
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Used to specify stage counts or dispatch to automatic computation of stage count
-template<int num_stages>
-struct StageCount {
-  static constexpr int value = num_stages;
-
-  StageCount() = default;
-  explicit StageCount(cute::Int<num_stages>) {}
-};
-
-template<int carveout_bytes>
-struct StageCountAutoCarveout {
-  static constexpr int bytes = carveout_bytes;
-
-  StageCountAutoCarveout() = default;
-  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
-};
-
-using StageCountAuto = StageCountAutoCarveout<0>;
-
-// Used to automatically let the builder pick the kernel schedule.
-// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
-struct KernelScheduleAuto final {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ArchTag,
-  class OpClass,
-  class ElementA,
-  class GmemLayoutA,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutB,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType,
-  class Enable = void
->
-struct CollectiveBuilder {
-  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
deleted file mode 100755
index 103da9af7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/gemm/collective/collective_mma_decl.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/collective/sm70_mma_twostage.hpp"
-#include "cutlass/gemm/collective/sm80_mma_multistage.hpp"
-#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp"
-#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp"
-#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp"
-#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
deleted file mode 100755
index feef54962..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cute/numeric/integral_constant.hpp>
-#include <cutlass/detail/dependent_false.hpp>
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class DispatchPolicy,
-  class TileShape,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class TiledMma,
-  class GmemTiledCopyA,
-  class SmemLayoutAtomA,
-  class SmemCopyAtomA,
-  class TransformA,
-  class GmemTiledCopyB,
-  class SmemLayoutAtomB,
-  class SmemCopyAtomB,
-  class TransformB
->
-struct CollectiveMma {
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
deleted file mode 100755
index 374fffafc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cute/algorithm/clear.hpp"
-#include "cute/tensor.hpp"
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////FP8 Accumulation///////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-/// It would promote (add) the results from the tensor core accumulators to the
-/// main accumulators when the number of MMAs reaches the max number of MMA
-/// interval specified by user, after that the tensor core accumulators are
-/// zeroed.
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-template <
-    class EngineAccum,
-    class LayoutAccum>
-struct GmmaFP8Accumulation {  
- using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
-
-  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
-  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
-
-private:
-  TensorAccum& accum_;
-  TensorAccum accum_temp_;
-
-  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
-  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
-  uint32_t mma_count_;                        // current executed MMAs
-  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
-
-  CUTLASS_DEVICE
-  void promote_core() {
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i);
-    }
-  }
-
-public:
-  CUTLASS_DEVICE
-  GmmaFP8Accumulation(
-      TensorAccum &accum,
-      uint32_t accum_promotion_interval,
-      uint32_t mma_count_per_mainloop_iteration)
-      : accum_(accum), 
-        accum_promotion_interval_(accum_promotion_interval),
-        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
-        mma_count_(0), 
-        reset_accum_flag_(0) 
-  {
-    accum_temp_ = cute::make_fragment_like(accum);
-  }
-
-  CUTLASS_DEVICE 
-  TensorAccum& operator()() {
-    return accum_temp_;
-  }
-
-  /// prepare the MMA accumulators when initialization or zeroing is required.
-  CUTLASS_DEVICE
-  bool prepare_if_needed() { 
-    return reset_accum_flag_;
-  }
-
-  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_if_needed() {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      promote_core();
-      mma_count_ = 0;
-    }
-  }
-
-  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_residue_if_needed() {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      promote_core();
-    }
-  }
-};
-
-} // namespace cutlass::gemm::collective
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
deleted file mode 100755
index 3d9e03edf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
+++ /dev/null
@@ -1,597 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cutlass/gemm/collective/collective_mma_decl.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm70TwoStageUnpredicated,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm70TwoStageUnpredicated;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
-    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-  };
-
-  // Device side kernel params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  CollectiveMma() = default;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
-    (void) workspace;
-    return args;
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  template <
-    class FrgTensorD,
-    class TensorA,
-    class TensorB,
-    class FrgTensorC,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator() (
-      FrgTensorD &accum,
-      TensorA gA,
-      TensorB gB,
-      FrgTensorC const &src_accum,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char *smem_buf)
-  {
-    using namespace cute;
-
-    (void)residue_mnk;
-
-    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
-    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 2,
-      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
-    static_assert(cute::rank(SmemLayoutB{}) == 2,
-      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
-
-    // Construct shared memory tiles
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
-    // Partition the copying of A and B tiles across the threads
-    GmemTiledCopyA gmem_tiled_copy_a;
-    GmemTiledCopyB gmem_tiled_copy_b;
-    auto copy_a_thr = gmem_tiled_copy_a.get_slice(thread_idx);
-    auto copy_b_thr = gmem_tiled_copy_b.get_slice(thread_idx);
-
-    Tensor tAgA = copy_a_thr.partition_S(gA);                                  // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = copy_a_thr.partition_D(sA);                                  // (ACPY,ACPY_M,ACPY_K)
-    Tensor tBgB = copy_b_thr.partition_S(gB);                                  // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = copy_b_thr.partition_D(sB);                                  // (BCPY,BCPY_N,BCPY_K)
-
-    // Allocate the register tiles for double buffering -- same shape as partitioned data
-    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
-    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
-
-    // Tile MMA compute thread partitions and allocate accumulators
-    TiledMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCrA  = thr_mma.partition_fragment_A(sA);                           // (MMA,MMA_M,MMA_K)
-    Tensor tCrB  = thr_mma.partition_fragment_B(sB);                           // (MMA,MMA_M,MMA_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
-
-    //
-    // Copy Atom retiling
-    //
-
-    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
-    Tensor tCsA           = thr_copy_A.partition_S(sA);
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
-
-    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
-    Tensor tCsB           = thr_copy_B.partition_S(sB);
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
-
-    //
-    // Prologue
-    //
-
-    // Copy gmem to rmem for the first k_tile
-    copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
-    copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
-    if (--k_tile_count > 0) ++k_tile_iter;
-    // Copy rmem to smem
-    copy(tArA, tAsA);
-    copy(tBrB, tBsB);
-    // Clear accumulators
-    __syncthreads();
-
-    // Load A, B smem->rmem for k=0
-    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
-    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
-    //
-    // Mainloop
-    //
-
-    // Size of the k-tiles's outer product mode (k)
-    auto K_BLOCK_MAX = size<2>(tCrA);
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (k_tile_count > -1)
-    {
-      // Pipeline the outer products with a static for loop
-      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
-      {
-        if (k_block == K_BLOCK_MAX - 1)
-        {
-          __syncthreads();
-
-          // Copy rmem to smem
-          copy(tArA, tAsA);
-          copy(tBrB, tBsB);
-          __syncthreads();
-        }
-
-        // Load A, B smem->rmem for k+1
-        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;     // static
-        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
-        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
-        if (k_block == 0)
-        {
-          // Copy gmem to rmem
-          copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
-          copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
-          if (--k_tile_count > 0) ++k_tile_iter;
-        }
-
-        // transform before compute
-        cute::transform(tCrA(_,_,k_block), TransformA{});
-        cute::transform(tCrB(_,_,k_block), TransformB{});
-
-        // Thread-level register gemm for k
-        // disambiguate gemm (shared with the namespace name)
-        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
-      });
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm70TwoStage,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm70TwoStage;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
-    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-  };
-
-  // Device side kernel params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  CollectiveMma() = default;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
-    (void) workspace;
-    return args;
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  template <
-    class FrgTensorD,
-    class TensorA,
-    class TensorB,
-    class FrgTensorC,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator() (
-      FrgTensorD &accum,
-      TensorA gA,
-      TensorB gB,
-      FrgTensorC const &src_accum,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char *smem_buf)
-  {
-    using namespace cute;
-
-    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
-    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 2,
-      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
-    static_assert(cute::rank(SmemLayoutB{}) == 2,
-      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
-
-    // Construct shared memory tiles
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
-    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
-    // This aligns the tensor with BLK_K for all but the 0th k_tile
-    gA.data() = &gA(0, get<2>(residue_mnk), 0);
-    gB.data() = &gB(0, get<2>(residue_mnk), 0);
-
-    // Partition the copying of A and B tiles across the threads
-    GmemTiledCopyA gmem_tiled_copy_a;
-    GmemTiledCopyB gmem_tiled_copy_b;
-    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
-    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
-
-    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
-    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
-
-    // Allocate the register tiles for double buffering -- same shape as partitioned data
-    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
-    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
-
-    //
-    // PREDICATES
-    //
-
-    // Allocate predicate tensors for m and n
-    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
-    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
-
-    // Construct identity layout for sA and sB
-    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
-
-    // Repeat the partitioning with identity layouts
-    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
-
-    // Set predicates for m bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < size<0>(tApA); ++m) {
-      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
-    }
-    // Set predicates for n bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < size<0>(tBpB); ++n) {
-      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
-    }
-
-    //
-    // PREFETCH
-    //
-
-    // Clear the rmem tiles to account for predicated off loads
-    clear(tArA);
-    clear(tBrB);
-
-    // Start async loads for 0th k-tile, where we take care of the k residue
-    {
-      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tArA); ++k) {
-        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
-          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tArA(_,_,k));
-        }
-      }
-      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tBrB); ++k) {
-        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
-          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBrB(_,_,k));
-        }
-      }
-      ++k_tile_iter;
-      --k_tile_count;
-    }
-
-    // Tile MMA compute thread partitions and allocate accumulators
-    TiledMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCrA  = thr_mma.make_fragment_A(thr_mma.partition_A(sA));           // (MMA,MMA_M,MMA_K)
-    Tensor tCrB  = thr_mma.make_fragment_B(thr_mma.partition_B(sB));           // (MMA,MMA_M,MMA_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
-
-    //
-    // Copy Atom retiling
-    //
-
-    auto thr_copy_A       = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma).get_thread_slice(thread_idx);
-    Tensor tCsA           = thr_copy_A.partition_S(sA);
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
-
-    auto thr_copy_B       = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma).get_thread_slice(thread_idx);
-    Tensor tCsB           = thr_copy_B.partition_S(sB);
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
-
-    //
-    // Prologue
-    //
-
-    // Copy rmem to smem
-    copy(tArA, tAsA);
-    copy(tBrB, tBsB);
-    // Clear accumulators
-    __syncthreads();
-
-    // Load A, B smem->rmem for k=0
-    copy(tCsA(_,_,0), tCrA_copy_view(_,_,0));
-    copy(tCsB(_,_,0), tCrB_copy_view(_,_,0));
-    //
-    // Mainloop
-    //
-
-    // Size of the k-tiles's outer product mode (k)
-    auto K_BLOCK_MAX = size<2>(tCrA);
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (k_tile_count > -1)
-    {
-      // Pipeline the outer products with a static for loop
-      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
-      {
-        if (k_block == K_BLOCK_MAX - 1)
-        {
-          __syncthreads();
-
-          // Copy rmem to smem
-          copy(tArA, tAsA);
-          copy(tBrB, tBsB);
-          __syncthreads();
-        }
-
-        // Load A, B smem->rmem for k+1
-        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;    // static
-        copy(tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
-        copy(tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
-        if (k_block == 0)
-        {
-          if (k_tile_count <= 0) {
-            clear(tApA);
-            clear(tBpB);
-          }
-          copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tArA);
-          copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBrB);
-          ++k_tile_iter;
-          --k_tile_count;
-        }
-
-        // transform before compute
-        cute::transform(tCrA(_,_,k_block), TransformA{});
-        cute::transform(tCrB(_,_,k_block), TransformB{});
-
-        // Thread-level register gemm for k
-        // disambiguate gemm (shared with the namespace name)
-        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
-      });
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
deleted file mode 100755
index a129b56e3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Stages,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm80CpAsyncUnpredicated<Stages>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_
-  >
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm80CpAsyncUnpredicated<Stages>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-  // Follow the change in TestSmall: TileShape switch to CtaShape 
-  // For sm80 arch, CtaShape should euqal to TileShape
-  using CtaShape_MNK = TileShape;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-
-  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
-    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-  };
-
-  // Device side kernel params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  CollectiveMma() = default;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
-    (void) workspace;
-    return args;
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  template <
-    class FrgTensorD,
-    class TensorA,
-    class TensorB,
-    class FrgTensorC,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator() (
-      FrgTensorD &accum,
-      TensorA gA,
-      TensorB gB,
-      FrgTensorC const &src_accum,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char *smem_buf)
-  {
-    using namespace cute;
-
-    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
-    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3,
-      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3,
-      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
-
-    // Construct shared memory tiles
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
-    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
-    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
-
-    // Partition the copying of A and B tiles across the threads
-    GmemTiledCopyA gmem_tiled_copy_A;
-    GmemTiledCopyB gmem_tiled_copy_B;
-    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
-    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
-
-    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
-    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
-
-    //
-    // PREDICATES
-    //
-
-    (void) residue_mnk;
-    //assert(residue_mnk == make_tuple(0,0,0));
-
-    //
-    // PREFETCH
-    //
-
-    // Start async loads for all pipes but the last
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
-      copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));
-      copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));
-      cp_async_fence();
-      --k_tile_count;
-      if (k_tile_count > 0) { ++k_tile_iter; }
-    }
-
-    //
-    // MMA Atom partitioning
-    //
-
-    // Tile MMA compute thread partitions and allocate accumulators
-    TiledMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0));                     // (MMA,MMA_M,MMA_K)
-    Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0));                     // (MMA,MMA_N,MMA_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
-    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_A) == size(tiled_mma));
-    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_B) == size(tiled_mma));
-
-    //
-    // Copy Atom retiling
-    //
-
-    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
-    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
-    Tensor tCsA            = smem_thr_copy_A.partition_S(sA);                  // (CPY,CPY_M,CPY_K,PIPE)
-    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                   // (CPY,CPY_M,CPY_K)
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
-
-    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
-    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
-    Tensor tCsB            = smem_thr_copy_B.partition_S(sB);                  // (CPY,CPY_N,CPY_K,PIPE)
-    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                   // (CPY,CPY_N,CPY_K)
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-
-    // Current pipe index in smem to read from
-    int smem_pipe_read  = 0;
-    // Current pipe index in smem to write to
-    int smem_pipe_write = DispatchPolicy::Stages-1;
-
-    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
-    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
-
-    // Size of the register pipeline
-    auto K_BLOCK_MAX = size<2>(tCrA);
-
-    // PREFETCH register pipeline
-    if (K_BLOCK_MAX > 1) {
-      // Wait until our first prefetched tile is loaded in
-      cp_async_wait<DispatchPolicy::Stages-2>();
-      __syncthreads();
-
-      // Prefetch the first rmem from the first k-tile
-      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
-      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (k_tile_count > -(DispatchPolicy::Stages-1))
-    {
-      // Pipeline the outer products with a static for loop.
-      //
-      // Note, the for_each() function is required here to ensure `k_block` is of type Int<x>.
-      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
-      {
-        if (k_block == K_BLOCK_MAX - 1)
-        {
-          // Slice the smem_pipe_read smem
-          tCsA_p = tCsA(_,_,_,smem_pipe_read);
-          tCsB_p = tCsB(_,_,_,smem_pipe_read);
-
-          // Commit the smem for smem_pipe_read
-          cp_async_wait<DispatchPolicy::Stages-2>();
-          __syncthreads();
-        }
-
-        // Load A, B shmem->regs for k_block+1
-        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
-        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
-        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
-        // Copy gmem to smem before computing gemm on each k-pipe
-        if (k_block == 0)
-        {
-          copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
-          copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
-          cp_async_fence();
-          
-          // Advance the tile
-          --k_tile_count;
-          if (k_tile_count > 0) { ++k_tile_iter; }
-
-          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
-          smem_pipe_write = smem_pipe_read;
-          ++smem_pipe_read;
-          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
-        }
-
-        // Transform before compute
-        cute::transform(tCrA(_,_,k_block), TransformA{});
-        cute::transform(tCrB(_,_,k_block), TransformB{});
-        // Thread-level register gemm for k_block
-        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
-      });
-
-    }
-
-    cp_async_wait<0>();
-    __syncthreads();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Stages,
-  class ClusterShape_,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_
->
-struct CollectiveMma<
-    MainloopSm80CpAsync<
-      Stages,
-      ClusterShape_>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_
-   >
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm80CpAsync<
-                          Stages,
-                          ClusterShape_>;
-  using TileShape = TileShape_;
-  // Follow the change in TestSmall: TileShape switch to CtaShape 
-  // In legacy arch, it should be same
-  using CtaShape_MNK = TileShape;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-
-  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
-
-  struct SharedStorage
-  {
-    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
-    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-  };
-
-  // Device side kernel params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  CollectiveMma() = default;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
-    (void) workspace;
-    return args;
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  template <
-    class FrgTensorD,
-    class TensorA,
-    class TensorB,
-    class FrgTensorC,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator() (
-      FrgTensorD &accum,
-      TensorA gA,                   // (BLK_M, BLK_K, K_TILES)
-      TensorB gB,                   // (BLK_N, BLK_K, K_TILES)
-      FrgTensorC const &src_accum,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char *smem_buf)
-  {
-    using namespace cute;
-
-    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
-    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-
-    // Construct shared memory tiles
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
-    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
-    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
-
-    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
-    // This aligns the tensor with BLK_K for all but the 0th k_tile
-    gA = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA);
-    gB = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB);
-
-    // Partition the copying of A and B tiles across the threads
-    GmemTiledCopyA gmem_tiled_copy_A;
-    GmemTiledCopyB gmem_tiled_copy_B;
-    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
-    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
-
-    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
-    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
-
-    //
-    // PREDICATES
-    //
-
-    // Allocate predicate tensors for m and n
-    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
-    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
-
-    // Construct identity layout for sA and sB
-    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
-
-    // Repeat the partitioning with identity layouts
-    Tensor tAcA = gmem_thr_copy_A.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tBcB = gmem_thr_copy_B.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
-
-    // Set predicates for m bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < size<0>(tApA); ++m) {
-      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
-    }
-    // Set predicates for n bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < size<0>(tBpB); ++n) {
-      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
-    }
-
-    //
-    // PREFETCH
-    //
-
-    // Clear the smem tiles to account for predicated off loads
-    clear(tAsA);
-    clear(tBsB);
-
-    // Start async loads for 0th k-tile, where we take care of the k residue
-    {
-      constexpr int k_pipe = 0;
-
-      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tAsA); ++k) {
-        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
-          copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe));
-        }
-      }
-      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tBsB); ++k) {
-        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
-          copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe));
-        }
-      }
-      cp_async_fence();
-      ++k_tile_iter;
-      --k_tile_count;
-    }
-
-    // Start async loads for 1st k-tile onwards, no k-residue handling needed
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
-      if (k_tile_count <= 0) {
-        clear(tApA);
-        clear(tBpB);
-      }
-      copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));  // CpAsync
-      copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));  // CpAsync
-      cp_async_fence();
-      ++k_tile_iter;
-      --k_tile_count;
-    }
-
-    //
-    // MMA Atom partitioning
-    //
-
-    // Tile MMA compute thread partitions and allocate accumulators
-    TiledMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCrA  = thr_mma.partition_fragment_A(sA(_,_,0));                    // (MMA,MMA_M,MMA_K)
-    Tensor tCrB  = thr_mma.partition_fragment_B(sB(_,_,0));                    // (MMA,MMA_N,MMA_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
-
-    //
-    // Copy Atom retiling
-    //
-
-    auto smem_tiled_copy_A   = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
-    auto smem_thr_copy_A     = smem_tiled_copy_A.get_thread_slice(thread_idx);
-    Tensor tCsA           = smem_thr_copy_A.partition_S(sA);                   // (CPY,CPY_M,CPY_K,PIPE)
-    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);                    // (CPY,CPY_M,CPY_K)
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
-
-    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
-    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
-    Tensor tCsB              = smem_thr_copy_B.partition_S(sB);                // (CPY,CPY_N,CPY_K,PIPE)
-    Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);                 // (CPY,CPY_N,CPY_K)
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-
-    // Current pipe index in smem to read from
-    int smem_pipe_read  = 0;
-    // Current pipe index in smem to write to
-    int smem_pipe_write = DispatchPolicy::Stages-1;
-
-    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
-    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
-
-    // Size of the register pipeline
-    auto K_BLOCK_MAX = size<2>(tCrA);
-
-    // PREFETCH register pipeline
-    if (K_BLOCK_MAX > 1) {
-      // Wait until our first prefetched tile is loaded in
-      cp_async_wait<DispatchPolicy::Stages-2>();
-      __syncthreads();
-
-      // Prefetch the first rmem from the first k-tile
-      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
-      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
-    }
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
-    {
-      // Pipeline the outer products with a static for loop.
-      //
-      // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
-      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
-      {
-        if (k_block == K_BLOCK_MAX - 1)
-        {
-          // Slice the smem_pipe_read smem
-          tCsA_p = tCsA(_,_,_,smem_pipe_read);
-          tCsB_p = tCsB(_,_,_,smem_pipe_read);
-
-          // Commit the smem for smem_pipe_read
-          cp_async_wait<DispatchPolicy::Stages-2>();
-          __syncthreads();
-        }
-
-        // Load A, B shmem->regs for k_block+1
-        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
-        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
-        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
-        // Copy gmem to smem before computing gemm on each k-pipe
-        if (k_block == 0)
-        {
-          // Set all predicates to false if we are going to overshoot bounds
-          if (k_tile_count <= 0) {
-            clear(tApA);
-            clear(tBpB);
-          }
-          copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
-          copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
-          cp_async_fence();
-          ++k_tile_iter;
-
-          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
-          smem_pipe_write = smem_pipe_read;
-          ++smem_pipe_read;
-          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
-        }
-
-        // Transform before compute
-        cute::transform(tCrA(_,_,k_block), TransformA{});
-        cute::transform(tCrB(_,_,k_block), TransformB{});
-        // Thread-level register gemm for k_block
-        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
-      });
-
-    }
-
-    cp_async_wait<0>();
-    __syncthreads();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
deleted file mode 100755
index 628750fc3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+++ /dev/null
@@ -1,759 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using InternalStrideA = cute::remove_pointer_t<StrideA>;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using InternalStrideB = cute::remove_pointer_t<StrideB>;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-
-  // Assumption: StrideA is congruent with Problem_MK
-  using TMA_A = decltype(make_tma_copy(
-      GmemTiledCopyA{},
-      make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
-      SmemLayoutA{}(_,_,cute::Int<0>{}),
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-      size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
-  // Assumption: StrideB is congruent with Problem_NK
-  using TMA_B = decltype(make_tma_copy(
-      GmemTiledCopyB{},
-      make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
-      SmemLayoutB{}(_,_,cute::Int<0>{}),
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
-
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-    } tensors;
-
-    struct TensorMapStorage : cute::aligned_struct<128, _0> {
-      cute::TmaDescriptor smem_tensormap_A;
-      cute::TmaDescriptor smem_tensormap_B;
-    } tensormaps;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const** ptr_A;
-    StrideA dA;
-    ElementB const** ptr_B;
-    StrideB dB;
-  };
-
-  // Device side kernel params
-  struct Params {
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    void* tensormaps;
-    InternalElementA const** ptr_A;
-    StrideA dA;
-    InternalElementB const** ptr_B;
-    StrideB dB;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      ProblemShape problem_shapes,
-      Arguments const& args,
-      void* workspace) {
-    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
-    // These will be replaced with correct values before the initial tma load.
-    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
-    auto init_M = get<0>(init_shape);
-    auto init_N = get<1>(init_shape);
-    auto init_K = get<2>(init_shape);
-    // Batches/Groups are managed by using appropriate pointers to input matrices
-    const uint32_t mock_L = 1;
-    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-
-    InternalStrideA stride_a;
-    InternalStrideB stride_b;
-    if constexpr (IsGroupedGemmKernel) {
-      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
-      stride_a = InternalStrideA{};
-      stride_b = InternalStrideB{};
-    }
-    else {
-      // Tensor shapes for Ptr-Array are initialized correctly only here.
-      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
-      init_M = get<0>(problem_shape_MNK);
-      init_N = get<1>(problem_shape_MNK);
-      init_K = get<2>(problem_shape_MNK);
-
-      stride_a = args.dA;
-      stride_b = args.dB;
-    }
-    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,mock_L), stride_a));
-    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,mock_L), stride_b));
-    TMA_A tma_load_a = make_tma_copy(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
-    TMA_B tma_load_b = make_tma_copy(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
-
-    void* tensormaps = workspace;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      TmaTransactionBytes,
-      tensormaps,
-      reinterpret_cast<InternalElementA const**>(args.ptr_A),
-      args.dA,
-      reinterpret_cast<InternalElementB const**>(args.ptr_B),
-      args.dB
-    };
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
-    constexpr uint32_t NumInputTensors = 2;
-    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
-    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
-    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape problem_shapes,
-      Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-
-    bool implementable = true;
-    if (problem_shapes.is_host_problem_shape_available()) {
-      // Check alignment for all problem sizes
-      for (int i = 0; i < problem_shapes.groups(); i++) {
-        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
-        auto [M,N,K,L] = problem_shape_MNKL;
-        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
-        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
-      }
-    }
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytes =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-
-  // Set up the data needed by this collective for load and mma.
-  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
-  // returned tuple must contain at least two elements, with the first two elements being:
-  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  // The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-    const int32_t mock_L = 1;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-
-  // Perform a collective-scoped matrix multiply-accumulate
-  // Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class TensorMapA, class TensorMapB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB> const& load_inputs,
-      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-      //
-      // Prepare the TMA loads for A and B
-      //
-
-      constexpr uint32_t cluster_shape_x = get<0>(DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-      Tensor gA_mkl = get<0>(load_inputs);
-      Tensor gB_nkl = get<1>(load_inputs);
-
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count)
-      {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-
-  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      // This helps avoid early exit of blocks in Cluster.
-      // Waits for all stages to either be released (all 
-      // Consumer UNLOCKs), or if the stage was never used
-      // then it would just be acquired since the phase was 
-      // still inverted from make_producer_start_state.
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    assert(k_tile_count >= 1);
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-    warpgroup_fence_operand(accum);
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-
-  //
-  // Methods to perform different parts of TMA/Tensormap modifications
-  //
-
-  CUTLASS_DEVICE auto
-  tensormaps_init(
-      Params const& mainloop_params,
-      TensorMapStorage& shared_tensormaps,
-      int32_t sm_count,
-      int32_t sm_idx) {
-    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
-
-    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
-    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
-
-    if (cute::elect_one_sync()) {
-      // Bringing tensormaps from params to smem for modification later
-      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
-      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
-      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
-      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
-
-      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
-      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
-    }
-    __syncwarp();
-
-    return cute::make_tuple(tma_desc_a, tma_desc_b);
-  }
-
-  // Replace address for the global tensor (to be done by single thread)
-  CUTLASS_DEVICE
-  void
-  tensormaps_replace_global_address(
-      TensorMapStorage& shared_tensormaps,
-      Params const& mainloop_params,
-      int32_t next_batch) {
-    // Replacing global_address for the next batch
-    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
-                                                    mainloop_params.ptr_A[next_batch]);
-    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
-                                                    mainloop_params.ptr_B[next_batch]);
-  }
-
-  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE
-  void
-  tensormaps_replace_global_tensor_properties(
-      TensorMapStorage& shared_tensormaps,
-      Params const& mainloop_params,
-      int32_t next_group,
-      ProblemShape_MNKL problem_shape_mnkl) {
-    const uint32_t M = get<0>(problem_shape_mnkl);
-    const uint32_t N = get<1>(problem_shape_mnkl);
-    const uint32_t K = get<2>(problem_shape_mnkl);
-    // Replace all dims for consistency
-    constexpr int MaxTensorRank = 5;
-    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
-    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
-    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
-    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
-
-    InternalElementA const* ptr_A = nullptr;
-    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
-
-    InternalElementB const* ptr_B = nullptr;
-    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
-
-    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, 
-                                             prob_shape_A, prob_stride_A);
-    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, 
-                                             prob_shape_B, prob_stride_B);
-
-    // Convert strides to byte strides
-    for (uint64_t& stride : prob_stride_A) {
-      stride = (stride * sizeof_bits_v<InternalElementA>) / 8;
-    }
-    for (uint64_t& stride : prob_stride_B) {
-      stride = (stride * sizeof_bits_v<InternalElementB>) / 8;
-    }
-
-    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
-                                                            prob_shape_A,
-                                                            prob_stride_A);
-    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
-                                                            prob_shape_B,
-                                                            prob_stride_B);
-  }
-
-  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
-  CUTLASS_DEVICE
-  void
-  tensormaps_perform_update(
-      TensorMapStorage& shared_tensormaps,
-      Params const& mainloop_params,
-      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
-      ProblemShape_MNKL problem_shape_mnkl,
-      int32_t next_batch) {
-    if (cute::elect_one_sync()) {
-      // Replacing global_address for the next batch
-      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
-
-      if constexpr (IsGroupedGemmKernel) {
-        // Replacing global dims and strides for the next batch
-        tensormaps_replace_global_tensor_properties(shared_tensormaps,
-          mainloop_params, next_batch, problem_shape_mnkl);
-      }
-    }
-  }
-
-  template <class TensorMapA, class TensorMapB>
-  CUTLASS_DEVICE
-  void
-  tensormaps_cp_fence_release (
-      TensorMapStorage& shared_tensormaps,
-      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
-    // Entire warp must do this (i.e. it's aligned)
-    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
-    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
-  }
-
-  // The entire warp must call this function collectively (that is, the instructions are aligned)
-  template <class TensorMapA, class TensorMapB>
-  CUTLASS_DEVICE
-  void
-  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
-    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
-    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
-  }
-
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
deleted file mode 100755
index 69b31fdab..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
+++ /dev/null
@@ -1,677 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape_,
-  class TileShape_,
-  class KernelSchedule,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
-  using TileShape = TileShape_;
-  using ClusterShape = ClusterShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
-  static constexpr bool IsLayoutAkBmn =
-    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
-    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
-
-  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
-  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
-  using InternalGmemTiledCopyA = cute::conditional_t<!SwapAB, GmemTiledCopyA, GmemTiledCopyB>;
-  using InternalGmemTiledCopyB = cute::conditional_t<!SwapAB, GmemTiledCopyB, GmemTiledCopyA>;
-  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
-  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
-  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
-  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
-  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
-  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
-  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
-  using PipelineState    = typename MainloopPipeline::PipelineState;
-  using PipelineParams   = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      InternalSmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-  using SmemLayoutB = decltype(tile_to_shape(
-      InternalSmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-
-  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
-  static constexpr bool IsLayoutAmnBmn =
-    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
-    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
-  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
-  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
-                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
-                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
-
-  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
-      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
-
-  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
-  using GmmaSmemLayoutB = decltype(tile_to_shape(
-      GmmaSmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-
-  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
-  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
-    "Should be same layout if not TransposeB.");
-  static_assert(!TransposeB || (cutlass::bits_to_bytes(size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value)) == 128,
-    "SmemLayoutB K must be 128bytes to be transposed.");
-  static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
-    "Warp specialized ARF kernels have not supported universal B transposition yet.");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<256, _0> { 
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, 256> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, 256> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A = nullptr;
-    StrideA dA{};
-    ElementB const* ptr_B = nullptr;
-    StrideB dB{};
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  struct Params {
-    InternalElementA const* ptr_A = nullptr;
-    InternalStrideA dA{};
-    InternalElementB const* ptr_B = nullptr;
-    InternalStrideB dB{};
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-    [[maybe_unused]] ProblemShape const& problem_shape,
-    Arguments const& args,
-    [[maybe_unused]] void* workspace) {
-    if constexpr (not SwapAB) {
-      return {
-        reinterpret_cast<InternalElementA const*>(args.ptr_A),
-        args.dA,
-        reinterpret_cast<InternalElementB const*>(args.ptr_B),
-        args.dB
-      };
-    }
-    else {
-      return {
-        reinterpret_cast<InternalElementA const*>(args.ptr_B),
-        args.dB,
-        reinterpret_cast<InternalElementB const*>(args.ptr_A),
-        args.dA
-      };
-    }
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    bool implementable = true;
-    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
-    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA,
-    class TensorB,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  load(
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write,
-      TensorA const& gA_in,
-      TensorB const& gB_in,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      TensorStorage& shared_tensors)
-  {
-    using namespace cute;
-
-    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
-    // This aligns the tensor with BLK_K for all but the 0th k_tile
-    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
-    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
-
-    // Partition the copying of A and B tiles across the threads
-    InternalGmemTiledCopyA gmem_tiled_copy_a;
-    InternalGmemTiledCopyB gmem_tiled_copy_b;
-    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
-    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
-
-    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
-    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
-
-    // Allocate predicate tensors for m and n
-    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
-    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
-
-    // Construct identity layout for sA and sB
-    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
-
-    // Repeat the partitioning with identity layouts
-    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
-
-    // Set predicates for m bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < size<0>(tApA); ++m) {
-      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
-    }
-    // Set predicates for n bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < size<0>(tBpB); ++n) {
-      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
-    }
-
-    // 0-th stage with predication on k to account for residue
-    {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-      int write_stage = smem_pipe_write.index();
-
-      // Copy gmem to smem for *k_tile_iter, predicating for k residue
-      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tAsA); ++k) {
-        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
-          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
-        }
-        else {
-          clear(tAsA(_,_,k,write_stage));
-        }
-      }
-      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tBsB); ++k) {
-        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
-          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
-        }
-        else {
-          clear(tBsB(_,_,k,write_stage));
-        }
-      }
-      
-      ++k_tile_iter;
-      --k_tile_count;
-
-      // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-      int write_stage = smem_pipe_write.index();
-
-      // Copy gmem to smem for *k_tile_iter
-      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-      ++k_tile_iter;
-
-      // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write) {
-    // Issue the epilogue waits
-    /* This helps avoid early exit of blocks in Cluster
-     * Waits for all stages to either be released (all 
-     * Consumer UNLOCKs), or if the stage was never used
-     * then would just be acquired since the phase was 
-     * still inverted from make_producer_start_state
-     */
-    pipeline.producer_tail(smem_pipe_write);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params)
-  {
-    using namespace cute;
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
-    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
-    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
-      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    // Obtain warp index
-    int warp_idx = canonical_warp_idx_sync();
-    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
-    
-    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
-    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                     // (BLK_M,BLK_K,PIPE)
-    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
-    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
-
-    // If TransposeB, GMMA will read from transposed B layout SMEM
-    Tensor gmma_sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), GmmaSmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
-    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    // Allocate fragments and descriptors
-    Tensor tCsA = mma_thread_slice.partition_A(sA);
-    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB);                                   // (MMA,MMA_N,MMA_K,PIPE)
-    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
-
-    //
-    // Copy Atom A retiling
-    //
-
-
-    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
-
-    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
-
-    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
-                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
-                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
-                                    cute::bool_constant<TransposeB>{});
-
-    warpgroup_fence_operand(accum);
-    // first k tile
-    {
-      pipeline.consumer_wait(smem_pipe_read);
-
-      int read_stage = smem_pipe_read.index();
-
-      ++smem_pipe_read;
-
-      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
-
-      // copy smem->rmem for A operand
-      copy(smem_tiled_copy_A, tCsA(_,_,0,read_stage), tCrA_copy_view(_,_,0));
-      // transpose B operand in SMEM
-      transpose(sB, gmma_sB, read_stage, 0);
-
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
-        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        if (k_block == 0) {
-          transpose(sB, gmma_sB, read_stage, 1);
-          transpose.synchronize();
-        }
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-      }
-
-      warpgroup_wait<2>();
-      
-      
-      if (k_tile_count - 1 > 0) {
-        if (!skip_wait) {
-          pipeline.consumer_wait(smem_pipe_read);
-        }
-        copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
-        transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
-      }
-
-      warpgroup_arrive();
-      // (V,M) x (V,N) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
-      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      warpgroup_commit_batch();
-      warpgroup_wait<2>();
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    --k_tile_count;
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 1; --k_tile_count) {
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      ++smem_pipe_read;
-      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
-
-      warpgroup_fence_operand(accum);
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        if (k_block == size<2>(tCrA) - 1) {
-          if (!skip_wait) {
-            pipeline.consumer_wait(smem_pipe_read);
-          }
-          copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
-          // transpose B operand in SMEM
-          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
-        } else {
-          copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-          // transpose B operand in SMEM
-          if (k_block < 2) {
-            transpose.synchronize(k_block);                                      // make transpose of k_block available
-          }
-          if (k_block == 0) {
-            transpose(sB, gmma_sB, read_stage, 1);
-          }
-        }
-        
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-        warpgroup_wait<2>();
-        if (k_block == 1) {
-          // release prior barrier
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-      }
-      warpgroup_fence_operand(accum);
-
-    }
-
-    warpgroup_fence_operand(accum);
-
-    if (k_tile_count > 0) {
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      warpgroup_fence_operand(accum);
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
-        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        if (k_block < 2) {
-          transpose.synchronize(k_block);                                           // make k_block transpose available
-        }
-        if (k_block == 0) {
-          transpose(sB, gmma_sB, read_stage, 1);
-        }
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-        warpgroup_wait<2>();
-        if (k_block == 1) {
-          // release prior barrier
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-      }
-      
-      warpgroup_arrive();
-      // (V,M) x (V,N) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
-      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      warpgroup_commit_batch();
-      warpgroup_wait<2>();
-      warpgroup_fence_operand(accum);
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
deleted file mode 100755
index e336bd475..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape_,
-  class TileShape_,
-  class KernelSchedule,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
-  using TileShape = TileShape_;
-  using ClusterShape = ClusterShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
-  using PipelineState    = typename MainloopPipeline::PipelineState;
-  using PipelineParams   = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A = nullptr;
-    StrideA dA{};
-    ElementB const* ptr_B = nullptr;
-    StrideB dB{};
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  using Params = Arguments;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-    [[maybe_unused]] ProblemShape const& problem_shape,
-    Arguments const& args,
-    [[maybe_unused]] void* workspace) {
-    return args;
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    bool implementable = true;
-    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
-    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA,
-    class TensorB,
-    class KTileIterator,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  load(
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write,
-      TensorA const& gA_in,
-      TensorB const& gB_in,
-      KTileIterator k_tile_iter, int k_tile_count,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      TensorStorage& shared_tensors)
-  {
-    using namespace cute;
-
-    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
-    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
-    // This aligns the tensor with BLK_K for all but the 0th k_tile
-    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
-    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
-
-    // Partition the copying of A and B tiles across the threads
-    GmemTiledCopyA gmem_tiled_copy_a;
-    GmemTiledCopyB gmem_tiled_copy_b;
-    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
-    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
-
-    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
-    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
-    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
-    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
-
-    // Allocate predicate tensors for m and n
-    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
-    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
-
-    // Construct identity layout for sA and sB
-    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
-
-    // Repeat the partitioning with identity layouts
-    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
-
-    // Set predicates for m bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < size<0>(tApA); ++m) {
-      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
-    }
-    // Set predicates for n bounds
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < size<0>(tBpB); ++n) {
-      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
-    }
-
-    // 0-th stage with predication on k to account for residue
-    {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-      int write_stage = smem_pipe_write.index();
-
-      // Copy gmem to smem for *k_tile_iter, predicating for k residue
-      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tAsA); ++k) {
-        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
-          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
-        }
-        else {
-          clear(tAsA(_,_,k,write_stage));
-        }
-      }
-      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0; k < size<2>(tBsB); ++k) {
-        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
-          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
-        }
-        else {
-          clear(tBsB(_,_,k,write_stage));
-        }
-      }
-      ++k_tile_iter;
-      --k_tile_count;
-
-      // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-      int write_stage = smem_pipe_write.index();
-
-      // Copy gmem to smem for *k_tile_iter
-      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-      ++k_tile_iter;
-
-      // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write) {
-    // Issue the epilogue waits
-    /* This helps avoid early exit of blocks in Cluster
-     * Waits for all stages to either be released (all 
-     * Consumer UNLOCKs), or if the stage was never used
-     * then would just be acquired since the phase was 
-     * still inverted from make_producer_start_state
-     */
-    pipeline.producer_tail(smem_pipe_write);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params)
-  {
-    using namespace cute;
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    assert(k_tile_count >= 1);
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-    warpgroup_fence_operand(accum);
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-
-      warpgroup_arrive();
-
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue) {
-
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-
-      warpgroup_arrive();
-
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-      
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
deleted file mode 100755
index b30fed1c8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
+++ /dev/null
@@ -1,752 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop that source A operand from registers
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only
-  // (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
-  static constexpr bool IsLayoutAkBmn =
-    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
-    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
-
-  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
-  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
-  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
-  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
-  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
-  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
-  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
-  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
-  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      InternalSmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      InternalSmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
-  static constexpr bool IsLayoutAmnBmn =
-    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
-    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
-  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
-  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
-                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
-                                      InternalElementB{}, cute::bool_constant<TransposeB>{})); 
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
-      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
-
-  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
-  using GmmaSmemLayoutB = decltype(tile_to_shape(
-      GmmaSmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
-  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
-    "Should be same layout if not TransposeB.");
-  static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
-    "SmemLayoutB K must be 128bytes to be transposed.");
-
-  static constexpr bool uses_universal_transposition() {
-    if constexpr (TransposeB) {
-      return transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>();
-    }
-    else {
-      return false;
-    }
-  }
-
-  static_assert(!uses_universal_transposition(),
-    "Warp specialized ARF kernels have not supported universal B transposition yet.");
-  
-  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
-
-  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
-
-  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> { 
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, SmemAlignmentA> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, SmemAlignmentB> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A = nullptr;
-    StrideA dA{};
-    ElementB const* ptr_B = nullptr;
-    StrideB dB{};
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    if constexpr (SwapAB) {
-      M = get<1>(problem_shape_MNKL);
-      N = get<0>(problem_shape_MNKL);
-    }
-
-    InternalElementA const* ptr_A;
-    InternalStrideA dA;
-    InternalElementB const* ptr_B;
-    InternalStrideB dB;
-
-    if constexpr (not SwapAB) {
-      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-      dA = args.dA;
-      dB = args.dB;
-    }
-    else {
-      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
-      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
-      dA = args.dB;
-      dB = args.dA;
-    }
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<InternalElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<InternalElementB>::value)) ;
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  /// The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});       // (BLK_M,BLK_K,PIPE)
-      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});       // (BLK_N,BLK_K,PIPE)
-      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
-      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
-
-      //
-      // Prepare the TMA loads for A and B
-      //
-      
-      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-      Tensor gA_mkl = get<0>(load_inputs);
-      Tensor gB_nkl = get<1>(load_inputs);
-
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all 
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was 
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
-    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
-    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
-      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    // Obtain warp index
-    int warp_idx = canonical_warp_idx_sync();
-    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
-    
-    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
-    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
-    
-    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
-    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
-
-    // If TransposeB, GMMA will read from transposed B layout SMEM
-    Tensor gmma_sB_position_dependent = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), 
-                                          GmmaSmemLayoutB{});                                     // (BLK_N,BLK_K,PIPE)
-    Tensor gmma_sB = as_position_independent_swizzle_tensor(gmma_sB_position_dependent);          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
-    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    // Allocate fragments and descriptors
-    Tensor tCsA = mma_thread_slice.partition_A(sA);
-    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB_position_dependent);                // (MMA,MMA_N,MMA_K,PIPE)
-    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
-
-    //
-    // Copy Atom A retiling
-    //
-
-
-    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
-
-    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
-
-    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
-    Tensor tCsA_copy_view  = smem_thr_copy_A.partition_S(sA);                                      // (CPY,CPY_M,CPY_K)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA_copy_view) == size<1>(tCrA_copy_view));                                  // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA_copy_view) == size<2>(tCrA_copy_view));                                  // CPY_K
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) > _2{}, "RS loops require more than 2 MMA k-iterations for correctness.");
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
-                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{}, 
-                                    InternalSmemLayoutAtomB{}, InternalElementB{}, 
-                                    cute::bool_constant<TransposeB>{});
-
-    warpgroup_fence_operand(accum);
-    
-    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
-    // first k tile
-    {
-      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-
-      ++smem_pipe_read;
-      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-
-      // copy smem->rmem for A operand
-      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,read_stage), tCrA_copy_view(_,_,0));
-      // transpose B operand in SMEM
-      transpose(sB, gmma_sB, read_stage, 0);
-      
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
-        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        transpose.synchronize(k_block);
-        transpose(sB, gmma_sB, read_stage, k_block + 1);
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        if(k_block == 0) {
-          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        }
-        warpgroup_commit_batch();
-      }
-
-      warpgroup_wait<2>();
-      
-      warpgroup_arrive();
-      // (V,M) x (V,N) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
-      warpgroup_commit_batch();
-      --k_tile_count;
-      if(k_tile_count == 0) {
-        return;
-      }
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
-      transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
-      warpgroup_wait<2>();
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 1; --k_tile_count) {
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-      ++smem_pipe_read;
-
-      warpgroup_fence_operand(accum);
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        if (k_block == 0) {
-          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-        }
-        if (k_block == size<2>(tCrA) - 1) {
-          pipeline.consumer_wait(smem_pipe_read, barrier_token);
-          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
-          // transpose B operand in SMEM
-          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
-        } 
-        else {
-          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-          // transpose B operand in SMEM
-          transpose.synchronize(k_block);                                      // make transpose of k_block available
-          transpose(sB, gmma_sB, read_stage, k_block + 1);
-        }
-        
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        warpgroup_commit_batch();
-        warpgroup_wait<2>();
-        if (k_block == 1) {
-          // release prior barrier
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-      }
-      warpgroup_fence_operand(accum);
-
-    }
-
-    warpgroup_fence_operand(accum);
-
-    {
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      warpgroup_fence_operand(accum);
-      
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
-        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
-        transpose.synchronize(k_block);                                           // make k_block transpose available
-        transpose(sB, gmma_sB, read_stage, k_block + 1);
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-        warpgroup_wait<2>();
-        if (k_block == 1) {
-          // release prior barrier
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-      }
-      
-      warpgroup_arrive();
-      // (V,M) x (V,N) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
-      warpgroup_commit_batch();
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-  
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = 1;
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
deleted file mode 100755
index a3efc67e8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
+++ /dev/null
@@ -1,1560 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/detail/collective.hpp"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/atom/copy_traits_sm90_tma.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop that source A operand from registers
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementAOptionalTuple,
-  class StrideA_,
-  class ElementBOptionalTuple,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementAOptionalTuple,
-    StrideA_,
-    ElementBOptionalTuple,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-private:
-  template <class PointerType>
-  static constexpr auto
-  get_logical_ptr(PointerType const* ptr) {
-    if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
-      return subbyte_iterator<PointerType const>(ptr);
-    }
-    else {  
-      return ptr;
-    }
-  }
-
-  enum class ConversionMode {
-    DirectConvert,
-    ConvertAndScale,
-    ConvertAndScaleWithZero
-  };
-
-  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
-  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
-  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
-  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
-
-public:
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-
-  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value, 
-    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
-    "[ElementZero]}. Inputs in [] are optional.");
-
-  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
-  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
-  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
-  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
-  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
-  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
-  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
-  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
-
-  using StrideA = StrideA_;
-  using StrideB = StrideB_;
-  // These are always MN major
-  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
-  // For cases where we can't have a void scale, we can use this to allow the code to compile when the scale is void.
-  using NonVoidStrideScale = cute::conditional_t<
-      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
-
-  static_assert((IsATransformed && cutlass::gemm::detail::is_k_major<StrideA>()) || 
-                (!IsATransformed && cutlass::gemm::detail::is_k_major<StrideB>()),
-                "The transformed type must be K-major.");
-
-  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
-                (!IsATransformed && (sizeof(ElementA) == 2)) ||
-                (cutlass::gemm::detail::is_k_major<StrideA>() && 
-                 cutlass::gemm::detail::is_k_major<StrideB>()), 
-                "The unscaled element must be 2 bytes OR both inputs must be K-major");
-
-  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(), 
-    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
-
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  // Scale layout atom set after swapping.
-
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
-
-  // We must ensure the type to be scaled goes to RF
-  static constexpr bool SwapAB = !IsATransformed;
-  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
-  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
-  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
-  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
-  
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-  using RealInternalElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
-  using RealInternalElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
-  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
-  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
-  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
-  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using InternalTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
-  using InternalTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
-
-  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
-  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
-  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
-
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using MainloopPipeline = cutlass::PipelineTmaAsync<
-                             DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(InternalSmemLayoutAtomA{})), cute::Int<1>>>;
-  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
-
-  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-
-  template<class LayoutAtom, class TileShape, class Stride>
-  static constexpr
-  CUTLASS_HOST_DEVICE
-  auto get_smem_layout(LayoutAtom layout_atom, TileShape const& tile_shape, Stride const& stride) {
-    if constexpr (not cute::is_layout<Stride>::value) {
-      return tile_to_shape(
-        layout_atom,
-        append(tile_shape, Int<DispatchPolicy::Stages>{}),
-        cute::conditional_t< ::cutlass::gemm::detail::is_major<0,Stride>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{});
-    }
-    else {
-      auto gmem_tile = composition(stride, tile_shape);
-      return make_layout_like(append(gmem_tile, make_layout(Int<DispatchPolicy::Stages>{}, 0)));
-    }
-  }
-
-  using SmemLayoutA = decltype(get_smem_layout(InternalSmemLayoutAtomA{}, select<0,2>(TileShape{}), InternalStrideA{}));
-  using SmemLayoutB = decltype(get_smem_layout(InternalSmemLayoutAtomB{}, select<1,2>(TileShape{}), InternalStrideB{}));
-    
-  // It is assumed that the scales and zero-points share the same smem layout
-  using SmemLayoutScale = decltype(tile_to_shape(
-    SmemLayoutAtomScale{}, 
-    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
-    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
-  // We must also handle updating the pipeline transaction bytes on the fly.
-  // NOTE: Deleting this assertion without required changes will cause the code to hang.
-  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
-
-private:
-  static constexpr ConversionMode 
-  get_conversion_mode() {
-    if constexpr (cute::is_void_v<ElementScale>) {
-      return ConversionMode::DirectConvert;
-    } 
-    else if constexpr (cute::is_void_v<ElementZero>) {
-      return ConversionMode::ConvertAndScale;
-    }
-    else {
-      return ConversionMode::ConvertAndScaleWithZero;
-    }
-  }
-
-  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
-  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
-                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
-  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
-                                              cutlass::detail::is_Array_v<ElementScale>;
-
-  static constexpr auto
-  elements_per_smem_scale() {
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      return 0;
-    } 
-    else if constexpr (ModeHasScales) {
-      return cute::cosize_v<SmemLayoutScale>;
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
-    }
-  }
-
-  static constexpr auto
-  elements_per_smem_zero() {
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
-                  KernelConversionMode == ConversionMode::ConvertAndScale ) {
-      return 0;
-    } 
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-      return cute::cosize_v<SmemLayoutScale>;
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
-    }
-  }
-
-  // These methods use some the public members of the class. For that reason, we define them after the public section.
-  static constexpr uint32_t
-  compute_tma_transaction_bytes_mk() {
-    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
-  }
-
-  static constexpr uint32_t
-  compute_tma_transaction_bytes_nk() {
-    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
-  }
-
-  static constexpr uint32_t
-  compute_tma_transaction_bytes_extra() {
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      return 0;
-    }
-    else if constexpr (ModeHasScales) {
-      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
-      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return scale_tx_bytes;
-      }
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        // Scale and zero share smem layout
-        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
-        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
-        return scale_tx_bytes + zero_tx_bytes;
-      }
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
-      }
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
-    }
-  }
-
-public:
-  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
-
-  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
-
-  // Just pick the max alignment of A and B since it is required to be at least 128B
-  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
-
-  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
-
-  struct SharedStorage
-  {
-    static constexpr int scale_elements = elements_per_smem_scale();
-    static constexpr int zero_elements = elements_per_smem_zero();
-    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> {
-      cute::ArrayEngine<RealInternalElementA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
-      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A = nullptr;
-    StrideA dA{};
-    ElementB const* ptr_B = nullptr;
-    StrideB dB{};
-    ElementScale const* ptr_S = nullptr;
-    NonVoidStrideScale dS{};
-    int group_size = 0;
-    ElementZero const* ptr_Z = nullptr;
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  template<class Shape, class Stride>
-  static constexpr
-  CUTLASS_HOST_DEVICE
-  auto get_gmem_layout(Shape const& shape, Stride const& stride) {
-    if constexpr (not cute::is_layout<Stride>::value) {
-      return make_layout(shape, stride);
-    }
-    else {
-      return stride;
-    }
-  }
-
-  // Device side kernel params
-  struct Params {
-  private:
-    using Outer = CollectiveMma<DispatchPolicy, TileShape_, 
-                                ElementAOptionalTuple, StrideA_, 
-                                ElementBOptionalTuple, StrideB_,
-                                TiledMma_, 
-                                GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
-                                TransformA_,
-                                GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
-                                TransformB_>;
-
-  public:
-
-    // Assumption: StrideA is congruent with Problem_MK
-    using LayoutA = decltype(get_gmem_layout(repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}));
-    using LayoutB = decltype(get_gmem_layout(repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}));
-
-    using TMA_A = decltype(make_tma_copy_A_sm90<TmaElementA>(
-        GmemTiledCopyA{},
-        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementA const*>(nullptr)), LayoutA{}),
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));  // mcast along N mode for this M load, if any
-
-   using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
-        GmemTiledCopyScale{},
-        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
-        SmemLayoutScale{}(_,_,cute::Int<0>{}),
-        ScaleTileShape{},
-        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
-
-   using TMA_Zero = decltype(make_tma_copy(
-        GmemTiledCopyScale{},
-        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
-        SmemLayoutScale{}(_,_,cute::Int<0>{}),
-        ScaleTileShape{},
-        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
-
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementB const*>(nullptr)), LayoutB{}),
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{})); // mcast along M mode for this N load, if any
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    TMA_Scale tma_load_scale;
-    TMA_Zero tma_load_zero;
-    int64_t scale_k;
-    int group_size;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    int reload_factor = (group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
-    InternalStrideA dA;
-    InternalStrideB dB;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    if constexpr (SwapAB) {
-      M = get<1>(problem_shape_MNKL);
-      N = get<0>(problem_shape_MNKL);
-    }
-
-    InternalElementA const* ptr_A;
-    InternalStrideA dA;
-    InternalElementB const* ptr_B;
-    InternalStrideB dB;
-
-    if constexpr (not SwapAB) {
-      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-      dA = args.dA;
-      dB = args.dB;
-    }
-    else {
-      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
-      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
-      dA = args.dB;
-      dB = args.dA;
-    }
-
-    Tensor tensor_a = make_tensor(get_logical_ptr(ptr_A), get_gmem_layout(make_shape(M,K,L), dA));
-    Tensor tensor_b = make_tensor(get_logical_ptr(ptr_B), get_gmem_layout(make_shape(N,K,L), dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<TmaElementA>(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}); // mcast along N mode for this M load, if any
-
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}); // mcast along M mode for this N load, if any
-
-    typename Params::TMA_Scale tma_load_scale{};
-    typename Params::TMA_Zero tma_load_zero{};
-
-    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0, tma_transaction_bytes, 1, dA, dB };
-    } 
-    else if constexpr (ModeHasScales) {
-      auto scale_k = (K + args.group_size - 1) / args.group_size;
-      ElementScale const* ptr_S = args.ptr_S;
-      StrideScale dS = args.dS;
-      Tensor tensor_scale = make_tensor(get_logical_ptr(ptr_S), make_layout(make_shape(M,scale_k,L), dS));
-      tma_load_scale = make_tma_copy<TmaElementScale>(
-          GmemTiledCopyScale{},
-          tensor_scale,
-          SmemLayoutScale{}(_,_,cute::Int<0>{}),
-          ScaleTileShape{},
-          _1{}); // mcast along N mode for this M load, if any
-
-      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
-      }
-      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        Tensor tensor_zero = make_tensor(get_logical_ptr(args.ptr_Z), make_layout(make_shape(M,scale_k,L), dS));
-        tma_load_zero = make_tma_copy(
-            GmemTiledCopyScale{},
-            tensor_zero,
-            SmemLayoutScale{}(_,_,cute::Int<0>{}),
-            ScaleTileShape{},
-            _1{}); // mcast along N mode for this M load, if any
-        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
-      } else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
-      }
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
-    }
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    bool check_aligned_A = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(get_gmem_layout(cute::make_shape(M,K,L), args.dA));
-
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    bool check_aligned_B = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(get_gmem_layout(cute::make_shape(N,K,L), args.dB));
-
-    bool check_aligned_S = true;
-    bool check_aligned_Z = true;
-    bool check_mode_args = true;
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      check_mode_args = check_mode_args && (args.ptr_S == nullptr);
-      check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
-    } 
-    else if constexpr (ModeHasScales) {
-      const int scale_mn = SwapAB ? N : M;
-      const int scale_k = (K + args.group_size - 1) / args.group_size;
-      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
-      check_aligned_S = cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), args.dS);
-      check_mode_args = check_mode_args && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
-      check_mode_args = check_mode_args && args.group_size != 0;
-      check_mode_args = check_mode_args && (args.ptr_S != nullptr);
-
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
-      }
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
-        check_aligned_Z = cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), args.dS);
-        check_mode_args = check_mode_args && (args.ptr_Z != nullptr);
-      } 
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
-      }
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
-    }
-
-    if (!check_mode_args) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n");
-    }
-    if (!check_aligned_A) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor A meet the minimum alignment requirements for TMA.\n");
-    }
-    if (!check_aligned_B) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor B meet the minimum alignment requirements for TMA.\n");
-    }
-    if (!check_aligned_S) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor S (scale) meet the minimum alignment requirements for TMA.\n");
-    }
-    if (!check_aligned_Z) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor Z (zeros) meet the minimum alignment requirements for TMA.\n");
-    }
-
-    return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
-  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
-  static constexpr uint32_t TmaTransactionBytesExtra = compute_tma_transaction_bytes_extra();
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      // Nothing extra to do
-    } 
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
-    }
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
-      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
-    }  
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
-    }
-    
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  /// The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(get_gmem_layout(make_shape(M,K,L), mainloop_params.dA))); // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(get_gmem_layout(make_shape(N,K,L), mainloop_params.dB))); // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      return cute::make_tuple(gA_mkl, gB_nkl);
-    } 
-    else if constexpr (ModeHasScales) {
-      auto scale_k = mainloop_params.scale_k;
-      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M,scale_k,L));          // (m,scale_k,l)
-      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));         // (BLK_M,BLK_Scale_K,m,scale_k,l)
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
-      }
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M,scale_k,L));         // (m,scale_k,l)
-        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
-        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
-      }
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
-      }
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
-    }
-  }  
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  /// This overload gets triggered when we have scales.
-  template <
-    class... Ts,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write,
-      cute::tuple<Ts...> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
-    } 
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
-    } 
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
-    }
-
-    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
-    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
-    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
-    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Prepare the TMA loads for A, B and Scales
-    //
-    
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-    // Partition the inputs based on the current block coordinates.
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-    // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-    uint16_t mcast_mask_a = 0;
-    uint16_t mcast_mask_b = 0;
-    uint16_t mcast_mask_s = 0;
-
-    // Issue TmaLoads
-    // Maps the tile -> block, value
-    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int n = 0; n < size<1>(block_layout); ++n) {
-        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-      }
-    }
-
-    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int m = 0; m < size<0>(block_layout); ++m) {
-        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-      }
-    }
-
-    auto extra_input_partitions = partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-
-      //
-      // Copy gmem to smem for *k_tile_iter
-      //
-
-      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-      int write_stage = smem_pipe_write.index();
-      if (cute::elect_one_sync()) {
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-      }
-
-      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-        // Nothing extra to do.
-      }
-      else if constexpr (ModeHasScales) {
-        auto tSgS = get<0>(extra_input_partitions);
-        auto tSsS = get<1>(extra_input_partitions);
-
-        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
-        // on the fly.
-        // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
-        // is a multiple of the threadblock tile K
-        int const scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when group_size == K.
-        if (cute::elect_one_sync()) copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
-
-        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-          // Nothing extra to do
-        } 
-        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-          auto tZgZ = get<2>(extra_input_partitions);
-          auto tZsZ = get<3>(extra_input_partitions);
-          if (cute::elect_one_sync()) copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
-        }
-        else {
-          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
-        } 
-      } 
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
-      }
-
-      ++k_tile_iter;
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    // Issue the epilogue waits
-    if (cute::elect_one_sync()) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all 
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was 
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
-    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
-    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
-      "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
-    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    // Obtain warp index
-    int warp_idx = canonical_warp_idx_sync();
-    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
-    
-    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
-    
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCsA = mma_thread_slice.partition_A(sA);
-    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    // Allocate fragments and descriptors
-    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrA_load = make_fragment_like<RealInternalElementA>(tCrA_mma);
-    
-    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
-    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
-
-    //
-    // Copy Atom A retiling
-    //
-    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
-    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
-
-    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
-
-    // Partition of thread -> shared and thread -> RF
-    auto partitioned_extra_info = partition_extra_mma_info(mma_thread_slice, shared_tensors);
-    auto copy_partitions_extra_info = retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    warpgroup_fence_operand(accum);
-
-    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
-    
-    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
-    // first k tile
-    {
-      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-
-      ++smem_pipe_read;
-      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-
-      // copy smem->rmem for A operand
-      copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
-      if (K_BLOCK_MAX > 1) { // prefetch next block
-        copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
-      }
-      transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
-      
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-
-        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
-          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
-        }
-        if (k_block < K_BLOCK_MAX - 1) {
-          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
-        }
-      }     
-
-      --k_tile_count;
-      if (k_tile_count > 0) {
-        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma.
-        pipeline.consumer_wait(smem_pipe_read, barrier_token);
-        copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
-        if (K_BLOCK_MAX > 1) { // prefetch next block
-          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
-        }
-        warpgroup_wait<K_BLOCK_MAX - 1>(); 
-        transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
-      }
-    }
-
-    if (k_tile_count == 0) {
-      return;
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 1; --k_tile_count) {
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-      ++smem_pipe_read;
-
-      warpgroup_fence_operand(accum);
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
-        
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-
-        warpgroup_wait<K_BLOCK_MAX - 1>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
-        if (k_block == K_BLOCK_MAX - 1) {
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-
-        if (k_block == 0) {
-          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-        }
-
-        if (k_block == K_BLOCK_MAX - 1) { 
-          pipeline.consumer_wait(smem_pipe_read, barrier_token);
-          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
-          if (K_BLOCK_MAX > 1) { // prefetch next block
-            copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-              partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
-          }
-          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
-        } 
-        else {
-          if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
-            copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
-          }
-          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
-        }
-      }
-      warpgroup_fence_operand(accum);
-
-    }
-
-    warpgroup_fence_operand(accum);
-
-    {
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      warpgroup_fence_operand(accum);
-      
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
-
-        warpgroup_arrive();
-        // (V,M) x (V,N) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-        warpgroup_commit_batch();
-
-        warpgroup_wait<K_BLOCK_MAX - 1>();
-        if (k_block == K_BLOCK_MAX - 1) { // release prior barrier
-          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
-          ++smem_pipe_release;
-        }
-
-        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
-          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
-        }
-        if (k_block < K_BLOCK_MAX - 1) {
-          copy_A_and_extra_info(smem_tiled_copy_A, tCsA, tCrA_copy_view, 
-            partitioned_extra_info, copy_partitions_extra_info, k_block + 1, read_stage);
-          transform_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
-        }
-      }
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-  
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = 1;
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-
-private:
-  /// Utilities for any additional inputs inside of the TMA load
-  template <class... Ts>
-  CUTLASS_DEVICE
-  auto partition_extra_tma_inputs(
-    Params const& mainloop_params,
-    cute::tuple<Ts...> const& load_inputs,
-    TensorStorage& shared_tensors,
-    uint2 const& cluster_local_block_id,
-    int const m_coord, 
-    int const l_coord) {
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      return cute::make_tuple();
-    } 
-    else if constexpr (ModeHasScales) {
-      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
-      Tensor gS_mkl = get<2>(load_inputs);
-      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
-      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
-
-      Tensor tSgS = block_tma_s.partition_S(gS);                                              // (TMA,TMA_M,TMA_K,k)
-      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return cute::make_tuple(tSgS, tSsS);
-      } 
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
-        Tensor gZ_mkl = get<3>(load_inputs);
-        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
-        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
-
-        Tensor tZgZ = block_tma_z.partition_S(gZ);                                            // (TMA,TMA_M,TMA_K,k)
-        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
-        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);          
-      }
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
-      }
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");      
-    }
-  }
-
-  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
-  template <class ThreadMma>
-  CUTLASS_DEVICE 
-  auto partition_extra_mma_info(
-    ThreadMma const& mma_thread_slice,
-    TensorStorage& shared_tensors) {
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      // nothing to do
-      return cute::make_tuple();
-    }
-    else if constexpr (UseScaleLookupTable) {
-      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
-      Tensor tCsS = mma_thread_slice.partition_A(sS);
-      Tensor tCrS_neg = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
-      Tensor tCrS_pos = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
-
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return cute::make_tuple(tCsS, tCrS_neg, tCrS_pos);
-      }
-    }
-    else if constexpr (ModeHasScales) {
-      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
-      Tensor tCsS = mma_thread_slice.partition_A(sS);
-      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout()); 
-
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return cute::make_tuple(tCsS, tCrS);
-      }
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
-        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
-        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).layout()); 
-        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
-      }
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
-      }
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
-    }
-  }
-
-  /// Returns the tiled copy and copy views for the extra inputs.
-  template <class TiledMma, class... Ts>
-  CUTLASS_DEVICE
-  auto retile_extra_mma_info(
-    TiledMma const& tiled_mma,
-    cute::tuple<Ts...>& partitioned_extra_info,
-    int const warp_group_thread_idx) {
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      // nothing to do
-      return cute::make_tuple();
-    }
-    else if constexpr (ModeHasScales) {
-      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
-      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
-      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
-      
-      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
-      } 
-      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
-        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
-      } 
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
-      }
-    } 
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
-    }
-  }
-
-  /// Utilities to copy A and extra inputs from smem to RF
-  template <class SmemTiledCopyA,
-            class TensorASmemView,
-            class TensorACopyView,
-            class... Ts,
-            class... Us
-            >
-  CUTLASS_DEVICE
-  void copy_A_and_extra_info(
-    SmemTiledCopyA const& smem_tiled_copy_A,
-    TensorASmemView const& tCsA,
-    TensorACopyView& tCrA_copy_view,
-    cute::tuple<Ts...> const& partitioned_mma_extra_info,
-    cute::tuple<Us...> const& tiled_copy_and_views,
-    int k_block,
-    int read_stage) {
-
-    copy(smem_tiled_copy_A, tCsA(_,_,k_block,read_stage), tCrA_copy_view(_,_,k_block));
-
-    if (k_block == 0) {
-      // We are starting a new k-tile so copy the scale
-      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-        // nothing to do
-      } 
-      else if constexpr (ModeHasScales) {
-        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
-        auto tCrS_copy_view    = cute::get<1>(tiled_copy_and_views);
-        auto tCsS              = cute::get<0>(partitioned_mma_extra_info);
-        copy(smem_tiled_copy_S, tCsS(_,_,k_block,read_stage), tCrS_copy_view(_,_,k_block));
-        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-          // Nothing extra to do
-        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-          auto tCsZ              = cute::get<2>(partitioned_mma_extra_info);
-          auto tCrZ_copy_view    = cute::get<2>(tiled_copy_and_views);
-          copy(smem_tiled_copy_S, tCsZ(_,_,k_block,read_stage), tCrZ_copy_view(_,_,k_block));
-        } else {
-          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");         
-        }
-      } 
-      else {
-        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
-      }
-    }
-  }
-  
-  // Helper functions to select packing for conversion
-  template <class SrcType,
-            class DstType,
-            int Cosize>
-  struct select_packing { // Naive packing policy
-    static constexpr auto value() {
-      return Int<cute::gcd(Cosize, 32 / cute::min(sizeof_bits_v<SrcType>, sizeof_bits_v<DstType>))>{};
-    }
-  };
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(Array<cutlass::int4b_t, 4> const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<uint16_t const&>(source));
-  }
-  CUTLASS_DEVICE
-  static uint32_t to_reg(Array<cutlass::int4b_t, 8> const& source) {
-    return reinterpret_cast<uint32_t const&>(source);
-  }
-  // The core converter uses a lookup table to converts i4 -> 8 bit value.
-  template <class TensorPos,
-            class TensorNeg,
-            int N>
-  CUTLASS_DEVICE
-  static Array<RealInternalElementB, N> lookup_table_convert(
-    cute::Int<N> _,
-    Array<cutlass::int4b_t, N> const& source,
-    TensorPos const& scale_neg, 
-    TensorNeg const& scale_pos, 
-    int scale_idx) {
-
-    static_assert(N == 4 || N == 8);
-    uint32_t res[N / 4];
-
-    // View the input as reg
-    uint32_t reg = to_reg(source);
-
-    // Determines if to get from the signed or unsigned candidates
-    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
-    uint32_t sign; // ((reg & 0x88888888) | 0x64206420) >> 1 
-    asm volatile(
-      "{\n"
-      "  lop3.b32 %0, %1, %2, %3, %4;\n" \
-      "}\n"
-      : "=r"(sign)
-      : "r"(reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut)
-    );
-    sign = sign >> 1;
-
-    // Ignore sign bit when indexing into LUT
-    uint32_t lut_idx = reg & 0x77777777;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 4; ++i, lut_idx >>=16, sign >>=16) {
-      Array<uint32_t, 2> const& _scale_neg = reinterpret_cast<Array<uint32_t, 2> const&>(scale_neg[scale_idx + i * 4]);
-      Array<uint32_t, 2> const& _scale_pos = reinterpret_cast<Array<uint32_t, 2> const&>(scale_pos[scale_idx + i * 4]);
-      asm volatile(
-        "{\n"
-        "  .reg .b32 pos, neg                    ;\n" \
-        "  prmt .b32 neg, %3, %4, %1             ;\n" \
-        "  prmt .b32 pos, %5, %6, %1             ;\n" \
-        "  prmt .b32 %0, pos, neg, %2            ;\n" \
-        "}\n"
-        : "=r"(res[i])
-        : "r"(lut_idx), "r"(sign), "r"(_scale_neg[0]), "r"(_scale_neg[1]), "r"(_scale_pos[0]), "r"(_scale_pos[1])
-      );
-    }
-    return reinterpret_cast<Array<RealInternalElementB, N>&>(res);
-  }
-
-  template <class Layout>
-  CUTLASS_DEVICE
-  static void static_check_scale(Layout const& tensor) {
-    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0, "At least 4 adjacent weights in a thread must share the same scale.");
-  }
-  template <class Engine,
-            class Layout>
-  CUTLASS_DEVICE
-  static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
-    static_check_scale(flatten(Layout{}));
-  }
-
-  /// Utilities to transform A.
-  template <class EngineIn,
-            class EngineOut, 
-            class LayoutIn,
-            class LayoutOut,
-            class... Ts>
-  CUTLASS_DEVICE
-  void transform_A_kblock(
-    Tensor<EngineIn, LayoutIn> const& tCrA_load, 
-    Tensor<EngineOut, LayoutOut>& tCrA_mma,
-    cute::tuple<Ts...> const& partitioned_extra_info,
-    int const k_block) {
-
-    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
-    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
-    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
-    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
-    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
-    using SrcType = typename EngineIn::value_type;
-    using DstType = typename EngineOut::value_type;
-
-    auto const& src = tCrA_load(_, _, k_block);
-    auto const& dst = tCrA_mma(_, _, k_block);
-    auto pSrc = raw_pointer_cast(src.data());
-    auto pDst = const_cast<DstType*>(raw_pointer_cast(dst.data()));
-    constexpr int num_elements = decltype(size(src))::value;
-
-    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
-      constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
-      using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-      using SrcArray = cutlass::Array<SrcType, pack>;
-      using DstArray = cutlass::Array<DstType, pack>;
-      constexpr int iters = num_elements / pack;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < iters; ++i) {
-        SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
-        DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
-        *pDstArr = Converter::convert(*pSrcArr);
-      }
-    } 
-    else if constexpr (UseScaleLookupTable) {
-      static_assert(is_same_v<RealInternalElementA, cutlass::int4b_t>, "Lookup table only supports int4 being the quant type now.");
-      static_assert(sizeof_bits_v<ElementScale> == 64, "Lookup table only supports 8 8bit scale values now.");
-      static_assert(num_elements % 4 == 0 && num_elements >= 4, "Lookup table requires a vector size of 4x when converting.");
-      constexpr int pack = num_elements % 8 == 0? 8 : 4;
-      constexpr int iters = num_elements / pack;
-      using SrcArray = cutlass::Array<SrcType, pack>;
-      using DstArray = cutlass::Array<DstType, pack>;
-
-      auto const& tCrS_neg = cute::get<1>(partitioned_extra_info);
-      auto const& tCrS_pos = cute::get<2>(partitioned_extra_info);
-      auto const& scale_neg = tCrS_neg(_, _, k_block);
-      auto const& scale_pos = tCrS_pos(_, _, k_block);
-      CUTE_STATIC_ASSERT_V(size(src) == size(scale_neg));
-
-      static_check_scale(scale_neg);
-      static_check_scale(scale_pos);
-      if (k_block == 0) {
-        auto pNeg = raw_pointer_cast(tCrS_neg.data());
-        auto pPos = const_cast<ElementScale*>(raw_pointer_cast(tCrS_pos.data()));
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < cosize(tCrS_neg.layout()); ++i)
-        {
-          // pPos[i] = pNeg[i] & 0x7F7F7F7F7F7F7F00;
-          cutlass::Array<uint32_t, 2> const& _scale_neg = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(pNeg[i]);
-          cutlass::Array<uint32_t, 2> & _scale_pos = reinterpret_cast<cutlass::Array<uint32_t, 2> &>(pPos[i]);
-          asm volatile(
-              "{\n"
-              "  and  .b32 %0, %2, %4             ;\n" \
-              "  and  .b32 %1, %3, %5             ;\n" \
-              "}\n"
-              : "=r"(_scale_pos[0]), "=r"(_scale_pos[1])
-              : "r"(_scale_neg[0]), "r"(_scale_neg[1]), "n"(0x7F7F7F00), "n"(0x7F7F7F7F)
-              );
-        }
-      }
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < iters; i ++) {
-        SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(raw_pointer_cast(src.data())) + i;
-        DstArray* pDstArr = reinterpret_cast<DstArray*>(raw_pointer_cast(dst.data())) + i;
-        
-        *pDstArr = lookup_table_convert(Int<pack>{}, *pSrcArr, scale_neg, scale_pos, i * pack);
-      }
-    }
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
-      auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
-      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
-
-      if constexpr (is_same_v<DstType, ElementScale>) {
-        constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
-        using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using SrcArray = cutlass::Array<SrcType, pack>;
-        using DstArray = cutlass::Array<DstType, pack>;
-        constexpr int iters = num_elements / pack;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < iters; ++i) {
-          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
-          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
-          *pDstArr = Converter::convert(*pSrcArr);
-          CUTLASS_PRAGMA_UNROLL
-          for (int j = 0; j < pack; ++j) {
-            (*pDstArr)[j] = (*pDstArr)[j] * scales[i*pack + j];
-          }
-        }
-      }
-      else {
-        constexpr int pack1 = decltype(select_packing<SrcType, ElementScale, num_elements>::value())::value;
-        constexpr int pack2 = decltype(select_packing<ElementScale, DstType, num_elements>::value())::value;
-        constexpr int pack = cute::gcd(pack1, pack2);
-        using Converter1 = cutlass::NumericArrayConverter<ElementScale, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using Converter2 = cutlass::NumericArrayConverter<DstType, ElementScale, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using SrcArray = cutlass::Array<SrcType, pack>;
-        using DstArray = cutlass::Array<DstType, pack>;
-        using StageArray = cutlass::Array<ElementScale, pack>;
-        constexpr int iters = num_elements / pack;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < iters; ++i) {
-          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
-          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
-          StageArray stageArr;
-          stageArr = Converter1::convert(*pSrcArr);
-          CUTLASS_PRAGMA_UNROLL
-          for (int j = 0; j < pack; ++j) {
-            stageArr[j] = stageArr[j] *  scales[i*pack + j];
-          }
-          *pDstArr = Converter2::convert(stageArr);
-        }
-      }
-    } 
-    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
-      static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
-      auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
-      auto const& zeros = cute::get<3>(partitioned_extra_info)(_, _, k_block);
-      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
-      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
-      
-      if constexpr (is_same_v<DstType, ElementScale>) {
-        constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
-        using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using SrcArray = cutlass::Array<SrcType, pack>;
-        using DstArray = cutlass::Array<DstType, pack>;
-        constexpr int iters = num_elements / pack;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < iters; ++i) {
-          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
-          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
-          *pDstArr = Converter::convert(*pSrcArr);
-          CUTLASS_PRAGMA_UNROLL
-          for (int j = 0; j < pack; ++j) {
-            (*pDstArr)[j] = (*pDstArr)[j] * scales[i*pack + j] + zeros[i*pack + j];
-          }
-        }
-      }
-      else {
-        constexpr int pack1 = decltype(select_packing<SrcType, ElementScale, num_elements>::value())::value;
-        constexpr int pack2 = decltype(select_packing<ElementScale, DstType, num_elements>::value())::value;
-        constexpr int pack = cute::gcd(pack1, pack2);
-        using Converter1 = cutlass::NumericArrayConverter<ElementScale, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using Converter2 = cutlass::NumericArrayConverter<DstType, ElementScale, pack, cutlass::FloatRoundStyle::round_to_nearest>;
-        using SrcArray = cutlass::Array<SrcType, pack>;
-        using DstArray = cutlass::Array<DstType, pack>;
-        using StageArray = cutlass::Array<ElementScale, pack>;
-        constexpr int iters = num_elements / pack;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < iters; ++i) {
-          SrcArray const* pSrcArr = reinterpret_cast<SrcArray const*>(pSrc) + i;
-          DstArray* pDstArr = reinterpret_cast<DstArray*>(pDst) + i;
-          StageArray stageArr;
-          stageArr = Converter1::convert(*pSrcArr);
-          CUTLASS_PRAGMA_UNROLL
-          for (int j = 0; j < pack; ++j) {
-            stageArr[j] = stageArr[j] *  scales[i*pack + j] + zeros[i*pack + j];
-          }
-          *pDstArr = Converter2::convert(stageArr);
-        }
-      }
-      return;
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
deleted file mode 100755
index daaed6210..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
+++ /dev/null
@@ -1,539 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  int Stages,
-  class ClusterShape,
-  int PipelineAsyncMmaStages,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  static constexpr int ThreadCount = CUTE_STATIC_V(size(TiledMma{}));
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-
-  struct SharedStorage {
-    cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-    cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    alignas(16) PipelineStorage pipeline_storage;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
-    typename Params::TMA_B tma_load_b = make_tma_copy(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
-    return {
-      tma_load_a,
-      tma_load_b
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TMA_LOAD_A,
-    class TensorB, class TMA_LOAD_B,
-    class FrgTensorC,
-    class KTileIterator
-  >
-  CUTLASS_DEVICE void
-  operator() (
-      TensorA const& gA, TMA_LOAD_A& tma_load_a,
-      TensorB const& gB, TMA_LOAD_B& tma_load_b,
-      FrgTensorC& accum,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      char* shared_memory,
-      Params const& mainloop_params)
-  {
-    using namespace cute;
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2.");
-    static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(shared_memory);
-    Tensor sA = make_tensor(make_smem_ptr(storage.smem_A.data()), SmemLayoutA{});                 // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(storage.smem_B.data()), SmemLayoutB{});                 // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Prepare the TMA loads for A and B
-    //
-
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-    auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y);
-    auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x);
-
-    // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
-
-    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
-
-    //
-    // Prepare TMA membars and PREFETCH
-    //
-
-    // Number of pipelined k-tiles in smem
-    constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-
-    // NOTE: Another parameter: Partition the pipeline between active MMAs and active TMAs
-    // Tunable via the dispatch policy to tollerate latencies evenly across the math and compute stages
-    // K_PIPE_MMAS: The max number of active MMA pipes at beginning of every loop
-    // K_PIPE_TMAS: The max number of active TMA pipes at beginning of every loop (geq 1)
-    constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
-    constexpr int K_PIPE_TMAS = K_PIPE_MAX - K_PIPE_MMAS;
-    static_assert(0 <= K_PIPE_MMAS && K_PIPE_MMAS <  K_PIPE_MAX);
-    static_assert(0 <  K_PIPE_TMAS && K_PIPE_TMAS <= K_PIPE_MAX);
-
-    static_assert(K_PIPE_MMAS < K_PIPE_MAX - 1);
-
-    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
-    constexpr uint32_t TmaTransactionBytes = static_cast<uint32_t>(
-        cutlass::bits_to_bytes(size<0>(sA) * size<1>(sA) * sizeof_bits<InternalElementA>::value) +
-        cutlass::bits_to_bytes(size<0>(sB) * size<1>(sB) * sizeof_bits<InternalElementB>::value));
-
-    // Obtain warp index
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-
-    PipelineParams params;
-    params.transaction_bytes = TmaTransactionBytes;
-    params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
-    params.is_leader = warp_group_thread_idx == 0;
-    params.num_consumers = NumThreadsPerWarpGroup;
-
-    MainloopPipeline pipeline(storage.pipeline_storage, params, ClusterShape{});
-
-    // State variables used for iterating the circular buffer
-    // smem_pipe_read / release is used by the consumer of SMEM data - i.e MMA
-    // smem_pipe_write is used by the producer of SMEM data - i.e TMA
-    PipelineState smem_pipe_read;
-    PipelineState smem_pipe_release;
-    PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
-
-    // We need this to guarantee that the Pipeline init is visible
-    // To all producers and consumer blocks in the Cluster
-    if constexpr (size(ClusterShape{}) > 1) {
-      cute::cluster_arrive_relaxed();
-      cute::cluster_wait();
-    }
-    else {
-      __syncthreads();
-    }
-
-    // Set predicate for the lowest lane_id in the warp
-    int lane_predicate = cute::elect_one_sync();
-
-    uint16_t mcast_mask_a = 0;
-    uint16_t mcast_mask_b = 0;
-    // Keep a copy to know when to stop issuing loads
-    int k_tile_count_tma = k_tile_count;
-
-    // Issue TmaLoads (Prologue fetches)
-    if (warp_idx == 0 && lane_predicate == 1) {
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Issue the prologue loads
-      int prologue_tma_count = min(K_PIPE_MAX, k_tile_count);
-      CUTLASS_PRAGMA_UNROLL
-      for (int stage = 0; stage < prologue_tma_count; ++stage) {
-        pipeline.producer_acquire(smem_pipe_write);
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,stage));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,stage));
-        ++k_tile_iter;
-        ++smem_pipe_write;
-      }
-      k_tile_count_tma -= prologue_tma_count;
-    }
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                  // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                  // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                            // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                            // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                     // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                     // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                      // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                      // PIPE
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA));                      // PIPE
-    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB));                      // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
-
-    __syncthreads();
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    warpgroup_fence_operand(accum);
-    // Prologue MMAs
-    assert(k_tile_count >= 1);
-    {
-      // WAIT on smem_pipe_read until it's data is available
-      pipeline.consumer_wait(smem_pipe_read);
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,smem_pipe_read.index()), tCrB(_,_,k_block,smem_pipe_read.index()), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-      ++smem_pipe_read;
-      --k_tile_count;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count) - 1; 
-        prologue_mma_count > 0; --prologue_mma_count)
-    {
-      // WAIT on smem_pipe_read until it's data is available
-      pipeline.consumer_wait(smem_pipe_read);
-      warpgroup_arrive();
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
-      warpgroup_commit_batch();
-      ++smem_pipe_read;
-      --k_tile_count;
-    }
-    warpgroup_fence_operand(accum);
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until data is available
-      pipeline.consumer_wait(smem_pipe_read);
-
-      //
-      // Compute on k_tile
-      //
-
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      pipeline.consumer_release(smem_pipe_release);  // UNLOCK wr stage, done _computing_ on it
-
-      //
-      // Copy gmem to smem for *k_tile_iter
-      //
-
-      // Do Acquire & Load only if needed - helps with both performance and also corner case illegal barrier-ops
-      if (warp_idx == 0 && lane_predicate == 1 && (k_tile_count_tma > 0) ) {
-        pipeline.producer_acquire(smem_pipe_write);  // LOCK wr stage, for _writing_
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write.index()));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write.index()));
-        ++smem_pipe_write;
-        ++k_tile_iter;
-        --k_tile_count_tma;
-      }
-
-      // Advance consumer pipeline
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    // Wait on all GMMAs
-    warpgroup_wait<0>();
-    warpgroup_fence_operand(accum);
-
-    // Workaround for ensuring Smem destruction doesn't happen accidentally
-    if constexpr (size(typename DispatchPolicy::ClusterShape{}) > 1) {
-      cute::cluster_arrive();
-      cute::cluster_wait();
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
deleted file mode 100755
index b370dc70b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
+++ /dev/null
@@ -1,582 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  /// The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-      //
-      // Prepare the TMA loads for A and B
-      //
-
-      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-      Tensor gA_mkl = get<0>(load_inputs);
-      Tensor gB_nkl = get<1>(load_inputs);
-
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all 
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was 
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    assert(k_tile_count >= 1);
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-    warpgroup_fence_operand(accum);
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // (V,M,K) x (V,N,K) => (V,M,N)
-      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
deleted file mode 100755
index da5274469..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
+++ /dev/null
@@ -1,584 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/collective/fp8_accumulation.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/numeric_types.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t mma_promotion_interval = 4;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk,
-      args.mma_promotion_interval
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
-    implementable = implementable && (args.mma_promotion_interval % (size<2>(TileShape{})() / TiledMma().template tile_size_mnk<2>()()) == 0);
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params)
-  {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-      //
-      // Prepare the TMA loads for A and B
-      //
-
-      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-      Tensor gA_mkl = get<0>(load_inputs);
-      Tensor gB_nkl = get<1>(load_inputs);
-
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-    
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      accumulation.promote_if_needed();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accumulation());
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      warpgroup_fence_operand(accumulation());
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accumulation());
-
-      accumulation.promote_if_needed();
-
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    accumulation.promote_residue_if_needed();
-
-    warpgroup_fence_operand(accumulation());
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
deleted file mode 100755
index 01e83bdf5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
+++ /dev/null
@@ -1,724 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/tensor_predicate.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class LayoutPairAE_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    LayoutPairAE_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using TiledMma = TiledMma_;
-  using ElementA = ElementA_;
-  using ElementAMma = typename TiledMma::ValTypeA;
-  using ElementAMmaRaw = typename ElementAMma::raw_type;
-  using LayoutPairAE = LayoutPairAE_;
-  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
-  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
-  using StrideA = decltype(cute::stride(LayoutA{}));
-  using ElementB = ElementB_;
-  using ElementBMma = typename TiledMma::ValTypeB;
-  using StrideB = StrideB_;
-  using ElementEMma = typename TiledMma::ValTypeE;
-  using ElementE = typename ElementEMma::raw_type;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-  using ArrayElementA = ElementA;
-  using ArrayElementB = ElementB;
-
-  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
-  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
-
-  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
-  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
-
-  // LayoutA is nested in the stride due to the sparsity.
-  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
-  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
-
-  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
-                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
-                                                     ElementEMma,
-                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
-
-  // The offline permutation for the metadata.
-  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
-  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
-                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
-                                          SmemLayoutAtomE_>;
-
-  // Metadata pathways
-  using SmemCopyAtomE = AutoVectorizingCopy;
-  using GmemCopyAtomE = GmemTiledCopyA;
-
-  using CtaShape_MNK = TileShape;
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutE = decltype(tile_to_shape(
-      SmemLayoutAtomE{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-
-  static_assert(cute::is_void_v<SmemCopyAtomA>,
-    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-  static_assert(cute::is_void_v<SmemCopyAtomB>,
-    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
-                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
-                                                                    cutlass::tfloat32_t,
-                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
-  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>, 
-                                                  tfloat32_t,
-                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
-
-  struct SharedStorage
-  {
-    struct TensorStorage {
-      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
-      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
-      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 0;
-
-  static constexpr uint32_t TmaTransactionBytes =
-        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
-        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>) +
-        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A{};
-    LayoutA layout_a{};
-    ElementB const* ptr_B{};
-    StrideB dB{};
-    ElementE const* ptr_E{};
-    LayoutE layout_e{};
-  };
-
-  // Device side kernel params
-  struct Params {
-
-    using TMA_A = decltype(make_tma_copy<typename TmaInternalElementA::raw_type>(
-        GmemTiledCopyA{},
-        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
-
-    using TMA_E = decltype(make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
-        GmemCopyAtomE{},
-        make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(nullptr), LayoutE{}),
-        SmemLayoutE{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
-
-    using TMA_B = decltype(make_tma_copy<TmaInternalElementB>(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<TmaInternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
-
-    TMA_A tma_load_a;
-    TMA_E tma_load_e;
-    TMA_B tma_load_b;
-    LayoutA layout_a;
-    LayoutE layout_e;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
-    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
-    auto ptr_E = recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(args.ptr_E);
-
-    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
-
-    typename Params::TMA_A tma_load_a = make_tma_copy<typename TmaInternalElementA::raw_type>(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
-
-    typename Params::TMA_E tma_load_e = make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
-        GmemCopyAtomE{},
-        tensor_e,
-        SmemLayoutE{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
-
-    typename Params::TMA_B tma_load_b = make_tma_copy<TmaInternalElementB>(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
-
-    return {
-      tma_load_a,
-      tma_load_e,
-      tma_load_b,
-      args.layout_a,
-      args.layout_e
-    };
-  }
-
-  template<class ProblemShape>
-  CUTLASS_HOST_DEVICE static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool size_check = true;
-    // Check Alignment A
-    if constexpr (is_A_mn_major) {
-      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
-    }
-    else { // If A is K-major
-      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
-    }
-    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!size_check) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-
-    // Check if layout_a and layout_e is filled correctly
-    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
-    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
-    bool layout_check = true;
-    layout_check = layout_check && (layout_a_ref == args.layout_a);
-    layout_check = layout_check && (layout_e_ref == args.layout_e);
-
-    if (!layout_check) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
-    }
-
-    return size_check && layout_check;
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  /// The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
-    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB, class TensorE,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline, 
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-
-      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
-
-      // Define the CTA-in-cluster Layout and Coord
-      Layout cta_layout_mnk = make_layout(ClusterShape{});
-      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
-
-      // TMA Multicast Masks
-      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
-      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
-      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
-
-      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
-      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
-      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
-
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-      // Applies the mapping from block_tma_a
-      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
-      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for ( ; k_tile_count > 0; --k_tile_count)
-      {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all 
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was 
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    Tensor sE_ = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
-    Tensor sE = as_position_independent_swizzle_tensor(sE_);
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
-
-    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
-    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
-
-    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
-    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
-
-    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
-    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-      int read_stage = smem_pipe_read.index();
-
-      // Load metadata smem->rmem for one stage
-      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
-
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-
-      warpgroup_commit_batch();
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-      int read_stage = smem_pipe_read.index();
-
-      // Load metadata smem->rmem for one stage
-      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
-
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-
-    warpgroup_fence_operand(accum);
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-    
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-
-private:
-
-  template <class MMA_Atom,
-            class AtomLayoutMNK,
-            class PermutationMNK,
-            class ETensor>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
-  {
-    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
-
-    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
-
-    // Reorder the tensor for the TiledAtom
-    auto t_tile = make_tile(get<0>(PermutationMNK{}),
-                            get<2>(PermutationMNK{}));
-    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
-
-    // Tile the tensor for the Atom
-    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
-                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
-    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
-
-    // Transform the Atom mode from (M,K) to (Thr,Val)
-    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
-    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
-
-    // Tile the tensor for the Thread
-    auto thr_tile = make_tile(_,
-                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
-                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
-    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
-
-    return thr_tensor;
-  }
-
-  template<class... MArgs>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
-  {
-    // (M,K) -> (M,K)
-    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
-    // (ethrid,val) -> (M,K)
-    auto layoutE_TV = thrfrg_E(mma, ref_E);
-
-    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
-    auto etile = make_tile(_,
-                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
-                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
-                                      _));
-
-    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
-    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
-
-    // (thr_idx,val) -> (M,K)
-    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
-  }
-
-  template <class... MArgs, class ETensor>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
-  {
-    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
-
-    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
-    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
-  }
-
-  template <class... CArgs, class... MArgs>
-  CUTE_HOST_DEVICE static constexpr
-  auto
-  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
-                    TiledMMA<MArgs...>  const& mma)
-  {
-    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
deleted file mode 100755
index eec61981f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief Base device-level grouped kernel.
-*/
-
-#pragma once
-
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/trace.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM Grouped
-template <typename BaseKernel_>
-class BaseGrouped {
-public:
-
-  using BaseKernel = BaseKernel_;
-
-  using ElementA = typename BaseKernel::ElementA;
-  using LayoutA = typename BaseKernel::LayoutA;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
-  static int const kAlignmentA = BaseKernel::kAlignmentA;
-
-  using ElementB = typename BaseKernel::ElementB;
-  using LayoutB = typename BaseKernel::LayoutB;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
-  static int const kAlignmentB = BaseKernel::kAlignmentB;
-
-  using ElementC = typename BaseKernel::ElementC;
-  using LayoutC = typename BaseKernel::LayoutC;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  static int const kAlignmentC = BaseKernel::kAlignmentC;
-
-  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
-
-  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
-  using ThreadblockSwizzle = typename BaseKernel::ThreadblockSwizzle;
-
-  using Operator = typename BaseKernel::Operator;
-  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
-
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename WarpMmaOperator::MathOperator;
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-  using ThreadblockShape = typename BaseKernel::Mma::Shape;
-  using WarpShape = typename BaseKernel::WarpShape;
-  using InstructionShape = typename BaseKernel::InstructionShape;
-  static int const kStages = BaseKernel::Mma::kStages;
-
-  /// Argument structure
-  using Arguments = typename BaseKernel::Arguments;
-
-  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
-
-protected:
-
-  /// Kernel parameters object
-  typename BaseKernel::Params params_;
-
-private:
-
-  /// Get the number of tiles across all problems in a group
-  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count) {
-    int32_t tiles = 0;
-    for (int32_t i = 0; i < problem_count; ++i) {
-      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
-      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
-      tiles += problem_tile_count(problem);
-    }
-    return tiles;
-  }
-
-  /// Copy from `data` to `workspace`
-  Status copy_to_workspace(void* workspace, void* data, size_t bytes) {
-    cudaError_t cuda_error = cudaMemcpy(workspace, data, bytes, cudaMemcpyHostToDevice);
-    if (cuda_error != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      cuda_error = cudaGetLastError();
-      CUTLASS_TRACE_HOST(
-          "  cudaMemcpy() returned error "
-          << cudaGetErrorString(cuda_error));
-      return Status::kErrorInternal;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Precomputes scheduling information for the grouped GEMM
-  Status precompute(Arguments const &args, int32_t tile_count, void* workspace) {
-    size_t workspace_bytes = get_workspace_size(args);
-    std::vector<uint8_t> host_workspace(workspace_bytes);
-    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes,
-                                                args.problem_count,
-                                                args.threadblock_count,
-                                                (void*)host_workspace.data());
-    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes);
-  }
-
-  /// Reorder `data` according to `indices`
-  template <typename T>
-  static void reorder_array(T* data, const std::vector<size_t>& indices) {
-    // For now, simply create a copy of the data and then copy over to the original.
-    std::vector<T> copy(indices.size());
-    for (size_t i = 0; i < indices.size(); ++i) {
-      copy.at(i) = data[indices[i]];
-    }
-
-    memcpy(data, copy.data(), indices.size() * sizeof(T));
-  }
-
-public:
-
-  /// Constructs the GEMM.
-  BaseGrouped() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return BaseKernel::can_implement(args);
-  }
-
-  /// Get the number of tiles in a problem
-  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const &problem) {
-    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
-    return BaseKernel::ProblemVisitor::tile_count(grid);
-  }
-
-  /// Get the number of tiles across all problems in a group
-  static int32_t group_tile_count(Arguments const &args) {
-    if (args.host_problem_sizes == nullptr) {
-        CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
-        return -1;
-    }
-
-    return group_tile_count(args.host_problem_sizes, args.problem_count);
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
-      return BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes,
-                                                            args.problem_count,
-                                                            args.threadblock_count);
-    } else {
-      return 0;
-    }
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-
-    return dim3(args.threadblock_count, 1, 1);
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-
-    CUTLASS_TRACE_HOST("BaseGrouped::maximum_active_blocks()");
-
-    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
-
-    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
-
-    cudaError_t result;
-    if (smem_size > (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<BaseKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        // Call cudaGetLastError() to clear the error bit
-        result = cudaGetLastError();
-        CUTLASS_TRACE_HOST(
-          "  cudaFuncSetAttribute() returned error "
-          << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    int max_active_blocks = -1;
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<BaseKernel>,
-        BaseKernel::kThreadCount,
-        smem_size);
-
-    if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
-      CUTLASS_TRACE_HOST(
-        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
-        << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Sorts each pointer passed in according to the indices that sort
-  /// `problem_sizes_ptr` in descending order of problem-K dimension.
-  static void sort_problems(int problem_count,
-                            cutlass::gemm::GemmCoord* problem_sizes_ptr,
-                            int64_t* lda_host_ptr,
-                            int64_t* ldb_host_ptr,
-                            int64_t* ldc_host_ptr,
-                            int64_t* ldd_host_ptr,
-                            int64_t* offset_A_ptr,
-                            int64_t* offset_B_ptr,
-                            int64_t* offset_C_ptr,
-                            int64_t* offset_D_ptr)
-  {
-    std::vector<size_t> indices(problem_count);
-    std::iota(indices.begin(), indices.end(), 0);
-    std::stable_sort(indices.begin(), indices.end(),
-      [&problem_sizes_ptr](size_t i, size_t j) {
-        return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
-      });
-
-    reorder_array(problem_sizes_ptr, indices);
-    reorder_array(lda_host_ptr, indices);
-    reorder_array(ldb_host_ptr, indices);
-    reorder_array(ldc_host_ptr, indices);
-    reorder_array(ldd_host_ptr, indices);
-    reorder_array(offset_A_ptr, indices);
-    reorder_array(offset_B_ptr, indices);
-    reorder_array(offset_C_ptr, indices);
-    reorder_array(offset_D_ptr, indices);
-  }
-
-  /// Computes the number of threadblocks to launch for the grouped kernel
-  static int sufficient(const cutlass::gemm::GemmCoord* problem_sizes_ptr=nullptr,
-                        int problem_count=0,
-                        int available_sm_count=-1) {
-    // Determine the number of blocks that would be launched to fill up a single
-    // wave on the GPU with each SM having maximum occupancy.
-    int device_idx;
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
-      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
-          << cudaGetErrorString(result));
-      return 0;
-    }
-
-    int multiprocessor_count;
-    result = cudaDeviceGetAttribute(&multiprocessor_count,
-      cudaDevAttrMultiProcessorCount, device_idx);
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaDeviceGetAttribute() returned error "
-        << cudaGetErrorString(result));
-      return 0;
-    }
-
-    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
-    if (override_sm_count) {
-      available_sm_count = multiprocessor_count;
-    }
-
-    int max_active_blocks = maximum_active_blocks();
-    if (max_active_blocks <= 0) {
-      return 0;
-    }
-
-    int occupancy_based_block_count = available_sm_count * max_active_blocks;
-
-    if (problem_sizes_ptr == nullptr || problem_count == 0) {
-      return occupancy_based_block_count;
-    }
-
-    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
-
-    // If the group contains a single problem, launching the exact number of
-    // threadblocks needed to cover the problem minimizes the work performed
-    // per threadblock in finding the next tile to compute. We return total_tiles
-    // unless the user has provided the SM count.
-    if (problem_count == 1 && override_sm_count) {
-      return total_tiles;
-    }
-
-    // Choose between the full wave of threadblocks and the tile count. If there
-    // are fewer tiles in the group than threadblocks in the full wave, only
-    // some threadblocks will be assigned tiles. Those threadblocks
-    // which are not assigned tiles still need to perform the work of iterating through
-    // problem sizes to determine that they have no work to do. This competes for cycles
-    // with those threadblocks that are assigned tiles to compute.
-    return std::min(total_tiles, occupancy_based_block_count);
-  }
-
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    CUTLASS_TRACE_HOST("BaseGrouped::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Workspace
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
-      int32_t tile_count = group_tile_count(args);
-      Status status = precompute(args, tile_count, workspace);
-      if (status != Status::kSuccess) {
-        return status;
-      }
-
-      params_ = typename BaseKernel::Params(args, workspace, tile_count);
-    } else {
-      params_ = typename BaseKernel::Params(args, workspace);
-    }
-
-    // Specify shared memory capacity for kernel.
-    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<BaseKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
-      int32_t tile_count = group_tile_count(args);
-      Status status = precompute(args, tile_count, workspace);
-      if (status != Status::kSuccess) {
-        return status;
-      }
-
-      params_.update(args, workspace, tile_count);
-    } else {
-      params_.update(args, workspace);
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    //
-    // Configure grid and block dimensions
-    //
-
-    if (!params_.problem_visitor.problem_count) {
-      return Status::kSuccess;
-    }
-
-    dim3 grid(params_.threadblock_count, 1, 1);
-    dim3 block(BaseKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
-
-    //
-    // Launch kernel
-    //
-
-    // Launch
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    //
-    // Query for errors
-    //
-    cudaError_t result = cudaGetLastError();
-
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
-      return Status::kErrorInternal;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Initializes and runs the kernel.
-  Status operator()(
-    Arguments const &args,
-    void *workspace,
-    cudaStream_t stream = nullptr) {
-
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
deleted file mode 100755
index e7ed2da94..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Definitions for GEMM structures
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename OperatorClass,
-  typename ArchTag,
-  typename ElementA, 
-  typename ElementB, 
-  typename ElementC,
-  typename ElementAccumulator
->
-struct DefaultGemmConfiguration;
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ArchTag,
-  typename ElementA, 
-  typename ElementB, 
-  typename ElementC, 
-  typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassSimt, 
-  ArchTag,
-  ElementA, 
-  ElementB, 
-  ElementC, 
-  ElementAccumulator> {
-  
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  using ThreadblockShape = GemmShape<128, 128, 8>;
-  using WarpShape = GemmShape<32, 64, 8>;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-    ElementC,
-    1,
-    ElementAccumulator,
-    ElementAccumulator
-  >;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ArchTag,
-  typename ElementC>
-struct DefaultGemmConfiguration<arch::OpClassSimt, ArchTag, int8_t, int8_t, ElementC, int32_t> {
-  
-  static int const kAlignmentA = 4;
-  static int const kAlignmentB = 4;
-  using ThreadblockShape = GemmShape<128, 128, 32>;
-  using WarpShape = GemmShape<32, 64, 32>;
-  using InstructionShape = GemmShape<1, 1, 4>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-    ElementC,
-    1,
-    int32_t,
-    float
-  >;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ArchTag,
-  typename ElementA, 
-  typename ElementB, 
-  typename ElementC, 
-  typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassWmmaTensorOp, 
-  ArchTag,
-  ElementA, 
-  ElementB, 
-  ElementC, 
-  ElementAccumulator> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
-
-  static int const kStages = 2;
-  
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-    ElementC,
-    128 / sizeof_bits<ElementC>::value,
-    ElementAccumulator,
-    ElementAccumulator
-  >;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA, 
-  typename ElementB, 
-  typename ElementC, 
-  typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm70,
-  ElementA, 
-  ElementB, 
-  ElementC, 
-  ElementAccumulator> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 32>;
-  using WarpShape = GemmShape<64, 64, 32>;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  static int const kStages = 2;
-  
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-    ElementC,
-    128 / sizeof_bits<ElementC>::value,
-    ElementAccumulator,
-    ElementAccumulator
-  >;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA, 
-  typename ElementB, 
-  typename ElementC, 
-  typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75,
-  ElementA, 
-  ElementB, 
-  ElementC, 
-  ElementAccumulator> {
-
-  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
-  using ThreadblockShape = GemmShape<128, 256, 32>;
-  using WarpShape = GemmShape<64, 64, 32>;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-    ElementC,
-    128 / sizeof_bits<ElementC>::value,
-    ElementAccumulator,
-    ElementAccumulator
-  >;
-
-  using Operator = typename platform::conditional<
-      (platform::is_same<ElementA, int8_t>::value ||
-       platform::is_same<ElementA, int4b_t>::value ||
-       platform::is_same<ElementA, uint8_t>::value ||
-       platform::is_same<ElementA, uint4b_t>::value),
-      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  int8_t, 
-  int8_t, 
-  ElementC, 
-  int32_t> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<8, 8, 16>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  int8_t, 
-  uint8_t, 
-  ElementC, 
-  int32_t> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<8, 8, 16>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  uint8_t, 
-  int8_t, 
-  ElementC, 
-  int32_t> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<8, 8, 16>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  uint8_t, 
-  uint8_t, 
-  ElementC, 
-  int32_t> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<8, 8, 16>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  int4b_t, 
-  int4b_t, 
-  ElementC, 
-  int32_t> {
-   
-  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<8, 8, 32>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  int4b_t, 
-  uint4b_t, 
-  ElementC, 
-  int32_t> {
-    
-  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<8, 8, 32>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  uint4b_t, 
-  int4b_t, 
-  ElementC, 
-  int32_t> {
-  
-  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<8, 8, 32>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  uint4b_t, 
-  uint4b_t, 
-  ElementC, 
-  int32_t> {
-   
-  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<8, 8, 32>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm75, 
-  uint1b_t, 
-  uint1b_t, 
-  ElementC, 
-  int32_t> {
-    
-  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 512>;
-  using WarpShape = GemmShape<64, 64, 512>;
-  using InstructionShape = GemmShape<8, 8, 128>;
-  static int const kStages = 2;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpXorPopc;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename ElementA, typename ElementB, typename ElementC,
-          typename ElementAccumulator>
-struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, ElementA,
-                                ElementB, ElementC, ElementAccumulator> {
-
-  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 16>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
-      ElementAccumulator>;
-
-  using Operator = typename platform::conditional<
-      (platform::is_same<ElementA, int8_t>::value ||
-       platform::is_same<ElementA, int4b_t>::value ||
-       platform::is_same<ElementA, uint8_t>::value ||
-       platform::is_same<ElementA, uint4b_t>::value),
-      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-template <typename ElementC,
-          typename ElementAccumulator>
-struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
-                                double, ElementC, ElementAccumulator> {
-
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  
-  using ThreadblockShape = GemmShape<128, 128, 16>;
-  using WarpShape = GemmShape<32, 64, 16>;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      ElementC, 1, ElementAccumulator,
-      ElementAccumulator>;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-
-template <>
-struct DefaultGemmConfiguration<
-    arch::OpClassTensorOp, 
-    arch::Sm80, 
-    complex<double>,
-    complex<double>, 
-    complex<double>,
-    complex<double>
-  > {
-
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  
-  using ThreadblockShape = GemmShape<64, 64, 16>;
-  using WarpShape = GemmShape<32, 32, 16>;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      complex<double>, 1, complex<double>,
-      complex<double>>;
-
-  using Operator = arch::OpMultiplyAddComplex;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  int8_t, 
-  int8_t, 
-  ElementC, 
-  int32_t> {
-     
-  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
- 
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  int8_t, 
-  uint8_t, 
-  ElementC, 
-  int32_t> {
-      
-  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  uint8_t, 
-  int8_t, 
-  ElementC, 
-  int32_t> {
-      
-  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  uint8_t, 
-  uint8_t, 
-  ElementC, 
-  int32_t> {
-      
-  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  int4b_t, 
-  int4b_t, 
-  ElementC, 
-  int32_t> {
-      
-  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<16, 8, 64>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  int4b_t, 
-  uint4b_t, 
-  ElementC, 
-  int32_t> {
-       
-  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<16, 8, 64>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  uint4b_t, 
-  int4b_t, 
-  ElementC, 
-  int32_t> {
-       
-  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<16, 8, 64>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  uint4b_t, 
-  uint4b_t, 
-  ElementC, 
-  int32_t> {
-       
-  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 128>;
-  using WarpShape = GemmShape<64, 64, 128>;
-  using InstructionShape = GemmShape<16, 8, 64>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template < 
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp, 
-  arch::Sm80, 
-  uint1b_t, 
-  uint1b_t, 
-  ElementC, 
-  int32_t> {
-       
-  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
-  
-  using ThreadblockShape = GemmShape<128, 256, 512>;
-  using WarpShape = GemmShape<64, 64, 512>;
-  using InstructionShape = GemmShape<16, 8, 256>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm80,
-  int4b_t,
-  int8_t,
-  ElementC,
-  int32_t> {
-
-  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementC>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm80,
-  int8_t,
-  int4b_t,
-  ElementC,
-  int32_t> {
-
-  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
-      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
-
-  using Operator = arch::OpMultiplyAddSaturate;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Base configuration for all {fe4m3, fe5m2} x {fe4m3, fe5m2} combinations on SM89
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename ElementAccumulator>
-struct DefaultGemmConfigurationSm89F8 {
-  static_assert((platform::is_same<ElementA, cutlass::float_e4m3_t>::value ||
-                 platform::is_same<ElementA, cutlass::float_e5m2_t>::value),
-                "ElementA must be of type float_e4m3_t or float_e5m2_t");
-  static_assert((platform::is_same<ElementB, cutlass::float_e4m3_t>::value ||
-                 platform::is_same<ElementB, cutlass::float_e5m2_t>::value),
-                "ElementB must be of type float_e4m3_t or float_e5m2_t");
-
-  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
-  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
-
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 32>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
-      ElementAccumulator>;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-/// Partial specialization for SM89 fe4m3 x fe4m3
-template <typename ElementC, typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm89,
-  cutlass::float_e4m3_t,
-  cutlass::float_e4m3_t,
-  ElementC,
-  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
-                            cutlass::float_e4m3_t,
-                            cutlass::float_e4m3_t,
-                            ElementC,
-                            ElementAccumulator> {};
-
-/// Partial specialization for SM89 fe4m3 x fe5m2
-template <typename ElementC, typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm89,
-  cutlass::float_e4m3_t,
-  cutlass::float_e5m2_t,
-  ElementC,
-  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
-                            cutlass::float_e4m3_t,
-                            cutlass::float_e5m2_t,
-                            ElementC,
-                            ElementAccumulator> {};
-
-/// Partial specialization for SM89 fe5m2 x fe4m3
-template <typename ElementC, typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm89,
-  cutlass::float_e5m2_t,
-  cutlass::float_e4m3_t,
-  ElementC,
-  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
-                            cutlass::float_e5m2_t,
-                            cutlass::float_e4m3_t,
-                            ElementC,
-                            ElementAccumulator> {};
-
-/// Partial specialization for SM89 fe5m2 x fe5m2
-template <typename ElementC, typename ElementAccumulator>
-struct DefaultGemmConfiguration<
-  arch::OpClassTensorOp,
-  arch::Sm89,
-  cutlass::float_e5m2_t,
-  cutlass::float_e5m2_t,
-  ElementC,
-  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
-                            cutlass::float_e5m2_t,
-                            cutlass::float_e5m2_t,
-                            ElementC,
-                            ElementAccumulator> {};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename ElementC,
-          typename ElementAccumulator>
-struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm90, double,
-                                double, ElementC, ElementAccumulator> {
-
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 4>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      ElementC, 1, ElementAccumulator,
-      ElementAccumulator>;
-
-  using Operator = arch::OpMultiplyAdd;
-};
-
-template <>
-struct DefaultGemmConfiguration<
-    arch::OpClassTensorOp, 
-    arch::Sm90, 
-    complex<double>,
-    complex<double>, 
-    complex<double>,
-    complex<double>
-  > {
-
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  
-  using ThreadblockShape = GemmShape<64, 64, 16>;
-  using WarpShape = GemmShape<32, 32, 16>;
-  using InstructionShape = GemmShape<16, 8, 4>;
-  static int const kStages = 3;
-
-  using EpilogueOutputOp = epilogue::thread::LinearCombination<
-      complex<double>, 1, complex<double>,
-      complex<double>>;
-
-  using Operator = arch::OpMultiplyAddComplex;
-};
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
deleted file mode 100755
index 54ddab400..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h
+++ /dev/null
@@ -1,849 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a Block-Ell sparse gemm kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/ell_gemm.h"
-
-#include "cutlass/gemm/kernel/default_ell_gemm.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Blocked-Ell sparse gemm device-level operator. This is an interface to efficient CUTLASS
-  Blocked-Ell kernels that may be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to Blocked-Ell problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  Example of a CUTLASS EllGemm operator is as follows:
-
-    //
-    // Instantiate the CUTLASS EllGemm operator.
-    //
-
-    cutlass::gemm::device::EllGemm<
-      cutlass::half_t,
-      cutlass::layout::RowMajor,
-      cutlass::half_t,
-      cutlass::layout::ColumnMajor,
-      cutlass::half_t,
-      cutlass::layout::ColumnMajor,
-      float, 
-      cutlass::arch::OpClassTensorOp, 
-      cutlass::arch::Sm80,
-      cutlass::gemm::GemmShape<128, 128, 32>,
-      cutlass::gemm::GemmShape<64, 64, 32>, 
-      cutlass::gemm::GemmShape<16, 8, 16>,
-      cutlass::epilogue::thread::LinearCombination<
-          cutlass::half_t, 128 / cutlass::sizeof_bits<cutlass::half_t>::value,
-          float, float>,
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 
-      4, // Stages
-      128 / cutlass::sizeof_bits<cutlass::half_t>::value, // Alignment A
-      128 / cutlass::sizeof_bits<cutlass::half_t>::value  // Alignment B
-    > ellgemm_op;
-
-    //
-    // Launch the EllGemm operation on the device
-    //
-
-    Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format:
-      a_rows              - Rows in the sparse matrix.
-      a_cols              - Colums in the sparse matrix.
-      BlockedEllA         - Packed matrix (ellValue matrix) that stores non-zero values in 
-                            consecutive blocks, whose size is (a_rows * a_ell_num_columns)
-      ell_idx             - Blocked-ELL Column indices (ellColInd) matrix, whose size is
-                            (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
-      a_ell_blocksize     - Size of the ELL-Blocks.
-      a_ell_num_columns   - Number of columns in the Blocked-Ellpack format (ellValue columns)
-      B                   - Input dense matrix whose size is (a_cols * n)
-      C/D                 - Output dense matrix whose size is (a_rows * n)
-
-    cutlass::Status status = ellgemm_op({
-      {a_rows, n, a_cols},  // GemmCoord problem_size
-      {BlockedEllA, lda},   // TensorRef<cutlass::half_t, layout::RowMajor> ref_BlockedEllA
-      {B, ldb},             // TensorRef<cutlass::half_t, layout::ColumnMajor> ref_B,
-      {C, ldc},             // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},             // TensorRef<float, layout::ColumnMajor> ref_D,
-      ell_idx,              // Blocked-ELL Column indices or ellColInd matrix (const int*)
-      a_ell_num_columns,    // Columns in the Blocked-Ellpack (ellValue) matrix (int)
-      a_ell_blocksize,      // Size of the ELL-Blocks (int)
-      a_ell_base,           // Base index of ellColInd (int) - Zero or One
-      {alpha, beta}         // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-
-      /// Access granularity of A matrix in units of elements
-      int AlignmentA,
-
-      /// Access granularity of B matrix in units of elements
-      int AlignmentB,
-
-      /// Supports split-K with serial reduction
-      bool SplitKSerial,
-
-      /// Operation performed by GEMM
-      typename Operator,
-
-      /// Sparse matrix is A or not
-      bool IsASparse
-    >
-    class EllGemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse = true
-    >
-class EllGemm {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-  static bool const kIsASparse = IsASparse;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultEllGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator,
-    kIsASparse
-  >::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    const int* ell_idx;
-    int ell_ncol;
-    int ell_blocksize;
-    int ell_base_idx;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      const int* ell_idx_,
-      int ell_ncol_,
-      int ell_blocksize_,
-      int ell_base_idx_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      ell_idx(ell_idx_),
-      ell_ncol(ell_ncol_),
-      ell_blocksize(ell_blocksize_),
-      ell_base_idx(ell_base_idx_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) {
-
-    }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_{};
-
-public:
-
-  /// Constructs the GEMM.
-  EllGemm() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = GemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D
-    );
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-                                              args.problem_size, 
-                                              {args.ell_blocksize,
-                                              ThreadblockShape::kN, ThreadblockShape::kK},
-                                              args.split_k_slices);
-      
-    tiled_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.ell_idx,
-      args.ell_ncol,
-      args.ell_blocksize,
-      args.ell_base_idx,
-      args.epilogue,
-      static_cast<int *>(workspace)
-    };
-    return Status::kSuccess;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    grid_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    return set(args, grid_shape, workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data());
-    params_.output_op = args.epilogue;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// If true, kernel supports split-K as a serial reduction
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Sparse matrix is A or not
-    bool IsASparse>
-class EllGemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           SplitKSerial, Operator_, IsASparse> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-  static bool const kSplitKSerial = SplitKSerial;
-  static bool const kIsASparse = false;
-
-  using UnderlyingOperator = EllGemm< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    SplitKSerial,
-    Operator,
-    kIsASparse
-  >;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    const int* ell_idx;
-    int ell_ncol;
-    int ell_blocksize;
-    int ell_base_idx;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      const int* ell_idx_,
-      int ell_ncol_,
-      int ell_blocksize_,
-      int ell_base_idx_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      ell_idx(ell_idx_),
-      ell_ncol(ell_ncol_),
-      ell_blocksize(ell_blocksize_),
-      ell_base_idx(ell_base_idx_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) { }
-  };
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  EllGemm() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-    return UnderlyingArguments(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
-      {args.ref_B.data(), args.ref_B.stride(0)},
-      {args.ref_A.data(), args.ref_A.stride(0)},
-      {args.ref_C.data(), args.ref_C.stride(0)},
-      {args.ref_D.data(), args.ref_D.stride(0)},
-      args.ell_idx,
-      args.ell_ncol,
-      args.ell_blocksize,
-      args.ell_base_idx,
-      args.epilogue,
-      args.split_k_slices
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
-      args.split_k_slices);
-    
-    tiled_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
-
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
-    // Initialize the Params structure
-    return underlying_operator_.set(to_underlying_arguments(args), grid_shape, workspace);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, 
-      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
-      args.split_k_slices);
-    
-    grid_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Initialize the Params structure
-    set(args, grid_shape, workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
deleted file mode 100755
index c6f488b14..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h
+++ /dev/null
@@ -1,772 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
-  be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
-
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
-  is as follows:
-
-    //
-    // Instantiate the CUTLASS GEMM operator.
-    //
-
-    cutlass::gemm::device::Gemm<
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor
-    > gemm_op;
-
-    //
-    // Launch the GEMM operation on the device
-    //
-
-    cutlass::Status status = gemm_op({
-      {m, n, k},                          // GemmCoord problem_size,
-      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
-      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
-      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
-      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-    >
-    class Gemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute>
-class Gemm {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator,
-    SharedMemoryClearOption::kNone,
-    GatherA,
-    GatherB,
-    ScatterD,
-    PermuteDLayout
-  >::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-    // For gather+scatter operations
-    int const *gather_A_indices;
-    int const *gather_B_indices;
-    int const *scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1,
-      int const *gather_A_indices_ = nullptr,
-      int const *gather_B_indices_ = nullptr,
-      int const *scatter_D_indices_ = nullptr
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices),
-      gather_A_indices(gather_A_indices_),
-      gather_B_indices(gather_B_indices_),
-      scatter_D_indices(scatter_D_indices_) {
-
-    }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  Gemm() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = GemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D
-    );
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.epilogue,
-      static_cast<int *>(workspace),
-      args.gather_A_indices,
-      args.gather_B_indices,
-      args.scatter_D_indices
-    };
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data());
-    params_.output_op = args.epilogue;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// If true, kernel supports split-K as a serial reduction
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout
->
-class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
-           Operator_, GatherA, GatherB, ScatterD, PermuteDLayout> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  using UnderlyingOperator = Gemm< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    SplitKSerial,
-    Operator,
-    GatherB,
-    GatherA,
-    ScatterD,
-    PermuteDLayout
-  >;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-    // For gather+scatter operations
-    int *gather_A_indices;
-    int *gather_B_indices;
-    int *scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1,
-      int *gather_A_indices_ = nullptr,
-      int *gather_B_indices_ = nullptr,
-      int *scatter_D_indices_ = nullptr
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices),
-      gather_A_indices(gather_A_indices_),
-      gather_B_indices(gather_B_indices_),
-      scatter_D_indices(scatter_D_indices_) { }
-  };
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  Gemm() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-    return UnderlyingArguments(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
-      {args.ref_B.data(), args.ref_B.stride(0)},
-      {args.ref_A.data(), args.ref_A.stride(0)},
-      {args.ref_C.data(), args.ref_C.stride(0)},
-      {args.ref_D.data(), args.ref_D.stride(0)},
-      args.epilogue,
-      args.split_k_slices,
-      args.gather_B_indices,
-      args.gather_A_indices,
-      args.scatter_D_indices
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
deleted file mode 100755
index 1ae2db467..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h
+++ /dev/null
@@ -1,738 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_array.h"
-
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
-  be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
-
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
-  is as follows:
-
-    //
-    // Instantiate the CUTLASS GEMM operator.
-    //
-
-    cutlass::gemm::device::Gemm<
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor
-    > gemm_op;
-
-    //
-    // Launch the GEMM operation on the device
-    //
-
-    cutlass::Status status = gemm_op({
-      {m, n, k},                          // GemmCoord problem_size,
-      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
-      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
-      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
-      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-    >
-    class Gemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator
->
-class GemmArray {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  using Operator = Operator_;
-
-  /// Define the kernel
-  using DefaultGemmKernel = typename kernel::DefaultGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    false,
-    Operator
-  >::GemmKernel;
-
-  using GemmKernel = kernel::GemmArray<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-
-    ElementA const * const *ptr_A;
-    LayoutA layout_A;
-
-    ElementB const * const *ptr_B;
-    LayoutB layout_B;
-
-    ElementC const * const *ptr_C;
-    LayoutC layout_C;
-
-    ElementC * const * ptr_D;
-    LayoutC layout_D;
-    
-    typename EpilogueOutputOp::Params epilogue;
-    int batch_count;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      ElementA const * const *ptr_A_,
-      LayoutA layout_A_,
-      ElementB const * const *ptr_B_,
-      LayoutB layout_B_,
-      ElementC const * const *ptr_C_,
-      LayoutC layout_C_,
-      ElementC * const * ptr_D_,
-      LayoutC layout_D_,
-      typename EpilogueOutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      ptr_A(ptr_A_),
-      layout_A(layout_A_),
-      ptr_B(ptr_B_),
-      layout_B(layout_B_),
-      ptr_C(ptr_C_),
-      layout_C(layout_C_),
-      ptr_D(ptr_D_),
-      layout_D(layout_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_) { }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmArray() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (args.layout_A.stride(0) % kAlignmentA) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (args.layout_B.stride(0) % kAlignmentB) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (args.layout_C.stride(0) % kAlignmentC) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (args.layout_D.stride(0) % kAlignmentC) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    return 0;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ptr_A,
-      args.layout_A,
-      args.ptr_B,
-      args.layout_B,
-      args.ptr_C,
-      args.layout_C,
-      args.ptr_D,
-      args.layout_D,
-      args.epilogue,
-      args.batch_count
-    };
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size,
-      args.batch_count,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK});
-
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ptr_A,
-      args.layout_A,
-      args.ptr_B,
-      args.layout_B,
-      args.ptr_C,
-      args.layout_C,
-      args.ptr_D,
-      args.layout_D,
-      args.epilogue,
-      args.batch_count
-    };
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator_,
-  /// Operator class tag
-  typename OperatorClass_,
-  /// Tag indicating architecture to tune for
-  typename ArchTag_,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape_,
-  /// Epilogue output operator
-  typename EpilogueOutputOp_,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle_,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB,
-  typename Operator_
->
-class GemmArray<
-  ElementA_,
-  LayoutA_,
-  ElementB_,
-  LayoutB_,
-  ElementC_,
-  layout::ColumnMajor,
-  ElementAccumulator_,
-  OperatorClass_,
-  ArchTag_,
-  ThreadblockShape_,
-  WarpShape_,
-  InstructionShape_,
-  EpilogueOutputOp_,
-  ThreadblockSwizzle_,
-  Stages,
-  AlignmentA,
-  AlignmentB,
-  Operator_
-> {
-public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = false;
-
-  //
-  using UnderlyingOperator = GemmArray< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA
-  >;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-
-    ElementA const * const *ptr_A;
-    LayoutA layout_A;
-
-    ElementB const * const *ptr_B;
-    LayoutB layout_B;
-
-    ElementC const * const *ptr_C;
-    LayoutC layout_C;
-
-    ElementC * const * ptr_D;
-    LayoutC layout_D;
-    
-    typename EpilogueOutputOp::Params epilogue;
-    int batch_count;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      ElementA const * const *ptr_A_,
-      LayoutA layout_A_,
-      ElementB const * const *ptr_B_,
-      LayoutB layout_B_,
-      ElementC const * const *ptr_C_,
-      LayoutC layout_C_,
-      ElementC * const * ptr_D_,
-      LayoutC layout_D_,
-      typename EpilogueOutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      ptr_A(ptr_A_),
-      layout_A(layout_A_),
-      ptr_B(ptr_B_),
-      layout_B(layout_B_),
-      ptr_C(ptr_C_),
-      layout_C(layout_C_),
-      ptr_D(ptr_D_),
-      layout_D(layout_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_) { }
-  };
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmArray() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-
-    GemmCoord problem_size{
-      args.problem_size.n(), 
-      args.problem_size.m(), 
-      args.problem_size.k()
-    };
-
-    return UnderlyingArguments(
-      problem_size,
-      args.ptr_B,
-      args.layout_B.stride(),
-      args.ptr_A,
-      args.layout_A.stride(),
-      args.ptr_C,
-      args.layout_C.stride(),
-      args.ptr_D,
-      args.layout_D.stride(),
-      args.epilogue,
-      args.batch_count
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
deleted file mode 100755
index 5981457c7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h
+++ /dev/null
@@ -1,704 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined batch GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_batched.h"
-
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
-  be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
-
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
-  is as follows:
-
-    //
-    // Instantiate the CUTLASS GEMM operator.
-    //
-
-    cutlass::gemm::device::Gemm<
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor
-    > gemm_op;
-
-    //
-    // Launch the GEMM operation on the device
-    //
-
-    cutlass::Status status = gemm_op({
-      {m, n, k},                          // GemmCoord problem_size,
-      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
-      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
-      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
-      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-    >
-    class Gemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator
->
-class GemmBatched {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  using Operator = Operator_;
-
-  /// Define the kernel
-  using DefaultGemmKernel = typename kernel::DefaultGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    false,
-    Operator
-  >::GemmKernel;
-
-  using GemmKernel = kernel::GemmBatched<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    int64_t stride_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    int64_t stride_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    int64_t stride_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    int64_t stride_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int batch_count;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      int64_t stride_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      int64_t stride_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      int64_t stride_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      int64_t stride_D_,
-      typename EpilogueOutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      stride_A(stride_A_),
-      ref_B(ref_B_),
-      stride_B(stride_B_),
-      ref_C(ref_C_),
-      stride_C(stride_C_),
-      ref_D(ref_D_),
-      stride_D(stride_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_) { }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmBatched() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!TensorRef_aligned(args.ref_A, kAlignmentA) || (args.stride_A % kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(args.ref_B, kAlignmentB) || (args.stride_B % kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(args.ref_C, kAlignmentC) || (args.stride_C % kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(args.ref_D, kAlignmentC) || (args.stride_D % kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    return 0;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.stride_A,
-      args.ref_B.non_const_ref(),
-      args.stride_B,
-      args.ref_C.non_const_ref(),
-      args.stride_C,
-      args.ref_D,
-      args.stride_D,
-      args.epilogue,
-      args.batch_count
-    };
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data()); 
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator_,
-  /// Operator class tag
-  typename OperatorClass_,
-  /// Tag indicating architecture to tune for
-  typename ArchTag_,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape_,
-  /// Epilogue output operator
-  typename EpilogueOutputOp_,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle_,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Access granularity of A matrix in units of elements
-  int AlignmentA,
-  /// Access granularity of B matrix in units of elements
-  int AlignmentB,
-  typename Operator_
->
-class GemmBatched<
-  ElementA_,
-  LayoutA_,
-  ElementB_,
-  LayoutB_,
-  ElementC_,
-  layout::ColumnMajor,
-  ElementAccumulator_,
-  OperatorClass_,
-  ArchTag_,
-  ThreadblockShape_,
-  WarpShape_,
-  InstructionShape_,
-  EpilogueOutputOp_,
-  ThreadblockSwizzle_,
-  Stages,
-  AlignmentA,
-  AlignmentB,
-  Operator_
-> {
-public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = false;
-
-  //
-  using UnderlyingOperator = GemmBatched< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA
-  >;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    int64_t stride_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    int64_t stride_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    int64_t stride_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    int64_t stride_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int batch_count;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      int64_t stride_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      int64_t stride_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      int64_t stride_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      int64_t stride_D_,
-      typename EpilogueOutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      stride_A(stride_A_),
-      ref_B(ref_B_),
-      stride_B(stride_B_),
-      ref_C(ref_C_),
-      stride_C(stride_C_),
-      ref_D(ref_D_),
-      stride_D(stride_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_) { }
-  };
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmBatched() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-    return UnderlyingArguments(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
-      {args.ref_B.data(), args.ref_B.stride(0)},
-      args.stride_B,
-      {args.ref_A.data(), args.ref_A.stride(0)},
-      args.stride_A,
-      {args.ref_C.data(), args.ref_C.stride(0)},
-      args.stride_C,
-      {args.ref_D.data(), args.ref_D.stride(0)},
-      args.stride_D,
-      args.epilogue,
-      args.batch_count
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
deleted file mode 100755
index e36c69cef..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h
+++ /dev/null
@@ -1,718 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM
-  kernels that may be invoked from host code.
-
-  The contributions of this class are:
-
-    1. At compile time, it maps data types and high-level structural parameters
-  onto specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to GEMM problems to kernel
-  parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most
-  plausible GEMM configurations for each supported architecture. Consequently,
-  not all parameters are exposed to the top-level interface. Rather, sensible
-  defaults at each level of the CUTLASS hierarchy are selected to tradeoff
-  simplicity of the interface with flexibility. We expect most configurations to
-  be specified at this level. Applications with more exotic requirements may
-  construct their kernels of interest using CUTLASS components at the
-  threadblock, warp, and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects
-  compose some internal state with an overloaded function call operator. This
-  enables decoupling of initialization from execution, possibly reducing
-  overhead during steady state phases of application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each
-  logical input to the computation. This is distinct from the kernel-level
-  Params structure pattern which contains application-specific precomputed state
-  needed by the device code.
-
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's
-  SGEMM NN is as follows:
-
-    //
-    // Instantiate the CUTLASS GEMM operator.
-    //
-
-    cutlass::gemm::device::Gemm<
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor
-    > gemm_op;
-
-    //
-    // Launch the GEMM operation on the device
-    //
-
-    cutlass::Status status = gemm_op({
-      {m, n, k},                          // GemmCoord problem_size,
-      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
-      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
-      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
-      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-
-      /// Layout type for A matrix operand
-      typename LayoutA,
-
-      /// Element type for B matrix operand
-      typename ElementB,
-
-      /// Layout type for B matrix operand
-      typename LayoutB,
-
-      /// Element type for C and D matrix operands
-      typename ElementC,
-
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-    >
-    class Gemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Multiply-add operator
-    // (selects complex or gaussian complex)
-    typename Operator_ = arch::OpMultiplyAddComplex,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false>
-class GemmComplex {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static bool const kSplitKSerial = SplitKSerial;
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultGemmComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kTransformA,
-    kTransformB,
-    Operator,
-    kSplitKSerial
-  >::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) {
-
-    }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmComplex() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      // Determine grid shape
-      ThreadblockSwizzle threadblock_swizzle;
-
-      cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-        args.problem_size, 
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.split_k_slices);
-
-      return sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return 0;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.epilogue,
-      static_cast<int *>(workspace)
-    };
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data());
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator_,
-  /// Operator class tag
-  typename OperatorClass_,
-  /// Tag indicating architecture to tune for
-  typename ArchTag_,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape_,
-  /// Epilogue output operator
-  typename EpilogueOutputOp_,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle_,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Multiply-add operator 
-  // (selects complex or gaussian complex)
-  typename Operator_,
-  /// If true, kernel supports split-K as a serial reduction
-  bool SplitKSerial
->
-class GemmComplex<
-  ElementA_,
-  LayoutA_,
-  ElementB_,
-  LayoutB_,
-  ElementC_,
-  layout::ColumnMajor,    // partially specialized on LayoutC
-  ElementAccumulator_,
-  OperatorClass_,
-  ArchTag_,
-  ThreadblockShape_,
-  WarpShape_,
-  InstructionShape_,
-  EpilogueOutputOp_,
-  ThreadblockSwizzle_,
-  Stages,
-  TransformA,
-  TransformB,
-  Operator_,
-  SplitKSerial
-> {
-public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static int const kStages = Stages;
-  using Operator = Operator_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  using UnderlyingOperator = GemmComplex< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    TransformB,
-    TransformA,
-    Operator,
-    SplitKSerial
-  >;
-  
-  static int const kAlignmentA = UnderlyingOperator::kAlignmentB;
-  static int const kAlignmentB = UnderlyingOperator::kAlignmentA;
-  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
-  static ComplexTransform const kTransformA = UnderlyingOperator::kTransformB;
-  static ComplexTransform const kTransformB = UnderlyingOperator::kTransformA;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) { }
-  };
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmComplex() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-    return UnderlyingArguments(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
-      {args.ref_B.data(), args.ref_B.stride(0)},
-      {args.ref_A.data(), args.ref_A.stride(0)},
-      {args.ref_C.data(), args.ref_C.stride(0)},
-      {args.ref_D.data(), args.ref_D.stride(0)},
-      args.epilogue,
-      args.split_k_slices
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
deleted file mode 100755
index 877375e94..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief Device-level grouped GEMM.
-*/
-
-#pragma once
-
-#include "cutlass/gemm/device/base_grouped.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM Grouped
-template <typename GemmKernel_>
-class GemmGrouped : public BaseGrouped<GemmKernel_> {
-public:
-  using GemmKernel = GemmKernel_;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
deleted file mode 100755
index 3de3cecbf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Device-level GEMM with layernorm elementwise operations fused in mainloop
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! 
-  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-  batched array variants.
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias_,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator
->
-class GemmLayernormMainloopFusion : 
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmLayernormMainloopFusion<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementScaleBias_,
-      LayoutScaleBias_,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmLayernormMainloopFusion<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementScaleBias_,
-      LayoutScaleBias_,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias_,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_
->
-class GemmLayernormMainloopFusion<ElementA_, LayoutA_, ElementB_, LayoutB_, 
-           ElementScaleBias_, LayoutScaleBias_,
-           ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementScaleBias = ElementScaleBias_;
-  using LayoutScaleBias = LayoutScaleBias_;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-
-  using UnderlyingOperator = typename GemmLayernormMainloopFusion< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementScaleBias,
-    LayoutScaleBias, 
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmLayernormMainloopFusion() { }
-
-  /// Helper to construct a transposed equivalent for the underlying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
deleted file mode 100755
index ac453c63b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h
+++ /dev/null
@@ -1,515 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/sparse_gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm_sparse.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
-  be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
-
-  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
-  is as follows:
-
-    //
-    // Instantiate the CUTLASS GEMM operator.
-    //
-
-    cutlass::gemm::device::Gemm<
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor
-    > gemm_op;
-
-    //
-    // Launch the GEMM operation on the device
-    //
-
-    cutlass::Status status = gemm_op({
-      {m, n, k},                          // GemmCoord problem_size,
-      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
-      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
-      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
-      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
-      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
-    });
-
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages
-    >
-    class Gemm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class SparseGemm {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using MathOperator = Operator;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultSparseGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator
-  >::GemmKernel;
-
-  using ElementE = typename GemmKernel::ElementE;
-
-  using LayoutE = typename GemmKernel::LayoutE;
-
-  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
-
-  static int const kSparse = GemmKernel::kSparse;
-  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
-  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    TensorRef<ElementE const, LayoutE> ref_E;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      TensorRef<ElementE, LayoutE> ref_E_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      ref_E(ref_E_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) {
-
-    }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  SparseGemm() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = GemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.ref_E.non_const_ref()
-    );
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.ref_E.non_const_ref(),
-      args.epilogue,
-      static_cast<int *>(workspace)
-    };
-    
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data());
-    params_.ref_E.reset(args.ref_E.non_const_ref().data());
-    params_.output_op = args.epilogue;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
deleted file mode 100755
index b7d8cecfa..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_sparse_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! 
-  GemmSparseUniversal is a stateful, reusable Sparse GEMM handle.  Once initialized for a given GEMM computation
-  (problem geometry and data references), it can be reused across different GEMM problems having the
-  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
-  cannot be updated.)
-
-  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-  batched array variants.
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class GemmSparseUniversal : 
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmSparseUniversal<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  > {
-
- public:
-
-  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
-             "Epilogue of Ampere sparse GEMM must be row major for now.");
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmSparseUniversal<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-
-  using ElementE = typename GemmKernel::ElementE;
-
-  using LayoutE = typename GemmKernel::LayoutE;
-
-  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
-
-  static int const kSparse = GemmKernel::kSparse;
-  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
-  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
deleted file mode 100755
index a313ddc90..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class GemmSparseUniversalWithAbsmax :
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  > {
-
- public:
-
-  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
-             "Epilogue of Ada sparse GEMM must be row major for now.");
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
-      ElementA_,
-      LayoutA_,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-
-  using ElementE = typename GemmKernel::ElementE;
-
-  using LayoutE = typename GemmKernel::LayoutE;
-
-  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
-
-  static int const kSparse = GemmKernel::kSparse;
-  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
-  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
deleted file mode 100755
index e599217a1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a sparse GEMM kernel that computes the absolute maximum of the output tensor
-    and applies additional scaling factors to operands.
-*/
-
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/sparse_gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class SparseGemmWithAbsmax {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using MathOperator = Operator;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultSparseGemmWithAbsmax<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator
-  >::GemmKernel;
-
-  using ElementE = typename GemmKernel::ElementE;
-
-  using LayoutE = typename GemmKernel::LayoutE;
-
-  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
-
-  static int const kSparse = GemmKernel::kSparse;
-  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
-  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
-
-  using Arguments = typename GemmKernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  SparseGemmWithAbsmax() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = GemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.ref_E.non_const_ref()
-    );
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_C.non_const_ref(),
-      args.ref_D,
-      args.ref_E.non_const_ref(),
-      args.ref_Aux,
-      args.ptr_Vector,
-      args.ldr,
-      args.epilogue,
-      static_cast<int *>(workspace)
-    };
-    
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.split_k_slices > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_C.reset(args.ref_C.non_const_ref().data());
-    params_.ref_D.reset(args.ref_D.data());
-    params_.ref_E.reset(args.ref_E.non_const_ref().data());
-    params_.output_op = args.epilogue;
-    params_.semaphore = static_cast<int *>(workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
deleted file mode 100755
index 73edfa35d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/sparse_gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Sparse GEMM with visitor
- */
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename FusionCallbacks_ =
-        typename cutlass::epilogue::threadblock::detail::EmptyCallbacks,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Number of stages used in the pipelined epilogue
-    int EpilogueStages = 1>
-class SparseGemmWithVisitor {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using FusionCallbacks = FusionCallbacks_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using MathOperator = Operator;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-
-  /// Define the kernel
-  using GemmKernel = typename kernel::DefaultSparseGemmWithVisitor<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    FusionCallbacks,
-    ThreadblockSwizzle,
-    kStages,
-    Operator,
-    EpilogueStages
-  >::GemmKernel;
-
-  using ElementE = typename GemmKernel::ElementE;
-
-  using LayoutE = typename GemmKernel::LayoutE;
-
-  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
-
-  static int const kSparse = GemmKernel::kSparse;
-  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
-  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementE const, LayoutE> ref_E;
-    typename FusionCallbacks::Arguments epilogue;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementE, LayoutE> ref_E_,
-      typename FusionCallbacks::Arguments epilogue_ = 
-        typename FusionCallbacks::Arguments()
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_E(ref_E_),
-      epilogue(epilogue_) {
-
-    }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params params_;
-
-public:
-
-  /// Constructs the GEMM.
-  SparseGemmWithVisitor() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    Status status = GemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      cutlass::TensorRef<ElementC, LayoutC>(), // It only matters that it's empty.
-      cutlass::TensorRef<ElementC, LayoutC>(), // Same as above.
-      args.ref_E.non_const_ref()
-    );
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-
-    size_t bytes = 0;
-
-    return bytes;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    constexpr int SplitKSlices = 1;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      SplitKSlices);
-
-    // Initialize the Params structure
-    params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      args.ref_E.non_const_ref(),
-      args.epilogue
-    };
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    params_.ref_A.reset(args.ref_A.non_const_ref().data());
-    params_.ref_B.reset(args.ref_B.non_const_ref().data());
-    params_.ref_E.reset(args.ref_E.non_const_ref().data());
-    params_.output_op = args.epilogue;
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
deleted file mode 100755
index f78c5a216..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ /dev/null
@@ -1,636 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for GEMM performing a reduction over K partitions in parallel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm.h"
-
-#include "cutlass/gemm/kernel/default_gemm_splitk_parallel.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/reduction/kernel/reduce_split_k.h"
-#include "cutlass/reduction/thread/reduction_operators.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*! 
-  Gemm device-level operator performing parallel reduction over the K partition.
-
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Epilogue output operator
-    typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert<
-        ElementAccumulator_,
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementAccumulator_,
-                                 ElementAccumulator_>::EpilogueOutputOp::kCount,
-        ElementAccumulator_>,
-    /// Reduction operator
-    typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd<
-        ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator,
-        EpilogueOutputOp_::kCount>,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        threadblock::GemmSplitKHorizontalThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class GemmSplitKParallel {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ConvertScaledOp = ConvertScaledOp_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ReductionOp = ReductionOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-
-  /// GEMM kernel 
-  using GemmKernel = typename kernel::DefaultGemmSplitKParallel<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    ConvertScaledOp,
-    ThreadblockSwizzle,
-    kStages,
-    Operator
-  >::GemmKernel;
-
-  /// Reduction kernel
-  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
-    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
-    EpilogueOutputOp,
-    ReductionOp
-  >;
-
-  //
-  //
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-    typename ConvertScaledOp::Params convert;
-    typename ReductionOp::Params reduction;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1,
-      typename ConvertScaledOp::Params convert_ = 
-        typename ConvertScaledOp::Params(),
-      typename ReductionOp::Params reduction_ =
-        typename ReductionOp::Params()
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices),
-      convert(convert_),
-      reduction(reduction_) { }
-  };
-
-private:
-
-  /// Kernel parameters object
-  typename GemmKernel::Params gemm_params_;
-
-  /// Reduction kernel parameters object
-  typename ReductionKernel::Params reduction_params_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmSplitKParallel() { }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    return sizeof(ElementAccumulator_) * size_t(args.problem_size.m()) * size_t(args.problem_size.n()) * grid_shape.k();
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace) {
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.split_k_slices);
-
-    // Define a reference to the workspace - this is an aligned region in device memory.
-    if (!workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-    
-    TensorRef<ElementAccumulator_, layout::RowMajor> ref_workspace(
-      static_cast<ElementAccumulator_ *>(workspace), 
-      args.problem_size.n());
-
-    int64_t partition_stride = int64_t(args.problem_size.m()) * int64_t(args.problem_size.n());
-
-    // Initialize the Params structure
-    gemm_params_ = typename GemmKernel::Params{
-      args.problem_size,
-      grid_shape,
-      args.ref_A.non_const_ref(),
-      args.ref_B.non_const_ref(),
-      ref_workspace,
-      args.convert,
-      partition_stride
-    };
-
-    reduction_params_ = typename ReductionKernel::Params(
-      args.problem_size.mn(),
-      grid_shape.k(),
-      partition_stride,
-      ref_workspace,
-      args.ref_D,
-      args.ref_C.non_const_ref(),
-      args.epilogue
-    );
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    if (!workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    gemm_params_.ref_A.reset(args.ref_A.data());
-    gemm_params_.ref_B.reset(args.ref_B.data());
-    gemm_params_.ref_D.reset(workspace);     
-
-    reduction_params_.ref_D.reset(args.ref_D.data());
-    reduction_params_.ref_C.reset(args.ref_C.data());
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    //
-    // Launch GEMM kernel
-    //
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(gemm_params_.grid_tiled_shape);
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-
-    cudaError_t result;
-
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-
-      result = cudaFuncSetAttribute(
-        Kernel<GemmKernel>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
-
-    result = cudaGetLastError();
-
-    if (result != cudaSuccess) {
-      return Status::kErrorInternal;
-    }
-
-    //
-    // Launch reduction kernel
-    //
-
-    block = ReductionKernel::block_shape();
-    grid = ReductionKernel::grid_shape(gemm_params_.problem_size.mn());
-
-    Kernel<ReductionKernel><<< grid, block, 0, stream >>>(reduction_params_);
-
-    result = cudaGetLastError();
-
-    if (result != cudaSuccess) {
-      return Status::kErrorInternal;
-    }
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Epilogue output operator
-    typename ConvertScaledOp_,
-    /// Reduction operator
-    typename ReductionOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages, int kAlignmentA, int kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_>
-class GemmSplitKParallel<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-                         layout::ColumnMajor, ElementAccumulator_,
-                         OperatorClass_, ArchTag_, ThreadblockShape_,
-                         WarpShape_, InstructionShape_, EpilogueOutputOp_,
-                         ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_,
-                         Stages, kAlignmentA, kAlignmentB, Operator_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ConvertScaledOp = ConvertScaledOp_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ReductionOp = ReductionOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-
-  using UnderlyingOperator = GemmSplitKParallel< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ConvertScaledOp,
-    ReductionOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentA,
-    kAlignmentB,
-    Operator
-  >;
-
-  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  using ReductionKernel = typename UnderlyingOperator::ReductionKernel;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A;
-    TensorRef<ElementB const, LayoutB> ref_B;
-    TensorRef<ElementC const, LayoutC> ref_C;
-    TensorRef<ElementC, LayoutC> ref_D;
-    typename EpilogueOutputOp::Params epilogue;
-    int split_k_slices;
-    typename ConvertScaledOp::Params convert;
-    typename ReductionOp::Params reduction;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments() { }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A_,
-      TensorRef<ElementB const, LayoutB> ref_B_,
-      TensorRef<ElementC const, LayoutC> ref_C_,
-      TensorRef<ElementC, LayoutC> ref_D_,
-      typename EpilogueOutputOp::Params epilogue_ = 
-        typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1,
-      typename ConvertScaledOp::Params convert_ = 
-        typename ConvertScaledOp::Params(),
-      typename ReductionOp::Params reduction_ =
-        typename ReductionOp::Params()
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices),
-      convert(convert_),
-      reduction(reduction_) { }
-  };
-
-private:
-
-  /// Kernel parameters object
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmSplitKParallel() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
-    return UnderlyingArguments(
-      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
-      {args.ref_B.data(), args.ref_B.stride(0)},
-      {args.ref_A.data(), args.ref_A.stride(0)},
-      {args.ref_C.data(), args.ref_C.stride(0)},
-      {args.ref_D.data(), args.ref_D.stride(0)},
-      args.epilogue,
-      args.split_k_slices,
-      args.convert,
-      args.reduction
-    );
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
deleted file mode 100755
index 55413b77a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h
+++ /dev/null
@@ -1,442 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/arch/mma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! 
-  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a given GEMM computation
-  (problem geometry and data references), it can be reused across different GEMM problems having the
-  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
-  cannot be updated.)
-
-  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-  batched array variants.
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout_ = layout::NoPermute,
-    /// Permute operand A
-    typename PermuteALayout_ = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout_ = layout::NoPermute
->
-class GemmUniversal : 
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmUniversal<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone,
-      GatherA,
-      GatherB,
-      ScatterD,
-      PermuteDLayout_,
-      PermuteALayout_,
-      PermuteBLayout_
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using PermuteDLayout = PermuteDLayout_;
-  using PermuteALayout = PermuteALayout_;
-  using PermuteBLayout = PermuteBLayout_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmUniversal<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone,
-      GatherA,
-      GatherB,
-      ScatterD,
-      PermuteDLayout_,
-      PermuteALayout_,
-      PermuteBLayout_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout_,
-    /// Permute operand A
-    typename PermuteALayout_,
-    /// Permute operand B
-    typename PermuteBLayout_
->
-class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD,
-           PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  using PermuteDLayout = PermuteDLayout_;
-  using PermuteALayout = PermuteALayout_;
-  using PermuteBLayout = PermuteBLayout_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmUniversal< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator,
-    kTransformB,
-    kTransformA,
-    GatherB,
-    GatherA,
-    ScatterD,
-    PermuteDLayout,
-    PermuteBLayout,
-    PermuteALayout
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmUniversal() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
deleted file mode 100755
index 73564d3c6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ /dev/null
@@ -1,693 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
-    batched array variants.
-*/
-
-#pragma once
-
-// common
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/detail/mma.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cutlass/kernel_launch.h"
-#if !defined(__CUDACC_RTC__)
-#include "cutlass/cluster_launch.hpp"
-#include "cutlass/trace.h"
-#endif // !defined(__CUDACC_RTC__)
-
-// 2.x
-#include "cutlass/gemm/device/gemm_universal_base.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
-
-// 3.x
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
-  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
-
-  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
-  to create it from the host facing arguments. For power users, new static methods
-  are exposed in 3.x APIs that bypass the stateful methods or args->params lowering.
-
-  It supports kernel types that implement both the 2.x and 3.0 APIs,
-  however, this is done by specializing the implementation of GemmUniversalAdapter
-  on the two kernel API types, and thus, GemmUniversalAdapter's behaviour might
-  differ between the two specializations.
-*/
-template <class GemmKernel_, class Enable = void>
-class GemmUniversalAdapter;
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////// CUTLASS 3.x API /////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <class GemmKernel_>
-class GemmUniversalAdapter<
-  GemmKernel_,
-  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
-{
-public:
-  using GemmKernel = GemmKernel_;
-  using TileShape = typename GemmKernel::TileShape;
-  using ElementA = typename GemmKernel::ElementA;
-  using ElementB = typename GemmKernel::ElementB;
-  using ElementC = typename GemmKernel::ElementC;
-  using ElementD = typename GemmKernel::ElementD;
-  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
-  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
-  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
-  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
-
-  // Map back to 2.x type as best as possible
-  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
-  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
-  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
-  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
-
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  static ComplexTransform const kTransformA = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA, cute::conjugate> ?
-                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB, cute::conjugate> ?
-                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
-
-  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
-  using MathOperator = cutlass::arch::OpMultiplyAdd;
-
-  using OperatorClass = cutlass::detail::get_operator_class_t<typename CollectiveMainloop::TiledMma>;
-
-  using ArchTag = typename GemmKernel::ArchTag;
-
-  // NOTE: Assume identity swizzle for now
-  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
-
-  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
-  using ThreadblockShape = cutlass::gemm::GemmShape<
-      cute::size<0>(TileShape{}),
-      cute::size<1>(TileShape{}),
-      cute::size<2>(TileShape{})>;
-
-  using ClusterShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
-
-  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
-  using InstructionShape = cutlass::gemm::GemmShape<
-      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
-      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
-
-  // Legacy: provide a correct warp count, but no reliable warp shape
-  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
-
-  // Warp shape is not a primary API type in 3.x
-  // But we can best approximate it by inspecting the TiledMma
-  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
-  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
-  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
-  static constexpr int WarpsInMmaM = 4;
-  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
-  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
-  using WarpShape = cutlass::gemm::GemmShape<
-      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
-      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
-      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
-
-  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
-
-  // Inspect TiledCopy for A and B to compute the alignment size
-  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveMainloop::GmemTiledCopyA, ElementA, typename CollectiveMainloop::TiledMma::ValTypeA>();
-  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveMainloop::GmemTiledCopyB, ElementB, typename CollectiveMainloop::TiledMma::ValTypeB>();
-  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
-  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
-      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
-
-  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
-
-  // Split-K preserves splits that are 128b aligned
-  static int constexpr kSplitKAlignment = cute::max(
-      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  /// Argument structure: User API
-  using Arguments = typename GemmKernel::Arguments;
-  /// Argument structure: Kernel API
-  using Params = typename GemmKernel::Params;
-
-private:
-
-  /// Kernel API parameters object
-  Params params_;
-
-public:
-
-  /// Access the Params structure
-  Params const& params() const {
-    return params_;
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status
-  can_implement(Arguments const& args) {
-    if (GemmKernel::can_implement(args)) {
-      return Status::kSuccess;
-    }
-    else {
-      return Status::kInvalid;
-    }
-  }
-
-  /// Gets the workspace size
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_bytes = 0;
-    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{}));
-    }
-
-    workspace_bytes += GemmKernel::get_workspace_size(args);
-
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-
-    return workspace_bytes;
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
-    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
-    return GemmKernel::get_grid_shape(tmp_params);
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Params const& params) {
-    return GemmKernel::get_grid_shape(params);
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
-    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
-    int max_active_blocks = -1;
-    int smem_size = GemmKernel::SharedStorageSize;
-
-    // first, account for dynamic smem capacity if needed
-    cudaError_t result;
-    if (smem_size >= (48 << 10)) {
-      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-      result = cudaFuncSetAttribute(
-          device_kernel<GemmKernel>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          smem_size);
-      if (cudaSuccess != result) {
-        result = cudaGetLastError(); // to clear the error bit
-        CUTLASS_TRACE_HOST(
-          "  cudaFuncSetAttribute() returned error: "
-          << cudaGetErrorString(result));
-        return -1;
-      }
-    }
-
-    // query occupancy after setting smem size
-    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        device_kernel<GemmKernel>,
-        GemmKernel::MaxThreadsPerBlock,
-        smem_size);
-
-    if (cudaSuccess != result) {
-      result = cudaGetLastError(); // to clear the error bit
-      CUTLASS_TRACE_HOST(
-        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
-        << cudaGetErrorString(result));
-      return -1;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-    return max_active_blocks;
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status
-  initialize(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-
-    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize the workspace
-    Status status = GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    // Initialize the Params structure
-    params_ = GemmKernel::to_underlying_arguments(args, workspace);
-    // Don't set the function attributes - require the CudaHostAdapter to set it.
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      return Status::kSuccess;
-    }
-    else {
-      //
-      // Account for dynamic smem capacity if needed
-      //
-      int smem_size = GemmKernel::SharedStorageSize;
-
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-      if (smem_size >= (48 << 10)) {
-        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-        cudaError_t result = cudaFuncSetAttribute(
-            device_kernel<GemmKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            smem_size);
-        if (cudaSuccess != result) {
-          result = cudaGetLastError(); // to clear the error bit
-          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    return Status::kSuccess;
-  }
-
-  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
-  Status
-  update(Arguments const& args, void* workspace = nullptr) {
-    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
-
-    size_t workspace_bytes = get_workspace_size(args);
-    if (workspace_bytes > 0 && nullptr == workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_ = GemmKernel::to_underlying_arguments(args, workspace);
-    return Status::kSuccess;
-  }
-
-  /// Primary run() entry point API that is static allowing users to create and manage their own params.
-  /// Supplied params struct must be construct by calling GemmKernel::to_underling_arguments()
-  static Status
-  run(Params& params,
-      cudaStream_t stream = nullptr,
-      CudaHostAdapter *cuda_adapter = nullptr,
-      bool launch_with_pdl = false) {
-    CUTLASS_TRACE_HOST("GemmUniversal::run()");
-    dim3 const block = GemmKernel::get_block_shape();
-    dim3 const grid = get_grid_shape(params);
-
-    // configure smem size and carveout
-    int smem_size = GemmKernel::SharedStorageSize;
-
-    Status launch_result{ Status::kSuccess };
-    // Use extended launch API only for mainloops that use it
-    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
-#endif
-      [[maybe_unused]] constexpr bool is_static_1x1x1 =
-        cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
-        cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
-      dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-                   cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
-                   cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
-      void* kernel_params[] = {&params};
-
-      if constexpr (kEnableCudaHostAdapter) {
-        //
-        // Use the cuda host adapter
-        //
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          if (launch_with_pdl) {
-            CUTLASS_TRACE_HOST(
-              "GemmUniversal::run() does not support launching with PDL and a custom cuda adapter.");
-            return Status::kErrorInternal;
-          }
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
-#endif
-          launch_result = cuda_adapter->launch(grid,
-                                               cluster,
-                                               block,
-                                               smem_size,
-                                               stream,
-                                               kernel_params,
-                                               0);
-        }
-        else {
-          CUTLASS_TRACE_HOST("GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA host adapter is null");
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        void const* kernel = (void const*) device_kernel<GemmKernel>;
-        if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 90) {
-          if constexpr (is_static_1x1x1) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching static 1x1x1 kernel");
-#endif
-            launch_result = cutlass::kernel_launch<GemmKernel>(
-              grid, block, smem_size, stream, params, launch_with_pdl);
-            if (launch_result != Status::kSuccess) {
-              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
-            }
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            else {
-              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
-            }
-#endif
-          }
-          else {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching dynamic cluster kernel");
-#endif
-            launch_result = ClusterLauncher::launch(
-              grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
-          }
-        }
-      }
-    }
-    else {
-      launch_result = Status::kSuccess;
-      cutlass::arch::synclog_setup();
-
-      if constexpr (kEnableCudaHostAdapter) {
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          void* kernel_params[] = {&params};
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
-#endif
-          launch_result = cuda_adapter->launch(
-            grid, block, smem_size, stream, kernel_params, 0
-          );
-
-        }
-        else {
-          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-        CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
-#endif
-        launch_result = cutlass::kernel_launch<GemmKernel>(
-          grid, block, smem_size, stream, params, launch_with_pdl);
-        if (launch_result != Status::kSuccess) {
-          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
-        }
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-        else {
-          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
-        }
-#endif
-      }
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess == result && Status::kSuccess == launch_result) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("GemmUniversal::run: cudaGetLastError reports success");
-#endif
-      return Status::kSuccess;
-    }
-    else {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-      return Status::kErrorInternal;
-    }
-  }
-
-  //
-  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
-  //
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  run(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    bool launch_with_pdl = false
-  ) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (Status::kSuccess == status) {
-      status = run(params_, stream, cuda_adapter, launch_with_pdl);
-    }
-    return status;
-  }
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  operator()(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    bool launch_with_pdl = false) {
-    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  run(
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, launch_with_pdl);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////// CUTLASS 2.x API /////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <class GemmKernel_>
-class GemmUniversalAdapter<
-  GemmKernel_,
-  cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
-{
-public:
-
-  using GemmKernel = GemmKernel_;
-
-  static bool const kInternalTranspose =
-    !cutlass::epilogue::threadblock::detail::is_2x_evt_v<typename GemmKernel::Epilogue> &&  // 2.x EVT does not require internal transpose
-    cute::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
-
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
-  using WarpShape = typename GemmKernel::WarpShape;
-  using InstructionShape = typename GemmKernel::InstructionShape;
-
-  // warp-level, arch-level (instruction), math operator
-  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
-  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
-  using MathOperator = typename WarpMmaOperator::MathOperator;
-
-  // Operator class and arch tag extract bottom-up
-  // set it for top-level gemm device-level template
-  using OperatorClass = typename WarpMmaOperator::OperatorClass;
-  using ArchTag = typename WarpMmaOperator::ArchTag;
-
-  // Type, layout, and complex transform deliberately exchanged with B
-  using MapArguments = kernel::detail::MapArguments<
-    typename GemmKernel::ElementA,
-    typename GemmKernel::LayoutA,
-    GemmKernel::kTransformA,
-    GemmKernel::kAlignmentA,
-    typename GemmKernel::ElementB,
-    typename GemmKernel::LayoutB,
-    GemmKernel::kTransformB,
-    GemmKernel::kAlignmentB,
-    typename GemmKernel::LayoutC,
-    kInternalTranspose
-  >;
-
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-
-  using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = typename MapArguments::LayoutC;
-  static int const kAlignmentC = GemmKernel::kAlignmentC;
-
-  // C and D same type for 2.x kernel
-  using ElementD = ElementC;
-  using LayoutD = LayoutC;
-
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementD, LayoutD>;
-
-  static int const kStages = GemmKernel::Mma::kStages;
-
-  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
-  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
-  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-  using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmUniversalAdapter() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    if (kInternalTranspose) {
-      return args.transposed_problem();
-    }
-    else {
-      return args;
-    }
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args), cuda_adapter);
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
-
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args), cuda_adapter);
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr
-  ) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream, cuda_adapter);
-  }
-
-  /// Lightweight update given a subset of arguments.
-  Status update(Arguments const &args) {
-
-    return underlying_operator_.update(to_underlying_arguments(args));
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-
-    return underlying_operator_.run(stream, cuda_adapter);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (status == Status::kSuccess) {
-      status = run(stream, cuda_adapter);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::device
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
deleted file mode 100755
index e23191eae..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
+++ /dev/null
@@ -1,522 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief The universal GEMM accommodates streamk, batched strided, and batched array variants.
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/limits>
-#else
-#include <limits>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-template <typename GemmKernel_>
-class GemmUniversalBase {
-public:
-
-  using GemmKernel = GemmKernel_;
-
-  /// Boolean indicating whether the CudaHostAdapter is enabled
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
-
-  using ElementA = typename GemmKernel::ElementA;
-  using LayoutA = typename GemmKernel::LayoutA;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
-
-  using ElementB = typename GemmKernel::ElementB;
-  using LayoutB = typename GemmKernel::LayoutB;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
-
-  using ElementC = typename GemmKernel::ElementC;
-  using LayoutC = typename GemmKernel::LayoutC;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-
-  /// Numerical accumulation element type
-  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
-
-  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
-  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
-  using Operator = typename GemmKernel::Operator;
-
-  /// Argument structure
-  using Arguments = typename GemmKernel::Arguments;
-
-
-  /// Index of the GEMM Kernel within the CudaHostAdapter
-  static int32_t const kGemmKernelIndex = 0;
-
-  /// Kernel dynamic shared memory allocation requirement
-  /// Update the kernel function's shared memory configuration for the current device
-  static constexpr size_t kSharedStorageSize = sizeof(typename GemmKernel::SharedStorage);
-
-protected:
-
-  //
-  // Device properties (uniform across all instances of the current thread)
-  //
-
-  // Device ordinal
-  CUTLASS_THREAD_LOCAL static int device_ordinal_;
-
-  /// Device SM count
-  CUTLASS_THREAD_LOCAL static int device_sms_;
-
-  /// Kernel SM occupancy (in thread blocks)
-  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
-
-protected:
-
-  /// Initialize static thread-local members for the thread's current device,
-  /// if necessary.
-  static Status init_device_props()
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
-
-    cudaError_t cudart_result;
-
-    // Get current device ordinal
-    int current_ordinal;
-    cudart_result = cudaGetDevice(&current_ordinal);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // Done if matches the current static member
-    if (current_ordinal == device_ordinal_) {
-      // Already initialized
-      return Status::kSuccess;
-    }
-
-    // Update SM count member
-    cudart_result = cudaDeviceGetAttribute (&device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // If requires more than 48KB: configure for extended, dynamic shared memory
-    if constexpr (kSharedStorageSize >= (48 << 10))
-    {
-      cudart_result = cudaFuncSetAttribute(
-        Kernel2<GemmKernel>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        kSharedStorageSize);
-      if (cudart_result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    // Update SM occupancy member
-    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-      &sm_occupancy_,
-      Kernel2<GemmKernel>,
-      GemmKernel::kThreadCount,
-      kSharedStorageSize,
-      cudaOccupancyDisableCachingOverride);
-    if (cudart_result != cudaSuccess) {
-      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned error " << cudaGetErrorString(cudart_result));
-      return Status::kErrorInternal;
-    }
-
-    // Update device ordinal member on success
-    device_ordinal_ = current_ordinal;
-
-    CUTLASS_TRACE_HOST("  "
-      "device_ordinal: (" << device_ordinal_ << "), "
-      "device_sms: (" << device_sms_ << "), "
-      "sm_occupancy: (" << sm_occupancy_ << ") "
-      "smem_size: (" << kSharedStorageSize << ") "
-      "GemmKernel::kThreadCount: (" << GemmKernel::kThreadCount << ")");
-
-    return Status::kSuccess;
-  }
-
-
-protected:
-
-  //
-  // Instance data members
-  //
-
-  /// Kernel parameters
-  typename GemmKernel::Params params_;
-
-
-  /// Initialize params member
-  Status init_params(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    int32_t device_sms = 0;
-    int32_t sm_occupancy = 0;
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-
-      //
-      // Occupancy query using CudaHostAdapter::query_occupancy().
-      //
-
-      if (cuda_adapter) {
-
-        Status status = cuda_adapter->query_occupancy(
-          &device_sms,
-          &sm_occupancy,
-          kGemmKernelIndex,
-          GemmKernel::kThreadCount,
-          kSharedStorageSize);
-
-        CUTLASS_ASSERT(status == Status::kSuccess);
-
-        if (status != Status::kSuccess) {
-          return status;
-        }
-      }
-      else {
-        return Status::kErrorInternal;
-      }
-    }
-    else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-      // Initialize static device properties, if necessary
-      Status result = init_device_props();
-
-      if (result != Status::kSuccess) {
-        return result;
-      }
-
-      //
-      // Use thread-local static members for occupancy query initialized by call to
-      // `init_device_props()`
-      //
-
-      device_sms   = device_sms_;
-      sm_occupancy = sm_occupancy_;
-    }
-
-    // Initialize params member
-    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
-    return Status::kSuccess;
-  }
-
-public:
-
-  //---------------------------------------------------------------------------------------------
-  // Stateless API
-  //---------------------------------------------------------------------------------------------
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
-
-    if (!kEnableCudaHostAdapter || cuda_adapter) {
-
-      dim3 grid = get_grid_shape(args, cuda_adapter);
-
-      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
-            grid.z <= std::numeric_limits<uint16_t>::max()))
-      {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    else {
-      //
-      // With a null host adapter, a conservative grid shape is computed and required to conform to CUDA grid
-      // dimension limits.
-      //
-
-      int64_t logicalGridM = (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) / ThreadblockShape::kM;
-      int64_t logicalGridN = (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
-      int32_t logicalGridL = args.batch_count;
-
-      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
-          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
-          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
-
-        return Status::kErrorInvalidProblem;
-      }
-
-    }
-
-    return GemmKernel::can_implement(args);
-  }
-
-
-  /// Returns the workspace size (in bytes) needed for the problem
-  /// geometry expressed by these arguments
-  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
-
-    // Initialize parameters from args
-    GemmUniversalBase base;
-    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
-      return 0;
-    }
-
-    // Get size from parameters
-    size_t workspace_bytes = base.params_.get_workspace_size();
-
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-    return workspace_bytes;
-  }
-
-
-  /// Returns the grid extents in thread blocks to launch
-  static dim3 get_grid_shape(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
-
-    // Initialize parameters from args
-    GemmUniversalBase base;
-    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
-      return dim3(0,0,0);
-    }
-
-    // Get dims from parameters
-    dim3 grid_dims = base.params_.get_grid_dims();
-
-    CUTLASS_TRACE_HOST(
-         "  tiled_shape: " << base.params_.get_tiled_shape()  << "\n"
-      << "  grid_dims: {" << grid_dims << "}");
-
-    return grid_dims;
-  }
-
-
-  /// Returns the maximum number of active thread blocks per multiprocessor
-  static int maximum_active_blocks(CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
-
-    int32_t device_sms   = 0;
-    int32_t sm_occupancy = 0;
-
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-
-      if (cuda_adapter) {
-
-        Status status = cuda_adapter->query_occupancy(
-          &device_sms,
-          &sm_occupancy,
-          kGemmKernelIndex,
-          GemmKernel::kThreadCount,
-          kSharedStorageSize);
-
-        CUTLASS_ASSERT(status == Status::kSuccess);
-
-        if (status != Status::kSuccess) {
-        return -1;
-        }
-      }
-      else {
-        return -1;
-      }
-    }
-    else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-      // Initialize static device properties, if necessary
-      if (init_device_props() != Status::kSuccess) {
-        return -1;
-      }
-
-      sm_occupancy = sm_occupancy_;
-    }
-
-    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
-    return sm_occupancy;
-  }
-
-
-  //---------------------------------------------------------------------------------------------
-  // Stateful API
-  //---------------------------------------------------------------------------------------------
-
-  /// Initializes GEMM state from arguments and workspace memory
-  Status initialize(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null"));
-
-    // Initialize parameters from args
-    Status result = init_params(args, cuda_adapter);
-    if (result != Status::kSuccess) {
-      return result;
-    }
-
-    // Assign and prepare workspace memory
-    if (args.mode == GemmUniversalMode::kGemm) {
-      return params_.init_workspace(workspace, stream);
-    }
-
-    return Status::kSuccess;
-  }
-
-
-  /// Lightweight update given a subset of arguments.
-  Status update(Arguments const &args)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
-    params_.update(args);
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
-
-    // Configure grid and block dimensions
-    dim3 block(GemmKernel::kThreadCount, 1, 1);
-    dim3 grid = params_.get_grid_dims();
-
-    // Launch kernel
-    CUTLASS_TRACE_HOST("  "
-      "grid: (" << grid << "), "
-      "block: (" << block << "), "
-      "SMEM: (" << kSharedStorageSize << ")");
-
-    cutlass::arch::synclog_setup();
-
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      if (cuda_adapter) {
-        void* kernel_params[] = {&params_};
-        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream, kernel_params, 0);
-      }
-      else {
-        return Status::kErrorInternal;
-      }
-    }
-    else {
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
-
-      // Query for errors
-      cudaError_t result = cudaGetLastError();
-      if (result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    return run(stream, cuda_adapter);
-  }
-
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr)
-  {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (status == Status::kSuccess) {
-      status = run(stream, cuda_adapter);
-    }
-
-    return status;
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Static initializers
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Device ordinal
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
-
-/// Device SM count
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
-
-/// Kernel SM occupancy (in thread blocks)
-template <typename GemmKernel_>
-CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
deleted file mode 100755
index 7ef581ac9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
+++ /dev/null
@@ -1,386 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a Stream-K GEMM kernel that can broadcast bias vector in the
-           epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  The universal GEMM with a broadcast epilogue.
-  Supports
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
-    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
-        ElementC_, ElementAccumulator_, ElementAccumulator_,
-        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone
->
-class GemmUniversalStreamkWithBroadcast :
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmStreamkWithBroadcast<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmStreamkWithBroadcast<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB>
-class GemmUniversalStreamkWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmUniversalStreamkWithBroadcast<
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator,
-    kTransformB,
-    kTransformA
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmUniversalStreamkWithBroadcast() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr) {
-
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
deleted file mode 100755
index 35f7b5416..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a GEMM kernel that computes the absolute maximum of the output tensor
-    and applies additional scaling factors to operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/kernel/default_gemm_with_absmax.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Universal GEMM with absolute-maximum calculation and scaling
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm89,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
-        ElementC_, ElementAccumulator_, ElementAccumulator_,
-        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone
->
-class GemmUniversalWithAbsMax;
-
-// Partial specialization for SM89
-template <
-    typename ElementA_,
-    typename LayoutA_,
-    typename ElementB_,
-    typename LayoutB_,
-    typename ElementC_,
-    typename LayoutC_,
-    typename ElementAccumulator_,
-    typename ThreadblockShape_,
-    typename WarpShape_,
-    typename InstructionShape_,
-    typename EpilogueOutputOp_,
-    typename ThreadblockSwizzle_,
-    int Stages,
-    int AlignmentA,
-    int AlignmentB,
-    typename Operator_,
-    ComplexTransform TransformA,
-    ComplexTransform TransformB
->
-class GemmUniversalWithAbsMax<
-    ElementA_,
-    LayoutA_,
-    ElementB_,
-    LayoutB_,
-    ElementC_,
-    LayoutC_,
-    ElementAccumulator_,
-    arch::OpClassTensorOp,
-    arch::Sm89,
-    ThreadblockShape_,
-    WarpShape_,
-    InstructionShape_,
-    EpilogueOutputOp_,
-    ThreadblockSwizzle_,
-    Stages,
-    AlignmentA,
-    AlignmentB,
-    Operator_,
-    TransformA,
-    TransformB
-> :
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmWithAbsMax<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      arch::OpClassTensorOp,
-      arch::Sm89,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = arch::OpClassTensorOp;
-  using ArchTag = arch::Sm89;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmWithAbsMax<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass,
-      ArchTag,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for SM89 column-major output exchanges problem size and operand.
-template <
-    typename ElementA_,
-    typename LayoutA_,
-    typename ElementB_,
-    typename LayoutB_,
-    typename ElementC_,
-    typename ElementAccumulator_,
-    typename ThreadblockShape_,
-    typename WarpShape_,
-    typename InstructionShape_,
-    typename EpilogueOutputOp_,
-    typename ThreadblockSwizzle_,
-    int Stages,
-    int AlignmentA,
-    int AlignmentB,
-    typename Operator_,
-    ComplexTransform TransformA,
-    ComplexTransform TransformB>
-class GemmUniversalWithAbsMax<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, arch::OpClassTensorOp, arch::Sm89, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = arch::OpClassTensorOp;
-  using ArchTag = arch::Sm89;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmUniversalWithAbsMax<
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator,
-    kTransformB,
-    kTransformA
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmUniversalWithAbsMax() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr) {
-
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
deleted file mode 100755
index 809a504a7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
+++ /dev/null
@@ -1,386 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a GEMM kernel that can broadcast bias vector in the
-           epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/kernel/default_gemm_with_broadcast.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!
-  The universal GEMM with a broadcast epilogue.
-  Supports
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
-    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
-        ElementC_, ElementAccumulator_, ElementAccumulator_,
-        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone
->
-class GemmUniversalWithBroadcast :
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmWithBroadcast<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmWithBroadcast<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB>
-class GemmUniversalWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmUniversalWithBroadcast<
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator,
-    kTransformB,
-    kTransformA
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmUniversalWithBroadcast() { }
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) {
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr) {
-
-    Status status = initialize(args, workspace, stream);
-
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
deleted file mode 100755
index b25ae6a36..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a GEMM kernel that can reduce one of the input matrix
-    into a vector along the K dimension.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
-
-#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-#include "cutlass/layout/permute.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! 
-  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-  batched array variants.
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassSimt,
-    /// Reduce A or B operand along the K dimension
-    bool ReduceKForA_ = true,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_ = arch::Sm70,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute
->
-class GemmWithKReduction : 
-  public GemmUniversalBase<
-    typename kernel::DefaultGemmWithKReduction<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ReduceKForA_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone
-    >::GemmKernel
-  > {
-
- public:
-
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static constexpr int kStages = Stages;
-  static constexpr int kAlignmentA = AlignmentA;
-  static constexpr int kAlignmentB = AlignmentB;
-  static constexpr int kAlignmentC = EpilogueOutputOp::kCount;
-  static constexpr ComplexTransform kTransformA = TransformA;
-  static constexpr ComplexTransform kTransformB = TransformB;
-
-  using Base = GemmUniversalBase<
-    typename kernel::DefaultGemmWithKReduction<
-      ElementA_,
-      LayoutA_,
-      TransformA,
-      AlignmentA,
-      ElementB_,
-      LayoutB_,
-      TransformB,
-      AlignmentB,
-      ElementC_,
-      LayoutC_,
-      ElementAccumulator_,
-      OperatorClass_,
-      ReduceKForA_,
-      ArchTag_,
-      ThreadblockShape_,
-      WarpShape_,
-      InstructionShape_,
-      EpilogueOutputOp_,
-      ThreadblockSwizzle_,
-      Stages,
-      Operator_,
-      SharedMemoryClearOption::kNone
-    >::GemmKernel
-  >;
-
-  using Arguments = typename Base::Arguments;
-  using GemmKernel = typename Base::GemmKernel;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Reduce A or B operand along the K dimension
-    bool ReduceKForA_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout
->
-class GemmWithKReduction<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ReduceKForA_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD, PermuteDLayout> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using UnderlyingOperator = typename GemmWithKReduction< 
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    ElementC,
-    layout::RowMajor,    
-    ElementAccumulator,
-    OperatorClass,
-    !ReduceKForA_,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    kAlignmentB,
-    kAlignmentA,
-    Operator,
-    kTransformB,
-    kTransformA,
-    GatherB,
-    GatherA,
-    ScatterD,
-    PermuteDLayout
-  >::Base;
-
-  using GemmKernel = typename UnderlyingOperator::GemmKernel;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the GEMM.
-  GemmWithKReduction() = default;
-
-  /// Helper to construct a transposed equivalent for the underying GEMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
deleted file mode 100755
index 5e181743e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/device/gemm_universal_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename GemvKernel_>
-class Gemv {
-public:
-
-  using GemvKernel = GemvKernel_;
-
-
-  using ElementA = typename GemvKernel::ElementA;
-  using LayoutA  = typename GemvKernel::LayoutA;
-  using ElementB = typename GemvKernel::ElementB;
-  using ElementC = typename GemvKernel::ElementC;
-
-  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
-  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
-
-  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
-  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
-
-  static int const kThreadCount = GemvKernel::kThreadCount;
-  static int const kThreadsPerRow = GemvKernel::kThreadsPerRow;
-
-  using Arguments = typename GemvKernel::Arguments;
-  using Params = typename GemvKernel::Params;
-
-private:
-
-  Params params_;
-
-public:
-
-  /// Constructs the Gemv.
-  Gemv() { }
-
-  /// Determines whether the Gemv can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return GemvKernel::can_implement(args);
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return 0;
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args, dim3 const &block) { 
-    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      return dim3((args.problem_size.row() + (block.x - 1)) / block.x, 1, args.batch_count % 65536);
-    }
-    else {
-      return dim3((args.problem_size.row() + (block.y - 1)) / block.y, 1, args.batch_count % 65536);
-    }
-  }
-
-  /// Computes the block shape
-  static dim3 get_block_shape() { 
-    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      return dim3(kThreadCount, 1, 1);
-    }
-    else {
-      return dim3(kThreadsPerRow, kThreadCount / kThreadsPerRow, 1);
-    }
-  }
-
-  /// Initializes Gemv state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-    params_ = Params(args);
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    return params_.update(args);    
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    dim3 block = get_block_shape();
-    dim3 grid = get_grid_shape(params_, block);
-
-    int smem_size = int(sizeof(typename GemvKernel::SharedStorage));
-    
-    // Launch
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    //
-    // Query for errors
-    //
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
deleted file mode 100755
index 296f38cad..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h
+++ /dev/null
@@ -1,548 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined Rank2K kernel. Does not compute batching or support split-K.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/rank_2k_universal.h"
-
-#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementB_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by SYRK
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-class Rank2K {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static FillMode const kFillModeC = FillModeC;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  static BlasMode const kBlasMode = BlasMode_;
-  static int const kUpdateRank = 2;
-
-  // static asserts for rank 2k update kernel
-  static_assert(platform::is_same<LayoutA, LayoutB>::value,
-    "Rank 2K update operator support same layouts for operandA and B");
-
-  /// Define the kernel
-  using Rank2Kkernel = typename kernel::DefaultRank2KUniversal<
-    ElementA,
-    LayoutA,
-    kTransformA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kTransformB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    kFillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator,
-    kBlasMode
-  >::Rank2Kkernel;
-  
-  using Arguments = typename Rank2Kkernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename Rank2Kkernel::Params params_;
-public:
-
-  /// Constructs the SYRK.
-  Rank2K() { }
-
-  /// Determines whether the SYRK can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.batch_count > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = Rank2Kkernel::can_implement(args);
-   
-    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-    
-    if (kSplitKSerial && args.batch_count > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes SYRK state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-    
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    if (kSplitKSerial) {
-      if (args.batch_count > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.batch_count > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    
-    int gemm_k_size = args.problem_size.k();
-
-    // Initialize the Params structure
-    params_ = typename Rank2Kkernel::Params{
-      args,
-      grid_tiled_shape,
-      gemm_k_size,
-      static_cast<int *>(workspace)
-    };
-    
-    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
-    
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<Rank2Kkernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.batch_count > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_.update(args, workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(Rank2Kkernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<Rank2Kkernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchange operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial,
-    /// Operation performed by Rank2K update kernel
-    typename Operator_,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformB,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_
-    >
-class Rank2K<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           SplitKSerial, Operator_, TransformA, TransformB, BlasMode_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static FillMode const kFillModeC = FillModeC;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static BlasMode const kBlasMode = BlasMode_;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  static int const kUpdateRank = 2;
-  
-  /// Define the kernel
-  using UnderlyingOperator = typename cutlass::gemm::device::Rank2K<
-    ElementB,
-    LayoutB,
-    ElementA,
-    LayoutA,
-    ElementC,
-    layout::RowMajor,
-    InvertFillMode<FillModeC>::mode,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kAlignmentB,
-    kAlignmentA,
-    kSplitKSerial,
-    Operator,
-    kTransformA,
-    kTransformB,
-    kBlasMode
-  >;
-  
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-  using Rank2Kkernel = typename UnderlyingOperator::Rank2Kkernel;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the Rank2K.
-  Rank2K() { }
-
-  /// Helper to construct a transposed equivalent for the underying Rank2K operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem();
-  }
-
-  /// Determines whether the Rank2K can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes Rank2K state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace Rank2K
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
deleted file mode 100755
index 6cbebc5d7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief Device-level grouped Rank2K.
-*/
-
-#pragma once
-
-#include "cutlass/gemm/device/base_grouped.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Rank2K Grouped
-template <typename Rank2Kkernel_>
-class Rank2KGrouped : public BaseGrouped<Rank2Kkernel_> {
-public:
-  using Rank2Kkernel = Rank2Kkernel_;
-  static const cutlass::FillMode kFillModeC = Rank2Kkernel::kFillModeC;
-  static const cutlass::BlasMode kBlasMode = Rank2Kkernel::kBlasMode;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
deleted file mode 100755
index ae18a11b8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined RankK kernel. Does not compute batching or support split-K.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/rank_k_universal.h"
-
-#include "cutlass/gemm/kernel/default_rank_k_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ =
-        typename threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by SYRK
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-class RankK {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static FillMode const kFillModeC = FillModeC;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static ComplexTransform const kTransformA = TransformA;
-  static BlasMode const kBlasMode = BlasMode_;
-  static int const kUpdateRank = 1;
-
-  /// Define the kernel
-  using RankKkernel = typename kernel::DefaultRankKUniversal<
-    ElementA,
-    LayoutA,
-    kTransformA,
-    kAlignmentA,
-    ElementC,
-    LayoutC,
-    kFillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator,
-    kBlasMode
-  >::RankKkernel;
-  
-  using Arguments = typename RankKkernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename RankKkernel::Params params_;
-public:
-
-  /// Constructs the SYRK.
-  RankK() { }
-
-  /// Determines whether the SYRK can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.batch_count > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = RankKkernel::can_implement(args);
-   
-    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-    
-    if (kSplitKSerial && args.batch_count > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes SYRK state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-    
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    if (kSplitKSerial) {
-      if (args.batch_count > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.batch_count > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    
-    int gemm_k_size = args.problem_size.k();
-
-    // Initialize the Params structure
-    params_ = typename RankKkernel::Params{
-      args,
-      grid_tiled_shape,
-      gemm_k_size,
-      static_cast<int *>(workspace)
-    };
-    
-    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
-    
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<RankKkernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.batch_count > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_.update(args, workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(RankKkernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<RankKkernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for column-major output exchange operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial,
-    /// Operation performed by RankK update kernel
-    typename Operator_,
-    /// Complex elementwise transformation 
-    ComplexTransform TransformA,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_
-    >
-class RankK<ElementA_, LayoutA_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA,
-           SplitKSerial, Operator_, TransformA, BlasMode_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static FillMode const kFillModeC = FillModeC;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static BlasMode const kBlasMode = BlasMode_;
-  static int const kUpdateRank = 1;
-
-  // Complex transform for input A matrices (function on input layout)
-  static ComplexTransform const kTransformA = TransformA;
-  
-  /// Define the kernel
-  using UnderlyingOperator = typename cutlass::gemm::device::RankK<
-    ElementA,
-    LayoutA,
-    ElementC,
-    layout::RowMajor,
-    InvertFillMode<FillModeC>::mode,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kAlignmentA,
-    kSplitKSerial,
-    Operator,
-    kTransformA,
-    kBlasMode
-  >;
-  
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-  using RankKkernel = typename UnderlyingOperator::RankKkernel;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the RankK.
-  RankK() { }
-
-  /// Helper to construct a transposed equivalent for the underying RankK operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args;
-  }
-
-  /// Determines whether the RankK can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes RankK state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace RankK
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
deleted file mode 100755
index c36ef959b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h
+++ /dev/null
@@ -1,603 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined SYMM and HEMM kernels. Does not compute batching or support split-K.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/symm_universal.h"
-
-#include "cutlass/gemm/kernel/default_symm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode SideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode FillModeA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
-      ElementC_,
-      128 / sizeof_bits<ElementC_>::value,
-      ElementAccumulator_,
-      ElementAccumulator_,
-      epilogue::thread::ScaleType::OnlyAlphaScaling
-    >,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by SYMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-class Symm {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
-  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
-  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static SideMode const kSideModeA = SideModeA;
-  static FillMode const kFillModeA = FillModeA;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static BlasMode const kBlasMode = BlasMode_;
-
-  // static asserts for symm update kernel
-  static_assert(platform::is_same<LayoutA, LayoutB>::value,
-    "SYMM update operator support same layouts for operand A and B");
-
-  /// Define the kernel
-  using SymmKernel = typename kernel::DefaultSymmUniversal<
-    ElementAKernel,
-    LayoutAKernel,
-    kSideModeA,
-    kFillModeA,
-    kAlignmentAKernel,
-    ElementBKernel,
-    LayoutBKernel,
-    kAlignmentBKernel,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator,
-    kBlasMode
-  >::SymmKernel;
-  
-  using Arguments = typename SymmKernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename SymmKernel::Params params_;
-public:
-
-  /// Constructs the SYMM.
-  Symm() { }
-
-  /// Determines whether the SYMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.batch_count > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = SymmKernel::can_implement(args);
-
-    if (SideModeA == SideMode::kInvalid) {
-      return Status::kErrorInvalidProblem;
-    }
-   
-    if (FillModeA != FillMode::kLower && FillModeA != FillMode::kUpper) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-    
-    if (kSplitKSerial && args.batch_count > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes SYMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-    
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    if (kSplitKSerial) {
-      if (args.batch_count > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.batch_count > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    
-    int gemm_k_size = args.problem_size.k();
-
-   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
-    if (kSideModeA == SideMode::kRight) {
-      // Initialize the Params structure
-      params_ = typename SymmKernel::Params{
-        args.swapped_matrices(),
-        grid_tiled_shape,
-        gemm_k_size,
-        static_cast<int *>(workspace)
-      };
-
-      return Status::kSuccess;
-    }
-
-    // Initialize the Params structure
-    params_ = typename SymmKernel::Params{
-      args,
-      grid_tiled_shape,
-      gemm_k_size,
-      static_cast<int *>(workspace)
-    };
-    
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.batch_count > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_.update(args, workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(SymmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename SymmKernel::SharedStorage));
-
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<SymmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<SymmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/********************************************************************************************************
-  SYMM/HEMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
-  In templates and arguments to cutlass kernel, `matrix A` is always symmetric/hermitian, and `matrix B` is rectangular. 
-  (adhering to the cuBLAS convention)
-
-  Although, cuBLAS SYMM/HEMM only supports ColumnMajor layouts for all matrices (A, B, C/D).
-
-  For the mainloop and symm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
-  
-  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
-  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
-  
-  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
-  transposing the GEMM problem. Thus, ColumnMajor output layout for SYMM/HEMM requires:
-   - Transposing `matrix A` and `matrix B` layouts
-   - Swapping problem size m and n values
-   - Swapping LeftSide and RightSide mode
-  
-  RowMajor output:    D = matrix A x matrix B
-  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
-
-  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
-    1.  LeftSide mode and RowMajor output (default template)
-    2.  LeftSide mode and ColumnMajor output 
-    3.  RightSide mode and RowMajor output
-    4.  RightSide mode and ColumnMajor output
-  
-  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
-  
-  Case 2 -> Case 3:
-      D_col = matrix A x matrix B (LeftSide mode) 
-   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
-
-  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
-
-  Case 4 -> Case 1:
-      D_col = matrix B x matrix A (RightSide mode) 
-   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
-
-   call GEMM mainloop for with RowMajor efficient-epilogue
-********************************************************************************************************/
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode SideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode FillModeA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for.  This is the minimum SM that
-    /// supports the intended feature. The device kernel can be built
-    /// targeting any SM larger than this number.
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial,
-    /// Operation performed by Symm update kernel
-    typename Operator_,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_
-    >
-class Symm<ElementA_, LayoutA_, SideModeA, FillModeA, ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           SplitKSerial, Operator_, BlasMode_> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static SideMode const kSideModeA = SideModeA;
-  static FillMode const kFillModeA = FillModeA;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static BlasMode const kBlasMode = BlasMode_;
-  
-  /// Define the kernel
-  using UnderlyingOperator = typename cutlass::gemm::device::Symm<
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    InvertSideMode<kSideModeA>::mode,
-    InvertFillMode<kFillModeA>::mode,
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type, 
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kAlignmentA,
-    kAlignmentB,
-    kSplitKSerial,
-    Operator,
-    kBlasMode
-  >;
-  
-
-  /// Argument structure
-  using Arguments = typename UnderlyingOperator::Arguments;
-  using SymmKernel = typename UnderlyingOperator::SymmKernel;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the Symm.
-  Symm() { }
-
-  /// Helper to construct a transposed equivalent for the underying SYMM operator
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem_size();
-  }
-
-  /// Determines whether the Symm can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Computes the grid shape
-  static dim3 get_grid_shape(Arguments const &args) { 
-    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
-  }
-
-  /// Computes the maximum number of active blocks per multiprocessor
-  static int maximum_active_blocks(int smem_capacity = -1) {
-    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
-  }
-
-  /// Initializes Symm state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace Symm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
deleted file mode 100755
index 09b9152cb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h
+++ /dev/null
@@ -1,759 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a TRMM kernel. Does not compute batching or support split-K.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/kernel/trmm_universal.h"
-
-#include "cutlass/gemm/kernel/default_trmm_universal.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*! Trmm device-level operator. This is an interface to efficient CUTLASS TRMM kernels that may
-  be invoked from host code.
-
-  The contributions of this class are:
-    
-    1. At compile time, it maps data types and high-level structural parameters onto 
-       specific CUTLASS components.
-
-    2. At runtime, it maps logical arguments to TRMM problems to kernel parameters.
-
-    3. At runtime, it launches kernels on the device.
-
-  The intent is to provide a convenient mechanism for interacting with most plausible TRMM
-  configurations for each supported architecture. Consequently, not all parameters are exposed
-  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
-  are selected to tradeoff simplicity of the interface with flexibility. We expect 
-  most configurations to be specified at this level. Applications with more exotic requirements 
-  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
-  and thread levels of abstraction.
-
-  CUTLASS exposes computations using the functor design pattern in which objects compose some
-  internal state with an overloaded function call operator. This enables decoupling of
-  initialization from execution, possibly reducing overhead during steady state phases of
-  application execution.
-
-  CUTLASS device-level operators expose an Arguments structure encompassing each logical
-  input to the computation. This is distinct from the kernel-level Params structure pattern
-  which contains application-specific precomputed state needed by the device code.
-
-  Example of a CUTLASS TRMM operator implementing the functionality of cuBLAS's STRMM NN
-  is as follows:
-
-    //
-    // Instantiate the CUTLASS TRMM operator.
-    //
-
-    cutlass::gemm::device::Trmm<
-      float,
-      cutlass::layout::ColumnMajor,
-      cutlass::SideMode::kLeft,
-      cutlass::FillMode::kLower,
-      cutlass::DiagType::kNonUnit,
-      float,
-      cutlass::layout::ColumnMajor,
-      float,
-      cutlass::layout::ColumnMajor,
-    > trmm_op;
-
-    //
-    // Launch the TRMM operation on the device
-    //
-
-    cutlass::Status status = trmm_op({
-      cutlass::gemm::GemmUniversalMode,   // Trmm Problem Mode
-      {m, n, m/n},                        // GemmCoord problem_size (k is based on left- or right-side mode)
-      batch_count,
-      {alpha},                            // EpilogueOutputOp::Params epilogue_op_params
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int lda,
-      int ldb,
-      int ldc
-    });
-
-  A simplified view of the template is listed below.
-
-    template <
-      /// Element type for A matrix operand
-      typename ElementA,
-      
-      /// Layout type for A matrix operand
-      typename LayoutA,
-      
-      /// Side Mode for A (kLeft or kRight)
-      SideMode SideModeA,
-
-      /// Fill Mode for A (kLower or kUpper)
-      FillMode FillModeA,
-
-      /// DiagType for A (kNonUnit or kUnit)
-      DiagType DiagTypeA,
-
-      /// Element type for B matrix operand
-      typename ElementB,
-      
-      /// Layout type for B matrix operand
-      typename LayoutB,
-      
-      /// Element type for C and D matrix operands
-      typename ElementC,
-      
-      /// Layout type for C and D matrix operands
-      typename LayoutC,
-      
-      /// Element type for internal accumulation
-      typename ElementAccumulator,
-
-      /// Operator class tag
-      typename OperatorClass,
-      
-      /// Tag indicating architecture to tune for.  This is the minimum SM that
-      /// supports the intended feature. The device kernel can be built
-      /// targeting any SM larger than this number.
-      typename ArchTag,
-      
-      /// Threadblock-level tile size (concept: GemmShape)
-      typename ThreadblockShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename WarpShape,
-      
-      /// Warp-level tile size (concept: GemmShape)
-      typename InstructionShape,
-      
-      /// Epilogue output operator
-      typename EpilogueOutputOp,
-      
-      /// Threadblock-level swizzling operator
-      typename ThreadblockSwizzle,
-      
-      /// Number of stages used in the pipelined mainloop
-      int Stages,
-
-      /// Access granularity of A matrix in units of elements
-      int AlignmentA,
-
-      /// Access granularity of B matrix in units of elements
-      int AlignmentB,
-
-      /// If true, kernel supports split-K with serial reduction
-      bool SplitKSerial,
-
-      /// Operation performed by TRMM
-      typename Operator,
-
-      /// Complex elementwise transformation on A operand
-      ComplexTransform TransformA
-    >
-    class Trmm;
-*/
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A 
-    SideMode SideModeA,
-    /// Fill Mode for A
-    FillMode FillModeA,
-    /// DiagType for A
-    DiagType DiagTypeA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_ = ElementC_,
-    /// Operator class tag
-    typename OperatorClass_ = arch::OpClassTensorOp,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_ = arch::Sm80,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
-      ElementC_,
-      128 / sizeof_bits<ElementC_>::value,
-      ElementAccumulator_,
-      ElementAccumulator_,
-      epilogue::thread::ScaleType::OnlyAlphaScaling
-    >,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Operation performed by TRMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone>
-class Trmm {
- public:
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
-  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
-  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static SideMode const kSideMode = SideModeA;
-  static FillMode const kFillMode = FillModeA;
-  static DiagType const kDiagType = DiagTypeA;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  // Complex Transform don't appply to B
-  static ComplexTransform const kTransformA = TransformA; 
-  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
-  static ComplexTransform const kTransformAKernel = (SideModeA == SideMode::kRight) ? 
-                                              ComplexTransform::kNone : TransformA;
-  static ComplexTransform const kTransformBKernel = (SideModeA == SideMode::kRight) ? 
-                                              TransformA : ComplexTransform::kNone;
-
-  /// Define the kernel
-  using TrmmKernel = typename kernel::DefaultTrmmUniversal<
-    ElementAKernel,
-    LayoutAKernel,
-    kTransformAKernel,
-    kAlignmentAKernel,
-    ElementBKernel,
-    LayoutBKernel,
-    kTransformBKernel,
-    kAlignmentBKernel,
-    kSideMode,
-    kFillMode,
-    kDiagType,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kSplitKSerial,
-    Operator
-  >::TrmmKernel;
-  
-  using Arguments = typename TrmmKernel::Arguments;
-
-private:
-
-  /// Kernel parameters object
-  typename TrmmKernel::Params params_;
-public:
-
-  /// Constructs the TRMM.
-  Trmm() { }
-
-  /// Determines whether the TRMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    if (!kSplitKSerial && args.batch_count > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    Status status = TrmmKernel::can_implement(args);
-   
-    if (SideModeA == SideMode::kInvalid) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (FillModeA == FillMode::kInvalid) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (DiagTypeA == DiagType::kInvalid) {
-      return Status::kErrorInvalidProblem;
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    size_t bytes = 0;
-
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-    
-    if (kSplitKSerial && args.batch_count > 1) {
-
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-
-    return bytes;
-  }
-
-  /// Initializes TRMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
- 
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size, 
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.batch_count);
-
-    if (kSplitKSerial) {
-      if (args.batch_count > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        size_t bytes = get_workspace_size(args);
-      
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-
-      if (args.batch_count > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    
-    int gemm_k_size = args.problem_size.k();
-
-   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
-    if (kSideMode == SideMode::kRight) {
-      // Initialize the Params structure
-      params_ = typename TrmmKernel::Params{
-        args.swapped_matrices(),
-        grid_tiled_shape,
-        gemm_k_size,
-        static_cast<int *>(workspace)
-      };
-
-      return Status::kSuccess;
-    }
-
-    // Initialize the Params structure
-    params_ = typename TrmmKernel::Params{
-      args,
-      grid_tiled_shape,
-      gemm_k_size,
-      static_cast<int *>(workspace)
-    };
-    
-    return Status::kSuccess;
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    
-    if (kSplitKSerial && args.batch_count > 1) {  
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-
-    size_t workspace_bytes = get_workspace_size(args);
-
-    if (workspace_bytes && !workspace) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    params_.update(args, workspace);
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    ThreadblockSwizzle threadblock_swizzle;
-
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(TrmmKernel::kThreadCount, 1, 1);
-
-    int smem_size = int(sizeof(typename TrmmKernel::SharedStorage));
-    
-    if (smem_size >= (48 << 10)) {
-      cudaError_t result = cudaFuncSetAttribute(Kernel<TrmmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-
-    cutlass::arch::synclog_setup();
-    cutlass::Kernel<TrmmKernel><<<grid, block, smem_size, stream>>>(params_);
-
-    cudaError_t result = cudaGetLastError();
-
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    Status status = initialize(args, workspace);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-/********************************************************************************************************
-  TRMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
-  In templates and arguments to cutlass kernel, `matrix A` is always triangular, and `matrix B` is rectangular. 
-  (adhering to the cuBLAS convention)
-
-For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
-  
-  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
-  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
-  
-  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
-  transposing the GEMM problem. Thus, ColumnMajor output layout for TRMM requires:
-   - Transposing `matrix A` and `matrix B` layouts
-   - Swapping problem size m and n values
-   - Swapping LeftSide and RightSide mode
-  
-  RowMajor output:    D = matrix A x matrix B
-  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
-
-  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
-    1.  LeftSide mode and RowMajor output (default template)
-    2.  LeftSide mode and ColumnMajor output 
-    3.  RightSide mode and RowMajor output
-    4.  RightSide mode and ColumnMajor output
-  
-  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
-  
-  Case 2 -> Case 3:
-      D_col = matrix A x matrix B (LeftSide mode) 
-   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
-
-  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
-
-  Case 4 -> Case 1:
-      D_col = matrix B x matrix A (RightSide mode) 
-   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
-
-   call GEMM mainloop for with RowMajor efficient-epilogue
-********************************************************************************************************/
-
-/// Partial specialization for column-major output exchanges problem size and operand.
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A 
-    SideMode SideModeA,
-    /// Fill Mode for A
-    FillMode FillModeA,
-    /// DiagType for A
-    DiagType DiagTypeA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB,
-    /// If true, kernel supports split-K as a serial reduction
-    bool SplitKSerial,
-    /// Operation performed by TRMM
-    typename Operator_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA>
-class Trmm<ElementA_, LayoutA_, SideModeA, FillModeA, DiagTypeA,
-           ElementB_, LayoutB_, ElementC_,
-           layout::ColumnMajor,  // partially specialized on LayoutC
-           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
-           WarpShape_, InstructionShape_, EpilogueOutputOp_,
-           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
-           Operator_, TransformA> {
- public:
-
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_; 
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  using TensorRefB = TensorRef<ElementB const, LayoutB>;
-  using ElementC = ElementC_;
-  using LayoutC = layout::ColumnMajor;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static SideMode const kSideMode = SideModeA;
-  static FillMode const kFillMode = FillModeA;
-  static DiagType const kDiagType = DiagTypeA;
-  // Changing SideMode as we change the layout
-  static SideMode const kSideModeT = (SideModeA == SideMode::kLeft) ?
-                                      SideMode::kRight : SideMode::kLeft;
-  // Changing FillMode as we change the layout
-  static FillMode const kFillModeT = (FillModeA == FillMode::kLower) ? 
-                                      FillMode::kUpper : FillMode::kLower;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static ComplexTransform const kTransformA = TransformA;
-  // Complex Transform don't appply to B
-  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
-  static bool const kSplitKSerial = SplitKSerial;
-
-  using UnderlyingOperator = Trmm<
-    ElementA,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    kSideModeT,
-    kFillModeT,
-    kDiagType,
-    ElementB,
-    typename layout::LayoutTranspose<LayoutB>::type, 
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    kStages,
-    kAlignmentA,
-    kAlignmentB,
-    kSplitKSerial,
-    Operator,
-    TransformA
-  >;
-
-  using Arguments = typename UnderlyingOperator::Arguments;
-  using TrmmKernel = typename UnderlyingOperator::TrmmKernel;
-  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
-
-private:
-
-  UnderlyingOperator underlying_operator_;
-
-public:
-
-  /// Constructs the TRMM.
-  Trmm() { }
-
-  /// Helper to construct a transposed equivalent for the underying TRMM operator which is identical
-  static Arguments to_underlying_arguments(Arguments const &args) {
-    return args.transposed_problem_size();
-  }
-
-  /// Determines whether the TRMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    
-    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
-  }
-
-  /// Initializes TRMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
-  }
-
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    return underlying_operator_.update(to_underlying_arguments(args), workspace);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-
-    return underlying_operator_.run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-   
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-
-    return status;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
deleted file mode 100755
index 904e6af3c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp
+++ /dev/null
@@ -1,324 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cute/layout.hpp"
-#include "cute/numeric/integral_constant.hpp" // cute::false_type
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::detail {
-
-template <class T, template <int...> class U>
-struct is_kernel_tag_of : cute::false_type {};
-
-template <template <int...> class U, int... Args>
-struct is_kernel_tag_of<U<Args...>, U> : cute::true_type {};
-
-template <class T, template <int...> class U>
-constexpr bool is_kernel_tag_of_v = is_kernel_tag_of<T, U>::value;
-
-template <class T, template <int,bool> class U>
-struct is_asymmetric_dma_kernel_tag_of : cute::false_type {};
-
-template <template <int, bool> class U, int I0, bool B0>
-struct is_asymmetric_dma_kernel_tag_of<U<I0, B0>, U> : cute::true_type {};
-
-template <class T, template <int, bool> class U>
-constexpr bool is_asymmetric_dma_kernel_tag_of_v = \
-                              is_asymmetric_dma_kernel_tag_of<T, U>::value;
-
-}
-
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm {
-using namespace cute;
-
-//////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-enum class KernelInputTransformType {
-    FastF32,
-    InterleavedComplexTF32
-};
-
-} // namespace detail
-
-//////////////////////////////////////////////////////////////////////////////
-
-namespace kernel::detail {
-
-// Has_SwapAB<T>::value will be true only if:
-//   class T has member SwapAB and T::SwapAB is true
-template <typename T, typename = void>
-struct Has_SwapAB { static constexpr bool value = false; };
-
-template <typename T>
-struct Has_SwapAB <T, CUTE_STL_NAMESPACE::void_t<decltype(T::SwapAB)>>
-{ static constexpr bool value = T::SwapAB; };
-
-template <typename T>
-static constexpr bool Has_SwapAB_v = Has_SwapAB<T>::value;
-
-} // namespace kernel::detail
-
-//////////////////////////////////////////////////////////////////////////////
-
-//
-// Kernel schedule policies (the base class tags, one for each kernel layer file)
-//
-struct KernelMultistage { };
-struct KernelCpAsyncWarpSpecialized { };
-struct KernelCpAsyncWarpSpecializedPingpong { };
-struct KernelCpAsyncWarpSpecializedCooperative { };
-struct KernelTma { };
-struct KernelTmaWarpSpecialized { };
-struct KernelTmaWarpSpecializedPingpong { };
-struct KernelTmaWarpSpecializedCooperative { 
-};
-
-struct KernelPtrArrayTmaWarpSpecializedCooperative { };
-struct KernelPtrArrayTmaWarpSpecializedPingpong { };
-
-//////////////////////////////////////////////////////////////////////////////
-
-//
-// Builder dispatch policies (not a part of the main CUTLASS layers, simply used to opt into
-// specific collective builder dispatches)
-//
-
-// FP8 related policies (including Fast Accumulation)
-struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
-struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { };
-struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { };
-struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };
-struct KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum : KernelPtrArrayTmaWarpSpecializedPingpong { };
-
-// Policies to opt into mixed type GEMMs
-struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
-struct KernelTmaWarpSpecializedPingpongMixedInput : KernelTmaWarpSpecializedPingpong { };
-struct KernelTmaWarpSpecializedCooperativeMixedInput: KernelTmaWarpSpecializedCooperative { };
-
-//////////////////////////////////////////////////////////////////////////////
-
-// Policies for dispatch of epilogue
-struct EpilogueDefault { };
-struct EpilogueTransposed { };
-
-//////////////////////////////////////////////////////////////////////////////
-
-//
-// Collective Mainloop Policies
-//
-
-// 2 stage pipeline through 1 stage in smem, 1 in rmem, WITHOUT predicated gmem loads
-struct MainloopSm70TwoStageUnpredicated {
-  constexpr static int Stages = 2;
-  using ArchTag = arch::Sm70;
-  using Schedule = KernelMultistage;
-  using ClusterShape = Shape<_1,_1,_1>;
-};
-
-// 2 stage pipeline through 1 stage in smem, 1 in rmem, with predicated gmem loads
-struct MainloopSm70TwoStage {
-  constexpr static int Stages = 2;
-  using ArchTag = arch::Sm70;
-  using Schedule = KernelMultistage;
-  using ClusterShape = Shape<_1,_1,_1>;
-};
-
-// n-buffer in smem (cp.async), pipelined with registers, WITHOUT predicated gmem loads
-template<int Stages_>
-struct MainloopSm80CpAsyncUnpredicated {
-  constexpr static int Stages = Stages_;
-  using ArchTag = arch::Sm80;
-  using Schedule = KernelMultistage;
-  using ClusterShape = Shape<_1,_1,_1>;
-};
-
-// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>
->
-struct MainloopSm80CpAsync {
-  constexpr static int Stages = Stages_;
-  using ArchTag = cute::conditional_t<(size(ClusterShape_{}) > 1), arch::Sm90, arch::Sm80>;
-  using Schedule = KernelMultistage;
-  using ClusterShape = ClusterShape_;
-};
-
-// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelCpAsyncWarpSpecialized
->
-struct MainloopSm90CpAsyncGmmaWarpSpecialized {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-};
-
-// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelCpAsyncWarpSpecialized
->
-struct MainloopSm90CpAsyncGmmaRmemAWarpSpecialized {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  int PipelineAsyncMmaStages_ = 1
->
-struct MainloopSm90TmaGmma {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  constexpr static int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelTma;
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecializedCooperative
->
-struct MainloopSm90TmaGmmaWarpSpecialized {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
-// With GMMA's A data from registers.
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecialized
->
-struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-  static_assert(
-    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
-    "KernelSchedule must be one of the warp specialized policies");
-};
-
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecialized
->
-struct MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-  static_assert(
-    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpongMixedInput> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
-    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperativeMixedInput>,
-    "KernelSchedule must be one of the warp specialized policies");
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
-// For FP8 kernels
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecialized
->
-struct MainloopSm90TmaGmmaWarpSpecializedFP8
-  : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
-  static_assert(
-    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecialized> ||
-    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedPingpong> ||
-    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
-    "KernelSchedule must be one of the warp specialized policies");
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
->
-struct MainloopSm90ArrayTmaGmmaWarpSpecialized {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-  static_assert(
-    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, KernelSchedule> ||
-    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, KernelSchedule>,
-    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");
-};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper sparse GMMA and TMA, Warp specialized dynamic schedule
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecializedCooperative
->
-struct MainloopSm90TmaGmmaWarpSpecializedSparse {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
deleted file mode 100755
index ac288e3e8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines common types used for all GEMM-like operators.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm_enumerated_types.h"
-#include "cute/layout.hpp"
-#include "cutlass/detail/layout.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-using cutlass::detail::TagToStrideA;
-using cutlass::detail::TagToStrideB;
-using cutlass::detail::TagToStrideC;
-using cutlass::detail::TagToStrideA_t;
-using cutlass::detail::TagToStrideB_t;
-using cutlass::detail::TagToStrideC_t;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-using cutlass::detail::StrideToLayoutTagA;
-using cutlass::detail::StrideToLayoutTagB;
-using cutlass::detail::StrideToLayoutTagC;
-using cutlass::detail::StrideToLayoutTagA_t;
-using cutlass::detail::StrideToLayoutTagB_t;
-using cutlass::detail::StrideToLayoutTagC_t;
-
-template<int ModeIndex, class Stride>
-constexpr bool
-is_major(Stride = {}) {
-  return ::cutlass::detail::is_major<ModeIndex>(Stride{});
-}
-
-template<class Stride>
-constexpr bool
-is_mn_major() {
-  return is_major<0,Stride>();
-}
-
-template<class Stride>
-constexpr
-bool
-is_k_major() {
-  return is_major<1,Stride>();
-}
-
-template<class LayoutA>
-constexpr bool
-is_mn_major_A() {
-  return is_mn_major<TagToStrideA_t<LayoutA>>();
-}
-
-template<class LayoutB>
-constexpr bool
-is_mn_major_B() {
-  return is_mn_major<TagToStrideB_t<LayoutB>>();
-}
-
-template<class LayoutA>
-constexpr bool
-is_k_major_A() {
-  return is_k_major<TagToStrideA_t<LayoutA>>();
-}
-
-template<class LayoutB>
-constexpr bool
-is_k_major_B() {
-  return is_k_major<TagToStrideB_t<LayoutB>>();
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-// The following two metafunctions are used to detect whether a `kernel::Gemm` or `kernel::GemmUniversal`
-// is implementing the CUTLASS 3.x API or not, by checking if the problem shape type is aliased within or not.
-template <class GemmKernel, class = void>
-struct IsCutlass3GemmKernel : cute::false_type { };
-
-template <typename GemmKernel>
-struct IsCutlass3GemmKernel<GemmKernel, cute::void_t<typename GemmKernel::ProblemShape>>
-    : cute::true_type { };
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h b/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
deleted file mode 100755
index 66aae898d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines common types used for all GEMM-like operators.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/layout/matrix.h"
-
-namespace cutlass {
-namespace gemm {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GEMM operand enumeration: D = A * B + C
-enum class Operand {
-  kA, /// A multiplicand
-  kB, /// B multiplicand
-  kC, /// Source accumulator
-  kD  /// Destination accumulator
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum class GemmUniversalMode {
-  kGemm,
-  kGemmSplitKParallel,
-  kBatched,
-  kArray,
-  kGrouped,
-  kInvalid
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Some options for clearing shared memory
-enum class SharedMemoryClearOption {
-  kNone,            ///< SMEM is in don't-care state
-  kZfill,           ///< Kernels fill out of bounds accesses with zeros
-  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
-};
-
-/////////////////////////////////////////////////////////////////////////
-
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
deleted file mode 100755
index 4a90a1d06..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This file contains definitions and utility functions for describing problem shapes 
-           for 3.x Ptr-Array GEMMs and Grouped GEMMs.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_coord.h"
-
-#include "cute/container/array.hpp"
-
-#if ! defined(__CUDACC_RTC__)
-#include <initializer_list>
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class ProblemShape_>
-struct GroupProblemShape {
-  using UnderlyingProblemShape = ProblemShape_;
-  int32_t num_groups = 1;
-  UnderlyingProblemShape* problem_shapes = nullptr;
-  UnderlyingProblemShape const* host_problem_shapes = nullptr;
-
-  CUTLASS_HOST_DEVICE
-  int32_t groups() const { return num_groups; }
-
-  CUTLASS_HOST_DEVICE
-  UnderlyingProblemShape const
-  get_problem_shape(int32_t group_idx) const {
-    return problem_shapes[group_idx];
-  }
-
-  CUTLASS_HOST_DEVICE
-  UnderlyingProblemShape const
-  get_host_problem_shape(int32_t group_idx) const {
-    return host_problem_shapes[group_idx];
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool
-  is_host_problem_shape_available() {
-    return host_problem_shapes != nullptr;
-  }
-};
-
-template <class ProblemShape_>
-class ArrayProblemShape {
-public:
-  using UnderlyingProblemShape = ProblemShape_;
-
-  ArrayProblemShape() = default;
-  ArrayProblemShape(UnderlyingProblemShape ps) : problem_shape_(ps) {}
-
-  // Num of groups for Ptr-Array GEMM always remain one, just the number of batches (l) can vary
-  // This is just to maintain uniformity with GroupProblemShape
-  constexpr int32_t groups() const { return 1; }
-
-  UnderlyingProblemShape* problem_shapes() const {
-    return &problem_shape_;
-  }
-  UnderlyingProblemShape const* host_problem_shapes() const {
-    return &problem_shape_;
-  }
-
-  // This is just to maintain uniformity with GroupProblemShape
-  CUTLASS_HOST_DEVICE
-  UnderlyingProblemShape const
-  get_problem_shape(int32_t /* unused */ = 0) const {
-    return problem_shape_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  UnderlyingProblemShape const
-  get_host_problem_shape(int32_t /* unused */ = 0) const {
-    return problem_shape_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool
-  is_host_problem_shape_available() {
-    return true;
-  }
-private:
-  UnderlyingProblemShape problem_shape_{};
-};
-
-} // namespace cutlass::gemm 
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
deleted file mode 100755
index 49f9eef33..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
+++ /dev/null
@@ -1,837 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Default kernel-level Blocked-Ell sparse gemm operators.
-      This operator combines threadblock-scoped ELL MMA
-      with the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-#include "cutlass/gemm/kernel/ell_gemm.h"
-#include "cutlass/gemm/threadblock/default_ell_mma.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse>
-struct DefaultEllGemm;
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse
->
-struct DefaultEllGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, IsASparse> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Turing Architecture
-template <
-  /// Element type for A matrix operand
-  typename ElementA,
-  /// Layout type for A matrix operand
-  typename LayoutA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB,
-  /// Layout type for B matrix operand
-  typename LayoutB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
-  /// Operation performed by GEMM
-  typename Operator,
-  /// Sparse matrix is A or not
-  bool IsASparse
->
-struct DefaultEllGemm<
-  ElementA, LayoutA, kAlignmentA,
-  ElementB, LayoutB, kAlignmentB,
-  ElementC, layout::RowMajor,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  arch::Sm75,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  SplitKSerial,
-  Operator,
-  IsASparse
-> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    layout::RowMajor,
-    arch::OpClassTensorOp,
-    arch::Sm75,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    2,
-    Operator
-  >::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    typename Mma::Operator,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Number of Interleaved k
-    int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse>
-struct DefaultEllGemm<
-    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
-    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
-    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
-    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
-    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, IsASparse> {
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
-
-  using ElementAccumulator = int32_t;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
-      true>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of Interleaved k
-    int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse>
-struct DefaultEllGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-                   kAlignmentA, ElementB,
-                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
-                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
-                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
-                   WarpShape, InstructionShape, EpilogueOutputOp,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsASparse> {
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
-
-  using ElementAccumulator = int32_t;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
-      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
-      InstructionShape, 2, Operator, true>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// Partial specialization for Volta architecture
-template <
-  /// Element type for A matrix operand
-  typename ElementA,
-  /// Layout type for A matrix operand
-  typename LayoutA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB,
-  /// Layout type for B matrix operand
-  typename LayoutB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
-  /// Operation performed by GEMM
-  typename Operator,
-  /// Sparse matrix is A or not
-  bool IsASparse
->
-struct DefaultEllGemm<
-  ElementA, LayoutA, kAlignmentA,
-  ElementB, LayoutB, kAlignmentB,
-  ElementC, layout::RowMajor,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  arch::Sm70,
-  ThreadblockShape,
-  WarpShape,
-  GemmShape<8, 8, 4>,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  SplitKSerial,
-  Operator,
-  IsASparse
-> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    layout::RowMajor,
-    arch::OpClassTensorOp,
-    arch::Sm70,
-    ThreadblockShape,
-    WarpShape,
-    GemmShape<8, 8, 4>,
-    2,
-    Operator
-  >::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
-    ThreadblockShape,
-    typename Mma::Operator,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for SIMT
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse
-  >
-struct DefaultEllGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    layout::RowMajor,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    GemmShape<1, 1, 1>,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    2,
-    SplitKSerial,
-    Operator,
-    IsASparse> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA,
-      LayoutA,
-      kAlignmentA,
-      ElementB,
-      LayoutB,
-      kAlignmentB,
-      ElementAccumulator,
-      layout::RowMajor,
-      arch::OpClassSimt,
-      arch::Sm50,
-      ThreadblockShape,
-      WarpShape,
-      GemmShape<1, 1, 1>,
-      2,
-      Operator>::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator, 
-    /// Sparse matrix is A or not
-    bool IsASparse
-    >
-struct DefaultEllGemm<ElementA,
-                   LayoutA,
-                   kAlignmentA,
-                   ElementB,
-                   LayoutB,
-                   kAlignmentB,
-                   ElementC,
-                   layout::RowMajor,
-                   ElementAccumulator,
-                   arch::OpClassSimt,
-                   arch::Sm80,
-                   ThreadblockShape,
-                   WarpShape,
-                   GemmShape<1, 1, 1>,
-                   EpilogueOutputOp,
-                   ThreadblockSwizzle,
-                   Stages,
-                   SplitKSerial,
-                   Operator,
-                   IsASparse> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80,
-      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
-      Operator>::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial,IsASparse>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for SIMT DP4A
-
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Layout type for C matrix operand
-    typename LayoutC,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse
-    >
-struct DefaultEllGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
-                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
-                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
-                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
-                   Operator, IsASparse> {
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using ElementB = int8_t;
-
-  using OperatorClass =  arch::OpClassSimt;
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<ElementA,
-      LayoutA,
-      kAlignmentA,
-      ElementB,
-      LayoutB,
-      kAlignmentB,
-      ElementAccumulator,
-      LayoutC,
-      arch::OpClassSimt,
-      arch::Sm50,
-      ThreadblockShape,
-      WarpShape,
-      InstructionShape,
-      2,
-      Operator
-      >::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for Wmma Gemm Kernel
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Sparse matrix is A or not
-    bool IsASparse
-    > 
-struct DefaultEllGemm<
-  ElementA, LayoutA, kAlignmentA, 
-  ElementB, LayoutB, kAlignmentB, 
-  ElementC, LayoutC, 
-  ElementAccumulator, 
-  arch::OpClassWmmaTensorOp,
-  ArchTag, 
-  ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, 
-  ThreadblockSwizzle, 
-  Stages, 
-  SplitKSerial,
-  Operator,
-  IsASparse> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
-      ElementA, LayoutA, kAlignmentA,
-      ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, 
-      arch::OpClassWmmaTensorOp, 
-      ArchTag,
-      ThreadblockShape, 
-      WarpShape, 
-      InstructionShape, 
-      Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue 
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
-      ThreadblockShape,
-      typename Mma::Operator, 
-      kPartitionsK, 
-      EpilogueOutputOp,
-      EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
-};
-////////////////////////////////////////////////////////////////////////////////
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
deleted file mode 100755
index 4678df4af..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h
+++ /dev/null
@@ -1,1189 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#include "cutlass/layout/permute.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute,
-    /// Permute operand A
-    typename PermuteALayout = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout = layout::NoPermute,
-    ///
-    typename Enable = void
->
-struct DefaultGemm;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
->
-struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
-                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB, 
-      PermuteALayout, PermuteBLayout>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ada Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
->
-struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD, 
-                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB,
-      PermuteALayout, PermuteBLayout>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
->
-struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   LayoutC, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
-                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
-
-  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
-             "Epilogue in the kernel level must be row major");
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB,
-      PermuteALayout, PermuteBLayout>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using RegularEpilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
-
-  using Affine2Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
-          2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
-                                                  RegularEpilogue,
-                                                  Affine2Epilogue>::type;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Turing Architecture
-template <
-  /// Element type for A matrix operand
-  typename ElementA,
-  /// Layout type for A matrix operand
-  typename LayoutA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB,
-  /// Layout type for B matrix operand
-  typename LayoutB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
-  /// Operation performed by GEMM
-  typename Operator,
-  /// Use zfill or predicate for out-of-bound cp.async
-  SharedMemoryClearOption SharedMemoryClear,
-  /// Gather operand A by using an index array
-  bool GatherA,
-  /// Gather operand B by using an index array
-  bool GatherB,
-  /// Scatter result D by using an index array
-  bool ScatterD,
-  /// Permute result D
-  typename PermuteDLayout,
-  /// Permute operand A
-  typename PermuteALayout,
-  /// Permute operand B
-  typename PermuteBLayout
->
-struct DefaultGemm<
-  ElementA, LayoutA, kAlignmentA,
-  ElementB, LayoutB, kAlignmentB,
-  ElementC, layout::RowMajor,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  arch::Sm75,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  SplitKSerial,
-  Operator,
-  SharedMemoryClear,
-  GatherA,
-  GatherB,
-  ScatterD,
-  PermuteDLayout,
-  PermuteALayout,
-  PermuteBLayout
-> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    layout::RowMajor,
-    arch::OpClassTensorOp,
-    arch::Sm75,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    2,
-    Operator,
-    false,
-    SharedMemoryClear,
-    GatherA,
-    GatherB,
-    PermuteALayout,
-    PermuteBLayout
-  >::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape,
-    typename Mma::Operator,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    ScatterD,
-    PermuteDLayout
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Number of Interleaved k
-    int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear>
-struct DefaultGemm<
-    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
-    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
-    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
-    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
-    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, SharedMemoryClear, false, false, false> {
-
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
-
-  using ElementAccumulator = int32_t;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
-      true, SharedMemoryClear>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of Interleaved k
-    int InterleavedK,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear>
-struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-                   kAlignmentA, ElementB,
-                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
-                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
-                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
-                   WarpShape, InstructionShape, EpilogueOutputOp,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, SharedMemoryClear,
-                   false, false, false> {
-
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
-
-  using ElementAccumulator = int32_t;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
-      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
-      InstructionShape, 2, Operator, true>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::
-      DefaultInterleavedEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Volta architecture
-template <
-  /// Element type for A matrix operand
-  typename ElementA,
-  /// Layout type for A matrix operand
-  typename LayoutA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB,
-  /// Layout type for B matrix operand
-  typename LayoutB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial,
-  /// Operation performed by GEMM
-  typename Operator,
-  /// Use zfill or predicate for out-of-bound cp.async
-  SharedMemoryClearOption SharedMemoryClear,
-  /// Gather operand A by using an index array
-  bool GatherA,
-  /// Gather operand B by using an index array
-  bool GatherB,
-  /// Scatter result D by using an index array
-  bool ScatterD,
-  /// Permute result D
-  typename PermuteDLayout,
-  /// Permute operand A
-  typename PermuteALayout,
-  /// Permute operand B
-  typename PermuteBLayout
->
-struct DefaultGemm<
-  ElementA, LayoutA, kAlignmentA,
-  ElementB, LayoutB, kAlignmentB,
-  ElementC, layout::RowMajor,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  arch::Sm70,
-  ThreadblockShape,
-  WarpShape,
-  GemmShape<8, 8, 4>,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  2,
-  SplitKSerial,
-  Operator,
-  SharedMemoryClear,
-  GatherA,
-  GatherB,
-  ScatterD,
-  PermuteDLayout,
-  PermuteALayout,
-  PermuteBLayout
-> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    layout::RowMajor,
-    arch::OpClassTensorOp,
-    arch::Sm70,
-    ThreadblockShape,
-    WarpShape,
-    GemmShape<8, 8, 4>,
-    2,
-    Operator,
-    false,
-    SharedMemoryClear,
-    GatherA,
-    GatherB,
-    PermuteALayout,
-    PermuteBLayout
-  >::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
-    ThreadblockShape,
-    typename Mma::Operator,
-    kPartitionsK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount,
-    ScatterD,
-    PermuteDLayout
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for SIMT
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-  >
-struct DefaultGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    arch::OpClassSimt,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    GemmShape<1, 1, 1>,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    2,
-    SplitKSerial,
-    Operator,
-    SharedMemoryClear,
-    GatherA,
-    GatherB,
-    ScatterD,
-    PermuteDLayout,
-    PermuteALayout,
-    PermuteBLayout,
-    typename platform::enable_if< ! platform::is_same<ArchTag, arch::Sm80>::value >::type > {
-
-  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
-             "Epilogue in the kernel level must be row major");
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA,
-      LayoutA,
-      kAlignmentA,
-      ElementB,
-      LayoutB,
-      kAlignmentB,
-      ElementAccumulator,
-      LayoutC,
-      arch::OpClassSimt,
-      arch::Sm50,
-      ThreadblockShape,
-      WarpShape,
-      GemmShape<1, 1, 1>,
-      2,
-      Operator,
-      false,
-      SharedMemoryClear,
-      GatherA,
-      GatherB,
-      PermuteALayout,
-      PermuteBLayout>::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess,
-      ScatterD,
-      PermuteDLayout
-      >::Epilogue;
-
-  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
-      2,
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
-                                                  RegularEpilogue,
-                                                  Affine2Epilogue>::type;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
->
-struct DefaultGemm<ElementA,
-                   LayoutA,
-                   kAlignmentA,
-                   ElementB,
-                   LayoutB,
-                   kAlignmentB,
-                   ElementC,
-                   LayoutC,
-                   ElementAccumulator,
-                   arch::OpClassSimt,
-                   arch::Sm80,
-                   ThreadblockShape,
-                   WarpShape,
-                   GemmShape<1, 1, 1>,
-                   EpilogueOutputOp,
-                   ThreadblockSwizzle,
-                   Stages,
-                   SplitKSerial,
-                   Operator,
-                   SharedMemoryClear,
-                   GatherA,
-                   GatherB,
-                   ScatterD,
-                   PermuteDLayout,
-                   PermuteALayout,
-                   PermuteBLayout> {
-
-  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
-             "Epilogue in the kernel level must be row major");
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, arch::OpClassSimt, arch::Sm80,
-      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
-      Operator, false, SharedMemoryClear, GatherA, GatherB,
-      PermuteALayout, PermuteBLayout>::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess,
-      ScatterD,
-      PermuteDLayout
-      >::Epilogue;
-
-  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
-      2,
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
-                                                  RegularEpilogue,
-                                                  Affine2Epilogue>::type;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>; 
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for SIMT DP4A
-
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Layout type for C matrix operand
-    typename LayoutC,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
->
-struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
-                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
-                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
-                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
-                   Operator, SharedMemoryClear, false, false, false,
-                   layout::NoPermute, layout::NoPermute> {
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using ElementB = int8_t;
-
-  using OperatorClass =  arch::OpClassSimt;
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA,
-      LayoutA,
-      kAlignmentA,
-      ElementB,
-      LayoutB,
-      kAlignmentB,
-      ElementAccumulator,
-      LayoutC,
-      arch::OpClassSimt,
-      arch::Sm50,
-      ThreadblockShape,
-      WarpShape,
-      InstructionShape,
-      2,
-      Operator
-      >::ThreadblockMma;
-
-  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
-  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
-
-  /// Define the epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-      ThreadblockShape,
-      typename Mma::Operator,
-      EpilogueOutputOp,
-      kEpilogueElementsPerAccess
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for Wmma Gemm Kernel
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
-> 
-struct DefaultGemm<
-  ElementA, LayoutA, kAlignmentA, 
-  ElementB, LayoutB, kAlignmentB, 
-  ElementC, LayoutC, 
-  ElementAccumulator, 
-  arch::OpClassWmmaTensorOp,
-  ArchTag, 
-  ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, 
-  ThreadblockSwizzle, 
-  Stages, 
-  SplitKSerial,
-  Operator,
-  SharedMemoryClear,
-  false,
-  false,
-  false,
-  layout::NoPermute,
-  layout::NoPermute
-> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA,
-      ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, LayoutC, 
-      arch::OpClassWmmaTensorOp, 
-      ArchTag,
-      ThreadblockShape, 
-      WarpShape, 
-      InstructionShape, 
-      Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue 
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
-      ThreadblockShape,
-      typename Mma::Operator, 
-      kPartitionsK, 
-      EpilogueOutputOp,
-      EpilogueOutputOp::kCount
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
deleted file mode 100755
index 7ef46c6cf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Multiply-add operator 
-  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-  typename Operator,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial
->
-struct DefaultGemmComplex;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultGemmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC,
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
-      layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
-      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultGemmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC,
-  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
-  arch::Sm50, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-    ThreadblockShape,
-    WarpShape, 
-    InstructionShape, 
-    ElementA, LayoutA, 
-    ElementB, LayoutB, 
-    ElementAccumulator, layout::RowMajor, 
-    arch::OpClassSimt,
-    Stages,
-    Operator,
-    false,
-    cutlass::arch::CacheOperation::Global,
-    cutlass::arch::CacheOperation::Global,
-    TransformA, 
-    TransformB
-  >;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, 
-          typename MmaCore::IteratorThreadMapA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, 
-          typename MmaCore::IteratorThreadMapB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using Mma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-
-  /// Define the epilogue
-  using Epilogue =
-    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-        ThreadblockShape, 
-        typename Mma::Operator, 
-        EpilogueOutputOp,
-        EpilogueOutputOp::kCount
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultGemmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC,
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
-      layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
-      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultGemmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC,
-  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
-      layout::RowMajor, arch::OpClassSimt, arch::Sm80, ThreadblockShape,
-      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
-        ThreadblockShape, 
-        typename Mma::Operator, 
-        EpilogueOutputOp,
-        EpilogueOutputOp::kCount
-      >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
deleted file mode 100755
index f9163874c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
+++ /dev/null
@@ -1,384 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/gemm_grouped.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
-    /// Operation performed by GEMM
-    typename Operator = typename device::DefaultGemmConfiguration<
-        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator>::Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute,
-    ///
-    typename Enable = void
-    >
-struct DefaultGemmGrouped;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Permute result D
-    typename PermuteDLayout
->
-struct DefaultGemmGrouped<
-  ElementA,
-  LayoutA,
-  ComplexTransform::kNone,   // transform A
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  ComplexTransform::kNone,   // transform B
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  GroupScheduleMode_,
-  Operator,
-  SharedMemoryClear,
-  PermuteDLayout,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
-  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
-
-  using MapArguments = kernel::detail::MapArguments<
-    ElementA,
-    LayoutA,
-    ComplexTransform::kNone,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    ComplexTransform::kNone,
-    kAlignmentB,
-    LayoutC,
-    kInternalTranspose
-  >;
-
-  // Define the default GEMM kernel
-  using DefaultGemmKernel = typename kernel::DefaultGemm<
-    typename MapArguments::ElementA,
-    typename MapArguments::LayoutA,
-    MapArguments::kAlignmentA,
-    typename MapArguments::ElementB,
-    typename MapArguments::LayoutB,
-    MapArguments::kAlignmentB,
-    ElementC,
-    typename MapArguments::LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    true,
-    Operator,
-    SharedMemoryClear,
-    false, /*GatherA*/
-    false, /*GatherB*/
-    false, /*ScatterD*/
-    PermuteDLayout
-  >::GemmKernel;
-
-    /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmGrouped<
-    typename DefaultGemmKernel::Mma,
-    typename DefaultGemmKernel::Epilogue,
-    ThreadblockSwizzle,
-    GroupScheduleMode_,
-    kInternalTranspose
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
-  >
-struct DefaultGemmGrouped<
-  ElementA,
-  LayoutA,
-  TransformA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  GroupScheduleMode_,
-  Operator,
-  SharedMemoryClear,
-  layout::NoPermute, /*PermuteDLayout*/
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
-  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
-
-  using MapArguments = kernel::detail::MapArguments<
-    ElementA,
-    LayoutA,
-    TransformA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    TransformB,
-    kAlignmentB,
-    LayoutC,
-    kInternalTranspose
-  >;
-
-  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
-    typename MapArguments::ElementA,
-    typename MapArguments::LayoutA,
-    typename MapArguments::ElementB,
-    typename MapArguments::LayoutB,
-    ElementC,
-    typename MapArguments::LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MapArguments::kTransformA,
-    MapArguments::kTransformB,
-    Operator,
-    false
-  >::GemmKernel;
-
-  /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmGrouped<
-    typename DefaultGemmKernel::Mma,
-    typename DefaultGemmKernel::Epilogue, 
-    ThreadblockSwizzle,
-    GroupScheduleMode_,
-    kInternalTranspose
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
deleted file mode 100755
index a031c1a95..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level softmax-grouped-GEMM
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias_,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
-    /// Operation performed by GEMM
-    typename Operator = typename device::DefaultGemmConfiguration<
-        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator>::Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
-    >
-struct DefaultGemmGroupedSoftmaxMainloopFusion {
-  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
-  static bool const kInternalTranspose = platform::is_same<LayoutC_, layout::ColumnMajor>::value;
-
-  using MapArguments = kernel::detail::MapArguments<
-    ElementA_,
-    LayoutA_,
-    ComplexTransform::kNone,
-    kAlignmentA,
-    ElementB_,
-    LayoutB_,
-    ComplexTransform::kNone,
-    kAlignmentB,
-    LayoutC_,
-    kInternalTranspose
-  >;
-
-private:
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMmaSoftmaxMainloopFusion<
-      typename MapArguments::ElementA, typename MapArguments::LayoutA, MapArguments::kAlignmentA,
-      typename MapArguments::ElementB, typename MapArguments::LayoutB, MapArguments::kAlignmentB,
-      ElementScaleBias_, LayoutScaleBias_, ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag,
-      ThreadblockShape, WarpShape, InstructionShape, Stages, kInternalTranspose,
-      Operator, false, SharedMemoryClear>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-public:
-  using GemmKernel = kernel::GemmGroupedSoftmaxMainloopFusion<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle,
-    GroupScheduleMode_,
-    kInternalTranspose
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
deleted file mode 100755
index 68d739e30..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h"
-#include "cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
-struct DefaultGemmLayernormMainloopFusion {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMmaLayernormMainloopFusion<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementScaleBias, LayoutScaleBias, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::GemmLayernormMainloopFusion<Mma, Epilogue, ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
deleted file mode 100755
index df74a0749..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ /dev/null
@@ -1,352 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/gemm_planar_complex.h"
-#include "cutlass/gemm/kernel/gemm_planar_complex_array.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h"
-#include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Math operation performed by GEMM (e.g. arch::OpMultiplyAdd)
-    typename Operator,
-    /// Conditional enabling to switch between stages
-    typename Enable = void
-  >
-struct DefaultGemmPlanarComplexUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for pipelined mainloop
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator
-  >
-struct DefaultGemmPlanarComplexUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  typename platform::enable_if<(Stages <= 2)>::type 
-> {
-
-  /// Define planar complex valued variants instead
-  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexPipelined<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    LayoutC,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    Stages,
-    TransformA,
-    TransformB,
-    Operator
-  >::ThreadblockMma;
-
-  /// Planar complex epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
-    ThreadblockShape,
-    typename Mma::Policy::Operator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape::kK / WarpShape::kK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount  
-  >::Epilogue;
-
-  /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmPlanarComplex<
-    Mma,
-    Epilogue, 
-    ThreadblockSwizzle
-  >;
-
-  // Array variant
-  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-  
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiple pipeline stages.
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator
-  >
-struct DefaultGemmPlanarComplexUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  typename platform::enable_if<(Stages > 2)>::type 
-> {
-
-  /// Define planar complex valued variants instead
-  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementAccumulator,
-    LayoutC,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    Stages,
-    TransformA,
-    TransformB,
-    Operator
-  >::ThreadblockMma;
-
-  /// Planar complex epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
-    ThreadblockShape,
-    typename Mma::Policy::Operator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape::kK / WarpShape::kK,
-    EpilogueOutputOp,
-    EpilogueOutputOp::kCount  
-  >::Epilogue;
-
-  /// Define the kernel in terms of the default kernel
-  using GemmKernel = kernel::GemmPlanarComplex<
-    Mma,
-    Epilogue, 
-    ThreadblockSwizzle
-  >;
-
-  // Array variant
-  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
-    Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
deleted file mode 100755
index f1841a377..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/sparse_gemm.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
-#include "cutlass/gemm/threadblock/default_sparse_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSparseGemm;
-
-////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ada Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
deleted file mode 100755
index 250a0e7b2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
-#include "cutlass/gemm/kernel/default_gemm_sparse.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Real-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator
->
-struct DefaultGemmSparseUniversal {
-
-  using DefaultGemmKernel = typename kernel::DefaultSparseGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    true,
-    Operator
-  >::GemmKernel;
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = kernel::GemmSparseUniversal<
-      typename DefaultGemmKernel::Mma,
-      typename DefaultGemmKernel::Epilogue,
-      ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
deleted file mode 100755
index 019390921..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
-#include "cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h"
-#include "cutlass/gemm/kernel/default_gemm_sparse.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Real-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator
->
-struct DefaultGemmSparseUniversalWithAbsmax {
-
-  using GemmBase = typename DefaultSparseGemm<
-    ElementA, LayoutA, kAlignmentA,
-    ElementB, LayoutB, kAlignmentB,
-    ElementC, LayoutC, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    false, // SplitKSerial
-    Operator
-  >::GemmKernel;
-
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC,
-    typename EpilogueOutputOp::ElementAuxOutput,
-    ElementC,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  using GemmKernel = kernel::GemmSparseUniversalWithAbsmax<
-      typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
deleted file mode 100755
index 30d063233..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief
-    Default configuration for a sparse GEMM with fused absolute-maximum calculations and scaling
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/sparse_gemm_with_absmax.h"
-#include "cutlass/gemm/kernel/default_gemm_sparse.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
-#include "cutlass/gemm/threadblock/default_sparse_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSparseGemmWithAbsmax {
-
-  using GemmBase = typename DefaultSparseGemm<
-    ElementA_, LayoutA_, kAlignmentA,
-    ElementB_, LayoutB_, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    SplitKSerial,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    typename EpilogueOutputOp::ElementAuxOutput,
-    ElementC_,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::SparseGemmWithAbsmax<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
deleted file mode 100755
index 9d7f2c6f7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default sparse GEMM with visitor.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_sparse.h"
-#include "cutlass/gemm/kernel/sparse_gemm_with_visitor.h"
-#include "cutlass/gemm/kernel/gemm_pipelined.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
-#include "cutlass/gemm/threadblock/default_sparse_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename FusionCallbacks,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of stages used in the pipelined epilogue
-    int EpilogueStages = 1>
-struct DefaultSparseGemmWithVisitor;
-
-////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename FusionCallbacks,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of stages used in the pipelined epilogue
-    int EpilogueStages>
-struct DefaultSparseGemmWithVisitor<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-                   ElementC, LayoutC, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-                   FusionCallbacks, ThreadblockSwizzle, Stages, Operator,
-                   EpilogueStages> {
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static constexpr int kAlignmentC = 128 / sizeof_bits<ElementC>::value;
-  using ElementEpilogue = ElementAccumulator;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-  using EpilogueOutputOp =
-      typename epilogue::thread::LinearCombination<
-          ElementC, kAlignmentC,
-          ElementAccumulator, ElementEpilogue>;
-  using BaseEpilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK,
-          EpilogueOutputOp, EpilogueOutputOp::kCount>::Epilogue;
-
-  // Define epilogue
-  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
-      BaseEpilogue,
-      FusionCallbacks,
-      EpilogueStages>;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::SparseGemmWithEpilogueVisitor<Mma, Epilogue, ThreadblockSwizzle>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
deleted file mode 100755
index 061bb7494..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/gemm_splitk_parallel.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator
->
-struct DefaultGemmSplitKParallel {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate using the basic GEMM's
-  /// mainloop.
-  using Default = DefaultGemm<
-    ElementA_,
-    LayoutA_,
-    kAlignmentA,
-    ElementB_,
-    LayoutB_,
-    kAlignmentB,
-    ElementAccumulator,
-    LayoutC_,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    false,
-    Operator
-  >;
-
-  /// Define the matrix multiply operator
-  using Mma = typename Default::Mma;
-
-  /// Define the epilogue
-  using Epilogue = typename Default::Epilogue;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::GemmSplitKParallel<Mma, Epilogue, ThreadblockSwizzle>;
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
deleted file mode 100755
index c19fdb5e2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a Stream-K GEMM that can broadcast a bias vector in the epilogue.
-    Similar structure to DefaultGemmWithBroadcast, but uses its own epilogue 
-    (DefaultStreamkEpilogueWithBroadcastTensorOp) and its own GEMM kernel 
-    (GemmStreamkWithFusedEpilogue).
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable = void
->
-struct DefaultGemmStreamkWithBroadcast {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Replace epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultStreamkEpilogueWithBroadcastTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmStreamkWithFusedEpilogue<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
deleted file mode 100755
index ed7951be5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ /dev/null
@@ -1,396 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/gemm_universal.h"
-#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/kernel/default_gemm_complex.h"
-
-#include "cutlass/layout/permute.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Scatter result D by using an index array
-    bool ScatterD = false,
-    /// Permute result D
-    typename PermuteDLayout = layout::NoPermute,
-    /// Permute operand A
-    typename PermuteALayout_ = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout_ = layout::NoPermute,
-    ///
-    typename Enable = void
-    >
-struct DefaultGemmUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Scatter result D by using an index array
-    bool ScatterD,
-    /// Permute result D
-    typename PermuteDLayout,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
->
-struct DefaultGemmUniversal<
-  ElementA,
-  LayoutA,
-  ComplexTransform::kNone,   // transform A
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  ComplexTransform::kNone,   // transform B
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  SharedMemoryClear,
-  GatherA,
-  GatherB,
-  ScatterD,
-  PermuteDLayout,
-  PermuteALayout,
-  PermuteBLayout,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultGemmKernel = typename kernel::DefaultGemm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    true,
-    Operator,
-    SharedMemoryClear,
-    GatherA,
-    GatherB,
-    ScatterD,
-    PermuteDLayout,
-    PermuteALayout,
-    PermuteBLayout
-  >::GemmKernel;
-
-  /// Universal kernel without StreamkFeature member type
-  template <class SwizzleT, class Enable = void>
-  class SelectBase :
-    public kernel::GemmUniversal<
-      typename DefaultGemmKernel::Mma,
-      typename DefaultGemmKernel::Epilogue,
-      SwizzleT>
-  {};
-
-  /// Universal kernel with StreamkFeature member type
-  template <class SwizzleT>
-  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
-    public kernel::GemmUniversalStreamk<
-      typename DefaultGemmKernel::Mma,
-      typename DefaultGemmKernel::Epilogue,
-      SwizzleT>
-  {};
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = SelectBase<ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued GEMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
-  >
-struct DefaultGemmUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  SharedMemoryClear,
-  false,
-  false,
-  false,
-  layout::NoPermute,
-  layout::NoPermute,
-  layout::NoPermute,
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    TransformA,
-    TransformB,
-    Operator,
-    false
-  >::GemmKernel;
-
-  /// Universal kernel without StreamkFeature member type
-  template <class SwizzleT, class Enable = void>
-  class SelectBase :
-    public kernel::GemmUniversal<
-      typename DefaultGemmKernel::Mma,
-      typename DefaultGemmKernel::Epilogue,
-      SwizzleT>
-  {};
-
-  /// Universal kernel with StreamkFeature member type
-  template <class SwizzleT>
-  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
-    public kernel::GemmUniversalStreamk<
-      typename DefaultGemmKernel::Mma,
-      typename DefaultGemmKernel::Epilogue,
-      SwizzleT>
-  {};
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = SelectBase<ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
deleted file mode 100755
index a3c69f2dc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief
-    Default configuration for a GEMM with fused epilogue visitor callbacks
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-
-#include "cutlass/gemm/kernel/gemm_universal_with_visitor.h"
-#include "cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Access granularity of C matrix in unit of elements
-  int kAlignmentC,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Element type for epilogue computation
-  typename ElementEpilogue,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename FusionCallbacks,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  /// Number of stages used in the pipelined epilogue
-  int EpilogueStages = 1
->
-struct DefaultGemmWithVisitor {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA, 
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    epilogue::thread::LinearCombination<
-        ElementC_, kAlignmentC, 
-        ElementAccumulator, ElementEpilogue 
-    >,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
-      typename GemmBase::Epilogue,
-      FusionCallbacks,
-      EpilogueStages
-  >;
-
-  /// GemmWithVisitor without StreamkFeature member type
-  template <class SwizzleT, class Enable = void>
-  class SelectBase :
-    public GemmWithEpilogueVisitor<
-      typename GemmBase::Mma,
-      Epilogue,
-      SwizzleT>
-  {};
-
-  /// GemmWIthVisitor with StreamkFeature member type
-  template <class SwizzleT>
-  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
-    public GemmWithEpilogueVisitorStreamk<
-      typename GemmBase::Mma,
-      Epilogue,
-      SwizzleT>
-  {};
-
-  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
-  using GemmKernel = SelectBase<ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
deleted file mode 100755
index 3fd643e7e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief
-    Default configuration for a GEMM with fused absolute-maximum calculations and scaling
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/kernel/gemm_with_absmax.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable = void
->
-struct DefaultGemmWithAbsMax {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    typename EpilogueOutputOp::ElementAuxOutput,
-    ElementC_,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmWithAbsMax<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
deleted file mode 100755
index e95c25610..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable = void
->
-struct DefaultGemmWithBroadcast {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmWithFusedEpilogue<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization: ArchTag = cutlass::arch::Sm70
-///
-///
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable
->
-struct DefaultGemmWithBroadcast<
-  ElementA_, LayoutA_, TransformA, kAlignmentA, 
-  ElementB_, LayoutB_, TransformB, kAlignmentB,
-  ElementC_, LayoutC_,
-  ElementAccumulator,
-  OperatorClass,
-  cutlass::arch::Sm70,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  Enable
-  > {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    cutlass::arch::Sm70,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    typename EpilogueOutputOp::ElementT,
-    typename EpilogueOutputOp::ElementVector,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmWithFusedEpilogue<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
deleted file mode 100755
index ca4c2cba6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
-      specializations here choose 'device::GemmTransposed' to implement this functionality.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
-#include "cutlass/gemm/threadblock/default_mma_with_reduction.h"
-#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Reduce A or B along the K dimension
-    bool ReduceKForA_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    ///
-    typename Enable = void>
-struct DefaultGemmWithKReduction {
-
-  static const bool kReduceKForA = (platform::is_same<LayoutC, cutlass::layout::RowMajor>::value) ? ReduceKForA_ : !ReduceKForA_;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMmaWithReduction<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, kReduceKForA, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the epilogue of the reduction vector
-  using EpilogueGemmKReduction =
-      typename cutlass::epilogue::threadblock::EpilogueGemmKReduction<
-          ElementAccumulator, ElementC, ThreadblockShape, typename Mma::Operator, kReduceKForA>;
-
-  /// Define the kernel-level GEMM operator.
-  using GemmKernel = kernel::GemmWithKReduction<Mma, Epilogue, EpilogueGemmKReduction, ThreadblockSwizzle>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
deleted file mode 100755
index 1a578f09f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief 
-    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
-#include "cutlass/gemm/kernel/default_gemm_universal.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
-#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Epilogue reduction operator
-  typename EpilogueReductionOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable = void
->
-struct DefaultGemmWithReduction {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator,
-    SharedMemoryClearOption::kClearLastStage
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    EpilogueOutputOp,
-    EpilogueReductionOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmWithFusedEpilogue<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization: ArchTag = cutlass::arch::Sm70
-///
-///
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Epilogue reduction operator
-  typename EpilogueReductionOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable
->
-struct DefaultGemmWithReduction<
-  ElementA_, LayoutA_, TransformA, kAlignmentA, 
-  ElementB_, LayoutB_, TransformB, kAlignmentB,
-  ElementC_, LayoutC_,
-  ElementAccumulator,
-  OperatorClass,
-  cutlass::arch::Sm70,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  EpilogueReductionOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  Enable
-  >  {
-
-  using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    cutlass::arch::Sm70,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
-
-  // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    EpilogueOutputOp,
-    EpilogueReductionOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmWithFusedEpilogue<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
deleted file mode 100755
index db6306401..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/gemm/threadblock/gemv.h"
-#include "cutlass/gemm/threadblock/default_gemv_core.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the ThreadBlock tile - concept: gemm::GemmShape<>
-    typename ThreadBlockShape_,
-    /// Size of the per-thread shape - concept: gemm::GemmShape<>
-    typename ThreadShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C/D matrix
-    typename ElementCD_,
-    /// Layout of C/D matrix (concept: MatrixLayout)
-    typename LayoutCD_,
-    ///  Data type of the accumulator
-    typename ElementAccumulator_ = ElementCD_>
-struct DefaultGemv {
-
-  /// Shape of Threadblock-level matrix operation (concept: GemmShape)
-  using ThreadBlockShape = ThreadBlockShape_;
-
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using ThreadShape = ThreadShape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulators
-  using ElementAccumulator = ElementAccumulator_;
-
-  /// Data type of accumulators (same as C/D)
-  using LayoutAccumulator = LayoutCD_;
-
-  /// Data type of input/output matrix C/D
-  using ElementCD = ElementCD_;
-
-  /// Layout of input/output matrix C/D
-  using LayoutCD = LayoutCD_;
-
-  // Define the core components
-  using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
-      ThreadBlockShape, ThreadShape, ElementA, LayoutA, ElementB, LayoutB,
-      ElementAccumulator, LayoutAccumulator>;
-
-  // Define the threadblock-scoped gemv
-  using ThreadBlockGemv = cutlass::gemm::threadblock::Gemv<Core>;
-
-  // Iterator for multiplicand A
-  using IteratorA = typename ThreadBlockGemv::IteratorA;
-
-  // Iterator for multiplicand B
-  using IteratorB = typename ThreadBlockGemv::IteratorB;
-
-  /// Policy for the iterator that reads/writes C/D
-  using IteratorPolicyCD = typename platform::conditional<
-        platform::is_same<LayoutCD, layout::RowMajor>::value,
-        cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
-          layout::PitchLinearShape<ThreadBlockShape::kN, ThreadBlockShape::kM>, Core::kThreadsPerN, ThreadShape::kN>,
-        cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
-          layout::PitchLinearShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, Core::kThreadsPerN, ThreadShape::kM>>::type;
-
-  /// Iterator that reads/writes C/D
-  using IteratorCD = cutlass::transform::threadblock::PredicatedTileIterator<
-   cutlass::MatrixShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, ElementCD, LayoutCD, 0, IteratorPolicyCD>;
-
-  /// Fragment storage for C/D
-  using FragmentCD = typename IteratorCD::Fragment;
-
-  // Define the threadblock swizzle
-  using ThreadBlockSwizzle = cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
deleted file mode 100755
index 63400ef40..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/rank_2k_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultRank2K;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultRank2K<
-                    ElementA, LayoutA, kAlignmentA, 
-                    ElementB, LayoutB, kAlignmentB, 
-                    ElementC,layout::RowMajor, FillModeC, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, 
-      kAlignmentA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementB, LayoutB, 
-      kAlignmentB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      kAlignmentA,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultRank2K<
-                    ElementA, LayoutA, kAlignmentA, 
-                    ElementB, LayoutB, kAlignmentB, 
-                    ElementC,layout::RowMajor, FillModeC, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, 
-      kAlignmentA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementB, LayoutB, 
-      kAlignmentB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      kAlignmentA,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
deleted file mode 100755
index 1a685286c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
+++ /dev/null
@@ -1,498 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/rank_2k_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultRank2KComplex;
-
-
-////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-
-template <
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation 
-  ComplexTransform TransformA,
-  /// Complex elementwise transformation 
-  ComplexTransform TransformB,
-  /// Blas3 computation mode (symmetric/hermitian)
-  BlasMode BlasMode_
-  > struct Rank2KTransposedComplexTransform {
-  
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-};
-  
-  // partial specializations for HER2K CUBLAS_OP_N layout (ColumMajor)
-template <>
-  struct Rank2KTransposedComplexTransform <
-  layout::ColumnMajor, layout::ColumnMajor, 
-  ComplexTransform::kNone, ComplexTransform::kNone,
-  BlasMode::kHermitian> {
-
-  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-};
-
-  // partial specializations for HER2K CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
-template <>
-  struct Rank2KTransposedComplexTransform <
-  layout::RowMajor, layout::RowMajor, 
-  ComplexTransform::kConjugate, ComplexTransform::kConjugate,
-  BlasMode::kHermitian> {
-
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
-
-};
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRank2KComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementB, LayoutB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRank2KComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-  // Complex transform for input A and B matrices (function on input layout)
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
-                                        LayoutA, LayoutB, 
-                                        TransformA, TransformB,
-                                        kBlasMode>;
-
-  // Complex transform on operandA and operandB (function of blas3 computation)
-  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
-  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementB, LayoutB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRank2KComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementB, LayoutB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRank2KComplex<
-  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-  // Complex transform for input A and B matrices (function on input layout)
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-
-  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
-                                        LayoutA, LayoutB, 
-                                        TransformA, TransformB,
-                                        kBlasMode>;
-
-  // Complex transform on operandA and operandB (function of blas3 computation)
-  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
-  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
-  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementB, LayoutB, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level Rank2K operator.
-  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
deleted file mode 100755
index 7c79dd61a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-      Default kernel-level grouped Rank2K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
-#include "cutlass/gemm/kernel/default_rank_2k.h"
-#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
-    ///
-    typename Enable = void
-    >
-struct DefaultRank2KGrouped;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued grouped Rank2K
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_
-    >
-struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
-          ElementB, LayoutB, TransformB, kAlignmentB,
-          ElementC, LayoutC,
-          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
-          WarpShape, InstructionShape, EpilogueOutputOp,
-          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
-          typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
-  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
-
-  using MapArguments = kernel::detail::Rank2KMapArguments<
-    ElementA,
-    LayoutA,
-    TransformA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    TransformB,
-    kAlignmentB,
-    LayoutC,
-    FillModeC,
-    kInternalTranspose
-  >;
-
-  // Define the default grouped Rank2K kernel
-  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
-    typename MapArguments::ElementA,
-    typename MapArguments::LayoutA,
-    MapArguments::kAlignmentA,
-    typename MapArguments::ElementB,
-    typename MapArguments::LayoutB,
-    MapArguments::kAlignmentB,
-    ElementC,
-    typename MapArguments::LayoutC,
-    MapArguments::kFillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    false,                  // SplitKSerial
-    Operator,
-    BlasMode_
-  >::Rank2Kkernel;
-
-  /// Define the kernel in terms of the default kernel
-  using Rank2Kkernel = kernel::Rank2KGrouped<
-    typename DefaultRank2Kkernel::Mma1,
-    typename DefaultRank2Kkernel::Mma2,
-    typename DefaultRank2Kkernel::Epilogue,
-    ThreadblockSwizzle,
-    TransformA,
-    TransformB,
-    DefaultRank2Kkernel::kFillModeC,
-    DefaultRank2Kkernel::kBlasMode,
-    GroupScheduleMode_,
-    kInternalTranspose
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Complex-valued grouped Rank2K
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_,
-    /// Whether the schedule of problems to visit has been precomputed
-    GroupScheduleMode GroupScheduleMode_
-    >
-struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
-          ElementB, LayoutB, TransformB, kAlignmentB,
-          ElementC, LayoutC,
-          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
-          WarpShape, InstructionShape, EpilogueOutputOp,
-          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
-          typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
-  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
-
-  using MapArguments = kernel::detail::Rank2KMapArguments<
-    ElementA,
-    LayoutA,
-    TransformA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    TransformB,
-    kAlignmentB,
-    LayoutC,
-    FillModeC,
-    kInternalTranspose
-  >;
-
-  // Define the default grouped Rank2K kernel
-  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
-    typename MapArguments::ElementA,
-    typename MapArguments::LayoutA,
-    typename MapArguments::ElementB,
-    typename MapArguments::LayoutB,
-    ElementC,
-    typename MapArguments::LayoutC,
-    MapArguments::kFillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    MapArguments::kTransformA,
-    MapArguments::kTransformB,
-    Operator,
-    false,                  // SplitKSerial
-    BlasMode_
-  >::Rank2Kkernel;
-
-  /// Define the kernel in terms of the default kernel
-  /// Pass through the user-provided TransformA and TransformB so as to
-  /// correctly set public-facing TransformA and TransformB in kernel::Rank2KGrouped.
-  /// This is needed because kernel::DefaultRank2KComplex may change TransformA and
-  /// TransformB that become template arguments to Mma1 and Mma2.
-  using Rank2Kkernel = kernel::Rank2KGrouped<
-    typename DefaultRank2Kkernel::Mma1,
-    typename DefaultRank2Kkernel::Mma2,
-    typename DefaultRank2Kkernel::Epilogue,
-    ThreadblockSwizzle,
-    TransformA,
-    TransformB,
-    DefaultRank2Kkernel::kFillModeC,
-    DefaultRank2Kkernel::kBlasMode,
-    GroupScheduleMode_,
-    kInternalTranspose
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
deleted file mode 100755
index 41e9cc45c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Rank 2k  definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/kernel/rank_2k_universal.h"
-#include "cutlass/gemm/kernel/default_rank_2k.h"
-#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric,
-    ///
-    typename Enable = void
-    >
-struct DefaultRank2KUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued Rank 2k update kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by Rank2k
-    typename Operator>
-struct DefaultRank2KUniversal<
-  ElementA,
-  LayoutA,
-  ComplexTransform::kNone,   // transform A
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  ComplexTransform::kNone,   // transform B
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  FillModeC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  BlasMode::kSymmetric,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    FillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    SplitKSerial,
-    Operator,
-    BlasMode::kSymmetric
-  >::Rank2Kkernel;
-
-    /// Define the kernel in terms of the default kernel
-  using Rank2Kkernel = kernel::Rank2KUniversal<
-    typename DefaultRank2Kkernel::Mma1,
-    typename DefaultRank2Kkernel::Mma2,
-    typename DefaultRank2Kkernel::Epilogue, 
-    ThreadblockSwizzle,
-    FillModeC,
-    BlasMode::kSymmetric
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued Rank 2K update kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    // BlasMode
-    BlasMode kBlasMode
-  >
-
-struct DefaultRank2KUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,   
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,  
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  FillModeC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  kBlasMode,
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    FillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    TransformA,
-    TransformB,
-    Operator,
-    SplitKSerial,
-    kBlasMode
-  >::Rank2Kkernel;
-
-    /// Define the kernel in terms of the default kernel
-  using Rank2Kkernel = kernel::Rank2KUniversal<
-    typename DefaultRank2Kkernel::Mma1,
-    typename DefaultRank2Kkernel::Mma2,
-    typename DefaultRank2Kkernel::Epilogue, 
-    ThreadblockSwizzle,
-    FillModeC,
-    kBlasMode
-  >;
-};
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
deleted file mode 100755
index 780b205a4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/rank_k_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultRankK;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultRankK<
-                    ElementA, LayoutA, kAlignmentA, 
-                    ElementC,layout::RowMajor, FillModeC, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, 
-      kAlignmentA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      kAlignmentA,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-  
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
-
-  /// Define the kernel-level Rank2 operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultRankK<
-                    ElementA, LayoutA, kAlignmentA, 
-                    ElementC,layout::RowMajor, FillModeC, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, 
-      kAlignmentA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      kAlignmentA,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-  
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
-
-  /// Define the kernel-level Rank2 operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
deleted file mode 100755
index 56d2fcc99..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/rank_k_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultRankKComplex;
-
-
-////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-
-template <
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation 
-  ComplexTransform TransformA,
-  /// Blas3 computation mode (symmetric/hermitian)
-  BlasMode BlasMode_
-  > struct RankKTransposedComplexTransform {
-  
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformA;
-
-};
-  
-  // partial specializations for HERK CUBLAS_OP_N layout (ColumMajor)
-template <>
-  struct RankKTransposedComplexTransform <
-  layout::ColumnMajor, 
-  ComplexTransform::kNone,
-  BlasMode::kHermitian> {
-
-  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-};
-
-  // partial specializations for HERK CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
-template <>
-  struct RankKTransposedComplexTransform <
-  layout::RowMajor, 
-  ComplexTransform::kConjugate,
-  BlasMode::kHermitian> {
-
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
-
-};
-
-}
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRankKComplex<
-  ElementA, LayoutA, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformA, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level RankK operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRankKComplex<
-  ElementA, LayoutA, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-  // Complex transform for input A and B matrices (function on input layout)
-  static ComplexTransform const kTransformA = TransformA;
-
-  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
-                                        LayoutA, 
-                                        TransformA,
-                                        kBlasMode>;
-
-  // Complex transform on operandA and operandB (function of blas3 computation)
-  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
-  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level RankK operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRankKComplex<
-  ElementA, LayoutA, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      TransformA, TransformA, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level RankK operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultRankKComplex<
-  ElementA, LayoutA, ElementC, 
-  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-  // Complex transform for input A and B matrices (function on input layout)
-  static ComplexTransform const kTransformA = TransformA;
-
-  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
-                                        LayoutA, 
-                                        TransformA,
-                                        kBlasMode>;
-
-  // Complex transform on operandA and operandB (function of blas3 computation)
-  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
-  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
-
-  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
-      ElementA, LayoutA, 
-      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
-      ThreadblockShape, WarpShape, InstructionShape, Stages, 
-      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
-
-  /// Define the kernel-level RankK operator.
-  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
deleted file mode 100755
index 309ea4642..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level Rank k  definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/kernel/rank_k_universal.h"
-#include "cutlass/gemm/kernel/default_rank_k.h"
-#include "cutlass/gemm/kernel/default_rank_k_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric,
-    ///
-    typename Enable = void
-    >
-struct DefaultRankKUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued Rank k update kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by Rank2k
-    typename Operator>
-struct DefaultRankKUniversal<
-  ElementA,
-  LayoutA,
-  ComplexTransform::kNone,   // transform A
-  kAlignmentA,
-  ElementC,
-  LayoutC,
-  FillModeC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  BlasMode::kSymmetric,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultRankKkernel = typename kernel::DefaultRankK<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementC,
-    LayoutC,
-    FillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    SplitKSerial,
-    Operator,
-    BlasMode::kSymmetric
-  >::RankKkernel;
-
-    /// Define the kernel in terms of the default kernel
-  using RankKkernel = kernel::RankKUniversal<
-    typename DefaultRankKkernel::Mma,
-    typename DefaultRankKkernel::Epilogue, 
-    ThreadblockSwizzle,
-    FillModeC
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued Rank 2K update kernels
-//
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Fill Mode for C (kLower or kUpper)
-    FillMode FillModeC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    // BlasMode
-    BlasMode kBlasMode
-  >
-
-struct DefaultRankKUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,   
-  kAlignmentA,
-  ElementC,
-  LayoutC,
-  FillModeC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  kBlasMode,
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultRankKkernel = typename kernel::DefaultRankKComplex<
-    ElementA,
-    LayoutA,
-    ElementC,
-    LayoutC,
-    FillModeC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    TransformA,
-    Operator,
-    SplitKSerial,
-    kBlasMode
-  >::RankKkernel;
-
-    /// Define the kernel in terms of the default kernel
-  using RankKkernel = kernel::RankKUniversal<
-    typename DefaultRankKkernel::Mma,
-    typename DefaultRankKkernel::Epilogue, 
-    ThreadblockSwizzle,
-    FillModeC
-  >;
-};
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
deleted file mode 100755
index 8f0ff4255..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/symm_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_trmm.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultSymm;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSymm<
-                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
-                    ElementB, LayoutB, kAlignmentB, 
-                    ElementC,layout::RowMajor, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
-      ElementA, LayoutA, kAlignmentA, 
-      ElementB, LayoutB, kAlignmentB,
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
-  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type; 
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
-			ElementA, LayoutAMma2, kAlignmentA, 
-			ElementB, LayoutBMma2, kAlignmentB,
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm90,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level SYMM/HEMM operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultSymm<
-                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
-                    ElementB, LayoutB, kAlignmentB, 
-                    ElementC,layout::RowMajor, 
-                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
-                    ThreadblockShape, WarpShape, InstructionShape,
-                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                    Operator> {
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
-      ElementA, LayoutA, kAlignmentA, 
-      ElementB, LayoutB, kAlignmentB,
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
-  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type; 
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
-			ElementA, LayoutAMma2, kAlignmentA, 
-			ElementB, LayoutBMma2, kAlignmentB,
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm80,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level SYMM/HEMM operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-};
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
deleted file mode 100755
index c2f803100..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
+++ /dev/null
@@ -1,508 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/symm_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kSymmetric>
-struct DefaultSymmComplex;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultSymmComplex<
-  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  // Complex Transform don't appply to A or B for SYMM
-  static ComplexTransform const TransformA = ComplexTransform::kNone; 
-  static ComplexTransform const TransformB = ComplexTransform::kNone; 
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, 
-      ElementB, LayoutB, 
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type; 
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-			ElementA, LayoutAMma2, 
-			ElementB, LayoutBMma2, 
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm90,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level Symm operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultSymmComplex<
-  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
-  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, 
-      ElementB, LayoutB, 
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type;
-  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
-                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
-  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
-                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
-
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-			ElementA, LayoutAMma2, 
-			ElementB, LayoutBMma2, 
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm90,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level Symm operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (symmetric)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultSymmComplex<
-  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  Operator, SplitKSerial, BlasMode::kSymmetric> {
-
-  static BlasMode const kBlasMode = BlasMode::kSymmetric;
-  // Complex Transform don't appply to A or B for SYMM
-  static ComplexTransform const TransformA = ComplexTransform::kNone; 
-  static ComplexTransform const TransformB = ComplexTransform::kNone; 
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, 
-      ElementB, LayoutB, 
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type; 
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-			ElementA, LayoutAMma2, 
-			ElementB, LayoutBMma2, 
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm80,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level Symm operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture complex datatype (hermitian)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode kSideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode kFillModeA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial>
-struct DefaultSymmComplex<
-  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
-  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
-  Operator, SplitKSerial, BlasMode::kHermitian> {
-
-  static BlasMode const kBlasMode = BlasMode::kHermitian;
-
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
-	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
-  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
-  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, 
-      ElementB, LayoutB, 
-      kSideModeA, kFillModeA, kDiagTypeMma1, 
-      ElementAccumulator, layout::RowMajor, 
-      arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape,
-      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
-
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
-	static const DiagType kDiagTypeMma2 = DiagType::kZero;
-  using LayoutAMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                typename layout::LayoutTranspose<LayoutA>::type, 
-                                LayoutA
-                              >::type;
-  using LayoutBMma2 = typename platform::conditional<
-                                (kSideModeA == SideMode::kLeft), 
-                                LayoutB, 
-                                typename layout::LayoutTranspose<LayoutB>::type
-                              >::type;
-  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
-                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
-  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
-                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
-
-	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-			ElementA, LayoutAMma2, 
-			ElementB, LayoutBMma2, 
-			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
-			ElementAccumulator, layout::RowMajor, 
-			arch::OpClassTensorOp, arch::Sm80,
-			ThreadblockShape, WarpShape, InstructionShape,
-			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level Symm operator.
-  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
deleted file mode 100755
index ac0da25d1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/kernel/symm_universal.h"
-#include "cutlass/gemm/kernel/default_symm.h"
-#include "cutlass/gemm/kernel/default_symm_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode SideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode FillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    /// Blas3 computation mode (symmetric/hermitian)
-    BlasMode BlasMode_ = BlasMode::kSymmetric,
-    ///
-    typename Enable = void
-    >
-struct DefaultSymmUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued SYMM/HEMM update kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode SideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode FillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYMM/HEMM
-    typename Operator>
-struct DefaultSymmUniversal<
-  ElementA,
-  LayoutA,
-  SideModeA,
-  FillModeA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  BlasMode::kSymmetric,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultSymmkernel = typename kernel::DefaultSymm<
-    ElementA,
-    LayoutA,
-    SideModeA,
-    FillModeA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    SplitKSerial,
-    Operator,
-    BlasMode::kSymmetric
-  >::SymmKernel;
-
-    /// Define the kernel in terms of the default kernel
-  using SymmKernel = kernel::SymmUniversal<
-    typename DefaultSymmkernel::Mma1,
-    typename DefaultSymmkernel::Mma2,
-    typename DefaultSymmkernel::Epilogue, 
-    ThreadblockSwizzle,
-    SideModeA,
-    FillModeA
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued SYMM/HEMM update kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Side Mode for A (kLeft or kRight)
-    SideMode SideModeA,
-    /// Fill Mode for A (kLower or kUpper)
-    FillMode FillModeA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by SYRK
-    typename Operator,
-    // BlasMode
-    BlasMode kBlasMode
-  >
-
-struct DefaultSymmUniversal<
-  ElementA,
-  LayoutA,
-  SideModeA,
-  FillModeA, 
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  kAlignmentB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  kBlasMode,
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultSymmkernel = typename kernel::DefaultSymmComplex<
-    ElementA,
-    LayoutA,
-    SideModeA,
-    FillModeA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator,
-    SplitKSerial,
-    kBlasMode
-  >::SymmKernel;
-
-    /// Define the kernel in terms of the default kernel
-  using SymmKernel = kernel::SymmUniversal<
-    typename DefaultSymmkernel::Mma1,
-    typename DefaultSymmkernel::Mma2,
-    typename DefaultSymmkernel::Epilogue, 
-    ThreadblockSwizzle,
-    SideModeA,
-    FillModeA
-  >;
-};
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
deleted file mode 100755
index 3380eee37..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-// 
-/*! \file
-    \brief 
-      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/trmm_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_trmm.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode SideMode_,
-    /// Fill Mode for the triangular matrix
-    FillMode FillMode_,
-    /// Diag Type for the triangular matrix
-    DiagType DiagType_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultTrmm;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-                   kSideMode, kFillMode, kDiagType, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator> {
-                    
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      kSideMode, kFillMode, kDiagType, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level TRMM operator.
-  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-                   kSideMode, kFillMode, kDiagType, ElementC,
-                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator> {
-                    
-  /// Define the threadblock-scoped triagular matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
-      kSideMode, kFillMode, kDiagType, 
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
-      ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator>::ThreadblockMma;
-
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
-
-  /// Define the kernel-level TRMM operator.
-  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
deleted file mode 100755
index c5cba8fb4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/trmm_universal.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Side Mode for the kernel
-  SideMode SideMode_,
-  /// Fill Mode for the triangular matrix
-  FillMode FillMode_,
-  /// Diag Type for the triangular matrix
-  DiagType DiagType_,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Multiply-add operator 
-  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-  typename Operator,
-  /// If true, kernel is configured to support serial reduction in the epilogue
-  bool SplitKSerial
->
-struct DefaultTrmmComplex;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Hopper Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultTrmmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, 
-  kSideMode, kFillMode, kDiagType,
-  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, ElementB, LayoutB, 
-      kSideMode, kFillMode, kDiagType,
-      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
-      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level TRMM operator.
-  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Ampere Architecture
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Multiply-add operator 
-    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator,
-    /// If true, kernel is configured to support serial reduction in the epilogue
-    bool SplitKSerial
-  >
-struct DefaultTrmmComplex<
-  ElementA, LayoutA, ElementB, LayoutB, 
-  kSideMode, kFillMode, kDiagType,
-  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
-  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
-  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
-
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
-      ElementA, LayoutA, ElementB, LayoutB, 
-      kSideMode, kFillMode, kDiagType,
-      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
-      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
-
-  /// Define the epilogue
-  using Epilogue =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
-          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
-          EpilogueOutputOp::kCount, Operator>::Epilogue;
-
-  /// Define the kernel-level TRMM operator.
-  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
deleted file mode 100755
index e06e15ca3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
-      the appropriate threadblock-scoped epilogue.
-  
-      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
-      accommodated by exchanging A and B operands and assuming transposed layouts.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-
-#include "cutlass/complex.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/kernel/trmm_universal.h"
-#include "cutlass/gemm/kernel/default_trmm.h"
-#include "cutlass/gemm/kernel/default_trmm_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by TRMM
-    typename Operator,
-    ///
-    typename Enable = void
-    >
-struct DefaultTrmmUniversal;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Real-valued TRMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by TRMM
-    typename Operator>
-struct DefaultTrmmUniversal<
-  ElementA,
-  LayoutA,
-  ComplexTransform::kNone,   // transform A
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  ComplexTransform::kNone,   // transform B
-  kAlignmentB,
-  kSideMode,
-  kFillMode,
-  kDiagType,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultTrmmKernel = typename kernel::DefaultTrmm<
-    ElementA,
-    LayoutA,
-    kAlignmentA,
-    ElementB,
-    LayoutB,
-    kAlignmentB,
-    kSideMode,
-    kFillMode,
-    kDiagType,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    SplitKSerial,
-    Operator
-  >::TrmmKernel;
-
-    /// Define the kernel in terms of the default kernel
-  using TrmmKernel = kernel::TrmmUniversal<
-    typename DefaultTrmmKernel::Mma,
-    typename DefaultTrmmKernel::Epilogue, 
-    ThreadblockSwizzle,
-    kSideMode,
-    kFillMode,
-    kDiagType
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Complex-valued TRMM kernels
-//
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Complex elementwise transformation on A operand
-    ComplexTransform TransformA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for C and D matrix operands
-    typename ElementC,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Epilogue output operator
-    typename EpilogueOutputOp,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// If true, kernel is configured to support serial reduction in the
-    /// epilogue
-    bool SplitKSerial,
-    /// Operation performed by TRMM
-    typename Operator
-  >
-struct DefaultTrmmUniversal<
-  ElementA,
-  LayoutA,
-  TransformA,
-  kAlignmentA,
-  ElementB,
-  LayoutB,
-  TransformB,
-  kAlignmentB,
-  kSideMode,
-  kFillMode,
-  kDiagType,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  OperatorClass,
-  ArchTag,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  SplitKSerial,
-  Operator,
-  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
-> {
-
-  using DefaultTrmmKernel = typename kernel::DefaultTrmmComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    kSideMode,
-    kFillMode,
-    kDiagType,
-    ElementC,
-    LayoutC,
-    ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    TransformA,
-    TransformB,
-    Operator,
-    SplitKSerial
-  >::TrmmKernel;
-
-  /// Define the kernel in terms of the default kernel
-  using TrmmKernel = kernel::TrmmUniversal<
-    typename DefaultTrmmKernel::Mma,
-    typename DefaultTrmmKernel::Epilogue, 
-    ThreadblockSwizzle,
-    kSideMode,
-    kFillMode,
-    kDiagType
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
deleted file mode 100755
index 7cd619802..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
+++ /dev/null
@@ -1,824 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a Block-Ell sparse gemm kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/arch/arch.h"
-
-#include "cutlass/transform/threadblock/ell_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
-  bool IsASparse                  ///! If true, A is sparse matrix
->
-struct EllGemm {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorA::TensorRef ref_A{};
-    typename Mma::IteratorB::Params params_B{};
-    typename Mma::IteratorB::TensorRef ref_B{};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
-    typename OutputOp::Params output_op{};
-    int *semaphore = nullptr;
-    int gemm_k_iterations{0};
-    int gemm_k_size{0};
-    const int* ell_idx = nullptr;
-    int ell_ncol{0};
-    int ell_blocksize{0};
-    int ell_base_idx{0};
-
-    //
-    // Methods
-    //
-   Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      const int* ell_idx,
-      int ell_ncol,
-      int ell_blocksize,
-      int ell_base_idx,
-      typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr
-    ):
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A.layout()),
-      ref_A(ref_A),
-      params_B(ref_B.layout()),
-      ref_B(ref_B),
-      params_C(ref_C.layout()),
-      ref_C(ref_C),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op),
-      ell_idx(ell_idx),
-      ell_ncol(ell_ncol),
-      ell_blocksize(ell_blocksize),
-      ell_base_idx(ell_base_idx)
-    {
-
-      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-
-      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
-
-    semaphore = workspace;
-    }
-  };
-
-  /// Shared memory storage structure
-  struct SharedStorage {
-    union{
-      typename Mma::SharedStorage main_loop;
-      typename Epilogue::SharedStorage epilogue;
-    };
-    typename cutlass::transform::threadblock::ell::SharedStorage ell;
-  };
-
-  //
-  // Methods
-  //
-  EllGemm() = default;
-
-  /// Determines whether kernel satisfies alignment
-    static Status can_implement(
-      cutlass::gemm::GemmCoord const & problem_size,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
-
-    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorA::Layout,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
-                                                       layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorB::Layout,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kM - 1 ) / Mma::Shape::kM;
-    int ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
-    int tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-    int lane_idx = threadIdx.x % 32;
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // skip computation if matrix is 0
-    if (params.ell_ncol > 0) {
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        ell_block_offset_m * params.ell_blocksize
-        + tile_offset_m * Mma::Shape::kM,
-        threadblock_tile_offset.k() * params.gemm_k_size
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        threadblock_tile_offset.k() * params.gemm_k_size,
-        threadblock_tile_offset.n() * Mma::Shape::kN
-      };
-
-      int ell_idx_start =
-        (threadblock_tile_offset.m() / tile_in_ell_block) *
-        (params.ell_ncol / params.ell_blocksize);
-      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
-
-      // Problem size is a function of threadblock index in the K dimension
-      int problem_size_k = min(
-        params.problem_size.k(),
-        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-      problem_size_k = min(problem_size_k, params.ell_ncol);
-
-      // Compute threadblock-scoped matrix multiply-add
-      int gemm_k_iterations =
-        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        params.params_A,
-        params.ref_A.data(),
-        {params.problem_size.m(), problem_size_k},
-        thread_idx,
-        tb_offset_A);
-
-      typename Mma::IteratorB iterator_B(
-        params.params_B,
-        params.ref_B.data(),
-        {problem_size_k, params.problem_size.n()},
-        thread_idx,
-        tb_offset_B);
-
-      // Define coef for ELL index depending on LayoutB
-      int ell_stride = iterator_B.get_stride();
-
-      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
-        shared_storage.ell,
-        ell_idx_ptr,
-        params.ell_blocksize,
-        params.ell_base_idx,
-        Mma::Shape::kK,
-        problem_size_k,
-        ell_stride,
-        thread_idx
-      );
-
-      //
-      // Main loop
-      //
-
-      // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-      if (!kSplitKSerial || gemm_k_iterations > 0) {
-        // check if index computations can be skipped
-        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
-        constexpr bool is_multiple_alignment =  
-          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
-        const bool is_specialized_blocksize =
-          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
-          && params.ell_blocksize >= Mma::Shape::kK;
-        // Compute threadblock-scoped matrix multiply-add
-        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
-          mma.operator()<true, true>(
-              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
-        } 
-        else {
-          mma.operator()<true, false>(
-              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
-        }
-      }
-    } // if (params.ell_ncols > 0)
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
-    tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      ell_block_offset_m * params.ell_blocksize
-      + tile_offset_m * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    //avoid out of bounds
-    MatrixCoord threadblock_extent(
-      min(params.problem_size.m(),
-         ell_block_offset_m * params.ell_blocksize
-         + min((tile_offset_m + 1) * Mma::Shape::kM, params.ell_blocksize)),
-      min(params.problem_size.n(),
-        (threadblock_tile_offset.n()+1) * Mma::Shape::kN)
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      params.ref_C.data(),
-      threadblock_extent,
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      threadblock_extent,
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-
-    //
-    // Release the semaphore
-    //
-
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-// B is Sparse
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
->
-struct EllGemm<Mma_, Epilogue_, ThreadblockSwizzle_, SplitKSerial, false> {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorA::TensorRef ref_A{};
-    typename Mma::IteratorB::Params params_B{};
-    typename Mma::IteratorB::TensorRef ref_B{};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
-    typename OutputOp::Params output_op{};
-    int *semaphore = nullptr;
-    int gemm_k_iterations{0};
-    int gemm_k_size{0};
-    const int* ell_idx = nullptr;
-    int ell_ncol{0};
-    int ell_blocksize{0};
-    int ell_base_idx{0};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      const int* ell_idx,
-      int ell_ncol,
-      int ell_blocksize,
-      int ell_base_idx,
-      typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr
-    ):
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A.layout()),
-      ref_A(ref_A),
-      params_B(ref_B.layout()),
-      ref_B(ref_B),
-      params_C(ref_C.layout()),
-      ref_C(ref_C),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op),
-      ell_idx(ell_idx),
-      ell_ncol(ell_ncol),
-      ell_blocksize(ell_blocksize),
-      ell_base_idx(ell_base_idx)
-    {
-
-      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-
-      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
-
-    semaphore = workspace;
-    }
-  };
-
-  /// Shared memory storage structure
-  struct SharedStorage {
-    union{
-      typename Mma::SharedStorage main_loop;
-      typename Epilogue::SharedStorage epilogue;
-    };
-    typename cutlass::transform::threadblock::ell::SharedStorage ell;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  EllGemm() { }
-
-  /// Determines whether kernel satisfies alignment
-    static Status can_implement(
-      cutlass::gemm::GemmCoord const & problem_size,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
-
-    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorA::Layout,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
-                                                       layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorB::Layout,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kN - 1 ) / Mma::Shape::kN;
-    int ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
-    int tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-    int lane_idx = threadIdx.x % 32;
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // skip computation if matrix is 0
-    if (params.ell_ncol > 0) {
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        threadblock_tile_offset.k() * params.gemm_k_size,
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        threadblock_tile_offset.k() * params.gemm_k_size,
-        ell_block_offset_n * params.ell_blocksize
-        + tile_offset_n * Mma::Shape::kN,
-      };
-
-      int ell_idx_start =
-        (threadblock_tile_offset.n() / tile_in_ell_block) *
-        (params.ell_ncol / params.ell_blocksize);
-      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
-
-      // Problem size is a function of threadblock index in the K dimension
-      int problem_size_k = min(
-        params.problem_size.k(),
-        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-      problem_size_k = min(problem_size_k, params.ell_ncol);
-
-      // Compute threadblock-scoped matrix multiply-add
-      int gemm_k_iterations =
-        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        params.params_A,
-        params.ref_A.data(),
-        {params.problem_size.m(), problem_size_k},
-        thread_idx,
-        tb_offset_A);
-
-      typename Mma::IteratorB iterator_B(
-        params.params_B,
-        params.ref_B.data(),
-        {problem_size_k, params.problem_size.n()},
-        thread_idx,
-        tb_offset_B);
-
-      // Define coef for ELL index depending on LayoutA
-      int ell_stride = iterator_A.get_stride();
-
-      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
-        shared_storage.ell,
-        ell_idx_ptr,
-        params.ell_blocksize,
-        params.ell_base_idx,
-        Mma::Shape::kK,
-        problem_size_k,
-        ell_stride,
-        thread_idx
-      );
-
-      //
-      // Main loop
-      //
-
-      // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-      if (!kSplitKSerial || gemm_k_iterations > 0) {
-        // check if index computations can be skipped
-        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
-        constexpr bool is_multiple_alignment =
-          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
-        const bool is_specialized_blocksize =
-          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
-          && params.ell_blocksize >= Mma::Shape::kK;
-        // Compute threadblock-scoped matrix multiply-add
-        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
-          mma.operator()<false, true>(
-              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
-        }
-        else {
-          mma.operator()<false, false>(
-              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
-        }
-      }
-    } // if (params.ell_ncols > 0)
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
-    tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      ell_block_offset_n * params.ell_blocksize
-      + tile_offset_n * Mma::Shape::kN
-    );
-
-    //avoid out of bounds
-    MatrixCoord threadblock_extent(
-      min(params.problem_size.m(),
-        (threadblock_tile_offset.m()+1) * Mma::Shape::kM),
-      min(params.problem_size.n(),
-         ell_block_offset_n * params.ell_blocksize
-         + min((tile_offset_n + 1) * Mma::Shape::kN, params.ell_blocksize))
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      params.ref_C.data(),
-      threadblock_extent,
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      threadblock_extent,
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op, iterator_D, accumulators, iterator_C);
-
-    //
-    // Release the semaphore
-    //
-
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
deleted file mode 100755
index 354f5ea8a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/arch/arch.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
->
-struct Gemm {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorA::TensorRef ref_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Mma::IteratorB::TensorRef ref_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::TensorRef ref_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::OutputTileIterator::TensorRef ref_D;
-    typename OutputOp::Params output_op;
-    int *semaphore;
-    int gemm_k_size;
-    // For gather+scatter operations
-    int const *gather_A_indices;
-    int const *gather_B_indices;
-    int const *scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr,
-      int const *gather_A_indices = nullptr,
-      int const *gather_B_indices = nullptr,
-      int const *scatter_D_indices = nullptr
-    ):
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A.layout()),
-      ref_A(ref_A),
-      params_B(ref_B.layout()),
-      ref_B(ref_B),
-      params_C(ref_C.layout()),
-      ref_C(ref_C),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op),
-      gather_A_indices(gather_A_indices),
-      gather_B_indices(gather_B_indices),
-      scatter_D_indices(scatter_D_indices) {
-
-      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-      
-      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
-
-    semaphore = workspace;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Gemm() { } 
-
-  /// Determines whether kernel satisfies alignment
-  CUTLASS_HOST_DEVICE
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size,
-    typename Mma::IteratorA::TensorRef ref_A,
-    typename Mma::IteratorB::TensorRef ref_B,
-    typename Epilogue::OutputTileIterator::TensorRef ref_C,
-    typename Epilogue::OutputTileIterator::TensorRef ref_D) {
-
-    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorA::Layout,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
-                                                       layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorB::Layout,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      threadblock_tile_offset.k() * params.gemm_k_size,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k = min(
-      params.problem_size.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      params.ref_A.data(),
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      params.ref_B.data(),
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    if (!kSplitKSerial || gemm_k_iterations > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-    }
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      params.ref_C.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.scatter_D_indices
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.scatter_D_indices
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op, iterator_D, accumulators, iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
deleted file mode 100755
index bafa5fa8b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmArray {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorA::Element const * const * ptr_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Mma::IteratorB::Element const * const * ptr_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Element const * const * ptr_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::OutputTileIterator::Element * const * ptr_D;
-    int64_t stride_D;
-    typename OutputOp::Params epilogue;
-    int batch_count;
-    int gemm_k_iterations;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() : 
-      swizzle_log_tile(0) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size_,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
-      typename Mma::IteratorA::Element const * const * ptr_A_,
-      typename Mma::IteratorA::Layout layout_A,
-      typename Mma::IteratorB::Element const * const * ptr_B_,
-      typename Mma::IteratorB::Layout layout_B,
-      typename Epilogue::OutputTileIterator::Element const * const * ptr_C_,
-      typename Epilogue::OutputTileIterator::Layout layout_C,
-      typename Epilogue::OutputTileIterator::Element * const * ptr_D_,
-      typename Epilogue::OutputTileIterator::Layout layout_D,
-      typename OutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      grid_tiled_shape(grid_tiled_shape_),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(layout_A),
-      ptr_A(ptr_A_),
-      params_B(layout_B),
-      ptr_B(ptr_B_),
-      params_C(layout_C),
-      ptr_C(ptr_C_),
-      params_D(layout_D),
-      ptr_D(ptr_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_),
-      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {
-
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  GemmArray() { } 
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-
-    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
-    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
-      batch_idx < params.batch_count; 
-      batch_idx += gridDim.z) {
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        0
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        0,
-        threadblock_tile_offset.n() * Mma::Shape::kN
-      };
-
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        params.params_A,
-        const_cast<typename Mma::IteratorA::Element *>(params.ptr_A[batch_idx]),
-        params.problem_size.mk(),
-        thread_idx,
-        tb_offset_A);
-
-      typename Mma::IteratorB iterator_B(
-        params.params_B,
-        const_cast<typename Mma::IteratorB::Element *>(params.ptr_B[batch_idx]),
-        params.problem_size.kn(),
-        thread_idx,
-        tb_offset_B);
-
-      //
-      // Main loop
-      //
-      
-      // Broadcast the warp_id computed by lane 0 to ensure dependent code
-      // is compiled as warp-uniform.
-      int warp_idx = canonical_warp_idx_sync();
-
-      int lane_idx = threadIdx.x % 32;
-      
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-      typename Mma::FragmentC accumulators;
-
-      accumulators.clear();
-
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-      //
-      // Epilogue
-      //
-
-      OutputOp output_op(params.epilogue);
-
-      //
-      // Masked tile iterators constructed from members
-      //
-
-      threadblock_tile_offset =
-          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-      //assume identity swizzle
-      MatrixCoord threadblock_offset(
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        threadblock_tile_offset.n() * Mma::Shape::kN
-      );
-
-      // Tile iterator writing to output tile
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        const_cast<typename Epilogue::OutputTileIterator::Element *>(params.ptr_C[batch_idx]),
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Tile iterator writing to output tile
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        params.ptr_D[batch_idx],
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      Epilogue epilogue(
-        shared_storage.epilogue, 
-        thread_idx, 
-        warp_idx, 
-        lane_idx);
-
-      // run efficient epilogue
-      epilogue(output_op, iterator_D, accumulators, iterator_C);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
deleted file mode 100755
index 0c11e997c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmBatched {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorA::TensorRef ref_A{};
-    int64_t stride_A{0};
-    typename Mma::IteratorB::Params params_B{};
-    typename Mma::IteratorB::TensorRef ref_B{};
-    int64_t stride_B{0};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
-    int64_t stride_C{0};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
-    int64_t stride_D{0};
-    typename OutputOp::Params epilogue{};
-    int batch_count{1};
-    int gemm_k_iterations{0};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size_,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
-      typename Mma::IteratorA::TensorRef ref_A_,
-      int64_t stride_A_,
-      typename Mma::IteratorB::TensorRef ref_B_,
-      int64_t stride_B_,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C_,
-      int64_t stride_C_,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D_,
-      int64_t stride_D_,
-      typename OutputOp::Params epilogue_,
-      int batch_count_
-    ):
-      problem_size(problem_size_),
-      grid_tiled_shape(grid_tiled_shape_),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A_.layout()),
-      ref_A(ref_A_),
-      stride_A(stride_A_),
-      params_B(ref_B_.layout()),
-      ref_B(ref_B_),
-      stride_B(stride_B_),
-      params_C(ref_C_.layout()),
-      ref_C(ref_C_),
-      stride_C(stride_C_),
-      params_D(ref_D_.layout()),
-      ref_D(ref_D_),
-      stride_D(stride_D_),
-      epilogue(epilogue_),
-      batch_count(batch_count_),
-      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {}
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-  GemmBatched() = default;
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-
-    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
-    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
-      batch_idx < params.batch_count; 
-      batch_idx += gridDim.z) {
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        0
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        0,
-        threadblock_tile_offset.n() * Mma::Shape::kN
-      };
-
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        params.params_A,
-        params.ref_A.data(),
-        params.problem_size.mk(),
-        thread_idx,
-        tb_offset_A);
-
-      iterator_A.add_pointer_offset(params.stride_A * batch_idx);
-
-      typename Mma::IteratorB iterator_B(
-        params.params_B,
-        params.ref_B.data(),
-        params.problem_size.kn(),
-        thread_idx,
-        tb_offset_B);
-
-      iterator_B.add_pointer_offset(params.stride_B * batch_idx);
-
-
-      //
-      // Main loop
-      //
-
-      // Broadcast the warp_id computed by lane 0 to ensure dependent code
-      // is compiled as warp-uniform.
-      int warp_idx = canonical_warp_idx_sync();
-
-      int lane_idx = threadIdx.x % 32;
-      
-      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-      typename Mma::FragmentC accumulators;
-
-      accumulators.clear();
-
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-      //
-      // Epilogue
-      //
-
-      OutputOp output_op(params.epilogue);
-
-      //
-      // Masked tile iterators constructed from members
-      //
-
-      threadblock_tile_offset =
-          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-      //assume identity swizzle
-      MatrixCoord threadblock_offset(
-        threadblock_tile_offset.m() * Mma::Shape::kM,
-        threadblock_tile_offset.n() * Mma::Shape::kN
-      );
-
-      // Tile iterator writing to output tile
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        params.ref_C.data(),
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      iterator_C.add_pointer_offset(params.stride_C * batch_idx);
-
-      // Tile iterator writing to output tile
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        params.ref_D.data(),
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      iterator_D.add_pointer_offset(params.stride_D * batch_idx);
-
-      Epilogue epilogue(
-        shared_storage.epilogue, 
-        thread_idx, 
-        warp_idx, 
-        lane_idx);
-
-      // run efficient epilogue
-      epilogue(output_op, iterator_D, accumulators, iterator_C);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
deleted file mode 100755
index daa6cbd77..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
+++ /dev/null
@@ -1,457 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Problem visitor for grouped GEMMs
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,                      ///! Epilogue
-  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
-  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
-  bool Transposed = false
->
-struct GemmGrouped {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
-  static bool const kTransposed = Transposed;
-
-  // Optional transpose
-  using MapArguments = kernel::detail::MapArguments<
-    typename Mma::IteratorA::Element,
-    typename Mma::IteratorA::Layout,
-    Mma::kTransformA,
-    Mma::IteratorA::AccessType::kElements,
-    typename Mma::IteratorB::Element,
-    typename Mma::IteratorB::Layout,
-    Mma::kTransformB,
-    Mma::IteratorB::AccessType::kElements,
-    typename Mma::LayoutC,
-    kTransposed
-  >;
-
-  // Public-facing type definitions related to operand element type, layout, and complex conjugate
-  // operation. Must interact with the 'kTransposed' notion.
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename MapArguments::LayoutC;
-
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-
-  // Type definitions about the mainloop.
-  using Operator = typename Mma::Operator;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ProblemVisitor = GemmGroupedProblemVisitor<
-                            ThreadblockShape,
-                            kGroupScheduleMode,
-                            kThreadCount,
-                            kThreadCount,
-                            kTransposed>;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord *problem_sizes{nullptr};
-    int problem_count{0};
-    int threadblock_count{0};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    ElementA ** ptr_A{nullptr};
-    ElementB ** ptr_B{nullptr};
-    ElementC ** ptr_C{nullptr};
-    ElementC ** ptr_D{nullptr};
-
-    typename LayoutA::Stride::LongIndex *lda{nullptr};
-    typename LayoutB::Stride::LongIndex *ldb{nullptr};
-    typename LayoutC::Stride::LongIndex *ldc{nullptr};
-    typename LayoutC::Stride::LongIndex *ldd{nullptr};
-
-    // Only used by device-level operator
-    GemmCoord *host_problem_sizes{nullptr};
-
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    Arguments() = default;
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(    
-      GemmCoord *problem_sizes,
-      int problem_count,
-      int threadblock_count,
-      typename EpilogueOutputOp::Params output_op,
-      ElementA ** ptr_A,
-      ElementB ** ptr_B,
-      ElementC ** ptr_C,
-      ElementC ** ptr_D,
-      typename LayoutA::Stride::LongIndex *lda,
-      typename LayoutB::Stride::LongIndex *ldb,
-      typename LayoutC::Stride::LongIndex *ldc,
-      typename LayoutC::Stride::LongIndex *ldd,
-      GemmCoord *host_problem_sizes=nullptr
-    ): 
-      problem_sizes(problem_sizes),
-      problem_count(problem_count),
-      threadblock_count(threadblock_count),
-      output_op(output_op),
-      ptr_A(ptr_A),
-      ptr_B(ptr_B),
-      ptr_C(ptr_C),
-      ptr_D(ptr_D),
-      lda(lda),
-      ldb(ldb),
-      ldc(ldc),
-      ldd(ldd),
-      host_problem_sizes(host_problem_sizes)
-    {
-
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    typename ProblemVisitor::Params problem_visitor{};
-    int threadblock_count{0};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    ElementA ** ptr_A{nullptr};
-    ElementB ** ptr_B{nullptr};
-    ElementC ** ptr_C{nullptr};
-    ElementC ** ptr_D{nullptr};
-
-    typename LayoutA::Stride::LongIndex *lda{nullptr};
-    typename LayoutB::Stride::LongIndex *ldb{nullptr};
-    typename LayoutC::Stride::LongIndex *ldc{nullptr};
-    typename LayoutC::Stride::LongIndex *ldd{nullptr};
-
-    //
-    // Methods
-    //
-
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args,
-          void *workspace = nullptr,
-          int tile_count = 0):
-      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
-      threadblock_count(args.threadblock_count),
-      output_op(args.output_op),
-      ptr_A(args.ptr_A),
-      ptr_B(args.ptr_B),
-      ptr_C(args.ptr_C),
-      ptr_D(args.ptr_D),
-      lda(args.lda),
-      ldb(args.ldb),
-      ldc(args.ldc),
-      ldd(args.ldd)
-    { 
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr,
-      int tile_count = 0) {
-
-      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
-                                                        workspace, tile_count);
-      threadblock_count = args.threadblock_count;
-      output_op = args.output_op;
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-      lda = args.lda;
-      ldb = args.ldb;
-      ldc = args.ldc;
-      ldd = args.ldd;
-    }
-  };
-
-  /// Shared memory storage structure
-  struct SharedStorage {
-    union {
-      typename Mma::SharedStorage main_loop;
-      typename Epilogue::SharedStorage epilogue;
-    } kernel;
-
-    // ProblemVisitor shared storage can't be overlapped with others
-    typename ProblemVisitor::SharedStorage problem_visitor;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  GemmGrouped() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return Status::kSuccess;
-  }
- 
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    //
-    // These types shadow the type-level definitions and support the ability to implement
-    // a 'transposed' GEMM that computes the transposed problems.
-    //
-    using ElementA = typename Mma::IteratorA::Element;
-    using LayoutA = typename Mma::IteratorA::Layout;
-    using ElementB = typename Mma::IteratorB::Element;
-    using LayoutB = typename Mma::IteratorB::Layout;
-    using ElementC = typename Epilogue::OutputTileIterator::Element;
-    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-    //
-    // Problem visitor.
-    //
-    ProblemVisitor problem_visitor(
-      params.problem_visitor,
-      shared_storage.problem_visitor,
-      blockIdx.x);
-
-    // Outer 'persistent' loop to iterate over tiles
-    while (problem_visitor.next_tile()) {
-
-      GemmCoord problem_size  = problem_visitor.problem_size();
-      int32_t problem_idx     = problem_visitor.problem_index();
-      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
-
-      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
-
-      cutlass::gemm::GemmCoord threadblock_offset(
-        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
-        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
-        0);
-
-      // Load element pointers. Exchange pointers and strides if working on the transpose
-      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
-      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
-
-      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
-      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        threadblock_offset.m(),
-        0,
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        0,
-        threadblock_offset.n()
-      };
-
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        LayoutA(ldm_A),
-        ptr_A,
-        {problem_size.m(), problem_size.k()},
-        thread_idx,
-        tb_offset_A);
-
-      typename Mma::IteratorB iterator_B(
-        LayoutB(ldm_B),
-        ptr_B,
-        {problem_size.k(), problem_size.n()},
-        thread_idx,
-        tb_offset_B);
-
-      typename Mma::FragmentC accumulators;
-
-      accumulators.clear();
-      
-      // Broadcast the warp_id computed by lane 0 to ensure dependent code
-      // is compiled as warp-uniform.
-      int warp_idx = canonical_warp_idx_sync();
-
-      int lane_idx = threadIdx.x % 32;
-
-      //
-      // Matrix multiply phase
-      //
-
-      // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
-
-      // Compute threadblock-scoped matrix multiply-add
-      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-      // Wait for all threads to finish their epilogue phases from the previous tile.
-      __syncthreads();
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(
-        gemm_k_iterations, 
-        accumulators, 
-        iterator_A, 
-        iterator_B, 
-        accumulators);
-
-      //
-      // Epilogue
-      //
-
-      EpilogueOutputOp output_op(params.output_op);
-
-      ElementC *ptr_C = params.ptr_C[problem_idx];
-      ElementC *ptr_D = params.ptr_D[problem_idx];
-
-      LayoutC layout_C(params.ldc[problem_idx]);
-      LayoutC layout_D(params.ldd[problem_idx]);
-
-      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
-      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
-
-      // Tile iterator loading from source tensor.
-      typename Epilogue::OutputTileIterator iterator_C(
-        params_C,
-        ptr_C,
-        problem_size.mn(),
-        thread_idx,
-        threadblock_offset.mn()
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params_D,
-        ptr_D,
-        problem_size.mn(),
-        thread_idx,
-        threadblock_offset.mn()
-      );
-
-      Epilogue epilogue(
-        shared_storage.kernel.epilogue, 
-        thread_idx, 
-        warp_idx, 
-        lane_idx);
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(
-        output_op, 
-        iterator_D, 
-        accumulators, 
-        iterator_C); 
-
-      // Next tile
-      problem_visitor.advance(gridDim.x);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
deleted file mode 100755
index 304f23e73..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Scheduler for grouped GEMM
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-// Helper for correctly representing problem sizes in grouped kernels 
-template <
-  typename ThreadblockShape,
-  bool Transposed
->
-struct GemmGroupedProblemSizeHelper {
-
-  static bool const kTransposed = Transposed;
-
-  CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
-    return cutlass::gemm::GemmCoord(
-      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
-      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
-      1);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
-    if (kTransposed) {
-      swap(problem.m(), problem.n());
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
-    return grid.m() * grid.n();
-  }
-};
-
-} // namespace detail
-
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <typename ThreadblockShape,
-          GroupScheduleMode GroupScheduleMode_,
-          int PrefetchTileCount,
-          int ThreadCount,
-          bool Transposed = false>
-struct GemmGroupedProblemVisitor : public GroupedProblemVisitor<
-                                            detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
-                                            ThreadblockShape,
-                                            GroupScheduleMode_,
-                                            PrefetchTileCount,
-                                            ThreadCount> {
-
-  static bool const kTransposed = Transposed;
-
-  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
-  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
-  using Params = typename Base::Params;
-  using SharedStorage = typename Base::SharedStorage;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GemmGroupedProblemVisitor(
-    Params const &params_,
-    SharedStorage &shared_storage_, 
-    int32_t block_idx
-  ): Base (params_, shared_storage_, block_idx)
-  {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
deleted file mode 100755
index 3d889469f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
+++ /dev/null
@@ -1,481 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Problem visitor for grouped GEMMs with a softmax fused beforehand
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,                      ///! Epilogue
-  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
-  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
-  bool Transposed = false
->
-struct GemmGroupedSoftmaxMainloopFusion {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
-  static bool const kTransposed = Transposed;
-
-  // Optional transpose
-  using MapArguments = kernel::detail::MapArguments<
-    typename Mma::IteratorA::Element,
-    typename Mma::IteratorA::Layout,
-    Mma::kTransformA,
-    Mma::IteratorA::AccessType::kElements,
-    typename Mma::IteratorB::Element,
-    typename Mma::IteratorB::Layout,
-    Mma::kTransformB,
-    Mma::IteratorB::AccessType::kElements,
-    typename Mma::LayoutC,
-    kTransposed
-  >;
-
-  // Public-facing type definitions related to operand element type, layout, and complex conjugate
-  // operation. Must interact with the 'kTransposed' notion.
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename MapArguments::LayoutC;
-
-  using ElementScaleBias = typename Mma::IteratorNormSum::Element;
-
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-
-  // Type definitions about the mainloop.
-  using Operator = typename Mma::Operator;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ProblemVisitor = GemmGroupedProblemVisitor<
-                            ThreadblockShape,
-                            kGroupScheduleMode,
-                            kThreadCount,
-                            kThreadCount,
-                            kTransposed>;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord *problem_sizes{nullptr};
-    int problem_count{0};
-    int threadblock_count{0};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    ElementA ** ptr_A{nullptr};
-    ElementB ** ptr_B{nullptr};
-    ElementC ** ptr_C{nullptr};
-    ElementC ** ptr_D{nullptr};
-    void ** ptr_norm{nullptr};
-    void ** ptr_sum{nullptr};
-
-    typename LayoutA::Stride::LongIndex *lda{nullptr};
-    typename LayoutB::Stride::LongIndex *ldb{nullptr};
-    typename LayoutC::Stride::LongIndex *ldc{nullptr};
-    typename LayoutC::Stride::LongIndex *ldd{nullptr};
-
-    // Only used by device-level operator
-    GemmCoord *host_problem_sizes{nullptr};
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    Arguments() = default;
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord *problem_sizes,
-      int problem_count,
-      int threadblock_count,
-      typename EpilogueOutputOp::Params output_op,
-      ElementA ** ptr_A,
-      ElementB ** ptr_B,
-      ElementC ** ptr_C,
-      ElementC ** ptr_D,
-      void ** ptr_norm,
-      void ** ptr_sum,
-      typename LayoutA::Stride::LongIndex *lda,
-      typename LayoutB::Stride::LongIndex *ldb,
-      typename LayoutC::Stride::LongIndex *ldc,
-      typename LayoutC::Stride::LongIndex *ldd,
-      GemmCoord *host_problem_sizes=nullptr
-    ):
-      problem_sizes(problem_sizes),
-      problem_count(problem_count),
-      threadblock_count(threadblock_count),
-      output_op(output_op),
-      ptr_A(ptr_A),
-      ptr_B(ptr_B),
-      ptr_C(ptr_C),
-      ptr_D(ptr_D),
-      ptr_norm(ptr_norm),
-      ptr_sum(ptr_sum),
-      lda(lda),
-      ldb(ldb),
-      ldc(ldc),
-      ldd(ldd),
-      host_problem_sizes(host_problem_sizes)
-    {
-
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    typename ProblemVisitor::Params problem_visitor{};
-    int threadblock_count{0};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    ElementA ** ptr_A{nullptr};
-    ElementB ** ptr_B{nullptr};
-    ElementC ** ptr_C{nullptr};
-    ElementC ** ptr_D{nullptr};
-
-    void ** ptr_norm{nullptr};
-    void ** ptr_sum{nullptr};
-
-    typename LayoutA::Stride::LongIndex *lda{nullptr};
-    typename LayoutB::Stride::LongIndex *ldb{nullptr};
-    typename LayoutC::Stride::LongIndex *ldc{nullptr};
-    typename LayoutC::Stride::LongIndex *ldd{nullptr};
-
-    //
-    // Methods
-    //
-
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args,
-          void *workspace = nullptr,
-          int tile_count = 0):
-      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
-      threadblock_count(args.threadblock_count),
-      output_op(args.output_op),
-      ptr_A(args.ptr_A),
-      ptr_B(args.ptr_B),
-      ptr_C(args.ptr_C),
-      ptr_D(args.ptr_D),
-      ptr_norm(args.ptr_norm),
-      ptr_sum(args.ptr_sum),
-      lda(args.lda),
-      ldb(args.ldb),
-      ldc(args.ldc),
-      ldd(args.ldd)
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr,
-      int tile_count = 0) {
-
-      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
-                                                        workspace, tile_count);
-      threadblock_count = args.threadblock_count;
-      output_op = args.output_op;
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-      ptr_norm = args.ptr_norm;
-      ptr_sum = args.ptr_sum;
-      lda = args.lda;
-      ldb = args.ldb;
-      ldc = args.ldc;
-      ldd = args.ldd;
-    }
-  };
-
-  /// Shared memory storage structure
-  struct SharedStorage {
-    union {
-      typename Mma::SharedStorage main_loop;
-      typename Epilogue::SharedStorage epilogue;
-    } kernel;
-
-    // ProblemVisitor shared storage can't be overlapped with others
-    typename ProblemVisitor::SharedStorage problem_visitor;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  GemmGroupedSoftmaxMainloopFusion() { }
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    //
-    // These types shadow the type-level definitions and support the ability to implement
-    // a 'transposed' GEMM that computes the transposed problems.
-    //
-    using ElementA = typename Mma::IteratorA::Element;
-    using LayoutA = typename Mma::IteratorA::Layout;
-    using ElementB = typename Mma::IteratorB::Element;
-    using LayoutB = typename Mma::IteratorB::Layout;
-    using ElementC = typename Epilogue::OutputTileIterator::Element;
-    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-    //
-    // Problem visitor.
-    //
-    ProblemVisitor problem_visitor(
-      params.problem_visitor,
-      shared_storage.problem_visitor,
-      blockIdx.x);
-
-    // Outer 'persistent' loop to iterate over tiles
-    while (problem_visitor.next_tile()) {
-
-      GemmCoord problem_size  = problem_visitor.problem_size();
-      int32_t problem_idx     = problem_visitor.problem_index();
-      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
-
-      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
-
-      cutlass::gemm::GemmCoord threadblock_offset(
-        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
-        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
-        0);
-
-      // Load element pointers. Exchange pointers and strides if working on the transpose
-      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
-      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
-
-      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
-      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_A{
-        threadblock_offset.m(),
-        0,
-      };
-
-      cutlass::MatrixCoord tb_offset_B{
-        0,
-        threadblock_offset.n()
-      };
-
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Construct iterators to A and B operands
-      typename Mma::IteratorA iterator_A(
-        LayoutA(ldm_A),
-        ptr_A,
-        {problem_size.m(), problem_size.k()},
-        thread_idx,
-        tb_offset_A);
-
-      typename Mma::IteratorB iterator_B(
-        LayoutB(ldm_B),
-        ptr_B,
-        {problem_size.k(), problem_size.n()},
-        thread_idx,
-        tb_offset_B);
-
-      // Construct iterator to the softmax norm/sum vector
-      typename Mma::IteratorNormSum iterator_norm_sum(
-        problem_size.m(),
-        static_cast<ElementScaleBias const *>(params.ptr_norm[problem_idx]),
-        static_cast<ElementScaleBias const *>(params.ptr_sum[problem_idx]),
-        thread_idx,
-        MatrixCoord(0, threadblock_offset.m())
-      );
-
-      typename Mma::FragmentC accumulators;
-
-      accumulators.clear();
-
-      // Broadcast the warp_id computed by lane 0 to ensure dependent code
-      // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-      int lane_idx = threadIdx.x % 32;
-
-      //
-      // Matrix multiply phase
-      //
-
-      // Construct thread-scoped matrix multiply
-      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
-
-      // Compute threadblock-scoped matrix multiply-add
-      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-      // Wait for all threads to finish their epilogue phases from the previous tile.
-      __syncthreads();
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(
-        gemm_k_iterations,
-        accumulators,
-        iterator_A,
-        iterator_B,
-        iterator_norm_sum,
-        accumulators);
-
-      //
-      // Epilogue
-      //
-
-      EpilogueOutputOp output_op(params.output_op);
-
-      ElementC *ptr_C = params.ptr_C[problem_idx];
-      ElementC *ptr_D = params.ptr_D[problem_idx];
-
-      LayoutC layout_C(params.ldc[problem_idx]);
-      LayoutC layout_D(params.ldd[problem_idx]);
-
-      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
-      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
-
-      // Tile iterator loading from source tensor.
-      typename Epilogue::OutputTileIterator iterator_C(
-        params_C,
-        ptr_C,
-        problem_size.mn(),
-        thread_idx,
-        threadblock_offset.mn()
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params_D,
-        ptr_D,
-        problem_size.mn(),
-        thread_idx,
-        threadblock_offset.mn()
-      );
-
-      Epilogue epilogue(
-        shared_storage.kernel.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx);
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(
-        output_op,
-        iterator_D,
-        accumulators,
-        iterator_C);
-
-      // Next tile
-      problem_visitor.advance(gridDim.x);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
deleted file mode 100755
index f324d7b30..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
+++ /dev/null
@@ -1,782 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a multistage GEMM kernel with layernorm operations fused in mainloop.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmLayernormMainloopFusion {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  using ElementScaleBias = typename Mma::IteratorVarMean::Element;
-  using LayoutScaleBias = typename Mma::IteratorVarMean::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_B{nullptr};
-    void const * ptr_var{nullptr};
-    void const * ptr_mean{nullptr};
-    void const * ptr_gamma{nullptr};
-    void const * ptr_beta{nullptr};
-    void const * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_var{0};
-    int64_t batch_stride_mean{0};
-    int64_t batch_stride_gamma{0};
-    int64_t batch_stride_beta{0};
-    int64_t batch_stride_C{0};
-
-    typename LayoutA::Stride stride_a{};
-    typename LayoutB::Stride stride_b{};
-    typename LayoutScaleBias::Stride stride_var{};
-    typename LayoutScaleBias::Stride stride_mean{};
-    typename LayoutScaleBias::Stride stride_gamma{};
-    typename LayoutScaleBias::Stride stride_beta{};
-    typename LayoutC::Stride stride_c{};
-    typename LayoutC::Stride stride_d{};
-
-    typename LayoutA::Stride::LongIndex lda{};
-    typename LayoutB::Stride::LongIndex ldb{};
-    typename LayoutScaleBias::Stride::LongIndex ld_var{};
-    typename LayoutScaleBias::Stride::LongIndex ld_mean{};
-    typename LayoutScaleBias::Stride::LongIndex ld_gamma{};
-    typename LayoutScaleBias::Stride::LongIndex ld_beta{};
-    typename LayoutC::Stride::LongIndex ldc{};
-    typename LayoutC::Stride::LongIndex ldd{};
-
-    int const * ptr_gather_A_indices{nullptr};
-    int const * ptr_gather_B_indices{nullptr};
-    int const * ptr_scatter_D_indices{nullptr};
-
-    //
-    // Methods
-    //
-    
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_var,
-      void const * ptr_mean,
-      void const * ptr_gamma,
-      void const * ptr_beta,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_var,
-      int64_t batch_stride_mean,
-      int64_t batch_stride_gamma,
-      int64_t batch_stride_beta,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride stride_a,
-      typename LayoutB::Stride stride_b,
-      typename LayoutScaleBias::Stride stride_var,
-      typename LayoutScaleBias::Stride stride_mean,
-      typename LayoutScaleBias::Stride stride_gamma,
-      typename LayoutScaleBias::Stride stride_beta,
-      typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      ptr_var(ptr_var), ptr_mean(ptr_mean), 
-      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
-      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
-      lda(0), ldb(0), ldc(0), ldd(0),
-      ld_var(0), ld_mean(0),
-      ld_gamma(0), ld_beta(0),
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
-      stride_var(stride_var), stride_mean(stride_mean),
-      stride_gamma(stride_gamma), stride_beta(stride_beta),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_var,
-      void const * ptr_mean,
-      void const * ptr_gamma,
-      void const * ptr_beta,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_var,
-      int64_t batch_stride_mean,
-      int64_t batch_stride_gamma,
-      int64_t batch_stride_beta,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutScaleBias::Stride::LongIndex ld_var,
-      typename LayoutScaleBias::Stride::LongIndex ld_mean,
-      typename LayoutScaleBias::Stride::LongIndex ld_gamma,
-      typename LayoutScaleBias::Stride::LongIndex ld_beta,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      ptr_var(ptr_var), ptr_mean(ptr_mean), 
-      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
-      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
-      ld_var(ld_var), ld_mean(ld_mean),
-      ld_gamma(ld_gamma), ld_beta(ld_beta),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices)
-    {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      stride_var = make_Coord(ld_var);
-      stride_mean = make_Coord(ld_mean);
-      stride_gamma = make_Coord(ld_gamma);
-      stride_beta = make_Coord(ld_beta);
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-      
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_var;
-    void * ptr_mean;
-    void * ptr_gamma;
-    void * ptr_beta;
-    void * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_var;
-    int64_t batch_stride_mean;
-    int64_t batch_stride_gamma;
-    int64_t batch_stride_beta;
-    int64_t batch_stride_C;
-
-    int * ptr_gather_A_indices;
-    int * ptr_gather_B_indices;
-    int * ptr_scatter_D_indices;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_var(const_cast<void *>(args.ptr_var)),
-      ptr_mean(const_cast<void *>(args.ptr_mean)),
-      ptr_gamma(const_cast<void *>(args.ptr_gamma)),
-      ptr_beta(const_cast<void *>(args.ptr_beta)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_var(args.batch_stride_var),
-      batch_stride_mean(args.batch_stride_mean),
-      batch_stride_gamma(args.batch_stride_gamma),
-      batch_stride_beta(args.batch_stride_beta),
-      batch_stride_C(args.batch_stride_C),
-      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
-      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
-      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
-    {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_var = const_cast<void *>(args.ptr_var);
-      ptr_mean = const_cast<void *>(args.ptr_mean);
-      ptr_gamma = const_cast<void *>(args.ptr_gamma);
-      ptr_beta = const_cast<void *>(args.ptr_beta);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_var = args.batch_stride_var;
-      batch_stride_mean = args.batch_stride_mean;
-      batch_stride_gamma = args.batch_stride_gamma;
-      batch_stride_beta = args.batch_stride_beta;
-      this->batch_stride_D = args.batch_stride_D;
-
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
-
-      output_op = args.epilogue;
-      
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
-
-    static int const kAlignmentA = (platform::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmLayernormMainloopFusion op;
-    op(params, shared_storage);
-  }
- 
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.ptr_gather_B_indices);
-
-    // Construct iterators to A var/mean vector
-    typename Mma::IteratorVarMean iterator_var_mean(
-      params.problem_size.m(),
-      static_cast<ElementScaleBias const *>(params.ptr_var),
-      static_cast<ElementScaleBias const *>(params.ptr_mean),
-      thread_idx,
-      MatrixCoord(0, (threadblock_tile_offset.m() * Mma::Shape::kM))
-    );
-
-    // Construct iterators to A scale/bias vector
-    typename Mma::IteratorGammaBeta iterator_gamma_beta(
-      problem_size_k,
-      static_cast<ElementScaleBias const *>(params.ptr_gamma),
-      static_cast<ElementScaleBias const *>(params.ptr_beta),
-      thread_idx,
-      MatrixCoord(
-        0, (threadblock_tile_offset.k() * Mma::Shape::kK)
-      )
-    );
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B,
-      iterator_var_mean,
-      iterator_gamma_beta, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.ptr_scatter_D_indices
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.ptr_scatter_D_indices
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
deleted file mode 100755
index 5a7f29d8f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct GemmParams {
-
-  //
-  // Type definitions
-  //
-  using Index = int32_t;
-  using LongIndex = int64_t;
-
-  using MmaIteratorParams = typename cutlass::transform::threadblock::PredicatedTileAccessIteratorParams;  
-  using EpilogueIteratorParams = typename cutlass::epilogue::threadblock::PredicatedTileIteratorParams;
-
-  //
-  // Data members
-  //
-
-  cutlass::gemm::GemmCoord problem_size{};
-  cutlass::gemm::GemmCoord grid_tiled_shape{};
-  int swizzle_log_tile{};
-
-  GemmUniversalMode mode{GemmUniversalMode::kGemm};
-  int batch_count{1};
-  int gemm_k_size{0};
-
-  void * ptr_A{nullptr};
-  void * ptr_B{nullptr};
-  void * ptr_C{nullptr};
-  void * ptr_D{nullptr};
-
-  LongIndex lda{0};
-  LongIndex ldb{0};
-  LongIndex ldc{0};
-  LongIndex ldd{0};
-
-  LongIndex batch_stride_A{0};
-  LongIndex batch_stride_B{0};
-  LongIndex batch_stride_C{0};
-  LongIndex batch_stride_D{0};
-
-  int *semaphore{nullptr};
-
-  //
-  // Methods
-  //
-
-  GemmParams() = default;
-
-  CUTLASS_HOST_DEVICE
-  GemmParams(
-    cutlass::gemm::GemmCoord problem_size_,
-    cutlass::gemm::GemmCoord grid_tiled_shape_,
-    int swizzle_log_tile_,
-    GemmUniversalMode mode_,
-    int batch_count_,
-    int gemm_k_size_,
-    void const * ptr_A_,
-    void const * ptr_B_,
-    void const * ptr_C_,
-    void * ptr_D_,
-    LongIndex lda_,
-    LongIndex ldb_, 
-    LongIndex ldc_, 
-    LongIndex ldd_,
-    int64_t batch_stride_A_,
-    int64_t batch_stride_B_,
-    int64_t batch_stride_C_,
-    int64_t batch_stride_D_,
-    MmaIteratorParams const & params_itr_a_,
-    MmaIteratorParams const & params_itr_b_,
-    EpilogueIteratorParams const & params_itr_c_,
-    EpilogueIteratorParams const & params_itr_d_,
-    void *workspace_ = nullptr) :
-      problem_size(problem_size_),
-      grid_tiled_shape(grid_tiled_shape_),
-      swizzle_log_tile(swizzle_log_tile_),
-      mode(mode_),
-      batch_count(batch_count_),
-      gemm_k_size(gemm_k_size_),
-      ptr_A(const_cast<void *>(ptr_A_)),
-      ptr_B(const_cast<void *>(ptr_B_)),
-      ptr_C(const_cast<void *>(ptr_C_)),
-      ptr_D(ptr_D_),
-      lda(lda_),
-      ldb(ldb_),
-      ldc(ldc_),
-      ldd(ldd_),
-      batch_stride_A(batch_stride_A_),
-      batch_stride_B(batch_stride_B_),
-      batch_stride_C(batch_stride_C_),
-      batch_stride_D(batch_stride_D_),
-      params_itr_a(params_itr_a_),
-      params_itr_b(params_itr_b_),      
-      params_itr_c(params_itr_c_),
-      params_itr_d(params_itr_d_),
-      semaphore(static_cast<int *>(workspace_)
-    ) { }
-
-
-  CUTLASS_HOST_DEVICE
-  void update(
-    void const * ptr_A_,
-    void const * ptr_B_,
-    void const * ptr_C_,
-    void * ptr_D_,
-    int64_t batch_stride_A_,
-    int64_t batch_stride_B_,
-    int64_t batch_stride_C_,
-    int64_t batch_stride_D_,
-    void *workspace_ = nullptr) {
-
-    ptr_A = const_cast<void *>(ptr_A_);
-    ptr_B = const_cast<void *>(ptr_B_);
-    ptr_C = const_cast<void *>(ptr_C_);
-    ptr_D = ptr_D_;
-
-    batch_stride_A = batch_stride_A_;
-    batch_stride_B = batch_stride_B_;
-    batch_stride_C = batch_stride_C_;
-    batch_stride_D = batch_stride_D_;
-
-
-    semaphore = static_cast<int *>(workspace_);
-    CUTLASS_TRACE_HOST("GemmParams::update()");
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
deleted file mode 100755
index 019f93c8f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Mma, typename Epilogue, typename ThreadblockSwizzle>
-CUTLASS_GLOBAL void GemmPipelined(
-  cutlass::gemm::GemmCoord problem_size,
-  cutlass::gemm::GemmCoord grid_tiled_shape,
-  typename Mma::IteratorA::Params params_A,
-  typename Mma::IteratorA::TensorRef ref_A,
-  typename Mma::IteratorB::Params params_B,
-  typename Mma::IteratorB::TensorRef ref_B,
-  typename Epilogue::Params params_epilogue
-  ) {
-
-  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
-  __shared__ union {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  } shared_storage;
-
-  // Compute threadblock location
-  ThreadblockSwizzle threadblock_swizzle;
-
-  int swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
-
-  cutlass::gemm::GemmCoord tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
-
-  if (grid_tiled_shape.m() <= tb_tile_offset.m() ||
-    grid_tiled_shape.n() <= tb_tile_offset.n()) {
-
-    return;
-  }
-
-  // Compute initial location in logical coordinates
-  cutlass::MatrixCoord tb_offset_A{
-    tb_tile_offset.m() * Mma::Shape::kM,
-    tb_tile_offset.k()
-  };
-
-  cutlass::MatrixCoord tb_offset_B{
-    tb_tile_offset.k(),
-    tb_tile_offset.n() * Mma::Shape::kN
-  };
-
-  // Compute position within threadblock
-  int tb_thread_id = threadIdx.x;
-
-  // Construct iterators to A and B operands
-  typename Mma::IteratorA iterator_A(
-    params_A,
-    ref_A.data(),
-    {problem_size.m(), problem_size.k()},
-    tb_thread_id,
-    tb_offset_A);
-
-  typename Mma::IteratorB iterator_B(
-    params_B,
-    ref_B.data(),
-    {problem_size.k(), problem_size.n()},
-    tb_thread_id,
-    tb_offset_B);
-
-  int warp_id = canonical_warp_idx_sync();
-  int lane_id = threadIdx.x % 32;
-
-  //
-  // Main loop
-  //
-
-  // Construct thread-scoped matrix multiply
-  Mma mma(shared_storage.main_loop, tb_thread_id, warp_id, lane_id);
-
-  typename Mma::FragmentC accumulators;
-
-  accumulators.clear();
-
-  // Compute threadblock-scoped matrix multiply-add
-  mma(problem_size, accumulators, iterator_A, iterator_B, accumulators);
-
-  //
-  // Epilogue
-  //
-
-  Epilogue epilogue(
-    params_epilogue, 
-    shared_storage.epilogue, 
-    tb_thread_id, 
-    warp_id, 
-    lane_id);
-
-  tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
-
-  //assume identity swizzle
-  MatrixCoord threadblock_offset(
-    tb_tile_offset.m() * Mma::Shape::kM,
-    tb_tile_offset.n() * Mma::Shape::kN
-  );
-
-  // run efficient epilogue
-  epilogue({problem_size.m(), problem_size.n()}, accumulators, threadblock_offset);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
deleted file mode 100755
index 09228ca01..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ /dev/null
@@ -1,715 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmPlanarComplex {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  using Operator = typename Mma::Operator;
-  using ArchTag = typename Mma::ArchTag;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value, 
-    128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Additional types needed for reflection
-  //
-
-  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::Shape;
-
-  static int const kStages = Mma::kStages;
-    
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  //
-  // Arguments structure
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A_real{nullptr};
-    void const * ptr_A_imag{nullptr};
-    void const * ptr_B_real{nullptr};
-    void const * ptr_B_imag{nullptr};
-    void const * ptr_C_real{nullptr};
-    void const * ptr_C_imag{nullptr};
-    void * ptr_D_real{nullptr};
-    void * ptr_D_imag{nullptr};
-
-    typename LayoutA::Stride::Index lda_real{};
-    typename LayoutA::Stride::Index lda_imag{};
-    typename LayoutB::Stride::Index ldb_real{};
-    typename LayoutB::Stride::Index ldb_imag{};
-    typename LayoutC::Stride::Index ldc_real{};
-    typename LayoutC::Stride::Index ldc_imag{};
-    typename LayoutC::Stride::Index ldd_real{};
-    typename LayoutC::Stride::Index ldd_imag{};
-    
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_A_imag{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_B_imag{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_C_imag{0};
-    int64_t batch_stride_D_imag{0};
-
-    //
-    // Methods
-    //
-
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A_real,
-      void const * ptr_A_imag,
-      void const * ptr_B_real,
-      void const * ptr_B_imag,
-      void const * ptr_C_real,
-      void const * ptr_C_imag,
-      void * ptr_D_real,
-      void * ptr_D_imag,
-      typename LayoutA::Stride::Index lda_real,
-      typename LayoutA::Stride::Index lda_imag,
-      typename LayoutB::Stride::Index ldb_real,
-      typename LayoutB::Stride::Index ldb_imag,
-      typename LayoutC::Stride::Index ldc_real,
-      typename LayoutC::Stride::Index ldc_imag,
-      typename LayoutC::Stride::Index ldd_real,
-      typename LayoutC::Stride::Index ldd_imag,
-      int64_t batch_stride_A = 0,
-      int64_t batch_stride_A_imag = 0,
-      int64_t batch_stride_B = 0,
-      int64_t batch_stride_B_imag = 0,
-      int64_t batch_stride_C = 0,
-      int64_t batch_stride_C_imag = 0,
-      int64_t batch_stride_D = 0,
-      int64_t batch_stride_D_imag = 0)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue), 
-      ptr_A_real(ptr_A_real), 
-      ptr_A_imag(ptr_A_imag), 
-      ptr_B_real(ptr_B_real),
-      ptr_B_imag(ptr_B_imag),
-      ptr_C_real(ptr_C_real),
-      ptr_C_imag(ptr_C_imag),
-      ptr_D_real(ptr_D_real), 
-      ptr_D_imag(ptr_D_imag), 
-      lda_real(lda_real),
-      lda_imag(lda_imag),
-      ldb_real(ldb_real),
-      ldb_imag(ldb_imag),
-      ldc_real(ldc_real),
-      ldc_imag(ldc_imag),
-      ldd_real(ldd_real),
-      ldd_imag(ldd_imag),
-      batch_stride_A(batch_stride_A),
-      batch_stride_A_imag(batch_stride_A_imag),
-      batch_stride_B(batch_stride_B),
-      batch_stride_B_imag(batch_stride_B_imag),
-      batch_stride_C(batch_stride_C),
-      batch_stride_C_imag(batch_stride_C_imag),
-      batch_stride_D_imag(batch_stride_D_imag)
-    {}
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-      
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A_real, args.ptr_B_real);
-      std::swap(args.ptr_A_imag, args.ptr_B_imag);
-      std::swap(args.lda_real, args.ldb_real);
-      std::swap(args.lda_imag, args.ldb_imag);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.batch_stride_A_imag, args.batch_stride_B_imag);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A_real{};
-    typename Mma::IteratorA::Params params_A_imag{};
-    typename Mma::IteratorB::Params params_B_real{};
-    typename Mma::IteratorB::Params params_B_imag{};
-    typename Epilogue::OutputTileIterator::Params params_C_real{};
-    typename Epilogue::OutputTileIterator::Params params_C_imag{};
-    typename Epilogue::OutputTileIterator::Params params_D_real{};
-    typename Epilogue::OutputTileIterator::Params params_D_imag{};
-    
-    typename EpilogueOutputOp::Params output_op{};
-
-    void * ptr_A_real{nullptr};
-    void * ptr_A_imag{nullptr};
-    void * ptr_B_real{nullptr};
-    void * ptr_B_imag{nullptr};
-    void * ptr_C_real{nullptr};
-    void * ptr_C_imag{nullptr};
-    void * ptr_D_real{nullptr};
-    void * ptr_D_imag{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-
-    int64_t batch_stride_A_imag{0};
-    int64_t batch_stride_B_imag{0};
-    int64_t batch_stride_C_imag{0};
-    int64_t batch_stride_D_imag{0};
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A_real(args.lda_real),
-      params_A_imag(args.lda_imag),
-      params_B_real(args.ldb_real),
-      params_B_imag(args.ldb_imag),
-      params_C_real(args.ldc_real),
-      params_C_imag(args.ldc_imag),
-      params_D_real(args.ldd_real),
-      params_D_imag(args.ldd_imag),
-      output_op(args.epilogue),
-      ptr_A_real(const_cast<void *>(args.ptr_A_real)),
-      ptr_A_imag(const_cast<void *>(args.ptr_A_imag)),
-      ptr_B_real(const_cast<void *>(args.ptr_B_real)),
-      ptr_B_imag(const_cast<void *>(args.ptr_B_imag)),
-      ptr_C_real(const_cast<void *>(args.ptr_C_real)),
-      ptr_C_imag(const_cast<void *>(args.ptr_C_imag)),
-      ptr_D_real(args.ptr_D_real),
-      ptr_D_imag(args.ptr_D_imag),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_A_imag(args.batch_stride_A_imag),
-      batch_stride_B_imag(args.batch_stride_B_imag),
-      batch_stride_C_imag(args.batch_stride_C_imag),
-      batch_stride_D_imag(args.batch_stride_D_imag)
-    {}
-
-    /// Returns the workspace size (in bytes) needed for this problem geometry
-    size_t get_workspace_size() const
-    {
-      size_t workspace_bytes = ParamsBase::get_workspace_size();
-      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
-      {
-        // Double the size returned by the base class because we need to
-        // accumulate two ElementC components
-        workspace_bytes *= 2;
-      }
-
-      return workspace_bytes;
-    }
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      ptr_A_real = const_cast<void *>(args.ptr_A_real);
-      ptr_A_imag = const_cast<void *>(args.ptr_A_imag);
-
-      ptr_B_real = const_cast<void *>(args.ptr_B_real);
-      ptr_B_imag = const_cast<void *>(args.ptr_B_imag);
-
-      ptr_C_real = const_cast<void *>(args.ptr_C_real);
-      ptr_C_imag = const_cast<void *>(args.ptr_C_imag);
-
-      ptr_D_real = const_cast<void *>(args.ptr_D_real);
-      ptr_D_imag = const_cast<void *>(args.ptr_D_imag);
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-      batch_stride_A_imag = args.batch_stride_A_imag;
-      batch_stride_B_imag = args.batch_stride_B_imag;
-      batch_stride_C_imag = args.batch_stride_C_imag;
-      batch_stride_D_imag = args.batch_stride_D_imag;
-
-      output_op = args.epilogue;
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(Arguments const &args)
-  {
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = args.problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = args.problem_size.m() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = args.problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = args.problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = args.problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = args.problem_size.m() % kAlignmentC;
-    }
-
-    if (isAMisaligned || isBMisaligned || isCMisaligned) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmPlanarComplex op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A_real = static_cast<ElementA *>(params.ptr_A_real);
-    ElementA *ptr_A_imag = static_cast<ElementA *>(params.ptr_A_imag);
-
-    ElementB *ptr_B_real = static_cast<ElementB *>(params.ptr_B_real);
-    ElementB *ptr_B_imag = static_cast<ElementB *>(params.ptr_B_imag);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A;
-      ptr_A_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A_imag;
-      ptr_B_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B;
-      ptr_B_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B_imag;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A_real = static_cast<ElementA * const *>(params.ptr_A_real)[threadblock_tile_offset.k()];
-      ptr_A_imag = static_cast<ElementA * const *>(params.ptr_A_imag)[threadblock_tile_offset.k()];
-      ptr_B_real = static_cast<ElementB * const *>(params.ptr_B_real)[threadblock_tile_offset.k()];
-      ptr_B_imag = static_cast<ElementB * const *>(params.ptr_B_imag)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A_real(
-      params.params_A_real,
-      ptr_A_real,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorA iterator_A_imag(
-      params.params_A_imag,
-      ptr_A_imag,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B_real(
-      params.params_B_real,
-      ptr_B_real,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorB iterator_B_imag(
-      params.params_B_imag,
-      ptr_B_imag,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A_real,
-      iterator_A_imag,
-      iterator_B_real, 
-      iterator_B_imag, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C_real = static_cast<ElementC *>(params.ptr_C_real);
-    ElementC *ptr_C_imag = static_cast<ElementC *>(params.ptr_C_imag);
-    ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real);
-    ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D_real += threadblock_tile_offset.k() * params.batch_stride_D;
-      ptr_D_imag += threadblock_tile_offset.k() * params.batch_stride_D_imag;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C;
-      ptr_C_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C_imag;
-      ptr_D_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D;
-      ptr_D_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D_imag;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C_real = static_cast<ElementC * const *>(params.ptr_C_real)[threadblock_tile_offset.k()];
-      ptr_C_imag = static_cast<ElementC * const *>(params.ptr_C_imag)[threadblock_tile_offset.k()];
-      ptr_D_real = static_cast<ElementC * const *>(params.ptr_D_real)[threadblock_tile_offset.k()];
-      ptr_D_imag = static_cast<ElementC * const *>(params.ptr_D_imag)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C_real(
-      params.params_C_real,
-      ptr_C_real,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    typename Epilogue::OutputTileIterator iterator_C_imag(
-      params.params_C_imag,
-      ptr_C_imag,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D_real(
-      params.params_D_real,
-      ptr_D_real,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    typename Epilogue::OutputTileIterator iterator_D_imag(
-      params.params_D_imag,
-      ptr_D_imag,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    //
-    // Construct epilogue
-    //
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C_real = iterator_D_real;
-        iterator_C_imag = iterator_D_imag;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D_real, 
-      iterator_D_imag, 
-      accumulators, 
-      iterator_C_real,
-      iterator_C_imag); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
deleted file mode 100755
index 0c21fb8d8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ /dev/null
@@ -1,609 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmPlanarComplexArray {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  using Operator = typename Mma::Operator;
-  using ArchTag = typename Mma::ArchTag;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value, 
-    128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Additional types needed for reflection
-  //
-
-  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::Shape;
-
-  static int const kStages = Mma::kStages;
-    
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  //
-  // Arguments structure
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    int const *ptr_M{nullptr};
-    int const *ptr_N{nullptr};
-    int const *ptr_K{nullptr};
-
-    void const * const * ptr_A_real{nullptr};
-    void const * const * ptr_A_imag{nullptr};
-
-    void const * const * ptr_B_real{nullptr};
-    void const * const * ptr_B_imag{nullptr};
-
-    void const * const * ptr_C_real{nullptr};
-    void const * const * ptr_C_imag{nullptr};
-
-    void * const * ptr_D_real{nullptr};
-    void * const * ptr_D_imag{nullptr};
-
-    typename LayoutA::Stride::Index lda_real{};
-    typename LayoutA::Stride::Index lda_imag{};
-    typename LayoutB::Stride::Index ldb_real{};
-    typename LayoutB::Stride::Index ldb_imag{};
-    typename LayoutC::Stride::Index ldc_real{};
-    typename LayoutC::Stride::Index ldc_imag{};
-    typename LayoutC::Stride::Index ldd_real{};
-    typename LayoutC::Stride::Index ldd_imag{};
-
-    //
-    // Methods
-    //
-
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      int const *ptr_M,
-      int const *ptr_N,
-      int const *ptr_K,
-      void const * const * ptr_A_real,
-      void const * const * ptr_A_imag,
-      void const * const * ptr_B_real,
-      void const * const * ptr_B_imag,
-      void const * const * ptr_C_real,
-      void const * const * ptr_C_imag,
-      void * const * ptr_D_real,
-      void * const * ptr_D_imag,
-      typename LayoutA::Stride::Index lda_real,
-      typename LayoutA::Stride::Index lda_imag,
-      typename LayoutB::Stride::Index ldb_real,
-      typename LayoutB::Stride::Index ldb_imag,
-      typename LayoutC::Stride::Index ldc_real,
-      typename LayoutC::Stride::Index ldc_imag,
-      typename LayoutC::Stride::Index ldd_real,
-      typename LayoutC::Stride::Index ldd_imag)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_M(ptr_M),
-      ptr_N(ptr_N),
-      ptr_K(ptr_K),
-      ptr_A_real(ptr_A_real), 
-      ptr_A_imag(ptr_A_imag), 
-      ptr_B_real(ptr_B_real),
-      ptr_B_imag(ptr_B_imag),
-      ptr_C_real(ptr_C_real),
-      ptr_C_imag(ptr_C_imag),
-      ptr_D_real(ptr_D_real), 
-      ptr_D_imag(ptr_D_imag), 
-      lda_real(lda_real),
-      lda_imag(lda_imag),
-      ldb_real(ldb_real),
-      ldb_imag(ldb_imag),
-      ldc_real(ldc_real),
-      ldc_imag(ldc_imag),
-      ldd_real(ldd_real),
-      ldd_imag(ldd_imag)
-    {}
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-      
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_M, args.ptr_N);
-      std::swap(args.ptr_A_real, args.ptr_B_real);
-      std::swap(args.ptr_A_imag, args.ptr_B_imag);
-      std::swap(args.lda_real, args.ldb_real);
-      std::swap(args.lda_imag, args.ldb_imag);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A_real{};
-    typename Mma::IteratorA::Params params_A_imag{};
-    typename Mma::IteratorB::Params params_B_real{};
-    typename Mma::IteratorB::Params params_B_imag{};
-    typename Epilogue::OutputTileIterator::Params params_C_real{};
-    typename Epilogue::OutputTileIterator::Params params_C_imag{};
-    typename Epilogue::OutputTileIterator::Params params_D_real{};
-    typename Epilogue::OutputTileIterator::Params params_D_imag{};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    int const *ptr_M{nullptr};
-    int const *ptr_N{nullptr};
-    int const *ptr_K{nullptr};
-
-    void const * const * ptr_A_real{nullptr};
-    void const * const * ptr_A_imag{nullptr};
-    void const * const * ptr_B_real{nullptr};
-    void const * const * ptr_B_imag{nullptr};
-    void const * const * ptr_C_real{nullptr};
-    void const * const * ptr_C_imag{nullptr};
-    void * const * ptr_D_real{nullptr};
-    void * const * ptr_D_imag{nullptr};
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      ptr_M(args.ptr_M),
-      ptr_N(args.ptr_N),
-      ptr_K(args.ptr_K),
-      params_A_real(args.lda_real),
-      params_A_imag(args.lda_imag),
-      params_B_real(args.ldb_real),
-      params_B_imag(args.ldb_imag),
-      params_C_real(args.ldc_real),
-      params_C_imag(args.ldc_imag),
-      params_D_real(args.ldd_real),
-      params_D_imag(args.ldd_imag),
-      output_op(args.epilogue),
-      ptr_A_real(args.ptr_A_real),
-      ptr_A_imag(args.ptr_A_imag),
-      ptr_B_real(args.ptr_B_real),
-      ptr_B_imag(args.ptr_B_imag),
-      ptr_C_real(args.ptr_C_real),
-      ptr_C_imag(args.ptr_C_imag),
-      ptr_D_real(args.ptr_D_real),
-      ptr_D_imag(args.ptr_D_imag)
-    {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      ptr_M = args.ptr_M;
-      ptr_N = args.ptr_N;
-      ptr_K = args.ptr_K;
-
-      ptr_A_real = args.ptr_A_real;
-      ptr_A_imag = args.ptr_A_imag;
-
-      ptr_B_real = args.ptr_B_real;
-      ptr_B_imag = args.ptr_B_imag;
-
-      ptr_C_real = args.ptr_C_real;
-      ptr_C_imag = args.ptr_C_imag;
-
-      ptr_D_real = args.ptr_D_real;
-      ptr_D_imag = args.ptr_D_imag;
-
-      output_op = args.epilogue;
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(Arguments const &args) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = args.problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = args.problem_size.m() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = args.problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = args.problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = args.problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = args.problem_size.m() % kAlignmentC;
-    }
-
-    if (isAMisaligned || isBMisaligned || isCMisaligned) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmPlanarComplexArray op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int batch_idx = threadblock_tile_offset.k();
-
-    int problem_size_m = params.problem_size.m();
-    int problem_size_n = params.problem_size.n();
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A_real = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_real[batch_idx]));
-    ElementA *ptr_A_imag = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_imag[batch_idx]));
-
-    ElementB *ptr_B_real = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_real[batch_idx]));
-    ElementB *ptr_B_imag = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_imag[batch_idx]));
-
-    //
-    // If pointers for problem sizes are specified, these are loaded from global memory
-    //
-
-    if (params.ptr_M) {
-      problem_size_m = params.ptr_M[batch_idx];
-    }
-
-    if (params.ptr_N) {
-      problem_size_n = params.ptr_N[batch_idx];
-    }
-
-    if (params.ptr_K) {
-      problem_size_k = params.ptr_K[batch_idx];
-    }
-
-    int const kBlockCountM = (problem_size_m + Mma::Shape::kM - 1) / Mma::Shape::kM;
-    int const kBlockCountN = (problem_size_n + Mma::Shape::kN - 1) / Mma::Shape::kN;
-        
-    int const kGemmKIterations = (problem_size_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    //
-    // Each threadblock loops over the logical problem size which the kernel may have discovered
-    // after the grid is launched.
-    //
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int block_m = threadblock_tile_offset.m(); 
-      block_m < kBlockCountM; 
-      block_m += params.grid_tiled_shape.m()) {
-
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int block_n = threadblock_tile_offset.n(); 
-        block_n < kBlockCountN; 
-        block_n += params.grid_tiled_shape.n()) {
-
-        //
-        // Compute indices within threadblock and warp.
-        //
-        int thread_idx = threadIdx.x;
-
-        // Broadcast the warp_id computed by lane 0 to ensure dependent code
-        // is compiled as warp-uniform.
-        int warp_idx = canonical_warp_idx_sync();
-        int lane_idx = threadIdx.x % 32;
-    
-        //
-        // Proceed with regular GEMM logic.
-        //
-
-        // Compute initial location in logical coordinates
-        cutlass::MatrixCoord tb_offset_A{ block_m * Mma::Shape::kM, 0};
-        cutlass::MatrixCoord tb_offset_B{ 0, block_n * Mma::Shape::kN };
-
-        // Construct iterators to A and B operands
-        typename Mma::IteratorA iterator_A_real(
-          params.params_A_real,
-          ptr_A_real,
-          {problem_size_m, problem_size_k},
-          thread_idx,
-          tb_offset_A);
-
-        typename Mma::IteratorA iterator_A_imag(
-          params.params_A_imag,
-          ptr_A_imag,
-          {problem_size_m, problem_size_k},
-          thread_idx,
-          tb_offset_A);
-
-        typename Mma::IteratorB iterator_B_real(
-          params.params_B_real,
-          ptr_B_real,
-          {problem_size_k, problem_size_n},
-          thread_idx,
-          tb_offset_B);
-  
-        typename Mma::IteratorB iterator_B_imag(
-          params.params_B_imag,
-          ptr_B_imag,
-          {problem_size_k, problem_size_n},
-          thread_idx,
-          tb_offset_B);
-
-        //
-        // Main loop
-        //
-
-        // Construct thread-scoped matrix multiply
-        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-        typename Mma::FragmentC accumulators;
-
-        accumulators.clear();
-
-        // Compute threadblock-scoped matrix multiply-add
-        mma(
-          kGemmKIterations, 
-          accumulators, 
-          iterator_A_real,
-          iterator_A_imag,
-          iterator_B_real, 
-          iterator_B_imag, 
-          accumulators);
-
-        //
-        // Epilogue
-        //
-
-        EpilogueOutputOp output_op(params.output_op);
-
-        //
-        // Masked tile iterators constructed from members
-        //
-
-        //assume identity swizzle
-        MatrixCoord threadblock_offset(
-          block_m * Mma::Shape::kM,
-          block_n * Mma::Shape::kN
-        );
-
-        ElementC *ptr_C_real = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_real[batch_idx]));
-        ElementC *ptr_C_imag = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_imag[batch_idx]));
-        ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real[batch_idx]);
-        ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag[batch_idx]);
-
-        // Tile iterator loading from source tensor.
-        typename Epilogue::OutputTileIterator iterator_C_real(
-          params.params_C_real,
-          ptr_C_real,
-          {problem_size_m, problem_size_n},
-          thread_idx,
-          threadblock_offset
-        );
-
-        typename Epilogue::OutputTileIterator iterator_C_imag(
-          params.params_C_imag,
-          ptr_C_imag,
-          {problem_size_m, problem_size_n},
-          thread_idx,
-          threadblock_offset
-        );
-
-        // Tile iterator writing to destination tensor.
-        typename Epilogue::OutputTileIterator iterator_D_real(
-          params.params_D_real,
-          ptr_D_real,
-          {problem_size_m, problem_size_n},
-          thread_idx,
-          threadblock_offset
-        );
-
-        typename Epilogue::OutputTileIterator iterator_D_imag(
-          params.params_D_imag,
-          ptr_D_imag,
-          {problem_size_m, problem_size_n},
-          thread_idx,
-          threadblock_offset
-        );
-
-        //
-        // Construct epilogue
-        //
-
-        Epilogue epilogue(
-          shared_storage.epilogue, 
-          thread_idx, 
-          warp_idx, 
-          lane_idx);
-
-        // Execute the epilogue operator to update the destination tensor.
-        epilogue(
-          output_op, 
-          iterator_D_real, 
-          iterator_D_imag, 
-          accumulators, 
-          iterator_C_real,
-          iterator_C_imag); 
-
-
-      } // for block_n
-    } // for block_m
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
deleted file mode 100755
index c5420c72d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
+++ /dev/null
@@ -1,804 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-namespace detail {
-
-template <
-  typename LayoutA,
-  typename LayoutB,
-  typename LayoutC,
-  typename LayoutE
->
-struct SparseUniversalArgumentsBase : UniversalArgumentsBase {
-  //
-  // Data members
-  //
-
-  void const * ptr_A;
-  void const * ptr_B;
-  void const * ptr_C;
-  void * ptr_D;
-  void const * ptr_E;
-
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_E;
-
-  typename LayoutA::Stride::LongIndex lda;
-  typename LayoutB::Stride::LongIndex ldb;
-  typename LayoutC::Stride::LongIndex ldc;
-  typename LayoutC::Stride::LongIndex ldd;
-  typename LayoutE::Stride::LongIndex lde;
-
-  //
-  // Methods
-  //
-
-  SparseUniversalArgumentsBase():
-    ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr), ptr_E(nullptr)
-  {}
-
-  /// constructs an arguments structure
-  SparseUniversalArgumentsBase(
-    GemmUniversalMode mode,
-    GemmCoord problem_size,
-    int batch_count,
-    void const * ptr_A,
-    void const * ptr_B,
-    void const * ptr_C,
-    void * ptr_D,
-    void const * ptr_E,
-    int64_t batch_stride_A,
-    int64_t batch_stride_B,
-    int64_t batch_stride_C,
-    int64_t batch_stride_D,
-    int64_t batch_stride_E,
-    typename LayoutA::Stride::LongIndex lda,
-    typename LayoutB::Stride::LongIndex ldb,
-    typename LayoutC::Stride::LongIndex ldc,
-    typename LayoutC::Stride::LongIndex ldd,
-    typename LayoutC::Stride::LongIndex lde)
-  :
-    UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-    ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_E(ptr_E),
-    batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-    batch_stride_E(batch_stride_E),
-    lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), lde(lde)
-  {
-    CUTLASS_TRACE_HOST("SparseUniversalArgumentsBase::Arguments() - problem_size: " << problem_size);
-  }
-};
-
-template <
-  typename Mma,
-  typename Epilogue,
-  typename Arguments,
-  typename ThreadblockSwizzle,
-  typename ThreadblockShape,
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutA,
-  typename LayoutB
->
-struct SparseUniversalParamsBase : UniversalParamsBase<
-  ThreadblockSwizzle,
-  ThreadblockShape,
-  ElementA,
-  ElementB,
-  ElementC,
-  LayoutA,
-  LayoutB> {
-  using ParamsBase = UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>;
-
-  //
-  // Data members
-  //
-
-  typename Mma::IteratorA::Params params_A;
-  typename Mma::IteratorB::Params params_B;
-  typename Epilogue::OutputTileIterator::Params params_C;
-  typename Epilogue::OutputTileIterator::Params params_D;
-  typename Mma::IteratorE::Params params_E;
-
-  void * ptr_A;
-  void * ptr_B;
-  void * ptr_C;
-  void * ptr_D;
-  void * ptr_E;
-
-  int64_t batch_stride_A;
-  int64_t batch_stride_B;
-  int64_t batch_stride_C;
-  int64_t batch_stride_E;
-
-  //
-  // Host dispatch API
-  //
-
-  /// Default constructor
-  SparseUniversalParamsBase() = default;
-
-  /// Constructor
-  SparseUniversalParamsBase(
-    Arguments const &args,  /// GEMM application arguments
-    int device_sms,         /// Number of SMs on the device
-    int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-  :
-    ParamsBase(args, device_sms, sm_occupancy),
-    params_A(args.lda),
-    params_B(args.ldb),
-    params_C(args.ldc),
-    params_D(args.ldd),
-    params_E(args.lde),
-    ptr_A(const_cast<void *>(args.ptr_A)),
-    ptr_B(const_cast<void *>(args.ptr_B)),
-    ptr_C(const_cast<void *>(args.ptr_C)),
-    ptr_D(args.ptr_D),
-    ptr_E(const_cast<void *>(args.ptr_E)),
-    batch_stride_A(args.batch_stride_A),
-    batch_stride_B(args.batch_stride_B),
-    batch_stride_C(args.batch_stride_C),
-    batch_stride_E(args.batch_stride_E)
-  {}
-
-  /// Lightweight update given a subset of arguments.
-  void update(Arguments const &args)
-  {
-    CUTLASS_TRACE_HOST("SparseUniversalParamsBase::update()");
-
-    // Update input/output pointers
-    this->ptr_A = const_cast<void *>(args.ptr_A);
-    this->ptr_B = const_cast<void *>(args.ptr_B);
-    this->ptr_C = const_cast<void *>(args.ptr_C);
-    this->ptr_D = args.ptr_D;
-    this->ptr_E = const_cast<void *>(args.ptr_E);
-
-    this->batch_stride_A = args.batch_stride_A;
-    this->batch_stride_B = args.batch_stride_B;
-    this->batch_stride_C = args.batch_stride_C;
-    this->batch_stride_D = args.batch_stride_D;
-    this->batch_stride_E = args.batch_stride_E;
-  }
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-class GemmSparseUniversal {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  static int const kSparse = Mma::kSparse;
-  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
-  static int const kMaxID2 = Mma::kMaxID2;
-  static int const kElementsPerElementE = Mma::kElementsPerElementE;
-
-  using ElementE = typename Mma::ElementE;
-  using LayoutE = typename Mma::LayoutE;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : detail::SparseUniversalArgumentsBase<
-      LayoutA,
-      LayoutB,
-      LayoutC,
-      LayoutE
-    > {
-    using Base = detail::SparseUniversalArgumentsBase<
-      LayoutA,
-      LayoutB,
-      LayoutC,
-      LayoutE
-    >;
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    Arguments() {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void const * ptr_E,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_E,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      typename LayoutC::Stride::LongIndex lde)
-    :
-      Base(
-        mode, problem_size, batch_count,
-        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
-        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
-        lda, ldb, ldc, ldd, lde
-      ),
-      epilogue(epilogue)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : detail::SparseUniversalParamsBase<
-    Mma,
-    Epilogue,
-    Arguments,
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = detail::SparseUniversalParamsBase<
-      Mma,
-      Epilogue,
-      Arguments,
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      output_op(args.epilogue)
-    {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-
-      // Update input/output pointers
-      this->ptr_A = const_cast<void *>(args.ptr_A);
-      this->ptr_B = const_cast<void *>(args.ptr_B);
-      this->ptr_C = const_cast<void *>(args.ptr_C);
-      this->ptr_D = args.ptr_D;
-      this->ptr_E = const_cast<void *>(args.ptr_E);
-
-      this->batch_stride_A = args.batch_stride_A;
-      this->batch_stride_B = args.batch_stride_B;
-      this->batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-      this->batch_stride_E = args.batch_stride_E;
-
-      output_op = args.epilogue;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size,
-    GemmUniversalMode mode,
-    int split_k_count)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
-
-    static int const kAlignmentA = (cute::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (cute::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (cute::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-    bool isEMisaligned = false;
-
-    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
-    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
-    }
-
-    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
-    }
-
-    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    isEMisaligned = (problem_size.m() % kAlignmentE)
-                  || ((problem_size.k() / kSparse) % kAlignmentE);
-
-    // The k dimension has to be the multiple of the Threadblock k because out
-    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
-    // a valid meta data.
-    if (problem_size.k() % Mma::Shape::kK) {
-      isEMisaligned = true;
-    }
-
-    if (mode == GemmUniversalMode::kGemm
-     || mode == GemmUniversalMode::kGemmSplitKParallel) {
-      if ((problem_size.k() / split_k_count) % Mma::Shape::kK) {
-        isEMisaligned = true;
-      }
-    }
-
-    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
-    // because of the row reordering of operand E
-    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
-
-    if (problem_size.m() % kAlignmentM) {
-      isEMisaligned = true;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isEMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for E operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size, args.mode, args.batch_count);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmSparseUniversal op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    ThreadblockSwizzle threadblock_swizzle;
-    run_with_swizzle(params, shared_storage, threadblock_swizzle);
-  }
-
-  /// Executes one GEMM with an externally-provided swizzling function
-  CUTLASS_DEVICE
-  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k / kSparse,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_E{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k / kSparse / kElementsPerElementE,
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k / kSparse},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorE iterator_E(
-      params.params_E,
-      ptr_E,
-      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
-      thread_idx,
-      tb_offset_E);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      iterator_E,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op,
-      iterator_D,
-      accumulators,
-      iterator_C);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
deleted file mode 100755
index 47b76a171..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
+++ /dev/null
@@ -1,609 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-class GemmSparseUniversalWithAbsmax {
-public:
-  using Base = GemmSparseUniversal<Mma_, Epilogue_, ThreadblockSwizzle_>;
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  static int const kSparse = Mma::kSparse;
-  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
-  static int const kMaxID2 = Mma::kMaxID2;
-  static int const kElementsPerElementE = Mma::kElementsPerElementE;
-
-  using ElementE = typename Mma::ElementE;
-  using LayoutE = typename Mma::LayoutE;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  using ElementAux = typename Epilogue::AuxOutputTileIterator::Element;
-  using LayoutAux = typename Epilogue::AuxOutputTileIterator::Layout;
-  using ElementVector = typename Epilogue::ElementVector;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : detail::SparseUniversalArgumentsBase<
-      LayoutA,
-      LayoutB,
-      LayoutC,
-      LayoutE
-    > {
-    using Base = detail::SparseUniversalArgumentsBase<
-      LayoutA,
-      LayoutB,
-      LayoutC,
-      LayoutE
-    >;
-
-    void const* ptr_Aux;
-    void const* ptr_Vector;
-    int64_t batch_stride_Aux;
-    int64_t batch_stride_Vector;
-    typename LayoutAux::Stride::LongIndex ldaux;
-    int64_t ldvector;
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    Arguments() {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void const * ptr_E,
-      void const * ptr_Aux,
-      void const * ptr_Vector,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_E,
-      int64_t batch_stride_Aux,
-      int64_t batch_stride_Vector,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      typename LayoutC::Stride::LongIndex lde,
-      typename LayoutAux::Stride::LongIndex ldaux,
-      int64_t ldvector
-      )
-    :
-      Base(
-        mode, problem_size, batch_count,
-        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
-        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
-        lda, ldb, ldc, ldd, lde
-      ),
-      ptr_Aux(ptr_Aux),
-      ptr_Vector(ptr_Vector),
-      batch_stride_Aux(batch_stride_Aux),
-      batch_stride_Vector(batch_stride_Vector),
-      ldaux(ldaux),
-      ldvector(ldvector),
-      epilogue(epilogue)
-    { }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : detail::SparseUniversalParamsBase<
-    Mma,
-    Epilogue,
-    Arguments,
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = detail::SparseUniversalParamsBase<
-      Mma,
-      Epilogue,
-      Arguments,
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
-    int64_t ldvector;
-
-    void* ptr_Aux;
-    void* ptr_Vector;
-
-    int64_t batch_stride_Aux;
-    int64_t batch_stride_Vector;
-    typename EpilogueOutputOp::Params output_op;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_Aux(args.ldaux),
-      ldvector(args.ldvector),
-      ptr_Aux(const_cast<void *>(args.ptr_Aux)),
-      ptr_Vector(const_cast<void *>(args.ptr_Vector)),
-      batch_stride_Aux(args.batch_stride_Aux),
-      batch_stride_Vector(args.batch_stride_Vector),
-      output_op(args.epilogue)
-    {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-
-      // Update input/output pointers
-      this->ptr_A = const_cast<void *>(args.ptr_A);
-      this->ptr_B = const_cast<void *>(args.ptr_B);
-      this->ptr_C = const_cast<void *>(args.ptr_C);
-      this->ptr_D = args.ptr_D;
-      this->ptr_E = const_cast<void *>(args.ptr_E);
-      ptr_Aux = const_cast<void *>(args.ptr_Aux);
-      ptr_Vector = const_cast<void *>(args.ptr_Vector);
-
-      this->batch_stride_A = args.batch_stride_A;
-      this->batch_stride_B = args.batch_stride_B;
-      this->batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-      this->batch_stride_E = args.batch_stride_E;
-      this->batch_stride_Aux = args.batch_stride_Aux;
-      batch_stride_Vector = args.batch_stride_Vector;
-
-      output_op = args.epilogue;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size,
-    GemmUniversalMode mode,
-    int split_k_count) {
-    return Base::can_implement(problem_size, mode, split_k_count);
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size, args.mode, args.batch_count);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmSparseUniversalWithAbsmax op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    ThreadblockSwizzle threadblock_swizzle;
-    run_with_swizzle(params, shared_storage, threadblock_swizzle);
-  }
-
-  /// Executes one GEMM with an externally-provided swizzling function
-  CUTLASS_DEVICE
-  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k / kSparse,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_E{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k / kSparse / kElementsPerElementE,
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k / kSparse},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorE iterator_E(
-      params.params_E,
-      ptr_E,
-      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
-      thread_idx,
-      tb_offset_E);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      iterator_E,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    ElementAux * ptr_Aux = static_cast<ElementAux *>(params.ptr_Aux);
-    ElementVector * ptr_Vector = static_cast<ElementVector *>(params.ptr_Vector);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      if (ptr_Aux) {
-        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_Aux;
-      }
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-      if (ptr_Aux) {
-        ptr_Aux = static_cast<ElementAux * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
-      }
-    }
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldvector;
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to auxiliary destination tensor.
-    typename Epilogue::AuxOutputTileIterator iterator_Aux(
-      params.params_Aux,
-      // Only the final block writes the auxiliary tensor
-      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-          ? nullptr
-          : ptr_Aux,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op,
-      // Only the final block uses Vector
-      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-       (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-          ? nullptr
-          : ptr_Vector,
-      iterator_D,
-      accumulators,
-      iterator_C,
-      iterator_Aux,
-      params.problem_size.mn(),
-      threadblock_offset);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
deleted file mode 100755
index 8ab98ff01..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for GEMM performing a reduction over K partitions in parallel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmSplitKParallel {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  static int const kAlignmentK = Mma::Operator::Shape::kK;
-
-  /// Parameters structure
-  struct Params {
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorA::TensorRef ref_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Mma::IteratorB::TensorRef ref_B;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::OutputTileIterator::TensorRef ref_D;
-    typename OutputOp::Params output_op;
-    int64_t splitk_slice_stride;
-    int gemm_k_size;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      typename OutputOp::Params output_op,
-      int64_t splitk_slice_stride
-    ):
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(ref_A.layout()),
-      ref_A(ref_A),
-      params_B(ref_B.layout()),
-      ref_B(ref_B),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op),
-      splitk_slice_stride(splitk_slice_stride) {
-
-      int full_gemm_k_iterations = problem_size.k() / Mma::Shape::kK;
-      int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();
-
-      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  GemmSplitKParallel() { } 
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      threadblock_tile_offset.k() * params.gemm_k_size,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k;
-    if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
-      problem_size_k = params.problem_size.k();
-    }
-    else {
-      problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-    }
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      params.ref_A.data(),
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      params.ref_B.data(),
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    int warp_idx = threadIdx.x / 32;
-    int lane_idx = threadIdx.x % 32;
-
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator writing to output tile
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    iterator_D.add_pointer_offset(params.splitk_slice_stride * threadblock_tile_offset.k());
-
-    // Execute the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Run efficient epilogue
-    epilogue(output_op, iterator_D, accumulators, iterator_D);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
deleted file mode 100755
index 013fb7730..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
+++ /dev/null
@@ -1,2396 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Stream-K Gemm kernel compatible with fused epilogues
-    that broadcast a bias vector over the MMA output.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/layout.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/barrier.h"
-#include "cutlass/block_striped.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool IsSingleSource = Epilogue_::kIsSingleSource
->
-struct GemmStreamkWithFusedEpilogue;
-
-// GemmStreamkWithFusedEpilogue with two sources
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  /// The per-thread tile of raw accumulators
-  using AccumulatorTile = typename Mma::FragmentC;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Workspace bytes per thread block
-  static size_t const kWorkspaceBytesPerBlock =
-    __NV_STD_MAX(
-      kThreadCount * sizeof(AccumulatorTile),
-      Epilogue::kWorkspaceBytesPerBlock);
-
-  /// Block-striped reduction utility
-  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
-
-
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-    GemmCoord problem_size{};
-    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_B{nullptr};
-    void const * ptr_C1{nullptr};
-    void const * ptr_C2{nullptr};
-    void * ptr_D{nullptr};
-
-    void * ptr_Vector;
-    void * ptr_Tensor;
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C1{0};
-    int64_t batch_stride_C2{0};
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_Vector{0};
-    int64_t batch_stride_Tensor{0};
-
-    typename LayoutA::Stride::Index lda{};
-    typename LayoutB::Stride::Index ldb{};
-    typename LayoutC::Stride::Index ldc1{};
-    typename LayoutC::Stride::Index ldc2{};
-    typename LayoutC::Stride::Index ldd{};
-    typename LayoutC::Stride::Index ldr{};
-    typename LayoutC::Stride::Index ldt{};
-
-    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-
-
-    //
-    // Methods
-    //
-
-    /// Default Constructor
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C1,
-      void const * ptr_C2,
-      void * ptr_D,
-      void * ptr_Vector,
-      void * ptr_Tensor,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C1,
-      int64_t batch_stride_C2,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      int64_t batch_stride_Tensor,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc1,
-      typename LayoutC::Stride::Index ldc2,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldt,
-      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-    :
-      mode(mode),
-      problem_size(problem_size),
-      batch_count(batch_split),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
-      ptr_Vector(ptr_Vector),
-      ptr_Tensor(ptr_Tensor),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C1(batch_stride_C1),
-      batch_stride_C2(batch_stride_C2),
-      batch_stride_Vector(batch_stride_Vector),
-      batch_stride_Tensor(batch_stride_Tensor),
-      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
-    {
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
-      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  /// Parameters structure
-  struct Params
-  {
-  public:
-
-    //
-    // Data members
-    //
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-
-    ThreadblockSwizzle block_mapping{};
-
-    void *barrier_workspace{nullptr};
-    void *partials_workspace{nullptr};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    void * ptr_C1{nullptr};
-    void * ptr_C2{nullptr};
-    void * ptr_D{nullptr};
-    void * ptr_Tensor{nullptr};
-    void * ptr_Vector{nullptr};
-
-    typename Epilogue::OutputTileIterator::Params params_C1{};
-    typename Epilogue::OutputTileIterator::Params params_C2{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::TensorTileIterator::Params params_Tensor{};
-
-    int64_t batch_stride_C1{0};
-    int64_t batch_stride_C2{0};
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_Vector{0};
-    int64_t batch_stride_Tensor{0};
-
-    typename LayoutC::Stride::Index ldr{};
-
-  protected:
-
-    //
-    // Host-only dispatch-utilities
-    //
-
-    /// Pad the given allocation size up to the nearest cache line
-    static size_t cacheline_align_up(size_t size)
-    {
-      static const int CACHELINE_SIZE = 128;
-      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
-    }
-
-    /// Get the workspace size needed for barrier
-    size_t get_barrier_workspace_size() const
-    {
-      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
-      // each reduction block needs its own synchronization flag.
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
-
-      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
-    }
-
-    /// Get the workspace size needed for intermediate partial sums
-    size_t get_partials_workspace_size() const
-    {
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
-    }
-
-
-  public:
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C1(args.ldc1),
-      params_C2(args.ldc2),
-      params_D(args.ldd),
-      params_Tensor(args.ldt),
-      output_op(args.epilogue),
-      mode(args.mode),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C1(const_cast<void *>(args.ptr_C1)),
-      ptr_C2(const_cast<void *>(args.ptr_C2)),
-      ptr_D(args.ptr_D),
-      ptr_Vector(args.ptr_Vector),
-      ldr(args.ldr),
-      ptr_Tensor(args.ptr_Tensor),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C1(args.batch_stride_C1),
-      batch_stride_C2(args.batch_stride_C2),
-      batch_stride_D(args.batch_stride_D),
-      batch_stride_Vector(args.batch_stride_Vector),
-      batch_stride_Tensor(args.batch_stride_Tensor),
-      barrier_workspace(nullptr),
-      partials_workspace(nullptr)
-    {
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
-
-      // Number of SMs to make available for StreamK decomposition
-      int avail_sms = (args.avail_sms == -1) ?
-                        device_sms :
-                        fast_min(args.avail_sms, device_sms);
-      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
-
-      // Initialize the block mapping structure
-      block_mapping = ThreadblockSwizzle(
-        args.mode,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.batch_count,
-        sm_occupancy,
-        device_sms,
-        avail_sms,
-        sizeof(ElementA),
-        sizeof(ElementB),
-        sizeof(ElementC),
-        Epilogue::kAccumulatorFragments);
-    }
-
-    /// Returns the workspace size (in bytes) needed for these parameters
-    size_t get_workspace_size() const
-    {
-      return
-        get_barrier_workspace_size() +
-        get_partials_workspace_size();
-    }
-
-    /// Assign and initialize the specified workspace buffer.  Assumes
-    /// the memory allocated to workspace is at least as large as get_workspace_size().
-    Status init_workspace(
-      void *workspace,
-      cudaStream_t stream = nullptr)
-    {
-      uint8_t *ptr = static_cast<uint8_t*>(workspace);
-
-      // Establish partials workspace
-      partials_workspace = nullptr;
-      size_t partials_workspace_bytes = get_partials_workspace_size();
-      if (partials_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        partials_workspace = ptr;
-        ptr += partials_workspace_bytes;
-      }
-
-      // Establish barrier workspace
-      barrier_workspace = nullptr;
-      size_t barrier_workspace_bytes = get_barrier_workspace_size();
-      if (barrier_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        barrier_workspace = ptr;
-        ptr += barrier_workspace_bytes;
-      }
-
-      // Zero-initialize barrier workspace
-      if (barrier_workspace)
-      {
-        size_t barrier_workspace_bytes = get_barrier_workspace_size();
-
-        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
-
-        cudaError_t result = cudaMemsetAsync(
-          barrier_workspace,
-          0,
-          barrier_workspace_bytes,
-          stream);
-
-        if (result != cudaSuccess) {
-          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-
-      return Status::kSuccess;
-    }
-
-
-    /// Returns the GEMM volume in thread block tiles
-    cutlass::gemm::GemmCoord get_tiled_shape() const
-    {
-      return block_mapping.tiled_shape();
-    }
-
-    /// Returns the total number of thread blocks to launch
-    int get_grid_blocks() const
-    {
-      dim3 grid_dims = get_grid_dims();
-      return grid_dims.x * grid_dims.y * grid_dims.z;
-    }
-
-    /// Returns the grid extents in thread blocks to launch
-    dim3 get_grid_dims() const
-    {
-      return block_mapping.get_grid_dims();
-    }
-
-    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
-    /// to remain the same.
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C1 = const_cast<void *>(args.ptr_C1);
-      ptr_C2 = const_cast<void *>(args.ptr_C2);
-      ptr_D = args.ptr_D;
-
-      ptr_Vector = args.ptr_Vector;
-      ldr = args.ldr;
-      ptr_Tensor = args.ptr_Tensor;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C1 = args.batch_stride_C1;
-      batch_stride_C2 = args.batch_stride_C2;
-      batch_stride_D = args.batch_stride_D;
-      batch_stride_Vector = args.batch_stride_Vector;
-      batch_stride_Tensor = args.batch_stride_Tensor;
-
-      output_op = args.epilogue;
-
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-    }
-  };
-
-  /// Tile work descriptor
-  struct TileWorkDesc
-  {
-    /// The linear tile index
-    int tile_idx;
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    cutlass::gemm::GemmCoord tiled_coord;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    int iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_begin;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_end;
-
-    /// The number of remaining MAC-iterations this threadblock will perform for this tile
-    int k_iters_remaining;
-
-    // Whether this block will perform the first iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_started()
-    {
-      return (k_begin == 0);
-    }
-
-    // Whether this block will perform the last iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_finished(Params const &params)
-    {
-      return (k_end == params.block_mapping.problem_size.k());
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-protected:
-
-  //
-  // Data members
-  //
-
-  /// GEMM problem parameters
-  Params const &params;
-
-  /// Shared storage reference
-  SharedStorage &shared_storage;
-
-  /// ID within the threadblock
-  int thread_idx;
-
-  /// ID of warp
-  int warp_idx;
-
-  /// ID of each thread within a warp
-  int lane_idx;
-
-  /// Threadblock scoped epilogue
-  Epilogue epilogue;
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-protected:
-
-  //
-  // Device-only utility methods
-  //
-
-  /// Iterator for fetching tile fragments from A
-  CUTLASS_DEVICE
-  typename Mma::IteratorA init_iterator_A(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input A matrix
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
-    }
-
-    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
-    int m_end = params.block_mapping.problem_size.m();
-    return Mma::IteratorA(
-        params.params_A,
-        ptr_A,
-        { m_end, tile_work.k_end },
-        threadIdx.x,
-        { m_begin, tile_work.k_begin });
-
-  }
-
-
-  /// Iterator for fetching tile fragments from B
-  CUTLASS_DEVICE
-  typename Mma::IteratorB init_iterator_B(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input B matrix
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
-    }
-
-    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
-    int n_end = params.block_mapping.problem_size.n();
-    return Mma::IteratorB(
-        params.params_B,
-        ptr_B,
-        { tile_work.k_end, n_end },
-        threadIdx.x,
-        { tile_work.k_begin, n_begin });
-  }
-
-
-  CUTLASS_DEVICE
-  void init_dp_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = 0;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = params.block_mapping.problem_size.k();
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void init_sk_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx,
-      int block_iter_begin,
-      int block_iter_end)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration for this tile
-    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
-
-    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
-
-    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_end = block_iter_end - tile_iter_begin;
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = min(
-        params.block_mapping.problem_size.k(),            // extent of k domain
-        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  /// Share accumulators with peers
-  CUTLASS_DEVICE
-  void share_accumulators(
-    AccumulatorTile const &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    int accum_tile_offset = first_block_idx * kThreadCount;
-
-    if (block_idx == first_block_idx)
-    {
-      // First peer initializes the workspace partials
-      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-    else
-    {
-      // Subsequent peers atomically accumulate into the workspace partials
-      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
-      {
-        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
-        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
-      }
-      else
-      {
-        // Turnstile reduction order: wait until the previous peer has written
-        int wait_count = block_idx - first_block_idx;
-        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
-      }
-
-      // Perform reduction in workspace
-      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-
-    // Signal our arrival
-    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
-  }
-
-
-  /// Acquire accumulators from peers
-  CUTLASS_DEVICE
-  void acquire_accumulators(
-    AccumulatorTile &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    // Wait for arrival
-    int num_carry_in = block_idx - first_block_idx;
-    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
-
-    // Load and add peer-partials accumulator tile to local accumulator tile
-    int accum_tile_offset = first_block_idx * kThreadCount;
-    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
-  }
-
-
-  /// Perform epilogue computations and output
-  CUTLASS_DEVICE
-  void do_epilogue(
-    TileWorkDesc &tile_work,
-    AccumulatorTile &accumulator_tile)
-  {
-    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
-    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    // Update pointers for batched/array mode(s)
-    if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C1 += tile_work.tiled_coord.k() * params.batch_stride_C1;
-      if (ptr_C2) {
-        ptr_C2 += tile_work.tiled_coord.k() * params.batch_stride_C2;
-      }
-      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
-      if (ptr_Tensor) {
-        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
-          ptr_Tensor,
-          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
-      }
-      if (ptr_Vector) {
-        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
-      }
-    }
-    if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[tile_work.tiled_coord.k()];
-      if (ptr_C2) {
-        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[tile_work.tiled_coord.k()];
-      }
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
-      if (ptr_Tensor) {
-        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
-      }
-    }
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tile_work.tiled_coord.m() * Mma::Shape::kM,
-      tile_work.tiled_coord.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator loading from residual1.
-    typename Epilogue::OutputTileIterator iterator_C1(
-        params.params_C1,
-        ptr_C1,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator loading from residual2.
-    typename Epilogue::OutputTileIterator iterator_C2(
-        params.params_C2,
-        ptr_C2,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        ptr_Tensor,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-        EpilogueOutputOp(params.output_op),
-        ptr_Vector,
-        iterator_D,
-        accumulator_tile,
-        iterator_C1,
-        iterator_C2,
-        tensor_iterator,
-        params.block_mapping.problem_size.mn(),
-        threadblock_item_begin);
-  }
-
-
-  CUTLASS_DEVICE
-  void separate_reduction(int reduce_idx)
-  {
-    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
-
-    // Reduce by sk-tile (every tile contributed to by one or more blocks)
-    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
-    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
-
-    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
-    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
-
-    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
-    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
-
-    // Wait for peers to complete
-    int peer_idx_end = peer_idx_last + 1;
-    int num_peers = peer_idx_end - peer_idx_begin;
-    Barrier::wait_eq_reset(
-        params.barrier_workspace,
-        thread_idx,
-        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
-        num_peers);
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tiled_coord.m() * Mma::Shape::kM,
-      tiled_coord.n() * Mma::Shape::kN
-    );
-
-    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
-    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    // Tile iterator loading from residual1.
-    typename Epilogue::OutputTileIterator iterator_C1(
-        params.params_C1,
-        ptr_C1,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator loading from residual2.
-    typename Epilogue::OutputTileIterator iterator_C2(
-        params.params_C2,
-        ptr_C2,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        ptr_Tensor,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue.reduce(
-        peer_idx_begin,
-        peer_idx_end,
-        reduce_fragment_idx,
-        params.partials_workspace,
-        EpilogueOutputOp(params.output_op),
-        ptr_Vector,
-        iterator_D,
-        iterator_C1,
-        iterator_C2,
-        tensor_iterator,
-        params.block_mapping.problem_size.mn(),
-        threadblock_item_begin);
-  }
-
-
-  CUTLASS_DEVICE
-  void process_tile(
-    TileWorkDesc tile_work,
-    int block_idx,
-    int dp_start_block_idx,
-    int block_iter_begin)
-  {
-    // Initialize input iterators
-    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
-    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
-
-    // Initialize accumulators
-    AccumulatorTile accumulator_tile;
-    accumulator_tile.clear();
-
-    // Initialize MMA abstraction
-    Mma mma(
-      shared_storage.main_loop,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Perform this tile's range of multiply-accumulate (MAC) iterations
-    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
-
-    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
-        (params.block_mapping.reduction_blocks == 0) ||
-        (block_idx >= dp_start_block_idx))
-    {
-      //
-      // Cooperative SK peer reduction or DP block
-      //
-
-      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
-
-      if (!tile_work.tile_finished(params)) {
-        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
-        share_accumulators(accumulator_tile, block_idx, first_block_idx);
-      }
-      else
-      {
-        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
-        if (!tile_work.tile_started())
-        {
-          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
-          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
-        }
-
-        do_epilogue(tile_work, accumulator_tile);
-      }
-    }
-    else
-    {
-      //
-      // Separate peer reduction
-      //
-
-      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
-      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
-
-      // Signal arrival
-      Barrier::arrive_range_inc(
-        params.barrier_workspace,
-        thread_idx,
-        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
-        Epilogue::kAccumulatorFragments);
-    }
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void gemm()
-  {
-    // Initialize block's iteration range
-    int tile_idx = 0;
-    int block_iter_begin = 0;
-    int block_iters_remaining = 0;
-
-    int block_idx = params.block_mapping.get_block_idx();
-
-    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
-    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
-    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
-    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
-
-    // Initialize tile work descriptor
-    TileWorkDesc tile_work;
-
-    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
-    bool sk_block = (block_idx < sk_padding_start_block_idx);
-    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
-            (block_idx < grid_padding_start_block_idx) &&
-            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
-
-    if (dp_block)
-    {
-      // This is a DP block
-      int dp_block_idx = block_idx - dp_start_block_idx;
-      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
-
-      // Blocks in first DP wave get configured number of tiles
-      tile_idx = first_dp_tile + dp_block_idx;
-      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
-
-      // Blocks in subsequent DP waves get 1 tile
-      if (dp_block_idx >= params.block_mapping.avail_sms) {
-          tile_allottment = 1;
-          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
-      }
-
-      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
-
-      init_dp_tile_work(tile_work, tile_idx);
-
-      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
-      if ((tile_idx < params.block_mapping.sk_tiles) ||
-          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
-          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
-      {
-        return;
-      }
-    }
-    else if (sk_block)
-    {
-      // This is a SK block
-      int block_iter_end;
-      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
-      block_iters_remaining = block_iter_end - block_iter_begin;
-
-      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
-      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-    }
-    else
-    {
-      if (reduce_block)
-      {
-        // This is a reduction threadblock
-        int reduce_block_idx = block_idx - reduce_start_block_idx;
-        separate_reduction(reduce_block_idx);
-      }
-
-      return;
-    }
-
-    // Iteration-processing loop body
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (true)
-    {
-      // Perform this block's share of work for this tile
-      process_tile(
-        tile_work,
-        block_idx,
-        dp_start_block_idx,
-        block_iter_begin);
-
-      block_iters_remaining -= tile_work.k_iters_remaining;
-
-      if (block_iters_remaining == 0)
-      {
-        break;
-      }
-
-      // Continue to next tile
-      __syncthreads();
-
-      if (block_idx >= dp_start_block_idx)
-      {
-        // DP block consume their tiles at stride
-        tile_idx += params.block_mapping.avail_sms;
-        init_dp_tile_work(tile_work, tile_idx);
-      }
-      else
-      {
-        // SK blocks consume their tiles in backwards order
-        tile_idx--;
-        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-      }
-    }
-
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmStreamkWithFusedEpilogue op(params, shared_storage);
-    op();
-  }
-
-
-  // Constructor
-  CUTLASS_DEVICE
-  GemmStreamkWithFusedEpilogue(
-      Params const &params,
-      SharedStorage &shared_storage)
-    :
-      params(params),
-      shared_storage(shared_storage),
-      thread_idx(threadIdx.x),
-      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
-      lane_idx(threadIdx.x % 32),
-      epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx)
-  {}
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()() {
-    // Generic SK code path
-    gemm();
-
-  }
-};
-
-
-// GemmStreamkWithFusedEpilogue with one source
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  /// The per-thread tile of raw accumulators
-  using AccumulatorTile = typename Mma::FragmentC;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Workspace bytes per thread block
-  static size_t const kWorkspaceBytesPerBlock =
-    __NV_STD_MAX(
-      kThreadCount * sizeof(AccumulatorTile),
-      Epilogue::kWorkspaceBytesPerBlock);
-
-  /// Block-striped reduction utility
-  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
-
-
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments
-  {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-    GemmCoord problem_size{};
-    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_B{nullptr};
-    void const * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    void * ptr_Vector{nullptr};
-    void * ptr_Tensor{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_Vector{0};
-    int64_t batch_stride_Tensor{0};
-
-    typename LayoutA::Stride::Index lda{};
-    typename LayoutB::Stride::Index ldb{};
-    typename LayoutC::Stride::Index ldc{};
-    typename LayoutC::Stride::Index ldd{};
-    typename LayoutC::Stride::Index ldr{};
-    typename LayoutC::Stride::Index ldt{};
-
-    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-
-
-    //
-    // Methods
-    //
-
-    /// Default Constructor
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void * ptr_Vector,
-      void * ptr_Tensor,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      int64_t batch_stride_Tensor,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldt,
-      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-    :
-      mode(mode),
-      problem_size(problem_size),
-      batch_count(batch_split),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      ptr_Vector(ptr_Vector),
-      ptr_Tensor(ptr_Tensor),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C(batch_stride_C),
-      batch_stride_Vector(batch_stride_Vector),
-      batch_stride_Tensor(batch_stride_Tensor),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
-    {
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
-      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  /// Parameters structure
-  struct Params
-  {
-
-  public:
-
-    //
-    // Data members
-    //
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-
-    ThreadblockSwizzle block_mapping{};
-
-    void *barrier_workspace{nullptr};
-    void *partials_workspace{nullptr};
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    void * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-    void * ptr_Tensor{nullptr};
-    void * ptr_Vector{nullptr};
-
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::TensorTileIterator::Params params_Tensor{};
-
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_Vector{0};
-    int64_t batch_stride_Tensor{0};
-
-    typename LayoutC::Stride::Index ldr{};
-
-  protected:
-
-    //
-    // Host-only dispatch-utilities
-    //
-
-    /// Pad the given allocation size up to the nearest cache line
-    static size_t cacheline_align_up(size_t size)
-    {
-      static const int CACHELINE_SIZE = 128;
-      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
-    }
-
-    /// Get the workspace size needed for barrier
-    size_t get_barrier_workspace_size() const
-    {
-      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
-      // each reduction block needs its own synchronization flag.
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
-
-      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
-    }
-
-    /// Get the workspace size needed for intermediate partial sums
-    size_t get_partials_workspace_size() const
-    {
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
-    }
-
-
-  public:
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      params_Tensor(args.ldt),
-      output_op(args.epilogue),
-      mode(args.mode),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      ptr_Vector(args.ptr_Vector),
-      ldr(args.ldr),
-      ptr_Tensor(args.ptr_Tensor),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      batch_stride_Vector(args.batch_stride_Vector),
-      batch_stride_Tensor(args.batch_stride_Tensor),
-      barrier_workspace(nullptr),
-      partials_workspace(nullptr)
-    {
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
-
-      // Number of SMs to make available for StreamK decomposition
-      int avail_sms = (args.avail_sms == -1) ?
-                        device_sms :
-                        fast_min(args.avail_sms, device_sms);
-      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
-
-      // Initialize the block mapping structure
-      block_mapping = ThreadblockSwizzle(
-        args.mode,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.batch_count,
-        sm_occupancy,
-        device_sms,
-        avail_sms,
-        sizeof(ElementA),
-        sizeof(ElementB),
-        sizeof(ElementC),
-        Epilogue::kAccumulatorFragments);
-    }
-
-    /// Returns the workspace size (in bytes) needed for these parameters
-    size_t get_workspace_size() const
-    {
-      return
-        get_barrier_workspace_size() +
-        get_partials_workspace_size();
-    }
-
-
-    /// Assign and initialize the specified workspace buffer.  Assumes
-    /// the memory allocated to workspace is at least as large as get_workspace_size().
-    Status init_workspace(
-      void *workspace,
-      cudaStream_t stream = nullptr)
-    {
-      uint8_t *ptr = static_cast<uint8_t*>(workspace);
-
-      // Establish partials workspace
-      partials_workspace = nullptr;
-      size_t partials_workspace_bytes = get_partials_workspace_size();
-      if (partials_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        partials_workspace = ptr;
-        ptr += partials_workspace_bytes;
-      }
-
-      // Establish barrier workspace
-      barrier_workspace = nullptr;
-      size_t barrier_workspace_bytes = get_barrier_workspace_size();
-      if (barrier_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        barrier_workspace = ptr;
-        ptr += barrier_workspace_bytes;
-      }
-
-      // Zero-initialize barrier workspace
-      if (barrier_workspace)
-      {
-        size_t barrier_workspace_bytes = get_barrier_workspace_size();
-
-        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
-
-        cudaError_t result = cudaMemsetAsync(
-          barrier_workspace,
-          0,
-          barrier_workspace_bytes,
-          stream);
-
-        if (result != cudaSuccess) {
-          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-
-      return Status::kSuccess;
-    }
-
-
-    /// Returns the GEMM volume in thread block tiles
-    cutlass::gemm::GemmCoord get_tiled_shape() const
-    {
-      return block_mapping.tiled_shape();
-    }
-
-
-    /// Returns the total number of thread blocks to launch
-    int get_grid_blocks() const
-    {
-      dim3 grid_dims = get_grid_dims();
-      return grid_dims.x * grid_dims.y * grid_dims.z;
-    }
-
-
-    /// Returns the grid extents in thread blocks to launch
-    dim3 get_grid_dims() const
-    {
-      return block_mapping.get_grid_dims();
-    }
-
-    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
-    /// to remain the same.
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      ptr_Vector = args.ptr_Vector;
-      ldr = args.ldr;
-      ptr_Tensor = args.ptr_Tensor;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
-      batch_stride_Vector = args.batch_stride_Vector;
-      batch_stride_Tensor = args.batch_stride_Tensor;
-
-      output_op = args.epilogue;
-
-      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-    }
-  };
-
-  /// Tile work descriptor
-  struct TileWorkDesc
-  {
-    /// The linear tile index
-    int tile_idx;
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    cutlass::gemm::GemmCoord tiled_coord;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    int iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_begin;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_end;
-
-    /// The number of remaining MAC-iterations this threadblock will perform for this tile
-    int k_iters_remaining;
-
-    // Whether this block will perform the first iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_started()
-    {
-      return (k_begin == 0);
-    }
-
-    // Whether this block will perform the last iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_finished(Params const &params)
-    {
-      return (k_end == params.block_mapping.problem_size.k());
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-protected:
-
-  //
-  // Data members
-  //
-
-  /// GEMM problem parameters
-  Params const &params;
-
-  /// Shared storage reference
-  SharedStorage &shared_storage;
-
-  /// ID within the threadblock
-  int thread_idx;
-
-  /// ID of warp
-  int warp_idx;
-
-  /// ID of each thread within a warp
-  int lane_idx;
-
-  /// Threadblock scoped epilogue
-  Epilogue epilogue;
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-protected:
-
-  //
-  // Device-only utility methods
-  //
-
-  /// Iterator for fetching tile fragments from A
-  CUTLASS_DEVICE
-  typename Mma::IteratorA init_iterator_A(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input A matrix
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
-    }
-
-    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
-    int m_end = params.block_mapping.problem_size.m();
-    return Mma::IteratorA(
-        params.params_A,
-        ptr_A,
-        { m_end, tile_work.k_end },
-        threadIdx.x,
-        { m_begin, tile_work.k_begin });
-
-  }
-
-
-  /// Iterator for fetching tile fragments from B
-  CUTLASS_DEVICE
-  typename Mma::IteratorB init_iterator_B(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input B matrix
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
-    }
-
-    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
-    int n_end = params.block_mapping.problem_size.n();
-    return Mma::IteratorB(
-        params.params_B,
-        ptr_B,
-        { tile_work.k_end, n_end },
-        threadIdx.x,
-        { tile_work.k_begin, n_begin });
-  }
-
-
-  CUTLASS_DEVICE
-  void init_dp_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = 0;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = params.block_mapping.problem_size.k();
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void init_sk_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx,
-      int block_iter_begin,
-      int block_iter_end)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration for this tile
-    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
-
-    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
-
-    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_end = block_iter_end - tile_iter_begin;
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = min(
-        params.block_mapping.problem_size.k(),            // extent of k domain
-        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  /// Share accumulators with peers
-  CUTLASS_DEVICE
-  void share_accumulators(
-    AccumulatorTile const &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    int accum_tile_offset = first_block_idx * kThreadCount;
-
-    if (block_idx == first_block_idx)
-    {
-      // First peer initializes the workspace partials
-      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-    else
-    {
-      // Subsequent peers atomically accumulate into the workspace partials
-      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
-      {
-        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
-        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
-      }
-      else
-      {
-        // Turnstile reduction order: wait until the previous peer has written
-        int wait_count = block_idx - first_block_idx;
-        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
-      }
-
-      // Perform reduction in workspace
-      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-
-    // Signal our arrival
-    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
-  }
-
-
-  /// Acquire accumulators from peers
-  CUTLASS_DEVICE
-  void acquire_accumulators(
-    AccumulatorTile &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    // Wait for arrival
-    int num_carry_in = block_idx - first_block_idx;
-    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
-
-    // Load and add peer-partials accumulator tile to local accumulator tile
-    int accum_tile_offset = first_block_idx * kThreadCount;
-    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
-  }
-
-
-  /// Perform epilogue computations and output
-  CUTLASS_DEVICE
-  void do_epilogue(
-    TileWorkDesc &tile_work,
-    AccumulatorTile &accumulator_tile)
-  {
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    // Update pointers for batched/array mode(s)
-    if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
-      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
-      if (ptr_Tensor) {
-        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
-          ptr_Tensor,
-          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
-      }
-      if (ptr_Vector) {
-        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
-      }
-    }
-    if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
-      if (ptr_Tensor) {
-        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
-      }
-    }
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tile_work.tiled_coord.m() * Mma::Shape::kM,
-      tile_work.tiled_coord.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        ptr_Tensor,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-        EpilogueOutputOp(params.output_op),
-        ptr_Vector,
-        iterator_D,
-        accumulator_tile,
-        iterator_C,
-        tensor_iterator,
-        params.block_mapping.problem_size.mn(),
-        threadblock_item_begin);
-  }
-
-
-  CUTLASS_DEVICE
-  void separate_reduction(int reduce_idx)
-  {
-    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
-
-    // Reduce by sk-tile (every tile contributed to by one or more blocks)
-    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
-    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
-
-    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
-    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
-
-    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
-    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
-
-    // Wait for peers to complete
-    int peer_idx_end = peer_idx_last + 1;
-    int num_peers = peer_idx_end - peer_idx_begin;
-    Barrier::wait_eq_reset(
-        params.barrier_workspace,
-        thread_idx,
-        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
-        num_peers);
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tiled_coord.m() * Mma::Shape::kM,
-      tiled_coord.n() * Mma::Shape::kN
-    );
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        ptr_Tensor,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue.reduce(
-        peer_idx_begin,
-        peer_idx_end,
-        reduce_fragment_idx,
-        params.partials_workspace,
-        EpilogueOutputOp(params.output_op),
-        ptr_Vector,
-        iterator_D,
-        iterator_C,
-        tensor_iterator,
-        params.block_mapping.problem_size.mn(),
-        threadblock_item_begin);
-  }
-
-
-  CUTLASS_DEVICE
-  void process_tile(
-    TileWorkDesc tile_work,
-    int block_idx,
-    int dp_start_block_idx,
-    int block_iter_begin)
-  {
-    // Initialize input iterators
-    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
-    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
-
-    // Initialize accumulators
-    AccumulatorTile accumulator_tile;
-    accumulator_tile.clear();
-
-    // Initialize MMA abstraction
-    Mma mma(
-      shared_storage.main_loop,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Perform this tile's range of multiply-accumulate (MAC) iterations
-    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
-
-    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
-        (params.block_mapping.reduction_blocks == 0) ||
-        (block_idx >= dp_start_block_idx))
-    {
-      //
-      // Cooperative SK peer reduction or DP block
-      //
-
-      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
-
-      if (!tile_work.tile_finished(params)) {
-        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
-        share_accumulators(accumulator_tile, block_idx, first_block_idx);
-      }
-      else
-      {
-        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
-        if (!tile_work.tile_started())
-        {
-          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
-          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
-        }
-
-        do_epilogue(tile_work, accumulator_tile);
-      }
-    }
-    else
-    {
-      //
-      // Separate peer reduction
-      //
-
-      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
-      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
-
-      // Signal arrival
-      Barrier::arrive_range_inc(
-        params.barrier_workspace,
-        thread_idx,
-        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
-        Epilogue::kAccumulatorFragments);
-    }
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void gemm()
-  {
-    // Initialize block's iteration range
-    int tile_idx = 0;
-    int block_iter_begin = 0;
-    int block_iters_remaining = 0;
-
-    int block_idx = params.block_mapping.get_block_idx();
-
-    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
-    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
-    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
-    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
-
-    // Initialize tile work descriptor
-    TileWorkDesc tile_work;
-
-    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
-    bool sk_block = (block_idx < sk_padding_start_block_idx);
-    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
-            (block_idx < grid_padding_start_block_idx) &&
-            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
-
-    if (dp_block)
-    {
-      // This is a DP block
-      int dp_block_idx = block_idx - dp_start_block_idx;
-      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
-
-      // Blocks in first DP wave get configured number of tiles
-      tile_idx = first_dp_tile + dp_block_idx;
-      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
-
-      // Blocks in subsequent DP waves get 1 tile
-      if (dp_block_idx >= params.block_mapping.avail_sms) {
-          tile_allottment = 1;
-          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
-      }
-
-      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
-
-      init_dp_tile_work(tile_work, tile_idx);
-
-      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
-      if ((tile_idx < params.block_mapping.sk_tiles) ||
-          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
-          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
-      {
-        return;
-      }
-    }
-    else if (sk_block)
-    {
-      // This is a SK block
-      int block_iter_end;
-      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
-      block_iters_remaining = block_iter_end - block_iter_begin;
-
-      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
-      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-    }
-    else
-    {
-      if (reduce_block)
-      {
-        // This is a reduction threadblock
-        int reduce_block_idx = block_idx - reduce_start_block_idx;
-        separate_reduction(reduce_block_idx);
-      }
-
-      return;
-    }
-
-    // Iteration-processing loop body
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (true)
-    {
-      // Perform this block's share of work for this tile
-      process_tile(
-        tile_work,
-        block_idx,
-        dp_start_block_idx,
-        block_iter_begin);
-
-      block_iters_remaining -= tile_work.k_iters_remaining;
-
-      if (block_iters_remaining == 0)
-      {
-        break;
-      }
-
-      // Continue to next tile
-      __syncthreads();
-
-      if (block_idx >= dp_start_block_idx)
-      {
-        // DP block consume their tiles at stride
-        tile_idx += params.block_mapping.avail_sms;
-        init_dp_tile_work(tile_work, tile_idx);
-      }
-      else
-      {
-        // SK blocks consume their tiles in backwards order
-        tile_idx--;
-        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-      }
-    }
-
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmStreamkWithFusedEpilogue op(params, shared_storage);
-    op();
-  }
-
-
-  // Constructor
-  CUTLASS_DEVICE
-  GemmStreamkWithFusedEpilogue(
-      Params const &params,
-      SharedStorage &shared_storage)
-    :
-      params(params),
-      shared_storage(shared_storage),
-      thread_idx(threadIdx.x),
-      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
-      lane_idx(threadIdx.x % 32),
-      epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx)
-  {}
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()() {
-    // Generic SK code path
-    gemm();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
deleted file mode 100755
index 4a2258c41..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-  \file
-  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
-    batched array variants.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA_, 
-  typename LayoutA_, 
-  ComplexTransform TransformA,
-  int AlignmentA,
-  typename ElementB_,
-  typename LayoutB_,
-  ComplexTransform TransformB,
-  int AlignmentB,
-  typename LayoutC_,
-  bool Transpose
->
-struct MapArguments {
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  static ComplexTransform const kTransformA = TransformA;
-  static int const kAlignmentA = AlignmentA; 
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  static ComplexTransform const kTransformB = TransformB;
-  static int const kAlignmentB = AlignmentB; 
-  using LayoutC = LayoutC_;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA_, 
-  typename LayoutA_, 
-  ComplexTransform TransformA,
-  int AlignmentA,
-  typename ElementB_,
-  typename LayoutB_,
-  ComplexTransform TransformB,
-  int AlignmentB,
-  typename LayoutC_
->
-struct MapArguments<
-  ElementA_,
-  LayoutA_,
-  TransformA,
-  AlignmentA, 
-  ElementB_,
-  LayoutB_,
-  TransformB,
-  AlignmentB,
-  LayoutC_,
-  true
-> {
-  using ElementA = ElementB_;
-  using LayoutA = typename layout::LayoutTranspose<LayoutB_>::type;
-  static ComplexTransform const kTransformA = TransformB;
-  static int const kAlignmentA = AlignmentB; 
-  using ElementB = ElementA_;
-  using LayoutB = typename layout::LayoutTranspose<LayoutA_>::type;
-  static ComplexTransform const kTransformB = TransformA;
-  static int const kAlignmentB = AlignmentA; 
-  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
deleted file mode 100755
index 08b30c74c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
+++ /dev/null
@@ -1,702 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-class GemmUniversal<
-  Mma_,
-  Epilogue_,
-  ThreadblockSwizzle_,
-  void,
-  // 3.x kernels use the first template argument to define the ProblemShape
-  // We use this invariant to SFINAE dispatch against either the 2.x API or the 3.x API
-  cute::enable_if_t<not (cute::is_tuple<Mma_>::value || IsCutlass3ArrayKernel<Mma_>::value)>
-> {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    typename LayoutA::Stride stride_a;
-    typename LayoutB::Stride stride_b;
-    typename LayoutC::Stride stride_c;
-    typename LayoutC::Stride stride_d;
-
-    typename LayoutA::Stride::LongIndex lda;
-    typename LayoutB::Stride::LongIndex ldb;
-    typename LayoutC::Stride::LongIndex ldc;
-    typename LayoutC::Stride::LongIndex ldd;
-
-    int const * ptr_gather_A_indices;
-    int const * ptr_gather_B_indices;
-    int const * ptr_scatter_D_indices;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
-      ptr_gather_A_indices(nullptr),
-      ptr_gather_B_indices(nullptr),
-      ptr_scatter_D_indices(nullptr)
-    {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride stride_a,
-      typename LayoutB::Stride stride_b,
-      typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices)
-    {
-      lda = 0;
-      ldb = 0;
-      ldc = 0;
-      ldd = 0;
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      int const *ptr_gather_A_indices = nullptr,
-      int const *ptr_gather_B_indices = nullptr,
-      int const *ptr_scatter_D_indices = nullptr
-    ):
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
-      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
-      ptr_scatter_D_indices(ptr_scatter_D_indices)
-    {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const
-    {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-
-    int * ptr_gather_A_indices;
-    int * ptr_gather_B_indices;
-    int * ptr_scatter_D_indices;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
-      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
-      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
-    {}
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-
-      // Update input/output pointers
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
-
-      output_op = args.epilogue;
-    }
-
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
-
-    static int const kAlignmentA = (cute::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (cute::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (cute::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (cute::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmUniversal op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    ThreadblockSwizzle threadblock_swizzle;
-    run_with_swizzle(params, shared_storage, threadblock_swizzle);
-  }
-
-  /// Executes one GEMM with an externally-provided swizzling function
-  CUTLASS_DEVICE
-  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.ptr_gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.ptr_scatter_D_indices
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      params.ptr_scatter_D_indices
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op,
-      iterator_D,
-      accumulators,
-      iterator_C);
-
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
deleted file mode 100755
index 6c7b89a24..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-// In cases where ProblemShape is not a tuple, this is used to check if the
-// underlying problem shape type is aliased within or not.
-// Used for dispatching GemmUniversal to 2.x API or 3.x API
-template <class ProblemShape, class = void>
-struct IsCutlass3ArrayKernel : cute::false_type { };
-
-template <typename ProblemShape>
-struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
-    : cute::true_type { };
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
-
-////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/kernel/sm70_gemm.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_tma.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp"
-#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp"
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
deleted file mode 100755
index 73426db5b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-namespace cutlass::gemm::kernel {
-
-
-/*
- * Stateless universal device GEMM kernel type that treats GEMM as
- * a composition of a collective mainloop and a collective epilogue.
- *
- * Supports both the 2.x and 3.x APIs based on whether the first type is
- * a cute::tuple<> or not.
- * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h
- * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp
- *
- * In the following declaration, the name preceding the 'Or' refers to
- * 3.x API type argument order, and the name succeeding the 'Or' refers to
- * 2.x API type argument order. Template arguments without two names
- * belong to the 3.x API only.
-**/
-template <
-  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
-  class CollectiveMainloopOrEpilogue_,
-  class CollectiveEpilogueOrThreadblockSwizzle_,
-  class TileScheduler_ = void,
-  class Enable = void
->
-class GemmUniversal;
-
-
-} // namespace cutlass::gemm::kernel
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
deleted file mode 100755
index 39a9bfb58..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
+++ /dev/null
@@ -1,1168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/barrier.h"
-#include "cutlass/block_striped.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock mapping function
->
-struct GemmUniversalStreamk {
-public:
-
-
-  //
-  // Types and constants
-  //
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  /// The per-thread tile of raw accumulators
-  using AccumulatorTile = typename Mma::FragmentC;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Workspace bytes per thread block
-  static size_t const kWorkspaceBytesPerBlock =
-    __NV_STD_MAX(
-      kThreadCount * sizeof(AccumulatorTile),
-      Epilogue::kWorkspaceBytesPerBlock);
-
-  /// Block-striped reduction utility
-  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
-
-
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode = GemmUniversalMode::kGemm;
-    GemmCoord problem_size {};
-    int batch_count {1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A = nullptr;
-    void const * ptr_B = nullptr;
-    void const * ptr_C = nullptr;
-    void * ptr_D = nullptr;
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-
-    typename LayoutA::Stride stride_a{0};
-    typename LayoutB::Stride stride_b{0};
-    typename LayoutC::Stride stride_c{0};
-    typename LayoutC::Stride stride_d{0};
-
-    typename LayoutA::Stride::LongIndex lda{0};
-    typename LayoutB::Stride::LongIndex ldb{0};
-    typename LayoutC::Stride::LongIndex ldc{0};
-    typename LayoutC::Stride::LongIndex ldd{0};
-
-    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-
-
-    //
-    // Methods
-    //
-
-    /// Default Constructor
-    Arguments() = default;
-
-    /// Constructor
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride stride_a,
-      typename LayoutB::Stride stride_b,
-      typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d,
-      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-    ):
-      mode(mode),
-      problem_size(problem_size),
-      batch_count(batch_split),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d), avail_sms(avail_sms)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// Constructor
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::LongIndex lda,
-      typename LayoutB::Stride::LongIndex ldb,
-      typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd,
-      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-    ):
-      mode(mode),
-      problem_size(problem_size),
-      batch_count(batch_split),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), avail_sms(avail_sms)
-    {
-      stride_a = make_Coord(lda);
-      stride_b = make_Coord(ldb);
-      stride_c = make_Coord(ldc);
-      stride_d = make_Coord(ldd);
-      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const
-    {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.stride_a, args.stride_b);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  /// Parameters structure
-  struct Params
-  {
-  public:
-
-    //
-    // Data members
-    //
-
-    void * ptr_A = nullptr;
-    void * ptr_B = nullptr;
-
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-
-    GemmUniversalMode mode = GemmUniversalMode::kGemm;
-
-    ThreadblockSwizzle block_mapping{};
-
-    void *barrier_workspace = nullptr;
-    void *partials_workspace = nullptr;
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    void * ptr_D = nullptr;
-    void * ptr_C = nullptr;
-
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_C{0};
-
-
-  protected:
-
-    //
-    // Host-only dispatch-utilities
-    //
-
-    /// Pad the given allocation size up to the nearest cache line
-    static size_t cacheline_align_up(size_t size)
-    {
-      static const int CACHELINE_SIZE = 128;
-      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
-    }
-
-    /// Get the workspace size needed for barrier
-    size_t get_barrier_workspace_size() const
-    {
-      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
-      // each reduction block needs its own synchronization flag.
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
-
-      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
-    }
-
-    /// Get the workspace size needed for intermediate partial sums
-    size_t get_partials_workspace_size() const
-    {
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
-    }
-
-
-  public:
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      output_op(args.epilogue),
-      mode(args.mode),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      barrier_workspace(nullptr),
-      partials_workspace(nullptr)
-    {
-      // Number of SMs to make available for StreamK decomposition
-      int avail_sms = (args.avail_sms == -1) ?
-                        device_sms :
-                        fast_min(args.avail_sms, device_sms);
-
-      // Initialize the block mapping structure
-      block_mapping = ThreadblockSwizzle(
-        args.mode,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.batch_count,
-        sm_occupancy,
-        device_sms,
-        avail_sms,
-        sizeof(ElementA),
-        sizeof(ElementB),
-        sizeof(ElementC),
-        Epilogue::kAccumulatorFragments);
-    }
-
-
-    /// Returns the workspace size (in bytes) needed for these parameters
-    size_t get_workspace_size() const
-    {
-      return
-        get_barrier_workspace_size() +
-        get_partials_workspace_size();
-    }
-
-
-    /// Assign and initialize the specified workspace buffer.  Assumes
-    /// the memory allocated to workspace is at least as large as get_workspace_size().
-    Status init_workspace(
-      void *workspace,
-      cudaStream_t stream = nullptr)
-    {
-      uint8_t *ptr = static_cast<uint8_t*>(workspace);
-
-      // Establish partials workspace
-      partials_workspace = nullptr;
-      size_t partials_workspace_bytes = get_partials_workspace_size();
-      if (partials_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        partials_workspace = ptr;
-        ptr += partials_workspace_bytes;
-      }
-
-      // Establish barrier workspace
-      barrier_workspace = nullptr;
-      size_t barrier_workspace_bytes = get_barrier_workspace_size();
-      if (barrier_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        barrier_workspace = ptr;
-        ptr += barrier_workspace_bytes;
-      }
-
-      // Zero-initialize barrier workspace
-      if (barrier_workspace)
-      {
-        size_t barrier_workspace_bytes = get_barrier_workspace_size();
-
-        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
-
-        cudaError_t result = cudaMemsetAsync(
-          barrier_workspace,
-          0,
-          barrier_workspace_bytes,
-          stream);
-
-        if (result != cudaSuccess) {
-          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-
-      return Status::kSuccess;
-    }
-
-
-    /// Returns the GEMM volume in thread block tiles
-    cutlass::gemm::GemmCoord get_tiled_shape() const
-    {
-      return block_mapping.tiled_shape();
-    }
-
-
-    /// Returns the total number of thread blocks to launch
-    int get_grid_blocks() const
-    {
-      dim3 grid_dims = get_grid_dims();
-      return grid_dims.x * grid_dims.y * grid_dims.z;
-    }
-
-
-    /// Returns the grid extents in thread blocks to launch
-    dim3 get_grid_dims() const
-    {
-      return block_mapping.get_grid_dims();
-    }
-
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
-
-      // Update input/output pointers
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
-
-      output_op = args.epilogue;
-    }
-
-  };
-
-  /// Tile work descriptor
-  struct TileWorkDesc
-  {
-    /// The linear tile index
-    int tile_idx;
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    cutlass::gemm::GemmCoord tiled_coord;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    int iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_begin;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    int k_end;
-
-    /// The number of remaining MAC-iterations this threadblock will perform for this tile
-    int k_iters_remaining;
-
-    // Whether this block will perform the first iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_started()
-    {
-      return (k_begin == 0);
-    }
-
-    // Whether this block will perform the last iteration of this tile
-    CUTLASS_DEVICE
-    bool tile_finished(Params const &params)
-    {
-      return (k_end == params.block_mapping.problem_size.k());
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage
-  {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-protected:
-
-  //
-  // Data members
-  //
-
-  /// GEMM problem parameters
-  Params params;
-
-  /// Shared storage reference
-  SharedStorage &shared_storage;
-
-  /// ID within the threadblock
-  int thread_idx;
-
-  /// ID of warp
-  int warp_idx;
-
-  /// ID of each thread within a warp
-  int lane_idx;
-
-  /// Threadblock scoped epilogue
-  Epilogue epilogue;
-
-
-public:
-
-  //
-  // Host-only dispatch API
-  //
-
-  /// Determines whether the GEMM problem size satisfies this kernel's
-  /// alignment requirements
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversalStreamk::can_implement()");
-
-    static int const kAlignmentA = (platform::is_same<LayoutA,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutA,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<LayoutB,
-                                                      layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutB,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = (platform::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  /// Determines whether the GEMM problem satisfies this kernel's
-  /// alignment requirements
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-protected:
-
-  //
-  // Device-only utility methods
-  //
-
-  /// Iterator for fetching tile fragments from A
-  CUTLASS_DEVICE
-  typename Mma::IteratorA init_iterator_A(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input A matrix
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
-    }
-
-    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
-    int m_end = params.block_mapping.problem_size.m();
-    return typename Mma::IteratorA(
-        params.params_A,
-        ptr_A,
-        { m_end, tile_work.k_end },
-        threadIdx.x,
-        { m_begin, tile_work.k_begin });
-
-  }
-
-
-  /// Iterator for fetching tile fragments from B
-  CUTLASS_DEVICE
-  typename Mma::IteratorB init_iterator_B(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input B matrix
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
-    }
-
-    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
-    int n_end = params.block_mapping.problem_size.n();
-    return typename Mma::IteratorB(
-        params.params_B,
-        ptr_B,
-        { tile_work.k_end, n_end },
-        threadIdx.x,
-        { tile_work.k_begin, n_begin });
-  }
-
-
-  CUTLASS_DEVICE
-  void init_dp_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = 0;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = params.block_mapping.problem_size.k();
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void init_sk_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx,
-      int block_iter_begin,
-      int block_iter_end)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration for this tile
-    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
-
-    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
-
-    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_end = block_iter_end - tile_iter_begin;
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = min(
-        params.block_mapping.problem_size.k(),            // extent of k domain
-        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  /// Share accumulators with peers
-  CUTLASS_DEVICE
-  void share_accumulators(
-    AccumulatorTile const &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    int accum_tile_offset = first_block_idx * kThreadCount;
-
-    if (block_idx == first_block_idx)
-    {
-      // First peer initializes the workspace partials
-      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-    else
-    {
-      // Subsequent peers atomically accumulate into the workspace partials
-      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
-      {
-        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
-        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
-      }
-      else
-      {
-        // Turnstile reduction order: wait until the previous peer has written
-        int wait_count = block_idx - first_block_idx;
-        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
-      }
-
-      // Perform reduction in workspace
-      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-
-    // Signal our arrival
-    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
-  }
-
-
-  /// Acquire accumulators from peers
-  CUTLASS_DEVICE
-  void acquire_accumulators(
-    AccumulatorTile &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    // Wait for arrival
-    int num_carry_in = block_idx - first_block_idx;
-    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
-
-    // Load and add peer-partials accumulator tile to local accumulator tile
-    int accum_tile_offset = first_block_idx * kThreadCount;
-    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
-  }
-
-
-  /// Perform epilogue computations and output
-  CUTLASS_DEVICE
-  void do_epilogue(
-    TileWorkDesc &tile_work,
-    AccumulatorTile &accumulator_tile)
-  {
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    // Update pointers for batched/array mode(s)
-    if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
-      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
-    }
-    if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
-    }
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tile_work.tiled_coord.m() * Mma::Shape::kM,
-      tile_work.tiled_coord.n() * Mma::Shape::kN
-    );
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-        EpilogueOutputOp(params.output_op),
-        iterator_D,
-        accumulator_tile,
-        iterator_C);
-  }
-
-
-  CUTLASS_DEVICE
-  void separate_reduction(int reduce_idx)
-  {
-    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
-
-    // Reduce by sk-tile (every tile contributed to by one or more blocks)
-    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
-    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
-
-    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
-    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
-
-    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
-    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
-
-    // Wait for peers to complete
-    int peer_idx_end = peer_idx_last + 1;
-    int num_peers = peer_idx_end - peer_idx_begin;
-    Barrier::wait_eq_reset(
-        params.barrier_workspace,
-        thread_idx,
-        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
-        num_peers);
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
-
-    // Location of this tile in item-coords
-    MatrixCoord threadblock_item_begin(
-      tiled_coord.m() * Mma::Shape::kM,
-      tiled_coord.n() * Mma::Shape::kN
-    );
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.block_mapping.problem_size.mn(),
-        thread_idx,
-        threadblock_item_begin);
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue.reduce(
-        peer_idx_begin,
-        peer_idx_end,
-        reduce_fragment_idx,
-        params.partials_workspace,
-        EpilogueOutputOp(params.output_op),
-        iterator_D,
-        iterator_C);
-  }
-
-
-  CUTLASS_DEVICE
-  void process_tile(
-    TileWorkDesc tile_work,
-    int block_idx,
-    int dp_start_block_idx,
-    int block_iter_begin)
-  {
-    // Initialize input iterators
-    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
-    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
-
-    // Initialize accumulators
-    AccumulatorTile accumulator_tile;
-    accumulator_tile.clear();
-
-    // Initialize MMA abstraction
-    Mma mma(
-      shared_storage.main_loop,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Perform this tile's range of multiply-accumulate (MAC) iterations
-    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
-
-    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
-        (params.block_mapping.reduction_blocks == 0) ||
-        (block_idx >= dp_start_block_idx))
-    {
-      //
-      // Cooperative SK peer reduction or DP block
-      //
-
-      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
-
-      if (!tile_work.tile_finished(params)) {
-        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
-        share_accumulators(accumulator_tile, block_idx, first_block_idx);
-      }
-      else
-      {
-        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
-        if (!tile_work.tile_started())
-        {
-          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
-          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
-        }
-
-        do_epilogue(tile_work, accumulator_tile);
-      }
-    }
-    else
-    {
-      //
-      // Separate peer reduction
-      //
-
-      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
-      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
-
-      // Signal arrival
-      Barrier::arrive_range_inc(
-        params.barrier_workspace,
-        thread_idx,
-        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
-        Epilogue::kAccumulatorFragments);
-    }
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void gemm()
-  {
-    // Initialize block's iteration range
-    int tile_idx = 0;
-    int block_iter_begin = 0;
-    int block_iters_remaining = 0;
-
-    int block_idx = params.block_mapping.get_block_idx();
-
-    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
-    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
-    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
-    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
-
-    // Initialize tile work descriptor
-    TileWorkDesc tile_work;
-
-    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
-    bool sk_block = (block_idx < sk_padding_start_block_idx);
-    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
-            (block_idx < grid_padding_start_block_idx) &&
-            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
-
-    if (dp_block)
-    {
-      // This is a DP block
-      int dp_block_idx = block_idx - dp_start_block_idx;
-      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
-
-      // Blocks in first DP wave get configured number of tiles
-      tile_idx = first_dp_tile + dp_block_idx;
-      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
-
-      // Blocks in subsequent DP waves get 1 tile
-      if (dp_block_idx >= params.block_mapping.avail_sms) {
-          tile_allottment = 1;
-          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
-      }
-
-      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
-
-      init_dp_tile_work(tile_work, tile_idx);
-
-      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
-      if ((tile_idx < params.block_mapping.sk_tiles) ||
-          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
-          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
-      {
-        return;
-      }
-    }
-    else if (sk_block)
-    {
-      // This is a SK block
-      int block_iter_end;
-      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
-      block_iters_remaining = block_iter_end - block_iter_begin;
-
-      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
-      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-    }
-    else
-    {
-      if (reduce_block)
-      {
-        // This is a reduction threadblock
-        int reduce_block_idx = block_idx - reduce_start_block_idx;
-        separate_reduction(reduce_block_idx);
-      }
-
-      return;
-    }
-
-    // Iteration-processing loop body
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (true)
-    {
-      // Perform this block's share of work for this tile
-      process_tile(
-        tile_work,
-        block_idx,
-        dp_start_block_idx,
-        block_iter_begin);
-
-      block_iters_remaining -= tile_work.k_iters_remaining;
-
-      if (block_iters_remaining == 0)
-      {
-        break;
-      }
-
-      // Continue to next tile
-      __syncthreads();
-
-      if (block_idx >= dp_start_block_idx)
-      {
-        // DP block consume their tiles at stride
-        tile_idx += params.block_mapping.avail_sms;
-        init_dp_tile_work(tile_work, tile_idx);
-      }
-      else
-      {
-        // SK blocks consume their tiles in backwards order
-        tile_idx--;
-        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-      }
-    }
-
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmUniversalStreamk op(params, shared_storage);
-    op();
-  }
-
-
-  // Constructor
-  CUTLASS_DEVICE
-  GemmUniversalStreamk(
-      Params const &params,
-      SharedStorage &shared_storage)
-    :
-      params(params),
-      shared_storage(shared_storage),
-      thread_idx(threadIdx.x),
-      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
-      lane_idx(threadIdx.x % 32),
-      epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx)
-  {}
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()()
-  {
-    // Generic SK code path
-    gemm();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
deleted file mode 100755
index 5ce123a1a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/kernel/gemm_universal.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Gemm that compute the epilogue visitor functor
-template <
-  typename Mma,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue,             ///! Epilogue
-  typename ThreadblockSwizzle_   ///! Threadblock swizzling function
->
-class GemmWithEpilogueVisitor: public GemmUniversal<Mma, Epilogue, ThreadblockSwizzle_> {
-public:
-
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using Base = GemmUniversal<Mma, Epilogue, ThreadblockSwizzle>;
-  using Base::Base;
-
-  using FusionCallbacks = typename Epilogue::FusionCallbacks;
-
-  using ElementA = typename Base::ElementA;
-  using LayoutA = typename Base::LayoutA;
-  using ElementB = typename Base::ElementB;
-  using LayoutB = typename Base::LayoutB;
-  using ElementC = typename Base::ElementC;
-  using LayoutC = typename Base::LayoutC;
-
-  using ThreadblockShape = typename Mma::Shape;
-
-  //
-  // Structures
-  //
-
-  using SharedStorage = typename Base::SharedStorage;
-  using Arguments = typename Base::Arguments;
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename FusionCallbacks::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-
-    int * ptr_gather_A_indices;
-    int * ptr_gather_B_indices;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
-      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
-      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices))
-    {
-      // Raise error on unsupported modes
-      assert(args.mode != GemmUniversalMode::kGemmSplitKParallel && "Sm80 EVT does not support SplitKParallel.");
-      assert(!(args.mode == GemmUniversalMode::kGemm && this->grid_tiled_shape.k() > 1 )
-        && "Sm80 EVT does not support SplitKSerial.");
-      assert(args.mode != GemmUniversalMode::kArray && "Sm80 EVT does not support Array Gemm.");
-    }
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversalwithVisitor::Params::update()");
-
-      // Update input pointers
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      this->batch_stride_D = args.batch_stride_D;
-
-      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
-      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
-
-      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
-      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
-    }
-  };
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithEpilogueVisitor op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    ThreadblockSwizzle threadblock_swizzle;
-    run_with_swizzle(params, shared_storage, threadblock_swizzle);
-  }
-
-  /// Executes one GEMM with an externally-provided swizzling function
-  CUTLASS_DEVICE
-  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A,
-      params.ptr_gather_A_indices);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B,
-      params.ptr_gather_B_indices);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    Epilogue epilogue(
-      params.output_op,
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
deleted file mode 100755
index cdb825993..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
+++ /dev/null
@@ -1,895 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept with streamk.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/barrier.h"
-#include "cutlass/block_striped.h"
-
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock mapping function
->
-class GemmWithEpilogueVisitorStreamk {
-public:
-
-  using Base = GemmUniversalStreamk<Mma_, Epilogue_, ThreadblockSwizzle_>;
-
-  //
-  // Types and constants
-  //
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using FusionCallbacks = typename Epilogue::FusionCallbacks;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  /// The per-thread tile of raw accumulators
-  using AccumulatorTile = typename Mma::FragmentC;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Workspace bytes per thread block
-  static size_t const kWorkspaceBytesPerBlock =
-    __NV_STD_MAX(
-      kThreadCount * sizeof(AccumulatorTile),
-      Epilogue::kWorkspaceBytesPerBlock);
-
-  /// Block-striped reduction utility
-  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
-
-
-
-  //
-  // Structures
-  //
-
-  using Arguments = typename Base::Arguments;
-
-
-  /// Parameters structure
-  struct Params
-  {
-  public:
-
-    //
-    // Data members
-    //
-    cute::Shape<int32_t,int32_t,int32_t> problem_shape{};
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-
-    ThreadblockSwizzle block_mapping{};
-
-    void *barrier_workspace{nullptr};
-    void *partials_workspace{nullptr};
-
-    typename FusionCallbacks::Params output_op{};
-
-
-    void * ptr_D{nullptr};
-    void * ptr_C{nullptr};
-
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-
-    int64_t batch_stride_D{0};
-    int64_t batch_stride_C{0};
-
-
-  protected:
-
-    //
-    // Host-only dispatch-utilities
-    //
-
-    /// Pad the given allocation size up to the nearest cache line
-    static size_t cacheline_align_up(size_t size)
-    {
-      static const int CACHELINE_SIZE = 128;
-      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
-    }
-
-    /// Get the workspace size needed for barrier
-    size_t get_barrier_workspace_size() const
-    {
-      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
-      // each reduction block needs its own synchronization flag.
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
-
-      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
-    }
-
-    /// Get the workspace size needed for intermediate partial sums
-    size_t get_partials_workspace_size() const
-    {
-      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
-      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
-    }
-
-
-  public:
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
-      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
-      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
-      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
-      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
-      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
-      mode(args.mode),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      barrier_workspace(nullptr),
-      partials_workspace(nullptr)
-    {
-      // Number of SMs to make available for StreamK decomposition
-      int avail_sms = (args.avail_sms == -1) ?
-                        device_sms :
-                        fast_min(args.avail_sms, device_sms);
-
-      // Initialize the block mapping structure
-      block_mapping = ThreadblockSwizzle(
-        args.mode,
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.batch_count,
-        sm_occupancy,
-        device_sms,
-        avail_sms,
-        sizeof(ElementA),
-        sizeof(ElementB),
-        sizeof(ElementC),
-        Epilogue::kAccumulatorFragments);
-    }
-
-
-    /// Returns the workspace size (in bytes) needed for these parameters
-    size_t get_workspace_size() const
-    {
-      return
-        get_barrier_workspace_size() +
-        get_partials_workspace_size();
-    }
-
-
-    /// Assign and initialize the specified workspace buffer.  Assumes
-    /// the memory allocated to workspace is at least as large as get_workspace_size().
-    Status init_workspace(
-      void *workspace,
-      cudaStream_t stream = nullptr)
-    {
-      uint8_t *ptr = static_cast<uint8_t*>(workspace);
-
-      // Establish partials workspace
-      partials_workspace = nullptr;
-      size_t partials_workspace_bytes = get_partials_workspace_size();
-      if (partials_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        partials_workspace = ptr;
-        ptr += partials_workspace_bytes;
-      }
-
-      // Establish barrier workspace
-      barrier_workspace = nullptr;
-      size_t barrier_workspace_bytes = get_barrier_workspace_size();
-      if (barrier_workspace_bytes > 0)
-      {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        barrier_workspace = ptr;
-        ptr += barrier_workspace_bytes;
-      }
-
-      // Zero-initialize barrier workspace
-      if (barrier_workspace)
-      {
-        size_t barrier_workspace_bytes = get_barrier_workspace_size();
-
-        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
-
-        cudaError_t result = cudaMemsetAsync(
-          barrier_workspace,
-          0,
-          barrier_workspace_bytes,
-          stream);
-
-        if (result != cudaSuccess) {
-          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-
-      return Status::kSuccess;
-    }
-
-
-    /// Returns the GEMM volume in thread block tiles
-    cutlass::gemm::GemmCoord get_tiled_shape() const
-    {
-      return block_mapping.tiled_shape();
-    }
-
-
-    /// Returns the total number of thread blocks to launch
-    int get_grid_blocks() const
-    {
-      dim3 grid_dims = get_grid_dims();
-      return grid_dims.x * grid_dims.y * grid_dims.z;
-    }
-
-
-    /// Returns the grid extents in thread blocks to launch
-    dim3 get_grid_dims() const
-    {
-      return block_mapping.get_grid_dims();
-    }
-
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
-
-      // Update input/output pointers
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
-
-      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
-      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
-    }
-
-  };
-
-  struct TileWorkDesc: Base::TileWorkDesc {
-    int k_end;
-    CUTLASS_DEVICE
-    bool tile_finished(Params const &params)
-    {
-      return (k_end == params.block_mapping.problem_size.k());
-    }
-  };
-
-  // using TileWorkDesc = typename Base::TileWorkDesc;
-  using SharedStorage = typename Base::SharedStorage;
-
-protected:
-
-  //
-  // Data members
-  //
-
-  /// GEMM problem parameters
-  Params params;
-
-  /// Shared storage reference
-  SharedStorage &shared_storage;
-
-  /// ID within the threadblock
-  int thread_idx;
-
-  /// ID of warp
-  int warp_idx;
-
-  /// ID of each thread within a warp
-  int lane_idx;
-
-  /// Threadblock scoped epilogue
-  Epilogue epilogue;
-
-
-public:
-
-  //
-  // Host-only dispatch API
-  //
-
-  /// Determines whether the GEMM problem size satisfies this kernel's
-  /// alignment requirements
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size)
-  {
-    return Base::can_implement(problem_size);
-  }
-
-  /// Determines whether the GEMM problem satisfies this kernel's
-  /// alignment requirements
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-protected:
-
-  //
-  // Device-only utility methods
-  //
-
-  /// Iterator for fetching tile fragments from A
-  CUTLASS_DEVICE
-  typename Mma::IteratorA init_iterator_A(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input A matrix
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
-    }
-
-    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
-    int m_end = params.block_mapping.problem_size.m();
-    return Mma::IteratorA(
-        params.params_A,
-        ptr_A,
-        { m_end, tile_work.k_end },
-        threadIdx.x,
-        { m_begin, tile_work.k_begin });
-
-  }
-
-
-  /// Iterator for fetching tile fragments from B
-  CUTLASS_DEVICE
-  typename Mma::IteratorB init_iterator_B(
-    TileWorkDesc &tile_work,
-    GemmUniversalMode mode)
-  {
-    // The input B matrix
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    // Update input pointers based on batched/array mode
-    if (mode == GemmUniversalMode::kBatched) {
-      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
-    }
-    if (mode == GemmUniversalMode::kArray) {
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
-    }
-
-    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
-    int n_end = params.block_mapping.problem_size.n();
-    return Mma::IteratorB(
-        params.params_B,
-        ptr_B,
-        { tile_work.k_end, n_end },
-        threadIdx.x,
-        { tile_work.k_begin, n_begin });
-  }
-
-
-  CUTLASS_DEVICE
-  void init_dp_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = 0;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = params.block_mapping.problem_size.k();
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void init_sk_tile_work(
-      TileWorkDesc &tile_work,
-      int tile_idx,
-      int block_iter_begin,
-      int block_iter_end)
-  {
-    // The linear tile index
-    tile_work.tile_idx = tile_idx;
-
-    // The first global-scoped MAC-iteration for this tile
-    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
-
-    // The first global-scoped MAC-iteration this threadblock will perform for this tile
-    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
-
-    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
-
-    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
-    int k_iter_end = block_iter_end - tile_iter_begin;
-
-    // The number of MAC-iterations this threadblock will perform for this tile
-    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
-
-    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
-
-    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
-    tile_work.k_end = min(
-        params.block_mapping.problem_size.k(),            // extent of k domain
-        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
-
-    // The location of this tile (in threadblock-tile coordinates) in the output matrix
-    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
-  }
-
-
-  /// Share accumulators with peers
-  CUTLASS_DEVICE
-  void share_accumulators(
-    AccumulatorTile const &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    int accum_tile_offset = first_block_idx * kThreadCount;
-
-    if (block_idx == first_block_idx)
-    {
-      // First peer initializes the workspace partials
-      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-    else
-    {
-      // Subsequent peers atomically accumulate into the workspace partials
-      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
-      {
-        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
-        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
-      }
-      else
-      {
-        // Turnstile reduction order: wait until the previous peer has written
-        int wait_count = block_idx - first_block_idx;
-        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
-      }
-
-      // Perform reduction in workspace
-      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
-    }
-
-    // Signal our arrival
-    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
-  }
-
-
-  /// Acquire accumulators from peers
-  CUTLASS_DEVICE
-  void acquire_accumulators(
-    AccumulatorTile &accumulator_tile,
-    int block_idx,
-    int first_block_idx)
-  {
-    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
-
-    // Wait for arrival
-    int num_carry_in = block_idx - first_block_idx;
-    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
-
-    // Load and add peer-partials accumulator tile to local accumulator tile
-    int accum_tile_offset = first_block_idx * kThreadCount;
-    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
-  }
-
-
-  /// Perform epilogue computations and output
-  CUTLASS_DEVICE
-  void do_epilogue(
-    TileWorkDesc &tile_work,
-    AccumulatorTile &accumulator_tile)
-  {
-    cutlass::gemm::GemmCoord threadblock_tile_offset{
-      tile_work.tiled_coord.m(),
-      tile_work.tiled_coord.n(),
-      tile_work.tiled_coord.k()
-    };
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      accumulator_tile,
-      threadblock_tile_offset,
-      params.problem_shape,
-      thread_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void separate_reduction(int reduce_idx)
-  {
-    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
-
-    // Reduce by sk-tile (every tile contributed to by one or more blocks)
-    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
-    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
-
-    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
-    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
-
-    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
-    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
-
-    // Wait for peers to complete
-    int peer_idx_end = peer_idx_last + 1;
-    int num_peers = peer_idx_end - peer_idx_begin;
-    Barrier::wait_eq_reset(
-        params.barrier_workspace,
-        thread_idx,
-        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
-        num_peers);
-
-    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
-    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue.reduce(
-        peer_idx_begin,
-        peer_idx_end,
-        reduce_fragment_idx,
-        params.partials_workspace,
-        tiled_coord,
-        params.problem_shape,
-        thread_idx);
-  }
-
-
-  CUTLASS_DEVICE
-  void process_tile(
-    TileWorkDesc tile_work,
-    int block_idx,
-    int dp_start_block_idx,
-    int block_iter_begin)
-  {
-    // Initialize input iterators
-    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
-    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
-
-    // Initialize accumulators
-    AccumulatorTile accumulator_tile;
-    accumulator_tile.clear();
-
-    // Initialize MMA abstraction
-    Mma mma(
-      shared_storage.main_loop,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Perform this tile's range of multiply-accumulate (MAC) iterations
-    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
-
-    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
-        (params.block_mapping.reduction_blocks == 0) ||
-        (block_idx >= dp_start_block_idx))
-    {
-      //
-      // Cooperative SK peer reduction or DP block
-      //
-
-      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
-
-      if (!tile_work.tile_finished(params)) {
-        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
-        share_accumulators(accumulator_tile, block_idx, first_block_idx);
-      }
-      else
-      {
-        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
-        if (!tile_work.tile_started())
-        {
-          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
-          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
-        }
-
-        do_epilogue(tile_work, accumulator_tile);
-      }
-    }
-    else
-    {
-      //
-      // Separate peer reduction
-      //
-
-      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
-      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
-
-      // Signal arrival
-      Barrier::arrive_range_inc(
-        params.barrier_workspace,
-        thread_idx,
-        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
-        Epilogue::kAccumulatorFragments);
-    }
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void gemm()
-  {
-    // Initialize block's iteration range
-    int tile_idx = 0;
-    int block_iter_begin = 0;
-    int block_iters_remaining = 0;
-
-    int block_idx = params.block_mapping.get_block_idx();
-
-    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
-    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
-    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
-    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
-
-    // Initialize tile work descriptor
-    TileWorkDesc tile_work;
-
-    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
-    bool sk_block = (block_idx < sk_padding_start_block_idx);
-    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
-            (block_idx < grid_padding_start_block_idx) &&
-            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
-
-    if (dp_block)
-    {
-      // This is a DP block
-      int dp_block_idx = block_idx - dp_start_block_idx;
-      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
-
-      // Blocks in first DP wave get configured number of tiles
-      tile_idx = first_dp_tile + dp_block_idx;
-      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
-
-      // Blocks in subsequent DP waves get 1 tile
-      if (dp_block_idx >= params.block_mapping.avail_sms) {
-          tile_allottment = 1;
-          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
-      }
-
-      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
-
-      init_dp_tile_work(tile_work, tile_idx);
-
-      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
-      if ((tile_idx < params.block_mapping.sk_tiles) ||
-          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
-          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
-      {
-        return;
-      }
-    }
-    else if (sk_block)
-    {
-      // This is a SK block
-      int block_iter_end;
-      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
-      block_iters_remaining = block_iter_end - block_iter_begin;
-
-      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
-      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-    }
-    else
-    {
-      if (reduce_block)
-      {
-        // This is a reduction threadblock
-        int reduce_block_idx = block_idx - reduce_start_block_idx;
-        separate_reduction(reduce_block_idx);
-      }
-
-      return;
-    }
-
-    // Iteration-processing loop body
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (true)
-    {
-      // Perform this block's share of work for this tile
-      process_tile(
-        tile_work,
-        block_idx,
-        dp_start_block_idx,
-        block_iter_begin);
-
-      block_iters_remaining -= tile_work.k_iters_remaining;
-
-      if (block_iters_remaining == 0)
-      {
-        break;
-      }
-
-      // Continue to next tile
-      __syncthreads();
-
-      if (block_idx >= dp_start_block_idx)
-      {
-        // DP block consume their tiles at stride
-        tile_idx += params.block_mapping.avail_sms;
-        init_dp_tile_work(tile_work, tile_idx);
-      }
-      else
-      {
-        // SK blocks consume their tiles in backwards order
-        tile_idx--;
-        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
-      }
-    }
-
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithEpilogueVisitorStreamk op(params, shared_storage);
-    op();
-  }
-
-
-  CUTLASS_DEVICE
-  GemmWithEpilogueVisitorStreamk(
-      Params const &params,
-      SharedStorage &shared_storage)
-    :
-      params(params),
-      shared_storage(shared_storage),
-      thread_idx(threadIdx.x),
-      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
-      lane_idx(threadIdx.x % 32),
-      epilogue(
-        params.output_op,
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx)
-  {}
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()()
-  {
-    // Generic SK code path
-    gemm();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
deleted file mode 100755
index 470eaef53..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
+++ /dev/null
@@ -1,759 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Gemm kernel with an epilogue that computes the absolute maximum value of the output
-    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
-    stored to global memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/layout.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Gemm that computes the absolute maximum value of the output and a pre-activation-function
-// auxiliary output.
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmWithAbsMax {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value,
-    128 / sizeof_bits<ElementB>::value
-  );
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-    void * ptr_Aux;
-
-    void * ptr_Vector;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_Vector;
-
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
-    typename LayoutC::Stride::Index ldaux;
-    typename LayoutC::Stride::Index ldr;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      ptr_Aux(nullptr)
-    {}
-
-    /// Constructs an arguments structure with ldaux
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void * ptr_Aux,
-      void * ptr_Vector,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldaux)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_Aux(ptr_Aux),
-      ptr_Vector(ptr_Vector),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C(batch_stride_C),
-      batch_stride_Vector(batch_stride_Vector),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldaux(ldaux), ldr(ldr)
-    {
-    }
-
-    /// Constructs an Arguments structure without ldaux.
-    /// These parameters are overridden with D batch stride and ldd.
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void * ptr_Aux,
-      void * ptr_Vector,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr)
-    : Arguments(mode, problem_size, batch_count, epilogue, ptr_A, ptr_B, ptr_C, ptr_D, ptr_Aux, ptr_Vector,
-               batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_Vector,
-               lda, ldb, ldc, ldd, ldr, ldd)
-    {
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-    void * ptr_Aux;
-
-    void * ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_Vector;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      params_Aux(args.ldaux),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      ptr_Aux(args.ptr_Aux),
-      ptr_Vector(args.ptr_Vector),
-      ldr(args.ldr),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_Vector(args.batch_stride_Vector)
-    {
-
-    }
-
-    /// Lightweight update given a subset of arguments.
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-      ptr_Aux = args.ptr_Aux;
-
-      ptr_Vector = args.ptr_Vector;
-      ldr = args.ldr;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      this->batch_stride_D = args.batch_stride_D;
-      batch_stride_Vector = args.batch_stride_Vector;
-
-      output_op = args.epilogue;
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithAbsMax op;
-    op(params, shared_storage);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementAuxOutput *ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput *>(params.ptr_Aux);
-    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    //
-    // Special path when split-K not enabled.
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
-
-      // Tile iterators loading from source tensors.
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Tile iterator writing to auxiliary tensor.
-      typename Epilogue::AuxOutputTileIterator iterator_Aux(
-        params.params_Aux,
-        ptr_Aux,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Construct the epilogue
-      Epilogue epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx);
-
-      // Move to appropriate location for this output tile
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-      }
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(output_op,
-               ptr_Vector,
-               iterator_D,
-               accumulators,
-               iterator_C,
-               iterator_Aux,
-               params.problem_size.mn(),
-               threadblock_offset);
-
-      return;
-    }
-
-    //
-    // Slower path when split-K or batching is needed
-    //
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      if (ptr_Aux) {
-        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_D;
-      }
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-      if (ptr_Aux) {
-        ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
-      }
-    }
-
-    // Tile iterators loading from source tensors.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to auxiliary destination tensor.
-    typename Epilogue::AuxOutputTileIterator iterator_Aux(
-      params.params_Aux,
-      // Only the final block writes the auxiliary tensor
-      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-          ? nullptr
-          : ptr_Aux,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-    }
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C,
-             iterator_Aux,
-             params.problem_size.mn(),
-             threadblock_offset);
-
-    //
-    // Release the semaphore
-    //
-
-    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
deleted file mode 100755
index 363d109ce..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
+++ /dev/null
@@ -1,1512 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Gemm kernel with fused reduction operation.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/layout.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool IsSingleSource = Epilogue_::kIsSingleSource
->
-struct GemmWithFusedEpilogue;
-
-// GemmWithFusedEpilogue with two sources
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value,
-    128 / sizeof_bits<ElementB>::value
-  );
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase{
-
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C1;
-    void const * ptr_C2;
-    void * ptr_D;
-
-    void * ptr_Vector;
-    void * ptr_Tensor;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C1;
-    int64_t batch_stride_C2;
-    int64_t batch_stride_Vector;
-    int64_t batch_stride_Tensor;
-
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc1;
-    typename LayoutC::Stride::Index ldc2;
-    typename LayoutC::Stride::Index ldd;
-    typename LayoutC::Stride::Index ldr;
-    typename LayoutC::Stride::Index ldt;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C1(nullptr),
-      ptr_C2(nullptr),
-      ptr_D(nullptr)
-    {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C1,
-      void const * ptr_C2,
-      void * ptr_D,
-      void * ptr_Vector,
-      void * ptr_Tensor,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C1,
-      int64_t batch_stride_C2,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      int64_t batch_stride_Tensor,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc1,
-      typename LayoutC::Stride::Index ldc2,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldt)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
-      ptr_Vector(ptr_Vector),
-      ptr_Tensor(ptr_Tensor),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C1(batch_stride_C1),
-      batch_stride_C2(batch_stride_C2),
-      batch_stride_Vector(batch_stride_Vector),
-      batch_stride_Tensor(batch_stride_Tensor),
-      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt)
-    {
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C1;
-    typename Epilogue::OutputTileIterator::Params params_C2;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::TensorTileIterator::Params params_Tensor;
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C1;
-    void * ptr_C2;
-    void * ptr_D;
-
-    void * ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    void * ptr_Tensor;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C1;
-    int64_t batch_stride_C2;
-    int64_t batch_stride_Vector;
-    int64_t batch_stride_Tensor;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C1(args.ldc1),
-      params_C2(args.ldc2),
-      params_D(args.ldd),
-      params_Tensor(args.ldt),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C1(const_cast<void *>(args.ptr_C1)),
-      ptr_C2(const_cast<void *>(args.ptr_C2)),
-      ptr_D(args.ptr_D),
-      ptr_Vector(args.ptr_Vector),
-      ldr(args.ldr),
-      ptr_Tensor(args.ptr_Tensor),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C1(args.batch_stride_C1),
-      batch_stride_C2(args.batch_stride_C2),
-      batch_stride_Vector(args.batch_stride_Vector),
-      batch_stride_Tensor(args.batch_stride_Tensor)
-    {
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
-    }
-
-    /// Lightweight update given a subset of arguments.
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C1 = const_cast<void *>(args.ptr_C1);
-      ptr_C2 = const_cast<void *>(args.ptr_C2);
-      ptr_D = args.ptr_D;
-
-      ptr_Vector = args.ptr_Vector;
-      ldr = args.ldr;
-      ptr_Tensor = args.ptr_Tensor;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C1 = args.batch_stride_C1;
-      batch_stride_C2 = args.batch_stride_C2;
-      batch_stride_Vector = args.batch_stride_Vector;
-      batch_stride_Tensor = args.batch_stride_Tensor;
-      this->batch_stride_D = args.batch_stride_D;
-
-      output_op = args.epilogue;
-
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithFusedEpilogue op;
-    op(params, shared_storage);
-  }
-
-  #define SPLIT_K_ENABLED 1
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-
-    #if SPLIT_K_ENABLED
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-    #endif
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
-    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    //
-    // Special path when split-K not enabled.
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
-
-      // Tile iterators loading from source tensors.
-      typename Epilogue::OutputTileIterator iterator_C1(
-        params.params_C1,
-        ptr_C1,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      typename Epilogue::OutputTileIterator iterator_C2(
-        params.params_C2,
-        ptr_C2,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Additional tensor to load from
-      typename Epilogue::TensorTileIterator tensor_iterator(
-          params.params_Tensor,
-          // Only the final block outputs Tensor
-          ptr_Tensor,
-          params.problem_size.mn(),
-          thread_idx,
-          threadblock_offset);
-
-      // Construct the epilogue
-      Epilogue epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx);
-
-      // Move to appropriate location for this output tile
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-      }
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(output_op,
-               ptr_Vector,
-               iterator_D,
-               accumulators,
-               iterator_C1,
-               iterator_C2,
-               tensor_iterator,
-               params.problem_size.mn(),
-               threadblock_offset);
-
-      return;
-    }
-
-    //
-    // Slower path when split-K or batching is needed
-    //
-
-
-    #if SPLIT_K_ENABLED
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C1;
-      if (ptr_C2) {
-        ptr_C2 += threadblock_tile_offset.k() * params.batch_stride_C2;
-      }
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      if (ptr_Tensor) {
-        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
-          ptr_Tensor,
-          threadblock_tile_offset.k() * params.batch_stride_Tensor);
-      }
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[threadblock_tile_offset.k()];
-      if (ptr_C2) {
-        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[threadblock_tile_offset.k()];
-      }
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-      if (ptr_Tensor) {
-        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
-      }
-    }
-    #endif
-
-    // Tile iterators loading from source tensors.
-    typename Epilogue::OutputTileIterator iterator_C1(
-      params.params_C1,
-      ptr_C1,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    typename Epilogue::OutputTileIterator iterator_C2(
-      params.params_C2,
-      ptr_C2,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        // Only the final block outputs Tensor
-        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-            ? nullptr
-            : ptr_Tensor,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset);
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    #if SPLIT_K_ENABLED
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C1 = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-    }
-    #endif
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C1,
-             iterator_C2,
-             tensor_iterator,
-             params.problem_size.mn(),
-             threadblock_offset);
-
-    //
-    // Release the semaphore
-    //
-
-    #if SPLIT_K_ENABLED
-    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-    #endif
-  }
-};
-
-// GemmWithFusedEpilogue with one source
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(
-    128 / sizeof_bits<ElementA>::value,
-    128 / sizeof_bits<ElementB>::value
-  );
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-
-    void * ptr_Vector;
-    void * ptr_Tensor;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_Vector;
-    int64_t batch_stride_Tensor;
-
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
-    typename LayoutC::Stride::Index ldr;
-    typename LayoutC::Stride::Index ldt;
-
-    //
-    // Methods
-    //
-
-    Arguments():
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr)
-    {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void * ptr_Vector,
-      void * ptr_Tensor,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_Vector,
-      int64_t batch_stride_Tensor,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutC::Stride::Index ldr,
-      typename LayoutC::Stride::Index ldt)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
-      ptr_Vector(ptr_Vector),
-      ptr_Tensor(ptr_Tensor),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C(batch_stride_C),
-      batch_stride_Vector(batch_stride_Vector),
-      batch_stride_Tensor(batch_stride_Tensor),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt)
-    {
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::TensorTileIterator::Params params_Tensor;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-
-    void * ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    void * ptr_Tensor;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_Vector;
-    int64_t batch_stride_Tensor;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      params_Tensor(args.ldt),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(args.ptr_D),
-      ptr_Vector(args.ptr_Vector),
-      ldr(args.ldr),
-      ptr_Tensor(args.ptr_Tensor),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_Vector(args.batch_stride_Vector),
-      batch_stride_Tensor(args.batch_stride_Tensor)
-    {
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
-    }
-
-    /// Lightweight update given a subset of arguments.
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      ptr_Vector = args.ptr_Vector;
-      ldr = args.ldr;
-      ptr_Tensor = args.ptr_Tensor;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_Vector = args.batch_stride_Vector;
-      batch_stride_Tensor = args.batch_stride_Tensor;
-      this->batch_stride_D = args.batch_stride_D;
-
-      output_op = args.epilogue;
-
-      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
-      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
-      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
-      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
-    }
-  };
-
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithFusedEpilogue op;
-    op(params, shared_storage);
-  }
-
-  #define SPLIT_K_ENABLED 1
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-
-    #if SPLIT_K_ENABLED
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm ||
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-    #endif
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations,
-      accumulators,
-      iterator_A,
-      iterator_B,
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
-
-    // Define the reduction output pointer and move to the appropriate place
-    typename Epilogue::ElementVector *ptr_Vector =
-      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-
-    //
-    // Fetch pointers based on mode.
-    //
-
-    //
-    // Special path when split-K not enabled.
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
-
-      // Tile iterators loading from source tensors.
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset
-      );
-
-      // Additional tensor to load from
-      typename Epilogue::TensorTileIterator tensor_iterator(
-          params.params_Tensor,
-          // Only the final block outputs Tensor
-          ptr_Tensor,
-          params.problem_size.mn(),
-          thread_idx,
-          threadblock_offset);
-
-      // Construct the epilogue
-      Epilogue epilogue(
-        shared_storage.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx);
-
-      // Move to appropriate location for this output tile
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-      }
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(output_op,
-               ptr_Vector,
-               iterator_D,
-               accumulators,
-               iterator_C,
-               tensor_iterator,
-               params.problem_size.mn(),
-               threadblock_offset);
-
-      return;
-    }
-
-    //
-    // Slower path when split-K or batching is needed
-    //
-
-
-    #if SPLIT_K_ENABLED
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      if (ptr_Tensor) {
-        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
-          ptr_Tensor,
-          threadblock_tile_offset.k() * params.batch_stride_Tensor);
-      }
-      if (ptr_Vector) {
-        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-      if (ptr_Tensor) {
-        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
-      }
-      if (ptr_Vector) {
-        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
-      }
-    }
-    #endif
-
-    // Tile iterators loading from source tensors.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Additional tensor to load from
-    typename Epilogue::TensorTileIterator tensor_iterator(
-        params.params_Tensor,
-        // Only the final block outputs Tensor
-        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-            ? nullptr
-            : ptr_Tensor,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset);
-
-    // Construct the epilogue
-    Epilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-
-    #if SPLIT_K_ENABLED
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
-
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-    }
-    #endif
-
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C,
-             tensor_iterator,
-             params.problem_size.mn(),
-             threadblock_offset);
-
-    //
-    // Release the semaphore
-    //
-
-    #if SPLIT_K_ENABLED
-    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      semaphore.release(lock);
-    }
-    #endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
deleted file mode 100755
index 49c4b0a1a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
+++ /dev/null
@@ -1,704 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/gemm/kernel/params_universal_base.h"
-
-#include "cutlass/trace.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename EpilogueGemmKReduction_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct GemmWithKReduction {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using EpilogueGemmKReduction = EpilogueGemmKReduction_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  using LayoutGemmKReduction = cutlass::layout::PitchLinear;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  static int const kReduceKForA = Mma::kReduceKForA;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments : UniversalArgumentsBase
-  {
-    //
-    // Data members
-    //
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    void const * ptr_A;
-    void const * ptr_B;
-    void const * ptr_C;
-    void * ptr_D;
-    void * ptr_gemm_k_reduction;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_gemm_k_reduction;
-
-    typename LayoutA::Stride::Index lda;
-    typename LayoutB::Stride::Index ldb;
-    typename LayoutC::Stride::Index ldc;
-    typename LayoutC::Stride::Index ldd;
-    typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction;
-
-    //
-    // Methods
-    //
-
-    Arguments() :
-      ptr_A(nullptr),
-      ptr_B(nullptr),
-      ptr_C(nullptr),
-      ptr_D(nullptr),
-      ptr_gemm_k_reduction(nullptr)
-    {}
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      void * ptr_gemm_k_reduction,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      int64_t batch_stride_gemm_k_reduction,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction)
-    :
-      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
-      epilogue(epilogue),
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_gemm_k_reduction(ptr_gemm_k_reduction),
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_gemm_k_reduction(batch_stride_gemm_k_reduction),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ld_gemm_k_reduction(ld_gemm_k_reduction)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-    }
-
-    /// Returns arguments for the transposed problem
-    Arguments transposed_problem() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params : UniversalParamsBase<
-    ThreadblockSwizzle,
-    ThreadblockShape,
-    ElementA,
-    ElementB,
-    ElementC,
-    LayoutA,
-    LayoutB>
-  {
-    using ParamsBase = UniversalParamsBase<
-      ThreadblockSwizzle,
-      ThreadblockShape,
-      ElementA,
-      ElementB,
-      ElementC,
-      LayoutA,
-      LayoutB>;
-
-    //
-    // Data members
-    //
-    
-    typename Mma::IteratorA::Params params_A;
-    typename Mma::IteratorB::Params params_B;
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    
-    typename EpilogueOutputOp::Params output_op;
-
-    void * ptr_A;
-    void * ptr_B;
-    void * ptr_C;
-    void * ptr_D;
-    void * ptr_gemm_k_reduction;
-
-    int64_t batch_stride_A;
-    int64_t batch_stride_B;
-    int64_t batch_stride_C;
-    int64_t batch_stride_gemm_k_reduction;
-
-    //
-    // Host dispatch API
-    //
-
-    /// Default constructor
-    Params() = default;
-
-    /// Constructor
-    Params(
-      Arguments const &args,  /// GEMM application arguments
-      int device_sms,         /// Number of SMs on the device
-      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
-    :
-      ParamsBase(args, device_sms, sm_occupancy),
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      output_op(args.epilogue),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_gemm_k_reduction(args.batch_stride_gemm_k_reduction),
-      ptr_D(args.ptr_D),
-      ptr_gemm_k_reduction(args.ptr_gemm_k_reduction)
-    {}
-
-    /// Assign and initialize the specified workspace buffer.  Assumes
-    /// the memory allocated to workspace is at least as large as get_workspace_size().
-    Status init_workspace(
-      void *workspace,
-      cudaStream_t stream = nullptr)
-    {
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << this->problem_size);
-
-      if (this->mode == GemmUniversalMode::kGemmSplitKParallel) {
-        ptr_D = workspace;
-        ptr_gemm_k_reduction = static_cast<uint8_t *>(workspace)
-                 + sizeof(ElementC) * size_t(this->batch_stride_D) * size_t(this->grid_tiled_shape.k());
-
-        return Status::kSuccess;
-      }
-
-      return ParamsBase::init_workspace(workspace, stream);
-    }
-
-    /// Returns the workspace size (in bytes) needed for this problem geometry
-    size_t get_workspace_size() const
-    {
-      size_t workspace_bytes = ParamsBase::get_workspace_size();
-
-      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
-      {
-        // Split-K parallel always requires a temporary workspace
-        workspace_bytes +=
-          sizeof(ElementC) *
-          size_t(batch_stride_gemm_k_reduction) *
-          size_t(this->grid_tiled_shape.k());
-      }
-
-      return workspace_bytes;
-    }
-
-    /// Lightweight update given a subset of arguments.
-    void update(Arguments const &args)
-    {
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-      ptr_gemm_k_reduction = args.ptr_gemm_k_reduction;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_gemm_k_reduction = args.batch_stride_gemm_k_reduction;
-      this->batch_stride_D = args.batch_stride_D;
-
-      output_op = args.epilogue;
-
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-
-public:
-
-  //
-  // Host dispatch API
-  //
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
-
-    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorA::Layout,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<typename Mma::IteratorB::Layout,
-                                                       layout::RowMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<typename Mma::IteratorB::Layout,
-                                                        layout::RowMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC =  (platform::is_same<LayoutC,
-                                                      layout::ColumnMajorInterleaved<32>>::value)
-                                   ? 32
-                                   : (platform::is_same<LayoutC,
-                                                        layout::ColumnMajorInterleaved<64>>::value)
-                                     ? 64
-                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    bool isAMisaligned = false;
-    bool isBMisaligned = false;
-    bool isCMisaligned = false;
-
-    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
-      isAMisaligned = problem_size.m() % kAlignmentA;
-    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
-      isAMisaligned = problem_size.k() % kAlignmentA;
-    }
-
-    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
-      isBMisaligned = problem_size.n() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
-            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
-      isBMisaligned = problem_size.k() % kAlignmentB;
-    }
-
-    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
-      isCMisaligned = problem_size.m() % kAlignmentC;
-    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
-            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
-      isCMisaligned = problem_size.n() % kAlignmentC;
-    }
-
-    if (isAMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand A");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isBMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand B");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (isCMisaligned) {
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand C");
-      return Status::kErrorMisalignedOperand;
-    }
-
-    CUTLASS_TRACE_HOST("  returning kSuccess");
-
-    return Status::kSuccess;
-  }
-
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-
-public:
-
-  //
-  // Device-only API
-  //
-
-  // Factory invocation
-  CUTLASS_DEVICE
-  static void invoke(
-    Params const &params,
-    SharedStorage &shared_storage)
-  {
-    GemmWithKReduction op;
-    op(params, shared_storage);
-  }
-
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    typename Mma::FragmentReduction gemm_k_accumulators;
-
-    gemm_k_accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
-      accumulators,
-      gemm_k_accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-    ElementC *ptr_gemm_k_reduction = static_cast<ElementC *>(params.ptr_gemm_k_reduction);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      ptr_gemm_k_reduction += threadblock_tile_offset.k() * params.batch_stride_gemm_k_reduction;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
- 
-    if ((kReduceKForA && threadblock_tile_offset.n() == 0)
-     || (!kReduceKForA && threadblock_tile_offset.m() == 0)) {
-
-      int warp_idx_mn = warp_idx % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
-      int warp_idx_m = warp_idx_mn % Mma::Base::WarpCount::kM;
-      int warp_idx_n = warp_idx_mn / Mma::Base::WarpCount::kM;
- 
-     if ((kReduceKForA && warp_idx_n == 0)
-      || (!kReduceKForA && warp_idx_m == 0)) {
-
-        int reduction_warp_idx = kReduceKForA ? warp_idx_m : warp_idx_n;
-        int reduction_threadblock_offset = kReduceKForA ? threadblock_tile_offset.m() :
-                                                          threadblock_tile_offset.n();
-        int reduction_vector_size = kReduceKForA ? params.problem_size.m()
-                                                 : params.problem_size.n();
-        EpilogueGemmKReduction epilogue_gemm_k_reduction(thread_idx,
-                                                         reduction_warp_idx,
-                                                         lane_idx,
-                                                         reduction_threadblock_offset,
-                                                         ptr_gemm_k_reduction);
-        epilogue_gemm_k_reduction(
-          reduction_vector_size,
-          gemm_k_accumulators,
-          params.mode == GemmUniversalMode::kGemm
-            && (params.grid_tiled_shape.k() > 1)
-            && (threadblock_tile_offset.k() > 0));
-      }
-    }
-   
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
deleted file mode 100755
index 9ec55e13c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h
+++ /dev/null
@@ -1,638 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/cache_operation.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/numeric_conversion.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA_,
-  typename LayoutA_,
-  typename ElementB_,
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename EpilogueOutputOp_,
-  int kElementsPerAccess_ = 1,            ///< Number of elements involved in a global access.
-  int kThreadCount_ = 0,                  ///< Number of threads in the thread block.
-                                          ///  It will be calculated automatically if set to 0.
-  int kThreadsPerRow_ = 0                 ///< Number of threads in the k dimension.
-                                          ///  It will be calculated automatically if set to 0.
->
-struct Gemv;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Specializations
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GEMV for column-major A matrix
-template <
-  typename ElementA_,
-  typename ElementB_,
-  typename ElementC_,
-  typename ElementAccumulator_,
-  typename EpilogueOutputOp_,
-  int kElementsPerAccess_,
-  int kThreadCount_,
-  int kThreadsPerRow_
->
-struct Gemv <
-  ElementA_,
-  layout::ColumnMajor,
-  ElementB_,
-  ElementC_,
-  ElementAccumulator_,
-  EpilogueOutputOp_,
-  kElementsPerAccess_,
-  kThreadCount_,
-  kThreadsPerRow_
->{
-public:
-
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using TensorRefA = TensorRef<ElementA, LayoutA>;
-
-  using ElementB = ElementB_;
-  using ElementC = ElementC_;
-
-  using ElementAccumulator = ElementAccumulator_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  // thread block shape (kThreadCount, 1, 1)
-  static int const kThreadCount = (kThreadCount_ <= 0) ? 32 : kThreadCount_;
-  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ? 1 : kThreadsPerRow_;
-
-  static int const kStages = 1;
-
-  static int const kAlignmentA = 1;
-  static int const kAlignmentB = 1;
-  static int const kAlignmentC = 1;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-    MatrixCoord     problem_size;
-    int32_t         batch_count;
-    typename EpilogueOutputOp::Params output_op;
-
-    TensorRefA      ref_A;
-
-    ElementB const *ptr_B;
-    ElementC const *ptr_C;
-    ElementC       *ptr_D;
-
-    int64_t         inc_B;
-    int64_t         inc_C;
-    int64_t         inc_D;
-
-    int64_t         batch_stride_A;
-    int64_t         batch_stride_B;
-    int64_t         batch_stride_C;
-    int64_t         batch_stride_D;
-
-    //
-    // Methods
-    //
-
-    Arguments(): batch_count(0) { }
-
-    Arguments(
-      MatrixCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params output_op,
-      TensorRefA  ref_A,
-      void const *ptr_B,
-      void const *ptr_C,
-      void       *ptr_D,
-      int64_t     inc_B,
-      int64_t     inc_C,
-      int64_t     inc_D,
-      int64_t     batch_stride_A,
-      int64_t     batch_stride_B,
-      int64_t     batch_stride_C,
-      int64_t     batch_stride_D
-    ): 
-      problem_size(problem_size),
-      batch_count(batch_count),
-      output_op(output_op),
-      ref_A(ref_A),
-      ptr_B(static_cast<ElementB const *>(ptr_B)),
-      ptr_C(static_cast<ElementC const *>(ptr_C)),
-      ptr_D(static_cast<ElementC       *>(ptr_D)),
-      inc_B(inc_B),
-      inc_C(inc_C),
-      inc_D(inc_D),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C(batch_stride_C),
-      batch_stride_D(batch_stride_D)
-    { }
-
-    Arguments(
-      MatrixCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params output_op,
-      TensorRefA  ref_A,
-      void const *ptr_B,
-      void const *ptr_C,
-      void       *ptr_D,
-      int64_t     batch_stride_A,
-      int64_t     batch_stride_B,
-      int64_t     batch_stride_C,
-      int64_t     batch_stride_D
-    ): 
-      Arguments(
-        problem_size, 
-        batch_count, 
-        output_op, 
-        ref_A, 
-        ptr_B, 
-        ptr_C, 
-        ptr_D,
-        1, 
-        1, 
-        1, 
-        batch_stride_A,
-        batch_stride_B,
-        batch_stride_C,
-        batch_stride_D)
-    { }
-
-    Arguments(
-      MatrixCoord problem_size,
-      typename EpilogueOutputOp::Params output_op,
-      TensorRefA  ref_A,
-      void const *ptr_B,
-      void const *ptr_C,
-      void       *ptr_D,
-      int64_t     inc_B,
-      int64_t     inc_C,
-      int64_t     inc_D
-    ): 
-      Arguments(
-        problem_size, 
-        1, 
-        output_op, 
-        ref_A, 
-        ptr_B, 
-        ptr_C, 
-        ptr_D,
-        inc_B, 
-        inc_C, 
-        inc_D, 
-        1, 
-        1, 
-        1, 
-        1)
-    { }
-
-    Status update(Arguments const &args) {
-      output_op = args.output_op;
-      ref_A = ref_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-
-      return Status::kSuccess;
-    }
-  };
-
-  using Params = Arguments;
-
-  /// Shared memory storage structure
-  union SharedStorage {
-
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  Gemv() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::MatrixCoord const & problem_size) {
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
- 
-  /// Executes one GEMV
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Loop over batch indices
-    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
-
-      int i = blockIdx.x * kThreadCount + threadIdx.x;
-
-      ElementA const *ptr_A = params.ref_A.data() + i;
-      ElementB const *ptr_B = params.ptr_B;
-
-      ptr_A += batch_idx * params.batch_stride_A;
-      ptr_B += batch_idx * params.batch_stride_B;
-
-      ElementAccumulator accum = ElementAccumulator();
-
-      // Compute inner product
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int k = 0; k < params.problem_size.column(); ++k) {
-
-        // Fetch from A
-        ElementA a = ElementA();
-        if (i < params.problem_size.row()) {
-          a = *ptr_A;
-        }
-        ptr_A += params.ref_A.stride(0);
-
-        // Fetch from B
-        ElementB b = *ptr_B;
-        ptr_B += params.inc_B;
-
-        // Math
-        accum += ElementAccumulator(a) * ElementAccumulator(b);
-      }
-
-      //
-      // Epilogue phase
-      //
-
-      ElementC const *ptr_C = params.ptr_C + i * params.inc_C + batch_idx * params.batch_stride_C;
-      ElementC       *ptr_D = params.ptr_D + i * params.inc_D + batch_idx * params.batch_stride_D;
-
-      EpilogueOutputOp output_op(params.output_op);
-
-      typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
-      typename EpilogueOutputOp::FragmentOutput      source_fragment;
-      typename EpilogueOutputOp::FragmentOutput      output_fragment;
-      
-      accum_fragment[0] = accum;
-
-      if (i < params.problem_size.row()) {
-        if (output_op.is_source_needed()) {
-          source_fragment[0] = *ptr_C;
-          output_fragment = output_op(accum_fragment, source_fragment);
-        }
-        else {
-          output_fragment = output_op(accum_fragment);
-        }
-
-        *ptr_D = output_fragment[0];
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GEMV for row-major A matrix
-template <
-    typename ElementA_,
-    typename ElementB_,
-    typename ElementC_,
-    typename ElementAccumulator_,
-    typename EpilogueOutputOp_,
-    int kElementsPerAccess_,
-    int kThreadCount_,
-    int kThreadsPerRow_ 
->
-struct Gemv <
-    ElementA_,            
-    layout::RowMajor,
-    ElementB_,            
-    ElementC_,
-    ElementAccumulator_,
-    EpilogueOutputOp_,
-    kElementsPerAccess_,
-    kThreadCount_,
-    kThreadsPerRow_
->{
-public:
-
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using TensorRefA = TensorRef<ElementA, LayoutA>;
-
-  using ElementB = ElementB_;
-  using ElementC = ElementC_;
-
-  using ElementAccumulator = ElementAccumulator_;
-  using EpilogueOutputOp = EpilogueOutputOp_;
-
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
-
-  // number of return elements in a global access
-  static int const kElementsPerAccess = kElementsPerAccess_;
-  
-  using FragmentA = Array<ElementA, kElementsPerAccess>;
-  using FragmentB = Array<ElementB, kElementsPerAccess>;
-  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
-
-  // thread block shape (kThreadsPerRow, kThreadCount / kThreadsPerRow, 1)
-  static int const kThreadCount = (kThreadCount_ <= 0) ? 128 : kThreadCount_;
-  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ?
-                                  std::min(static_cast<int>(kThreadCount / (kElementsPerAccess * sizeof(ElementA))), 16)
-                                  : kThreadsPerRow_;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-    MatrixCoord     problem_size;
-    int32_t         batch_count;
-    typename EpilogueOutputOp::Params output_op;
-
-    TensorRefA      ref_A;
-
-    ElementB const *ptr_B;
-    ElementC const *ptr_C;
-    ElementC       *ptr_D;
-
-    int64_t         batch_stride_A;
-    int64_t         batch_stride_B;
-    int64_t         batch_stride_C;
-    int64_t         batch_stride_D;
-
-    //
-    // Methods
-    //
-
-    Arguments(): batch_count(0) { }
-
-    Arguments(
-      MatrixCoord problem_size,
-      int32_t     batch_count,
-      typename EpilogueOutputOp::Params output_op,
-      TensorRefA  ref_A,
-      void const *ptr_B,
-      void const *ptr_C,
-      void       *ptr_D,
-      int64_t     batch_stride_A,
-      int64_t     batch_stride_B,
-      int64_t     batch_stride_C,
-      int64_t     batch_stride_D
-    ):
-      problem_size(problem_size),
-      batch_count(batch_count),
-      output_op(output_op),
-      ref_A(ref_A),
-      ptr_B(static_cast<ElementB const *>(ptr_B)),
-      ptr_C(static_cast<ElementC const *>(ptr_C)),
-      ptr_D(static_cast<ElementC       *>(ptr_D)),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B(batch_stride_B),
-      batch_stride_C(batch_stride_C),
-      batch_stride_D(batch_stride_D)
-    { }
-
-    Arguments(
-      MatrixCoord problem_size,
-      typename EpilogueOutputOp::Params output_op,
-      TensorRefA  ref_A,
-      void const *ptr_B,
-      void const *ptr_C,
-      void       *ptr_D
-    ):
-      Arguments(
-        problem_size,
-        1,
-        output_op,
-        ref_A,
-        ptr_B,
-        ptr_C,
-        ptr_D,
-        1,
-        1,
-        1,
-        1)
-    { }
-
-    Status update(Arguments const &args) {
-      problem_size = args.problem_size;
-      batch_count = args.batch_count;
-      output_op = args.output_op;
-      ref_A = ref_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_C = args.batch_stride_C;
-      batch_stride_D = args.batch_stride_D;
-
-      return Status::kSuccess;
-    }
-  };
-
-  using Params = Arguments;
-
-  /// Shared memory storage structure
-  union SharedStorage {
-
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  Gemv() {}
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::MatrixCoord const &problem_size) {
-    if (problem_size.column() % kElementsPerAccess != 0) {
-      return Status::kErrorMisalignedOperand;
-    }
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes one GEMV
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    
-    // Loop over batch indices
-    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
-      int idx_col_k = threadIdx.x;
-      int idx_row_m = blockIdx.x * blockDim.y + threadIdx.y;
-
-      if (idx_row_m < params.problem_size.row()) {
-        // problem_size (row = m, column = k)
-        // matrix A (batch, m, k)
-        // vector B (batch, 1, k)
-        // vector C (batch, m, 1)
-        // vector D (batch, m, 1)
-
-        // move in the batch dimension
-        ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
-        ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
-
-        ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
-        ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
-
-        // move in the k dimension
-        ptr_A += idx_col_k * kElementsPerAccess;
-        ptr_B += idx_col_k * kElementsPerAccess;
-
-        // move in the m dimension
-        ptr_A += idx_row_m * params.problem_size.column();
-        ptr_C += idx_row_m;
-        ptr_D += idx_row_m;
-
-        NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
-        NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
-
-        ElementAccumulator accum = 0.f;
-
-        FragmentB fragB;
-        FragmentA fragA;
-
-        int unroll_col_k = 0;
-
-        // rows of the rolling tile
-        int const tileA_k = kThreadsPerRow * kElementsPerAccess;
-
-        for (; unroll_col_k < params.problem_size.column() / tileA_k * tileA_k; unroll_col_k += tileA_k) {
-
-          // fetch from matrix A
-          arch::global_load<FragmentA,
-                            sizeof(FragmentA),
-                            arch::CacheOperation::LastUse>(fragA, (ptr_A + unroll_col_k), true);
-
-          // fetch from vector B
-          arch::global_load<FragmentB,
-                            sizeof(FragmentB),
-                            arch::CacheOperation::Always>(fragB, (ptr_B + unroll_col_k), true);
-
-          FragmentCompute fragB_Compute = srcB_converter(fragB);
-          FragmentCompute fragA_Compute = srcA_converter(fragA);
-
-          // Math
-          CUTLASS_PRAGMA_UNROLL
-          for (int e = 0; e < kElementsPerAccess; e++) {
-            accum += fragA_Compute.at(e) * fragB_Compute.at(e);
-          }
-        }
-
-        // calculate the rest of K elements
-        // each thread fetch 1 element each time
-        for (int k = unroll_col_k + idx_col_k; k < params.problem_size.column(); k += kThreadsPerRow) {
-          ElementB b = *(ptr_B - idx_col_k * kElementsPerAccess + k);
-          ElementA a = *(ptr_A - idx_col_k * kElementsPerAccess + k);
-
-          accum += ElementAccumulator(a) * ElementAccumulator(b);
-        }
-
-        EpilogueOutputOp output_op(params.output_op);
-        typename EpilogueOutputOp::FragmentOutput source_fragment;
-
-        // prefetch from source matrix C
-        if (output_op.is_source_needed()) {         
-          source_fragment[0] = *(ptr_C);
-        }
-
-        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
-        typename EpilogueOutputOp::FragmentOutput output_fragment;
-
-        for (int mask = (kThreadsPerRow >> 1); mask > 0; mask >>= 1) {
-          accum += __shfl_xor_sync(0xFFFFFFFF, accum, mask, 32);
-        }
-
-        if (idx_col_k == 0) {
-          accum_fragment[0] = accum;
-
-          if (output_op.is_source_needed()) {
-            output_fragment = output_op(accum_fragment, source_fragment);
-          }
-          else {
-            output_fragment = output_op(accum_fragment);
-          }
-
-          *ptr_D = output_fragment[0];
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
deleted file mode 100755
index 673f1995c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-namespace detail
-{
-  template<typename ElementAlphaBeta, bool BetaIsZero>
-  struct GemvBatchedStridedEpilogueScaling
-  {
-    ElementAlphaBeta const & alpha;
-    ElementAlphaBeta const & beta;
-
-    CUTLASS_DEVICE
-    GemvBatchedStridedEpilogueScaling(ElementAlphaBeta& alpha_, ElementAlphaBeta& beta_) :
-      alpha(alpha_), beta(beta_)
-    { }
-
-    template<typename FragmentCD, typename FragmentAccumulator>
-    CUTLASS_DEVICE
-    void operator()(FragmentAccumulator& accumulators,
-                    FragmentCD const& fragment_C,
-                    FragmentCD& fragment_D) const
-    {
-      using AccType = typename FragmentAccumulator::value_type;
-      using CDType = typename FragmentCD::value_type;
-
-      static_assert(FragmentCD::kElements == FragmentAccumulator::kElements,
-                    "Mistmatch in fragment sizes.");
-
-      for (int i = 0; i < FragmentCD::kElements; ++i)
-      {
-        if (BetaIsZero)
-        {
-          fragment_D[i] = CDType(accumulators[i] * AccType(alpha));
-        }
-        else
-        {
-          fragment_D[i] = CDType(accumulators[i] * AccType(alpha)
-                                 + AccType(fragment_C[i]) * AccType(beta));
-        } 
-      } 
-    }
-  };
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero=false>
-CUTLASS_DEVICE void GemvBatchedStridedDevice(
-  cutlass::gemm::BatchedGemmCoord problem_size,
-  ElementAlphaBeta alpha,
-  ElementAlphaBeta beta,
-  typename GemvKernel::IteratorA::TensorRef ref_A,
-  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
-  typename GemvKernel::IteratorB::TensorRef ref_B,
-  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
-  typename GemvKernel::IteratorCD::TensorRef ref_C,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
-  typename GemvKernel::IteratorCD::TensorRef ref_D,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
-{
-  using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv;
-  using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle;
-  using EpilogueScale = detail::GemvBatchedStridedEpilogueScaling<ElementAlphaBeta, BetaIsZero>;
-
-  ThreadBlockSwizzle swizzler;
-
-  // Compute initial location in logical coordinates
-  BatchedGemmCoord tb_offset = swizzler.get_tile_offset();
-  int const batch_idx = swizzler.get_batch_idx();
-
-  // Offset to the batch
-  ref_A.add_pointer_offset(batch_idx*lda);
-  ref_B.add_pointer_offset(batch_idx*ldb);
-
-  // Construct iterators to A and B operands
-  typename GemvKernel::IteratorA::Params params_A(ref_A.layout());
-  typename GemvKernel::IteratorA iterator_A(
-      params_A,
-      ref_A.data(),
-      { 1, problem_size.k() },
-      0,
-      { 0, 0 });
-
-  typename GemvKernel::IteratorB::Params params_B(ref_B.layout());
-  typename GemvKernel::IteratorB iterator_B(
-      params_B,
-      ref_B.data(),
-      { problem_size.k(), problem_size.n() },
-      threadIdx.x,
-      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
-
-  //
-  // Main loop
-  //
-
-  // Construct thread-scoped matrix multiply
-  ThreadBlockGemv mma;
-
-  typename ThreadBlockGemv::FragmentC accumulators;
-  accumulators.clear();
-
-  // Compute threadblock-scoped gemv
-  mma(problem_size.mnk(), accumulators, iterator_A, iterator_B, accumulators);
-
-  //
-  // Epilogue
-  //
-  typename GemvKernel::FragmentCD fragment_CD;
-
-  // Load C (skip if beta is zero)
-  if (!BetaIsZero)
-  {
-    tb_offset = swizzler.get_tile_offset();
-    ref_C.add_pointer_offset(batch_idx*ldc);
-    typename GemvKernel::IteratorCD::Params params_C(ref_C.layout());
-    typename GemvKernel::IteratorCD iterator_C(
-        params_C,
-        ref_C.data(),
-        { 1, problem_size.n() },
-        threadIdx.x,
-        { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
-    iterator_C.load(fragment_CD);
-  }
-
-  // Apply alpha/beta scaling
-  EpilogueScale epilogue_scale(alpha, beta);
-  epilogue_scale(accumulators, fragment_CD, fragment_CD);
-
-  // Store D
-  tb_offset = swizzler.get_tile_offset();
-  ref_D.add_pointer_offset(batch_idx*ldd);
-  typename GemvKernel::IteratorCD::Params params_D(ref_D.layout());
-  typename GemvKernel::IteratorCD iterator_D(
-      params_D,
-      ref_D.data(),
-      { 1, problem_size.n() },
-      threadIdx.x,
-      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
-  iterator_D.store(fragment_CD);
-}
-
-template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero>
-CUTLASS_GLOBAL void GemvBatchedStrided(
-  cutlass::gemm::BatchedGemmCoord problem_size,
-  ElementAlphaBeta alpha,
-  ElementAlphaBeta beta,
-  typename GemvKernel::IteratorA::TensorRef ref_A,
-  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
-  typename GemvKernel::IteratorB::TensorRef ref_B,
-  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
-  typename GemvKernel::IteratorCD::TensorRef ref_C,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
-  typename GemvKernel::IteratorCD::TensorRef ref_D,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
-{
-  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, BetaIsZero>(
-    problem_size, alpha, beta, ref_A, lda, ref_B, ldb, ref_C, ldc, ref_D, ldd
-  );
-}
-
-template <typename GemvKernel, typename ElementAlphaBeta>
-CUTLASS_GLOBAL void GemvBatchedStrided(
-  cutlass::gemm::BatchedGemmCoord problem_size,
-  ElementAlphaBeta alpha,
-  typename GemvKernel::IteratorA::TensorRef ref_A,
-  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
-  typename GemvKernel::IteratorB::TensorRef ref_B,
-  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
-  typename GemvKernel::IteratorCD::TensorRef ref_D,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
-{
-  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
-    problem_size, alpha, ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
-  );
-}
-
-template <typename GemvKernel>
-CUTLASS_GLOBAL void GemvBatchedStrided(
-  cutlass::gemm::BatchedGemmCoord problem_size,
-  typename GemvKernel::IteratorA::TensorRef ref_A,
-  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
-  typename GemvKernel::IteratorB::TensorRef ref_B,
-  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
-  typename GemvKernel::IteratorCD::TensorRef ref_D,
-  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
-{
-  using ElementAlphaBeta = typename GemvKernel::IteratorCD::Element;
-  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
-    problem_size, ElementAlphaBeta(1), ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
deleted file mode 100755
index 31787372a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
+++ /dev/null
@@ -1,463 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Base scheduler for grouped problems
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Enumerated type describing the type of scheduling to perform for the ProblemVisitor
-enum class GroupScheduleMode {
-  // Perform all scheduling on device
-  kDeviceOnly,
-  // Precompute on the host the full sequence of problems to access
-  kHostPrecompute
-};
-
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <typename ProblemSizeHelper,
-          typename ThreadblockShape_>
-struct BaseGroupedProblemVisitor {
-  using ThreadblockShape = ThreadblockShape_;
-
-  struct ProblemInfo {
-    static int32_t const kNoPrefetchEntry = -1;
-    int32_t problem_idx;
-    int32_t problem_start;
-
-    CUTLASS_DEVICE
-    ProblemInfo() : problem_idx(kNoPrefetchEntry), problem_start(kNoPrefetchEntry) {}
-
-    CUTLASS_DEVICE
-    ProblemInfo(int32_t problem_idx_, int32_t problem_start_) :
-      problem_idx(problem_idx_), problem_start(problem_start_) {}
-  };
-
-  struct Params {
-    cutlass::gemm::GemmCoord const *problem_sizes;
-    int32_t                         problem_count;
-    void const                     *workspace;
-    int32_t                         tile_count;
-
-    //
-    // Methods
-    //
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Params(): problem_sizes(nullptr), problem_count(0), workspace(nullptr), tile_count(0) { }
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const *problem_sizes,
-      int32_t                         problem_count,
-      void const                     *workspace = nullptr,
-      int32_t                         tile_count = 0
-    ):
-      problem_sizes(problem_sizes),
-      problem_count(problem_count),
-      workspace(workspace),
-      tile_count(tile_count)
-    {}
-
-  };
-
-  Params params;
-  int32_t tile_idx;
-  int32_t problem_tile_start;
-  int32_t problem_idx;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  BaseGroupedProblemVisitor(
-    Params const &params_,
-    int32_t block_idx
-  ):
-  params(params_),
-  tile_idx(block_idx),
-  problem_tile_start(0),
-  problem_idx(0)
-  {}
-
-  /// Get the grid shape
-  CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
-    return ProblemSizeHelper::grid_shape(problem);
-  }
-
-  /// Gets the global tile index
-  CUTLASS_HOST_DEVICE
-  int32_t tile_index() const {
-    return tile_idx;
-  }
-
-  /// Gets the index of the problem
-  CUTLASS_HOST_DEVICE
-  int32_t problem_index() const {
-    return problem_idx;
-  }
-
-  CUTLASS_HOST_DEVICE
-  int32_t threadblock_idx() const {
-    return tile_idx - problem_tile_start;
-  }
-
-  CUTLASS_DEVICE
-  void advance(int32_t grid_size) {
-    tile_idx += grid_size;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
-    ProblemSizeHelper::possibly_transpose_problem(problem);
-  }
-
-  /// Returns the problem size for the current problem
-  CUTLASS_HOST_DEVICE
-  cutlass::gemm::GemmCoord problem_size() const {
-    GemmCoord problem = params.problem_sizes[problem_idx];
-    ProblemSizeHelper::possibly_transpose_problem(problem);
-    return problem;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
-    return ProblemSizeHelper::tile_count(grid);
-  }
-
-  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
-    int32_t total_tiles = 0;
-    for (int32_t i = 0; i < problem_count; ++i) {
-      auto problem = host_problem_sizes_ptr[i];
-      possibly_transpose_problem(problem);
-      auto grid = grid_shape(problem);
-      total_tiles += tile_count(grid);
-    }
-
-    return total_tiles;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ProblemSizeHelper,
-  typename ThreadblockShape,
-  GroupScheduleMode GroupScheduleMode_,
-  int PrefetchTileCount,
-  int ThreadCount
->
-struct GroupedProblemVisitor;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// ProblemVisitor that performs all scheduling on device
-//
-template <typename ProblemSizeHelper,
-          typename ThreadblockShape,
-          int PrefetchTileCount,
-          int ThreadCount>
-struct GroupedProblemVisitor<ProblemSizeHelper,
-                             ThreadblockShape,
-                             GroupScheduleMode::kDeviceOnly,
-                             PrefetchTileCount,
-                             ThreadCount>: public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
-  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
-  using Params = typename Base::Params;
-  static int const kThreadCount = ThreadCount;
-  static bool const kRequiresPrecomputation = false;
-  static int const kThreadsPerWarp = 32;
-
-  struct SharedStorage {};
-
-  // Final tile of the problem loaded by this thread. Each thread will hold
-  // a separate value.
-  int32_t problem_ending_tile;
-
-  SharedStorage &shared_storage;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GroupedProblemVisitor(
-    Params const &params_,
-    SharedStorage &shared_storage_,
-    int32_t block_idx
-  ): Base(params_, block_idx),
-  problem_ending_tile(0),
-  shared_storage(shared_storage_)
-  {
-    this->problem_idx = -1 * kThreadsPerWarp;
-    this->problem_tile_start = 0;
-  }
-
-  CUTLASS_DEVICE
-  bool next_tile() {
-    // Check whether the tile to compute is within the range of the current problem.
-    int32_t problem_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, this->problem_idx % kThreadsPerWarp);
-    if (this->tile_idx < problem_tile_end) {
-      return true;
-    }
-
-    // Check whether the tile to compute is within the current group of problems fetched by the warp.
-    // The last tile for this group is the final tile of the problem held by the final thread in the warp.
-    int32_t group_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
-
-    // Keep the starting problem for this group in `problem_idx`. This is done to reduce
-    // register pressure. The starting problem for this group is simply the first problem
-    // in the group most recently fetched by the warp.
-    int32_t &group_problem_start = this->problem_idx;
-    group_problem_start = (this->problem_idx / kThreadsPerWarp) * kThreadsPerWarp;
-
-    // Keep the starting tile for this group in `problem_tile_start`. This is done to reduce
-    // register pressure.
-    int32_t &group_tile_start = this->problem_tile_start;
-
-    // Each thread in the warp processes a separate problem to advance until
-    // reaching a problem whose starting tile is less less than tile_idx.
-    while (group_tile_end <= this->tile_idx) {
-      group_problem_start += kThreadsPerWarp;
-      if (group_problem_start > this->params.problem_count) {
-        return false;
-      }
-
-      // Since `group_tile_start` is a reference to `this->problem_tile_start`, this
-      // also sets `this->problem_tile_start`. The fact that `this->problem_tile_start`
-      // is also set here is used later in `next_tile`.
-      group_tile_start = group_tile_end;
-
-      int lane_idx = threadIdx.x % kThreadsPerWarp;
-      int32_t lane_problem = group_problem_start + lane_idx;
-
-      // Compute the number of tiles in the problem assigned to each thread.
-      problem_ending_tile = 0;
-      if (lane_problem < this->params.problem_count) {
-        cutlass::gemm::GemmCoord problem = this->params.problem_sizes[lane_problem];
-        this->possibly_transpose_problem(problem);
-        cutlass::gemm::GemmCoord grid = this->grid_shape(problem);
-        problem_ending_tile = this->tile_count(grid);
-      }
-
-      // Compute a warp-wide inclusive prefix sum to compute the ending tile index of
-      // each thread's problem.
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < kThreadsPerWarp; i <<= 1) {
-        int32_t val = __shfl_up_sync(0xffffffff, problem_ending_tile, i);
-        if (lane_idx >= i) {
-          problem_ending_tile += val;
-        }
-      }
-
-      // The total tile count for this group is now in the final position of the prefix sum
-      int32_t tiles_in_group = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
-
-      problem_ending_tile += group_tile_start;
-      group_tile_end += tiles_in_group;
-    }
-
-    // The next problem to process is the first one that does not have ending tile position
-    // that is greater than or equal to tile index.
-    int32_t problem_idx_in_group =
-        __popc(__ballot_sync(0xffffffff, problem_ending_tile <= this->tile_idx));
-
-    this->problem_idx = group_problem_start + problem_idx_in_group;
-
-    // The starting tile for this problem is the ending tile of the previous problem. In cases
-    // where `problem_idx_in_group` is the first problem in the group, we do not need to reset
-    // `problem_tile_start`, because it is set to the previous group's ending tile in the while
-    // loop above.
-    if (problem_idx_in_group > 0) {
-      this->problem_tile_start = __shfl_sync(0xffffffff, problem_ending_tile, problem_idx_in_group - 1);
-    }
-
-    return true;
-  }
-
-  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
-                                   int32_t problem_count,
-                                   int32_t block_count) {
-    return 0;
-  }
-
-  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
-                              int32_t problem_count,
-                              int32_t block_count,
-                              void* host_workspace_ptr) {}
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Precomputes schedule on host and prefetches into shared memory
-//
-template <typename ProblemSizeHelper,
-          typename ThreadblockShape,
-          int PrefetchTileCount,
-          int ThreadCount>
-struct GroupedProblemVisitor<ProblemSizeHelper,
-                             ThreadblockShape,
-                             GroupScheduleMode::kHostPrecompute,
-                             PrefetchTileCount,
-                             ThreadCount> : public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
-  static_assert(PrefetchTileCount > 0,
-                "GroupedProblemVisitor with GroupScheduleMode `kHostPrecompute` currently requires prefetching to shared memory");
-
-  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
-  using Params = typename Base::Params;
-  using ProblemInfo = typename Base::ProblemInfo;
-  static bool const kRequiresPrecomputation = true;
-
-  static int const kPrefetchTileCount = PrefetchTileCount;
-  static int const kThreadCount = ThreadCount;
-
-  struct SharedStorage {
-    // Sequence of problem IDs and starting tiles to compute
-    cutlass::Array<ProblemInfo, kPrefetchTileCount> prefetched_problems;
-  };
-
-  int32_t tiles_computed;
-  int32_t iterations_per_block;
-  int32_t block_load_start;
-  SharedStorage &shared_storage;
-  ProblemInfo const *problem_info_ptr;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GroupedProblemVisitor(
-    Params const &params_,
-    SharedStorage &shared_storage_,
-    int32_t block_idx
-  ): Base(params_, block_idx),
-  tiles_computed(0),
-  shared_storage(shared_storage_),
-  problem_info_ptr(reinterpret_cast<ProblemInfo const*>(params_.workspace))
-  {
-    iterations_per_block = (params_.tile_count - 1 + gridDim.x) / gridDim.x;
-    block_load_start = iterations_per_block * block_idx;
-    // Start prefetching the first set of tiles to compute
-    prefetch_tiles();
-  }
-
-  CUTLASS_DEVICE
-  bool next_tile() {
-    if (this->tile_idx >= this->params.tile_count) {
-      return false;
-    }
-
-    int32_t prefetch_idx = (tiles_computed % kPrefetchTileCount);
-    if (prefetch_idx == 0) {
-      // Ensure all previous stores to shared memory have been completed
-      __syncthreads();
-    }
-
-    auto problem_info = shared_storage.prefetched_problems[prefetch_idx];
-    ++tiles_computed;
-
-    if ((tiles_computed % kPrefetchTileCount) == 0) {
-      // Begin prefetching next set of tiles. Synchronize first to ensure that
-      // we don't overwrite the current buffer while someone else is using it.
-      __syncthreads();
-      prefetch_tiles();
-    }
-
-    this->problem_idx = problem_info.problem_idx;
-    this->problem_tile_start = problem_info.problem_start;
-
-    return true;
-  }
-
-  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
-                                   int32_t problem_count,
-                                   int32_t block_count) {
-    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
-    int32_t entries_per_block = ((total_tiles - 1 + block_count) / block_count);
-    return sizeof(ProblemInfo) * entries_per_block * block_count;
-  }
-#if !defined(__CUDACC_RTC__)
-  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
-                              int32_t problem_count,
-                              int32_t block_count,
-                              void* host_workspace_ptr) {
-    ProblemInfo* host_problem_info_ptr = reinterpret_cast<ProblemInfo*>(host_workspace_ptr);
-    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
-    int32_t entries_per_block = (total_tiles - 1 + block_count) / block_count;
-
-    int tile = 0;
-    int start_tile = 0;
-    for (int p_idx = 0; p_idx < problem_count; ++p_idx) {
-      auto problem = host_problem_sizes_ptr[p_idx];
-      Base::possibly_transpose_problem(problem);
-      auto grid = Base::grid_shape(problem);
-      int tiles = Base::tile_count(grid);
-      ProblemInfo problem_info(p_idx, start_tile);
-      for (int i = 0; i < tiles; ++i, ++tile) {
-        host_problem_info_ptr[(entries_per_block * (tile % block_count)) + (tile / block_count)] = problem_info;
-      }
-      start_tile += tiles;
-    }
-  }
-#endif
-private:
-  CUTLASS_DEVICE
-  void prefetch_tiles() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int32_t i = 0; i < kPrefetchTileCount; i += kThreadCount) {
-      int32_t offset = threadIdx.x + i;
-      if (offset < kPrefetchTileCount && (tiles_computed + offset < iterations_per_block)) {
-        shared_storage.prefetched_problems[offset] = problem_info_ptr[block_load_start + tiles_computed + offset];
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
deleted file mode 100755
index 6080e7994..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Base functionality for common types of sparse GEMM kernel parameters
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure
-template <
-  typename ThreadblockSwizzle,
-  typename ParamsA,
-  typename TensorRefA,
-  typename ParamsB,
-  typename TensorRefB,
-  typename ParamsE,
-  typename TensorRefE>
-struct SparseParamsBase
-{
-  //
-  // Data members
-  //
-
-  cutlass::gemm::GemmCoord problem_size{};
-  cutlass::gemm::GemmCoord grid_tiled_shape{};
-  int swizzle_log_tile;
-  ParamsA params_A{};
-  TensorRefA ref_A{};
-  ParamsB params_B{};
-  TensorRefB ref_B{};
-  ParamsE params_E{};
-  TensorRefE ref_E{};
-  int gemm_k_iterations{0};
-  int gemm_k_size{0};
-
-  //
-  // Host dispatch API
-  //
-
-  /// Default constructor
-  SparseParamsBase() = default;
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  SparseParamsBase(
-    cutlass::gemm::GemmCoord const & problem_size,
-    cutlass::gemm::GemmCoord const & grid_tiled_shape,
-    TensorRefA ref_A,
-    TensorRefB ref_B,
-    TensorRefE ref_E,
-    int const mma_shape_k)
-  :
-    problem_size(problem_size),
-    grid_tiled_shape(grid_tiled_shape),
-    swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-    params_A(ref_A.layout()),
-    ref_A(ref_A),
-    params_B(ref_B.layout()),
-    ref_B(ref_B),
-    params_E(ref_E.layout()),
-    ref_E(ref_E)
-  {
-    int total_gemm_k_iterations = (problem_size.k() + mma_shape_k - 1) / mma_shape_k;
-    int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-
-    gemm_k_size = gemm_k_iterations * mma_shape_k;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
deleted file mode 100755
index 86986f2e2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Base functionality for common types of universal GEMM kernel parameters
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/gemm.h"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace util {
-
-template <class LayoutA, class LayoutB>
-CUTLASS_HOST_DEVICE
-static bool 
-is_continous_k_aligned(GemmCoord problem_size, size_t alignmentA, size_t alignmentB) {
-  return (platform::is_same<LayoutA, layout::RowMajor>::value && (problem_size.k() % alignmentA) == 0) ||
-         (platform::is_same<LayoutB, layout::ColumnMajor>::value && (problem_size.k() % alignmentB) == 0);
-}
-
-}  // namespace util
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Argument structure
-struct UniversalArgumentsBase
-{
-  //
-  // Data members
-  //
-
-  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-  GemmCoord problem_size{};
-  int batch_count{1};
-  int64_t batch_stride_D{0};
-
-  //
-  // Methods
-  //
-
-  UniversalArgumentsBase() = default;
-
-  /// constructs an arguments structure
-  UniversalArgumentsBase(
-    GemmUniversalMode mode,
-    GemmCoord problem_size,
-    int batch_count,
-    int64_t batch_stride_D)
-  :
-    mode(mode),
-    problem_size(problem_size),
-    batch_count(batch_count),
-    batch_stride_D(batch_stride_D)
-  {
-    CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
-  }
-};
-
-
-/// Parameters structure
-template <
-  typename ThreadblockSwizzle,
-  typename ThreadblockShape,
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutA,
-  typename LayoutB>
-struct UniversalParamsBase
-{
-  //
-  // Data members
-  //
-
-  GemmCoord problem_size{};
-  GemmCoord grid_tiled_shape{};
-  int swizzle_log_tile{0};
-  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-  int batch_count {0};
-  int gemm_k_size {0};
-  int64_t batch_stride_D {0};
-  int *semaphore = nullptr;
-
-
-  //
-  // Host dispatch API
-  //
-
-  /// Default constructor
-  UniversalParamsBase() = default;
-
-  /// Constructor
-  UniversalParamsBase(
-    UniversalArgumentsBase const &args, /// GEMM application arguments
-    int device_sms,                     /// Number of SMs on the device
-    int sm_occupancy)                   /// Kernel SM occupancy (in thread blocks)
-  :
-    problem_size(args.problem_size),
-    mode(args.mode),
-    batch_count(args.batch_count),
-    batch_stride_D(args.batch_stride_D),
-    semaphore(nullptr)
-  {
-    init_grid_tiled_shape();
-  }
-
-  /// Returns the workspace size (in bytes) needed for this problem geometry
-  size_t get_workspace_size() const
-  {
-    size_t workspace_bytes = 0;
-    if (mode == GemmUniversalMode::kGemmSplitKParallel)
-    {
-      // Split-K parallel always requires a temporary workspace
-      workspace_bytes =
-        sizeof(ElementC) *
-        size_t(batch_stride_D) *
-        size_t(grid_tiled_shape.k());
-    }
-    else if (mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1)
-    {
-      // Serial split-K only requires a temporary workspace if the number of partitions along the
-      // GEMM K dimension is greater than one.
-      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
-    }
-
-    return workspace_bytes;
-  }
-
-
-  /// Assign and initialize the specified workspace buffer.  Assumes
-  /// the memory allocated to workspace is at least as large as get_workspace_size().
-  Status init_workspace(
-    void *workspace,
-    cudaStream_t stream = nullptr)
-  {
-    semaphore = static_cast<int *>(workspace);
-    // Zero-initialize entire workspace
-    if (semaphore)
-    {
-      size_t workspace_bytes = get_workspace_size();
-
-      CUTLASS_TRACE_HOST("  Initialize " << workspace_bytes << " workspace bytes");
-
-      cudaError_t result = cudaMemsetAsync(
-        semaphore,
-        0,
-        workspace_bytes,
-        stream);
-
-      if (result != cudaSuccess) {
-        CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-        return Status::kErrorInternal;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-
-  /// Returns the GEMM volume in thread block tiles
-  GemmCoord get_tiled_shape() const
-  {
-    return grid_tiled_shape;
-  }
-
-
-  /// Returns the total number of thread blocks to launch
-  int get_grid_blocks() const
-  {
-    dim3 grid_dims = get_grid_dims();
-    return grid_dims.x * grid_dims.y * grid_dims.z;
-  }
-
-
-  /// Returns the grid extents in thread blocks to launch
-  dim3 get_grid_dims() const
-  {
-    return ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
-  }
-
-private:
-  CUTLASS_HOST_DEVICE
-  void init_grid_tiled_shape() {
-    // Get GEMM volume in thread block tiles
-    grid_tiled_shape = ThreadblockSwizzle::get_tiled_shape(
-      problem_size,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      batch_count);
-
-    swizzle_log_tile = ThreadblockSwizzle::get_log_tile(grid_tiled_shape);
-
-    // Determine extent of K-dimension assigned to each block
-    gemm_k_size = problem_size.k();
-
-    if (mode == GemmUniversalMode::kGemm || mode == GemmUniversalMode::kGemmSplitKParallel)
-    {
-      static const uint32_t CACHELINE_BYTES = 128;
-      static const size_t element_bytes_a = sizeof(ElementA);
-      static const size_t element_bytes_b = sizeof(ElementB);
-      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
-      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
-
-      const bool cacheline_alignment_needed =
-          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
-
-      int const kAlignK = const_max(
-                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
-                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
-
-      gemm_k_size = round_up(ceil_div(problem_size.k(), batch_count), kAlignK);
-      if (gemm_k_size) {
-        grid_tiled_shape.k() = ceil_div(problem_size.k(), gemm_k_size);
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
deleted file mode 100755
index 6b36db21a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
+++ /dev/null
@@ -1,688 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Grouped Rank2K kernel.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
-#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma1_,                          ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
-  typename Mma2_,                          ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
-  typename Epilogue_,                      ///! Epilogue
-  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
-  ComplexTransform OriginalTransformA_,    ///! Public-facing transformation on A
-  ComplexTransform OriginalTransformB_,    ///! Public-facing transformation on B
-  FillMode FillModeC_,                     ///! Fill Mode for C (kLower or kUpper)
-  BlasMode BlasMode_,                      ///! Blas3 computation mode
-  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
-  bool Transposed = false
->
-struct Rank2KGrouped {
-public:
-
-  using Mma1 = Mma1_;
-  using Mma2 = Mma2_;
-
-  static_assert(platform::is_same<typename Mma1::LayoutC, cutlass::layout::RowMajor>::value &&
-                platform::is_same<typename Mma2::LayoutC, cutlass::layout::RowMajor>::value,
-                "Kernel-level grouped Rank2K requires that LayoutC be row major.");
-
-  // Define generic Mma for usecases that use Kernel::Mma
-  using Mma = Mma1_;
-
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
-  static bool const kTransposed = Transposed;
-
-  // Public-facing type definitions related to operand element type, layout, and complex conjugate
-  // operation. Must interact with the 'kTransposed' notion to reflect the original layout,
-  // fill mode, etc. passed in.
-  //
-  // Recall that a Rank2K operation performs (A x BT) + (B x AT)
-  // This is performed via:
-  //    Mma1 = (A x BT)
-  //    Mma2 = (B x AT)
-  //
-  // However, if C needs to be transposed, then this is changed to the following:
-  //    Mma1 = (B x AT)
-  //    Mma2 = (A x BT)
-  //
-  // The transformation above is achieved by swapping the Layouts/Elements/Transforms/etc.
-  // of A and B as they are passed into the instantiations of Mma1 and Mma2.
-  //
-  // Now, given access to only Mma1 and Mma2, as well as whether a transposition has occurred,
-  // we wish to retrieve the original Layouts/Elements/etc. for A and B that were passed into
-  // the device-level call.
-  //
-  // The logic to do this (which is made clearer by referencing the above instantiations) is as follows:
-  //   LayoutA = kTransposed ? Mma2::LayoutA : Mma1::LayoutA
-  //   LayoutB = kTransposed ? Mma1::LayoutA : Mma2::LayoutA
-  //
-  // We achieve this swapping by passing Mma1::*A and Mma2::*B to Rank2KMapArguments:
-  using MapArgumentsA = kernel::detail::Rank2KMapArguments<
-    typename Mma1::IteratorA::Element,
-    typename Mma1::IteratorA::Layout,
-    Mma1::kTransformA,
-    Mma1::IteratorA::AccessType::kElements,
-    typename Mma2::IteratorA::Element,
-    typename Mma2::IteratorA::Layout,
-    Mma2::kTransformA,
-    Mma2::IteratorA::AccessType::kElements,
-    typename Mma1::LayoutC,
-    FillModeC_,
-    kTransposed
-  >;
-
-  using ElementA = typename MapArgumentsA::ElementA;
-  using LayoutA = typename MapArgumentsA::LayoutA;
-  static int const kAlignmentA = MapArgumentsA::kAlignmentA;
-
-  using MapArgumentsB = kernel::detail::Rank2KMapArguments<
-    typename Mma2::IteratorA::Element,
-    typename Mma2::IteratorA::Layout,
-    Mma2::kTransformA,
-    Mma2::IteratorA::AccessType::kElements,
-    typename Mma1::IteratorA::Element,
-    typename Mma1::IteratorA::Layout,
-    Mma1::kTransformA,
-    Mma1::IteratorA::AccessType::kElements,
-    typename Mma2::LayoutC,
-    FillModeC_,
-    kTransposed
-  >;
-
-  using ElementB = typename MapArgumentsB::ElementA;
-  using LayoutB = typename MapArgumentsB::LayoutA;
-  static int const kAlignmentB = MapArgumentsB::kAlignmentA;
-
-  // Use the user-provided TransformA and TransformB, rather than those
-  // resulting from MapArguments, because Mma1 and Mma2 may have different
-  // complex transforms than those passed in by the user.
-  // (See kernel/rank_2k_complex.h for an example of this)
-  static cutlass::ComplexTransform const kTransformA = OriginalTransformA_;
-  static cutlass::ComplexTransform const kTransformB = OriginalTransformB_;
-
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename MapArgumentsA::LayoutC;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-  static FillMode const kFillModeC = MapArgumentsA::kFillModeC;
-
-  // Common type definitions for Mma1 and Mma2
-  using Operator = typename Mma1::Operator;
-  using OperatorClass = typename Mma1::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma1::Shape;
-  using WarpShape = typename Mma1::Operator::Shape;
-  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma1::ArchTag;
-
-  static int const kStages = Mma1::kStages;
-  static BlasMode const kBlasMode = BlasMode_;
-
-private:
-  static FillMode const kInternalFillModeC = FillModeC_;
-
-public:
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma1::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ProblemVisitor = Rank2KGroupedProblemVisitor<
-                            ThreadblockShape,
-                            kGroupScheduleMode,
-                            kThreadCount,
-                            kThreadCount,
-                            kInternalFillModeC>;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode = GemmUniversalMode::kGemm;
-    GemmCoord *problem_sizes = nullptr;
-    int problem_count{0};
-    int threadblock_count{0};
-
-    typename EpilogueOutputOp::Params epilogue;
-
-    ElementA ** ptr_A = nullptr;
-    ElementB ** ptr_B = nullptr;
-    ElementC ** ptr_C = nullptr;
-    ElementC ** ptr_D = nullptr;
-
-    typename LayoutA::Stride::LongIndex *lda = nullptr;
-    typename LayoutB::Stride::LongIndex *ldb = nullptr;
-    typename LayoutC::Stride::LongIndex *ldc = nullptr;
-    typename LayoutC::Stride::LongIndex *ldd = nullptr;
-
-    // Only used by device-level operator
-    GemmCoord *host_problem_sizes = nullptr;
-
-    bool allow_early_exit = false;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    Arguments() = default;
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord *problem_sizes,
-      int problem_count,
-      int threadblock_count,
-      typename EpilogueOutputOp::Params epilogue,
-      ElementA ** ptr_A,
-      ElementB ** ptr_B,
-      ElementC ** ptr_C,
-      ElementC ** ptr_D,
-      typename LayoutA::Stride::LongIndex *lda,
-      typename LayoutB::Stride::LongIndex *ldb,
-      typename LayoutC::Stride::LongIndex *ldc,
-      typename LayoutC::Stride::LongIndex *ldd,
-      GemmCoord *host_problem_sizes=nullptr,
-      bool allow_early_exit=false
-    ):
-      mode(mode),
-      problem_sizes(problem_sizes),
-      problem_count(problem_count),
-      threadblock_count(threadblock_count),
-      epilogue(epilogue),
-      ptr_A(ptr_A),
-      ptr_B(ptr_B),
-      ptr_C(ptr_C),
-      ptr_D(ptr_D),
-      lda(lda),
-      ldb(ldb),
-      ldc(ldc),
-      ldd(ldd),
-      host_problem_sizes(host_problem_sizes),
-      allow_early_exit(allow_early_exit)
-    {
-
-    }
-
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    typename ProblemVisitor::Params problem_visitor{};
-    int threadblock_count = 0;
-
-    typename EpilogueOutputOp::Params output_op{};
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    int batch_count = 0;
-
-    ElementA** ptr_A = nullptr;
-    ElementB** ptr_B = nullptr;
-    ElementC** ptr_C = nullptr;
-    ElementC** ptr_D = nullptr;
-
-    typename LayoutA::Stride::LongIndex* lda = nullptr;
-    typename LayoutB::Stride::LongIndex* ldb = nullptr;
-    typename LayoutC::Stride::LongIndex* ldc = nullptr;
-    typename LayoutC::Stride::LongIndex* ldd = nullptr;
-
-    bool allow_early_exit = false;
-
-    //
-    // Methods
-    //
-
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const &args, void *workspace = nullptr, int tile_count = 0):
-      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
-      threadblock_count(args.threadblock_count),
-      output_op(args.epilogue),
-      ptr_A(args.ptr_A),
-      ptr_B(args.ptr_B),
-      ptr_C(args.ptr_C),
-      ptr_D(args.ptr_D),
-      lda(args.lda),
-      ldb(args.ldb),
-      ldc(args.ldc),
-      ldd(args.ldd),
-      allow_early_exit(args.allow_early_exit)
-    {
-
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr,
-      int tile_count = 0) {
-
-      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
-      threadblock_count = args.threadblock_count;
-      output_op = args.output_op;
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-    }
-  };
-
-  /// Shared memory storage structure
-  struct SharedStorage {
-    union {
-      typename Mma1::SharedStorage mma1_main_loop;
-      typename Mma2::SharedStorage mma2_main_loop;
-      typename Epilogue::SharedStorage epilogue;
-    } kernel;
-
-    // ProblemVisitor shared storage can't be overlapped with others
-    typename ProblemVisitor::SharedStorage problem_visitor;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  Rank2KGrouped() = default;
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Early exit following LAPACK's definition
-    if (params.allow_early_exit &&
-        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
-      return;
-    }
-
-    //
-    // Problem visitor.
-    //
-
-    ProblemVisitor problem_visitor(
-      params.problem_visitor,
-      shared_storage.problem_visitor,
-      blockIdx.x);
-
-    // Outer 'persistent' loop to iterate over tiles
-    while (problem_visitor.next_tile()) {
-
-      GemmCoord problem_size  = problem_visitor.problem_size();
-      int32_t problem_idx     = problem_visitor.problem_index();
-      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
-
-      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
-
-      cutlass::gemm::GemmCoord threadblock_tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
-
-      //
-      // Perform checks to determine whether the results of this threadblock will be needed.
-      // An example of an unneeded threadblock is one that is assigned to compute in the upper
-      // portion of a Rank2K kernel filled with mode kLower.
-      //
-      // TODO: Consider pushing these checks into ProblemVisitor to avoid spuriously
-      // returning from `next_tile()`.
-      //
-
-      // Early exit if threadblock is out of range
-      if (grid_shape.m() <= threadblock_tile_offset.m() ||
-          grid_shape.n() <= threadblock_tile_offset.n()) {
-        // Next tile
-        problem_visitor.advance(gridDim.x);
-        continue;
-      }
-
-      // Skip this tile if Fill Mode is Lower and
-      // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
-      if (kInternalFillModeC == cutlass::FillMode::kLower &&
-          (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
-        // Next tile
-        problem_visitor.advance(gridDim.x);
-        continue;
-      }
-
-      // Skip this tile if Fill Mode is Upper and
-      // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
-      if (kInternalFillModeC == cutlass::FillMode::kUpper &&
-          threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
-        // Next tile
-        problem_visitor.advance(gridDim.x);
-        continue;
-      }
-
-      bool tile_on_diagonal = false;
-      // Mark tiles that are being crossed by the main diagonal
-      // (top-right and bottom-left corners are on either side of the diagonal)
-      if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
-          && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
-        tile_on_diagonal = true;
-      }
-
-      int offset_k = 0;
-      int problem_size_k = problem_size.k();
-
-      //
-      // Fetch pointers based on mode.
-      //
-      if (params.mode == GemmUniversalMode::kGemm ||
-          params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-        if (threadblock_tile_offset.k() + 1 < grid_shape.k()) {
-          problem_size_k = (threadblock_tile_offset.k() + 1) * problem_size.k();
-        }
-
-        offset_k = threadblock_tile_offset.k() * problem_size.k();
-      }
-
-      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
-      typename LayoutA::Stride::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
-
-      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
-      typename LayoutB::Stride::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
-
-      // Compute initial location in logical coordinates
-      cutlass::MatrixCoord tb_offset_MxK{
-        threadblock_tile_offset.m() * Mma1::Shape::kM,
-        offset_k,
-      };
-
-      cutlass::MatrixCoord tb_offset_KxN{
-        offset_k,
-        threadblock_tile_offset.n() * Mma1::Shape::kN
-      };
-
-      // Assume identity swizzle
-      MatrixCoord tb_offset(
-        threadblock_tile_offset.m() * Mma1::Shape::kM,
-        threadblock_tile_offset.n() * Mma1::Shape::kN
-      );
-
-      // Compute position within threadblock
-      int thread_idx = threadIdx.x;
-
-      // Construct iterators to A and B operands for Mma1
-      typename Mma1::IteratorA iterator_A(
-        Mma1::IteratorA::Params(ldm_A),
-        ptr_A,
-        {problem_size.m(), problem_size_k},
-        thread_idx,
-        tb_offset_MxK);
-
-      typename Mma1::IteratorB iterator_BT(
-        Mma1::IteratorB::Params(ldm_B),
-        ptr_B,
-        {problem_size_k, problem_size.n()},
-        thread_idx,
-        tb_offset_KxN);
-
-      // Construct iterators to A and B operands for Mma2
-      typename Mma2::IteratorA iterator_B(
-        Mma2::IteratorA::Params(ldm_B),
-        ptr_B,
-        {problem_size.m(), problem_size_k},
-        thread_idx,
-        tb_offset_MxK);
-
-      typename Mma2::IteratorB iterator_AT(
-        Mma2::IteratorB::Params(ldm_A),
-        ptr_A,
-        {problem_size_k, problem_size.n()},
-        thread_idx,
-        tb_offset_KxN);
-
-      // Broadcast the warp_id computed by lane 0 to ensure dependent code
-      // is compiled as warp-uniform.
-      int warp_idx = canonical_warp_idx_sync();
-
-      int lane_idx = threadIdx.x % 32;
-
-      //
-      // Main loop
-      //
-
-      // Construct thread-scoped matrix multiply for Mma1 (A x BT)
-      Mma1 mma1(shared_storage.kernel.mma1_main_loop, thread_idx, warp_idx, lane_idx);
-
-      // Construct thread-scoped matrix multiply for Mma2 (B x AT)
-      Mma2 mma2(shared_storage.kernel.mma2_main_loop, thread_idx, warp_idx, lane_idx);
-
-      typename Mma1::FragmentC accumulators;
-
-      accumulators.clear();
-
-      // Compute threadblock-scoped matrix multiply-add
-      int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-
-      // Wait for all threads to finish their epilogue phases from the previous tile.
-      __syncthreads();
-
-      // Compute threadblock-scoped matrix multiply-add (A x BT)
-      mma1(
-        gemm_k_iterations,
-        accumulators,
-        iterator_A,
-        iterator_BT,
-        accumulators);
-
-      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
-      if (kBlasMode == BlasMode::kHermitian) {
-
-        //
-        // Epilogue
-        //
-
-        EpilogueOutputOp output_op(params.output_op);
-
-        int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
-
-        ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
-        ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
-
-        // If TB not on diagonal, FillMode doesn't apply.
-        FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
-
-        // Tile iterator loading from source tensor.
-        typename Epilogue::OutputTileIterator iterator_C(
-          Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
-          ptr_C,
-          problem_size.mn(),
-          thread_idx,
-          tb_offset,
-          kFillModeTB
-        );
-
-        // Tile iterator writing to destination tensor.
-        typename Epilogue::OutputTileIterator iterator_D(
-          Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
-          ptr_D,
-          problem_size.mn(),
-          thread_idx,
-          tb_offset,
-          kFillModeTB
-        );
-
-        Epilogue epilogue(
-          shared_storage.kernel.epilogue,
-          thread_idx,
-          warp_idx,
-          lane_idx);
-
-        // Execute the epilogue operator to update the destination tensor.
-        epilogue(
-          output_op,
-          iterator_D,
-          accumulators,
-          iterator_C);
-
-        __syncthreads();
-
-        accumulators.clear();
-      }
-
-      // Compute threadblock-scoped matrix multiply-add (B x AT)
-      mma2(
-        gemm_k_iterations,
-        accumulators,
-        iterator_B,
-        iterator_AT,
-        accumulators);
-
-      //
-      // Epilogue
-      //
-
-      EpilogueOutputOp output_op(params.output_op);
-
-      /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
-      typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
-      EpilogueOutputOp output_op_her2k(second_her2k_params);
-
-      //
-      // Masked tile iterators constructed from members
-      //
-
-      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
-
-      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
-
-      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
-      if (kBlasMode == BlasMode::kHermitian) {
-        ptr_C = static_cast<ElementC *>(params.ptr_D[problem_idx]);
-      }
-
-      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
-
-      // If TB not on diagonal, FillMode doesn't apply.
-      FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
-
-      // Tile iterator loading from source tensor.
-      typename Epilogue::OutputTileIterator iterator_C(
-        Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
-        ptr_C,
-        problem_size.mn(),
-        thread_idx,
-        tb_offset,
-        kFillModeTB
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
-        ptr_D,
-        problem_size.mn(),
-        thread_idx,
-        tb_offset,
-        kFillModeTB
-      );
-
-      Epilogue epilogue(
-        shared_storage.kernel.epilogue,
-        thread_idx,
-        warp_idx,
-        lane_idx);
-
-      // Execute the epilogue operator to update the destination tensor.
-      if (kBlasMode == BlasMode::kSymmetric) {
-        epilogue(
-          output_op,
-          iterator_D,
-          accumulators,
-          iterator_C);
-      } else {
-        epilogue(
-          output_op_her2k,
-          iterator_D,
-          accumulators,
-          iterator_C);
-      }
-
-      // Next tile
-      problem_visitor.advance(gridDim.x);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
deleted file mode 100755
index 2e31c7783..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Problem visitor for grouped Rank2K operations.
-
-    This problem visitor is specialized for Rank2K operations, for which matrix C is upper/lower
-    triangular. Using a problem visitor designed for GEMMs for Rank2K problems is inefficient
-    because threadblocks will be frequently assigned to tiles that exit early (e.g., due to
-    being assigned to a tile in the upper-triangular portion of a lower-triangular problem).
-    This can lead to load imbalance among threadblocks, as the GEMM-based scheduler
-    assigns all threadblocks to nearly the same number of tiles, regardless of whether
-    those tiles exit early.
-
-    Consider an example of a group of four Rank2Ks with matrix C consisting of a grid of 2x2 tiles.
-    Consider a grid of 8 threadblocks. The default GEMM scheduler will assign threadblocks to
-    tiles in the following order:
-        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
-          0  1          4  5           0  1          4  5
-          2  3          6  7           2  3          6  7
-    Assuming that the problems are lower triangular, blocks 1 and 5 are continuously assigned
-    to inactive tiles.
-
-    This problem visitor aims to assign threadblocks to only those tiles which are in the
-    upper/lower triangular portion of a given problem. Using the example above, the resulting
-    assignment would be:
-        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
-          0  -          3  -           6  -          1  -
-          1  2          4  5           7  0          2  3
-
-    Achieving the schedule above requires a mapping from threadblock ID to tile coordinates (i, j).
-    We will illustrate this by mapping on a lower-triangular matrix with a 3x3 grid. We first
-    calculate row and column indices assuming one-indexed rows, tiles, and threadblock IDs, and
-    then subtract one to convert to zero-indexed.
-                      Col 1   Col 2   Col 3
-                     ----------------------
-              Row 1 |   1      -       -
-              Row 2 |   2      3       -
-              Row 3 |   4      5       6
-
-    We next outline this mapping, borrowing from: https://stackoverflow.com/a/40954159
-
-    Calculating row i given threadblock ID t
-    ----------------------------------------
-    For a given row i, all threadblock IDs t in that row satisfy the following:
-          t <= 1 + 2 + 3 + ... + (i-1) + i
-
-    The closed-form equation for the right-hand side is: i(i+1)/2.
-    Using this, we can solve for i given t:
-          t  <= i(i+1)/2
-          2t <= i^2 + i
-          2t <= i^2 + i + 0.25 - 0.25
-          2t + 0.25 <= i^2 + i + 0.25
-          2t + 0.25 <= (i + 0.5)^2
-          sqrt(2t + 0.25) - 0.5 <= i
-
-    To account for fractional values, we set:
-          i = ceil(sqrt(2t + 0.25) - 0.5)
-
-    To turn this into a zero-indexed row and work with zero-indexed t, we perform:
-          i = ceil(sqrt(2(t+1) + 0.25) - 0.5) - 1
-            = ceil(sqrt(2t + 2.25) - 0.5) - 1
-
-    Calculating column j given threadblock ID t and row i
-    -----------------------------------------------------
-    For a given row i, all threadblock IDs t in that row also satisfy the following:
-          t > 1 + 2 + 3 + ... + (i-2) + (i-1)
-      --> t > i(i-1)/2
-
-    Threadblock IDs within a given row are sequential, so the one-indexed column ID
-    for one-indexed threadblock ID t and row i is:
-          j = t - (i(i-1)/2)
-
-    The zero-indexed version becomes:
-          j = (t+1) - (i(i+1)/2) -1
-            = t - (i(i+1)/2)
-
-    Accounting for non-square grids
-    -------------------------------
-    Though the overall output problem size for Rank2K problems is guranteed to be square, the
-    grids used in computing may not be square due to using non-square threadblock shapes. For
-    example, a threadblock shape of 64x32 operating on a problem of output size 128x128 would
-    result in a grid of 2x4 tiles.
-
-    This case can be handled by noting that the output resembles a square grid of 2x2 "macro tiles"
-    each of which contains 2 "true tiles." We can thus first map a threadblock ID to its "macro tile"
-    using the equations above, and then map it to the "true tile" within its "macro tile." In the example
-    of a 2x4 grid, this mapping would look as follows:
-        "Macro grid"           "True grid"
-       {0, 1}    -            0   1   -   -
-       {2, 3}  {4, 5}         2   3   4   5
-
-    A zero-indexed threadblock ID t is mapped to its "macro tile ID" t_macro as:
-      t_macro = t // r
-    Where r is the ratio of the maximum dimension of the grid to the minimum dimension of the grid
-    (i.e., r = 4 / 2 = 2 in the previous example).
-
-    One uses t_macro and the calculations above to find the row and column in the square matrix to
-    obtain i_macro and j_macro (zero-indexed). The mapping from (i_macro, j_macro) --> (i, j)
-    is simply the following:
-        if (ThreadblockShape::M > ThreadblockShape::N):
-            r = ThreadblockShape::M / ThreadblockShape::N
-            i = i_macro
-            j = (j_macro * r) + (t % r)
-        elif (ThreadblockShape::M < ThreadblockShape::N):
-            r = ThreadblockShape::N / ThreadblockShape::M
-            i = (i_macro * r) + (t % r)
-            j = j_macro
-        else:
-            i = i_macro
-            j = j_macro
-
-    Handling cases with grid dimensions that aren't multiples of eachother
-    ----------------------------------------------------------------------
-    Even though threadblock shapes M and N are typically multiples of one another, the grid
-    for a given problem may not have dimensions of the same ratio as that of the threadblock.
-    For example, a problem of size 132x132 using a threadblock of shape 64x32 will result
-    in a grid of 3x5 tiles. In this case, there is not an integer number of "true tiles"
-    per "macro tile."
-
-    When this scenario arises, we simply pad the larger dimension of the grid such that
-    there are an integer number of "true tiles" per "macro tile." Thus, the 3x5 grid in
-    the example above will be treated as a 3x6 grid. Row and column positions for each
-    tile are calculated as above. Any threadblocks that map to tiles that are outside the
-    problem range or upper/lower triangular portion (e.g., (2, 5)) will exit early from
-    this problem and may proceed to the next problem in the group.
-
-    Handling upper-triangular matrices
-    ----------------------------------
-    The only modification needed for upper-triangular matrices is to swap i_macro and j_macro
-    in the calculations above.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-
-#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-namespace detail {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Helpers for calculating offsets for Rank2K problem visitor. These helpers specifically pertain
-// to the conversion from "macro tiles" to "true tiles" in the description above.
-//
-template <
-  typename ThreadblockShape,
-  typename Enable = void
->
-struct Rank2KGroupedProblemVisitorOffsetHelper;
-
-// Partial specialization for the case where threadblock shape M > threadblock shape N
-template <
-  typename ThreadblockShape
->
-struct Rank2KGroupedProblemVisitorOffsetHelper<
-    ThreadblockShape,
-    typename platform::enable_if< (ThreadblockShape::kM > ThreadblockShape::kN) >::type
-> {
-  static_assert(ThreadblockShape::kM % ThreadblockShape::kN == 0,
-             "Rank2KGroupedProblemVisitor with threadblock shape M > threadblock shape N "
-             "requires that threadblock shape M be a multiple of threadblock shape N.");
-
-  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kM / ThreadblockShape::kN;
-
-  CUTLASS_HOST_DEVICE
-  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
-    return grid.m();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
-    return row;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
-    return (col * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
-  }
-};
-
-// Partial specialization for the case where threadblock shape M < threadblock shape N
-template <
-  typename ThreadblockShape
->
-struct Rank2KGroupedProblemVisitorOffsetHelper<
-    ThreadblockShape,
-    typename platform::enable_if< (ThreadblockShape::kM < ThreadblockShape::kN) >::type
-> {
-
-  static_assert(ThreadblockShape::kN % ThreadblockShape::kM == 0,
-             "Rank2KGroupedProblemVisitor with threadblock shape M < threadblock shape N "
-             "requires that threadblock shape N be a multiple of threadblock shape M.");
-
-  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kN / ThreadblockShape::kM;
-
-  CUTLASS_HOST_DEVICE
-  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
-    return grid.n();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
-    return (row * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
-    return col;
-  }
-};
-
-// Partial specialization for the case where threadblock shape M == threadblock shape N
-// In this case, macro tiles are equivalent to true tiles, so the conversions are
-// identity functions.
-template <
-  typename ThreadblockShape
->
-struct Rank2KGroupedProblemVisitorOffsetHelper<
-    ThreadblockShape,
-    typename platform::enable_if< (ThreadblockShape::kM == ThreadblockShape::kN) >::type
-> {
-
-  static int32_t const kThreadblockSkewRatio = 1;
-
-  CUTLASS_HOST_DEVICE
-  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
-    return grid.m();
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
-    return row;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
-    return col;
-  }
-};
-
-// Helper for correctly representing problem sizes in grouped kernels 
-template <typename ThreadblockShape>
-struct Rank2KGroupedProblemSizeHelper {
-  using OffsetHelper = Rank2KGroupedProblemVisitorOffsetHelper<ThreadblockShape>;
-
-  CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
-    return cutlass::gemm::GemmCoord(
-      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
-      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
-      1);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
-    // Return the number of tiles at or below the diagonal (or at and above
-    // for mode kUpper). We do this by first calculating this value assuming
-    // we have a square matrix of tiles of size `dim x dim` where `dim` is the
-    // minimum among {grid.m(), grid.n()}. We then multiply the resulting value
-    // by OffsetHelper::kThreadblockSkewRatio to account for cases in which there
-    // are more tiles in one dimension than the other.
-    int32_t dim = OffsetHelper::min_dim(grid);
-    int32_t tiles_on_diagonal = dim;
-    int32_t tiles_below_diagonal = ((dim * (dim - 1)) / 2);
-    return (tiles_on_diagonal + tiles_below_diagonal) * OffsetHelper::kThreadblockSkewRatio;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Default problem visitor for fill modes kUpper and kLower.
-//
-template <typename ThreadblockShape,
-          GroupScheduleMode GroupScheduleMode_,
-          int PrefetchTileCount,
-          int ThreadCount,
-          cutlass::FillMode FillModeC>
-struct Rank2KGroupedProblemVisitor : public GroupedProblemVisitor<
-                                              detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
-                                              ThreadblockShape,
-                                              GroupScheduleMode_,
-                                              PrefetchTileCount,
-                                              ThreadCount> {
-
-  static cutlass::FillMode const kFillModeC = FillModeC;
-
-  static_assert(kFillModeC == cutlass::FillMode::kLower || kFillModeC == cutlass::FillMode::kUpper,
-              "Default Rank2KGroupedProblemVisitor requires fill mode of kLower or kUpper.");
-
-  using ProblemSizeHelper = detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>;
-  using Base = GroupedProblemVisitor<ProblemSizeHelper,
-                                     ThreadblockShape,
-                                     GroupScheduleMode_,
-                                     PrefetchTileCount,
-                                     ThreadCount>;
-  using OffsetHelper = typename ProblemSizeHelper::OffsetHelper;
-  using Params = typename Base::Params;
-  using SharedStorage = typename Base::SharedStorage;
-
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  Rank2KGroupedProblemVisitor(
-    Params const &params_,
-    SharedStorage &shared_storage_,
-    int32_t block_idx
-  ): Base(params_, shared_storage_, block_idx)
-  {}
-
-  CUTLASS_DEVICE
-  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
-    int32_t macro_id = threadblock_id / OffsetHelper::kThreadblockSkewRatio;
-    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
-    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
-
-    if (kFillModeC == cutlass::FillMode::kUpper) {
-      swap(macro_row, macro_col);
-    }
-
-    int32_t row = OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
-    int32_t col = OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
-
-    return cutlass::gemm::GemmCoord(row, col, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
deleted file mode 100755
index 11b2a915a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-  \file
-  \brief Transpositions for Rank2K problems.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA_,
-  typename LayoutA_,
-  ComplexTransform TransformA,
-  int AlignmentA,
-  typename ElementB_,
-  typename LayoutB_,
-  ComplexTransform TransformB,
-  int AlignmentB,
-  typename LayoutC_,
-  FillMode FillModeC_,
-  bool Transpose
->
-struct Rank2KMapArguments {
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  static ComplexTransform const kTransformA = TransformA;
-  static int const kAlignmentA = AlignmentA;
-  using ElementB = ElementB_;
-  using LayoutB = LayoutB_;
-  static ComplexTransform const kTransformB = TransformB;
-  static int const kAlignmentB = AlignmentB;
-  using LayoutC = LayoutC_;
-  static FillMode const kFillModeC = FillModeC_;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA_,
-  typename LayoutA_,
-  ComplexTransform TransformA,
-  int AlignmentA,
-  typename ElementB_,
-  typename LayoutB_,
-  ComplexTransform TransformB,
-  int AlignmentB,
-  typename LayoutC_,
-  FillMode FillModeC_
->
-struct Rank2KMapArguments<
-  ElementA_,
-  LayoutA_,
-  TransformA,
-  AlignmentA,
-  ElementB_,
-  LayoutB_,
-  TransformB,
-  AlignmentB,
-  LayoutC_,
-  FillModeC_,
-  true
-> {
-  using ElementA = ElementB_;
-  using LayoutA = LayoutB_;
-  static ComplexTransform const kTransformA = TransformB;
-  static int const kAlignmentA = AlignmentB;
-  using ElementB = ElementA_;
-  using LayoutB = LayoutA_;
-  static ComplexTransform const kTransformB = TransformA;
-  static int const kAlignmentB = AlignmentA;
-  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
-  static FillMode const kFillModeC = InvertFillMode<FillModeC_>::mode;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}
-}
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
deleted file mode 100755
index bd7ffb0e3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
+++ /dev/null
@@ -1,769 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma1_,                 ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
-  typename Mma2_,                 ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  FillMode FillModeC_,            ///! Fill Mode for C (kLower or kUpper)
-  BlasMode BlasMode_              ///! Blas3 computation mode
->
-struct Rank2KUniversal {
-public:
-
-  using Mma1 = Mma1_;
-  using Mma2 = Mma2_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma1::IteratorA::Element;
-  using ElementB = typename Mma1::IteratorB::Element;
-
-  // Mma1 (A x B^T)
-  using LayoutA = typename Mma1::IteratorA::Layout;
-  using LayoutBT = typename Mma1::IteratorB::Layout;
-  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
-  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
-
-  // Mma2 (B x A^T)
-  using LayoutB = typename Mma2::IteratorA::Layout;
-  using LayoutAT = typename Mma2::IteratorB::Layout;
-  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
-  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
-
-  // Common type definitions for Mma1 and Mma2
-  using Operator = typename Mma1::Operator;
-  using OperatorClass = typename Mma1::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma1::Shape;
-  using WarpShape = typename Mma1::Operator::Shape;
-  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma1::ArchTag;
-
-  static int const kStages = Mma1::kStages;
-  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
-
-  // Output related typedefinitions
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  static FillMode const kFillModeC = FillModeC_;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-  static BlasMode const kBlasMode = BlasMode_;
-
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma1::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    GemmCoord problem_size {};
-    int batch_count{1};
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A = nullptr;
-    void const * ptr_B = nullptr;
-    void const * ptr_C = nullptr;
-    void * ptr_D = nullptr;
-
-    int64_t batch_stride_A {0};
-    int64_t batch_stride_B {0};
-    int64_t batch_stride_C {0};
-    int64_t batch_stride_D {0};
-
-    typename LayoutA::Stride::Index lda{0};
-    typename LayoutB::Stride::Index ldb{0};
-    typename LayoutC::Stride::Index ldc{0};
-    typename LayoutC::Stride::Index ldd{0};
-
-    bool allow_early_exit{false};
-
-    //
-    // Methods
-    //
-    
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      bool allow_early_exit = false
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_B(0),
-      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
-      allow_early_exit(allow_early_exit) {
-
-      }
-
-      /// Returns arguments for a the transposed problem
-      Arguments transposed_problem() const {
-        Arguments args(*this);
-        
-        std::swap(args.ptr_A, args.ptr_B);
-        std::swap(args.lda, args.ldb);
-        std::swap(args.batch_stride_A, args.batch_stride_B);
-
-        return args;
-      }
-
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    
-    // Mma1 Iterator A and B params
-    typename Mma1::IteratorA::Params params_A{};
-    typename Mma1::IteratorB::Params params_BT{};
-
-    // Mma2 Iterator A and B params 
-    typename Mma2::IteratorA::Params params_B{};
-    typename Mma2::IteratorB::Params params_AT{};
-
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    
-    typename EpilogueOutputOp::Params output_op{};
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    int batch_count{0};
-    int gemm_k_size{0};
-
-    void * ptr_A = nullptr;
-    void * ptr_B = nullptr;
-    void * ptr_C = nullptr;
-    void * ptr_D = nullptr;
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-
-    int *semaphore = nullptr;
-
-    bool allow_early_exit {false};
-
-    //
-    // Methods
-    //
-
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(args.lda),
-      params_BT(args.ldb),
-      params_B(args.ldb),
-      params_AT(args.lda),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(const_cast<void *>(args.ptr_D)),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)),
-      allow_early_exit(args.allow_early_exit) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      output_op = args.epilogue;
-
-      semaphore = static_cast<int *>(workspace);
-    }
-
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma1::SharedStorage mma1_main_loop;
-    typename Mma2::SharedStorage mma2_main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  Rank2KUniversal() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Early exit following LAPACK's definition
-    if (params.allow_early_exit &&
-        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
-      return;
-    }
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-      return;
-    }
-   
-    // Early exit if Fill Mode is Lower and
-    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
-    if (kFillModeC == cutlass::FillMode::kLower &&
-        (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
-      return;
-    }    
-    
-    // Early exit if Fill Mode is Upper and
-    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
-    if (kFillModeC == cutlass::FillMode::kUpper &&
-        threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
-      return;
-    }    
-    
-    bool tile_on_diagonal = false;
-    // Mark tiles that are being crossed by the main diagonal
-    // (top-right and bottom-left corners are on either side of the diagonal)
-    if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
-        && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
-      tile_on_diagonal = true;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_MxK{
-      threadblock_tile_offset.m() * Mma1::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_KxN{
-      offset_k,
-      threadblock_tile_offset.n() * Mma1::Shape::kN
-    };
-
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands for Mma1
-    typename Mma1::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_MxK);
-
-    typename Mma1::IteratorB iterator_BT(
-      params.params_BT,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_KxN);
-
-    // Construct iterators to A and B operands for Mma2
-    typename Mma2::IteratorA iterator_B(
-      params.params_B,
-      ptr_B,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_MxK);
-
-    typename Mma2::IteratorB iterator_AT(
-      params.params_AT,
-      ptr_A,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_KxN);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply for Mma1 (A x BT)
-    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
-
-    // Construct thread-scoped matrix multiply for Mma2 (B x AT)
-    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma1::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add (A x BT)
-    mma1(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_BT, 
-      accumulators);
-
-    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
-    if (kBlasMode == BlasMode::kHermitian) {
-
-      //
-      // Epilogue
-      //
-
-      EpilogueOutputOp output_op(params.output_op);
-
-      //
-      // Masked tile iterators constructed from members
-      //
-
-      threadblock_tile_offset =
-          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-      //assume identity swizzle
-      MatrixCoord threadblock_offset(
-        threadblock_tile_offset.m() * Mma1::Shape::kM,
-        threadblock_tile_offset.n() * Mma1::Shape::kN
-      );
-
-      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
-      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-      //
-      // Fetch pointers based on mode.
-      //
-      
-      // Construct the semaphore.
-      Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-      if (params.mode == GemmUniversalMode::kGemm) {
-
-        // If performing a reduction via split-K, fetch the initial synchronization
-        if (params.grid_tiled_shape.k() > 1) {
-          
-          // Fetch the synchronization lock initially but do not block.
-          semaphore.fetch();
-
-          // Indicate which position in a serial reduction the output operator is currently updating
-          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-        }
-      }
-      else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      }
-      else if (params.mode == GemmUniversalMode::kBatched) {
-        ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-      }
-      else if (params.mode == GemmUniversalMode::kArray) {
-        ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-        ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-      }
-
-      
-      // If CTA not on diagonal, FillMode doesn't apply. 
-      FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
-
-      // Tile iterator loading from source tensor.
-      typename Epilogue::OutputTileIterator iterator_C(
-        params.params_C,
-        ptr_C,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset,
-        kFillModeCTA
-      );
-
-      // Tile iterator writing to destination tensor.
-      typename Epilogue::OutputTileIterator iterator_D(
-        params.params_D,
-        ptr_D,
-        params.problem_size.mn(),
-        thread_idx,
-        threadblock_offset,
-        kFillModeCTA
-      );
-
-      Epilogue epilogue(
-        shared_storage.epilogue, 
-        thread_idx, 
-        warp_idx, 
-        lane_idx);
-
-      // Wait on the semaphore - this latency may have been covered by iterator construction
-      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-          
-        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-        if (threadblock_tile_offset.k()) {
-          iterator_C = iterator_D;
-        }
-
-        semaphore.wait(threadblock_tile_offset.k());
-
-        __threadfence();
-      }
-
-      // Execute the epilogue operator to update the destination tensor.
-      epilogue(
-        output_op, 
-        iterator_D, 
-        accumulators, 
-        iterator_C); 
-      
-      //
-      // Release the semaphore
-      //
-
-      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-        int lock = 0;
-        if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-          // The final threadblock resets the semaphore for subsequent grids.
-          lock = 0;
-        }
-        else {
-          // Otherwise, the semaphore is incremented
-          lock = threadblock_tile_offset.k() + 1;
-        }
-        
-        semaphore.release(lock);
-      }
-
-      __syncthreads();
-
-      accumulators.clear();
-    }
-
-    // Compute threadblock-scoped matrix multiply-add (B x AT)
-    mma2(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_B, 
-      iterator_AT, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
-    typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
-    EpilogueOutputOp output_op_her2k(second_her2k_params);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma1::Shape::kM,
-      threadblock_tile_offset.n() * Mma1::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
-
-    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
-    if (kBlasMode == BlasMode::kHermitian) {
-      ptr_C = static_cast<ElementC *>(params.ptr_D);
-    }
-
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        if (kBlasMode == BlasMode::kSymmetric) {
-          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-        } else {
-          output_op_her2k.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-        }
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    
-    // If CTA not on diagonal, FillMode doesn't apply. 
-    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      kFillModeCTA
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      kFillModeCTA
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    if (kBlasMode == BlasMode::kSymmetric) {
-      epilogue(
-        output_op,
-        iterator_D,
-        accumulators,
-        iterator_C);
-    } else {
-      epilogue(
-        output_op_her2k,
-        iterator_D,
-        accumulators,
-        iterator_C);
-    }
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
deleted file mode 100755
index ad418286b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  FillMode FillModeC_             ///! Fill Mode for C (kLower or kUpper)
->
-struct RankKUniversal {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  static FillMode const kFillModeC = FillModeC_;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = 128 / sizeof_bits<ElementA>::value;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-    GemmCoord problem_size{};
-    int batch_count{1};
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-
-    typename LayoutA::Stride::Index lda{};
-    typename LayoutB::Stride::Index ldb{};
-    typename LayoutC::Stride::Index ldc{};
-    typename LayoutC::Stride::Index ldd{};
-
-    bool allow_early_exit{false};
-
-    //
-    // Methods
-    //
-    
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd,
-      bool allow_early_exit = false
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(0),
-      ldc(ldc), ldd(ldd),
-      allow_early_exit(allow_early_exit) {
-
-      }
-
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-   
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    typename EpilogueOutputOp::Params output_op{};
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    int batch_count{0};
-    int gemm_k_size{0};
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-    void * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-
-    int *semaphore{nullptr};
-
-    bool allow_early_exit{false};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(args.lda),
-      params_B(args.lda),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_A)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(const_cast<void *>(args.ptr_D)),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_A),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)),
-      allow_early_exit(args.allow_early_exit) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_A);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      output_op = args.epilogue;
-
-      semaphore = static_cast<int *>(workspace);
-    }
-
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  RankKUniversal() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit following LAPACK's definition
-    if (params.allow_early_exit &&
-        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
-      return;
-    }
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-      return;
-    }
-   
-    // Early exit if Fill Mode is Lower and
-    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
-    if (kFillModeC == cutlass::FillMode::kLower &&
-        (threadblock_tile_offset.m() + 1) * Mma::Shape::kM <= threadblock_tile_offset.n() * Mma::Shape::kN) {
-      return;
-    }    
-    
-    // Early exit if Fill Mode is Upper and
-    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
-    if (kFillModeC == cutlass::FillMode::kUpper &&
-        threadblock_tile_offset.m() * Mma::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
-      return;
-    }    
-    
-    bool tile_on_diagonal = false;
-    // Mark tiles that are being crossed by the main diagonal
-    // (top-right and bottom-left corners are on either side of the diagonal)
-    if ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM > threadblock_tile_offset.n() * Mma::Shape::kN
-        && threadblock_tile_offset.m() * Mma::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
-      tile_on_diagonal = true;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    
-    // If CTA not on diagonal, FillMode doesn't apply. 
-    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      kFillModeCTA
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset,
-      kFillModeCTA
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
deleted file mode 100755
index b6ad7613d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-#include "cute/tensor.hpp"
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape,
-    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  static constexpr bool IsGdcEnabled = false;
-
-  static constexpr bool is_valid_tile_scheduler =
-  cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>;
-static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializing the tile scheduler.");
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
-    "Mainloop and epilogue do not agree on accumulator value type.");
-
-  // MSVC requires the cast to fix a warning-as-error.
-  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
-      sizeof(typename CollectiveMainloop::SharedStorage),
-      sizeof(typename CollectiveEpilogue::SharedStorage)));
-
-  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(cute::size(TiledMma{}));
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, args.hw_info.sm_count};
-    auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{});
-
-    return {
-      args.mode,
-      args.problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool mode_implementable = args.mode == GemmUniversalMode::kGemm or
-          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
-    return mode_implementable && TileScheduler::can_implement(args.scheduler);
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    return workspace_size;
-  }
-
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    cutlass::Status status = Status::kSuccess;
-
-    return status;
-  }
-
-  static dim3
-  get_grid_shape(Params const& params) {
-    int batch_count = 1;
-    if constexpr (cute::rank(ProblemShape{}) == 4) {
-      batch_count = cute::size<3>(params.problem_shape);
-    }
-
-    return dim3(
-      cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))),
-      cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))),
-      batch_count
-    );
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-    // Preconditions
-    CUTE_STATIC_ASSERT(is_static<TileShape>::value);
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    int thread_idx = int(threadIdx.x);
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);                                        // (m,n,k,l)
-
-    // Represent the full tensors
-    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
-    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
-
-    // Get batch slice
-    Tensor mA_mk = mA_mkl(_,_,l_coord);                                                                        // (m,k)
-    Tensor mB_nk = mB_nkl(_,_,l_coord);                                                                        // (n,k)
-
-    // Slice to get the tiles this thread block is responsible for
-    Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{});           // (BLK_M,BLK_K,k)
-    Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{});           // (BLK_N,BLK_K,k)
-
-    // Compute tile residues for predication
-    auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl);                             // M - BLK_M * m_coord
-    auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl);                             // N - BLK_N * n_coord
-    auto k_residue   = K - size<1>(gA) * size<2>(gA);                                        // K - BLK_K * k_coord_max
-    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
-
-    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
-    TiledMma tiled_mma;
-    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
-    clear(accumulators);
-
-    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
-    int  k_tile_count = size<2>(gA);
-
-    // Perform the collective scoped MMA
-    CollectiveMainloop collective_mma;
-    collective_mma(
-      accumulators,
-      gA,
-      gB,
-      accumulators,
-      k_tile_iter, k_tile_count,
-      residue_mnk,
-      thread_idx,
-      smem_buf
-    );
-    // Epilogue and write to gD
-    CollectiveEpilogue epilogue{params.epilogue};
-    epilogue(
-      problem_shape_MNKL,
-      blk_shape,
-      blk_coord_mnkl,
-      accumulators,
-      tiled_mma,
-      residue_mnk,
-      thread_idx,
-      smem_buf
-    );
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
deleted file mode 100755
index 823e919ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+++ /dev/null
@@ -1,881 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cute/tensor.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
->
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
-
-  static constexpr bool IsGdcEnabled = false;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using Schedule = typename DispatchPolicy::Schedule;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-  static_assert(cute::is_void_v<TileScheduler_>,
-    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
-
-  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
-
-  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
-    typename detail::TileSchedulerSelector<
-      GroupScheduler, ArchTag,
-      TileShape, ClusterShape,
-      ProblemShape>::Scheduler,
-    typename detail::TileSchedulerSelector<
-    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaThreads = CUTE_STATIC_V(size(TiledMma{}));
-  static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup;
-  static constexpr uint32_t MaxThreadsPerBlock = NumMmaThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  /// Register requirement for Load and Math WGs
-  static constexpr uint32_t LoadRegisterRequirement = 40;
-  static constexpr uint32_t MmaRegisterRequirement = 232;
-
-  // 1 stage ordered sequence between mainloop and epilogue producer load threads
-  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
-    } pipelines;
-
-    struct TensorMapStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
-      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
-
-      alignas(128) MainloopTensorMapStorage mainloop;
-      alignas(128) EpilogueTensorMapStorage epilogue;
-    } tensormaps;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-    void* workspace{nullptr};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    ProblemShape problem_shapes = args.problem_shape;
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-
-    // Calculate workspace pointers
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    void* scheduler_workspace = workspace_ptr;
-    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* epilogue_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* mainloop_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    TileSchedulerParams scheduler;
-    if constexpr (IsGroupedGemmKernel) {
-      scheduler = TileScheduler::to_underlying_arguments(
-      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
-    }
-    else {
-      scheduler = TileScheduler::to_underlying_arguments(
-      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
-    }
-
-    return {
-      args.mode,
-      problem_shapes,
-      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
-      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
-      hw_info,
-      scheduler,
-      workspace
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = true;
-    if constexpr (IsGroupedGemmKernel) {
-      // Group GEMM currently only supports rank-3 problem shapes
-      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
-      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
-    }
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-
-    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-    static constexpr uint32_t NumAccumulatorMtxs = 1;
-
-    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
-    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
-    dim3 grid_shape;
-    if constexpr (IsGroupedGemmKernel) {
-      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-    }
-    else {
-      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
-    }
-    return grid_shape;
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
-    static_assert(size<0>(TileShape{}) >= 128,
-        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
-    static_assert(NumMmaWarpGroups == 2, "Cooperative kernels currently only support NumMmaWarpGroups == 2");
-
-    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
-      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
-                    "Tiled MmA does not match expected warp groups performing the epilogue");
-    }
-
-    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer0 = 1,
-      Consumer1 = 2
-    };
-    enum class ProducerWarpRole {
-      Mainloop = 0,
-      Warp1 = 1,
-      Epilogue = 2,
-      Warp3 = 3
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int mma_thread_idx = thread_idx % size(TiledMma{});
-    auto warp_group_idx = canonical_warp_group_idx();
-    auto warp_group_role = WarpGroupRole(warp_group_idx);
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    mainloop_pipeline_params.num_consumers = size(TiledMma{});
-    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
-    epi_load_pipeline_params.consumer_arv_count = size(TiledMma{});
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
-    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
-    params_load_order_barrier.group_size = NumThreadsPerWarp;
-    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    auto cluster_wait_fn = [] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        cute::cluster_arrive_relaxed();
-        return [] () { cute::cluster_wait(); };
-      }
-      else {
-        __syncthreads();
-        return [] () {}; // do nothing
-      }
-    } ();
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-
-    TileScheduler scheduler{params.scheduler};
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-    if (not work_tile_info.is_valid()) {
-      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
-      return;
-    }
-
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-
-    // Prepare and partition the input tensors. Expects a tuple of tensors where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    // Get pipeline stage increments from tensor shapes
-    auto k_tile_count = size<3>(gA_mkl);
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
-        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
-        int32_t const mock_l_coord = 0;
-        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-        int32_t const sm_count = params.hw_info.sm_count;
-
-        // Fetch a copy of tensormaps for the CTA
-        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
-
-        // Update tensormap for the initial batch for the CTA
-        if (work_tile_info.is_valid()) {
-          collective_mainloop.tensormaps_perform_update(
-            shared_storage.tensormaps.mainloop,
-            params.mainloop,
-            input_tensormaps,
-            problem_shape_MNKL,
-            curr_batch
-          );
-          // Ensure warp is converged before issuing tensormap fence release
-          __syncwarp();
-          // Entire warp must do this (i.e. it's aligned)
-          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-        }
-
-        bool do_load_order_arrive = true;
-        bool did_batch_change = true;
-        while (work_tile_info.is_valid()) {
-          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-            work_tile_info = next_work_tile_info;
-            continue;
-          }
-
-          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
-
-          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
-          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
-          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
-
-          if (did_batch_change) {
-            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
-          }
-
-          collective_mainloop.load(
-            params.mainloop,
-            mainloop_pipeline,
-            mainloop_pipe_producer_state,
-            load_inputs,
-            input_tensormaps,
-            blk_coord,
-            k_tile_iter, work_k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-          // Update starting pipeline state for the next tile
-          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
-          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
-
-          // Signal for the epilogue load warp to begin
-          if (do_load_order_arrive) {
-            load_order_barrier.arrive();
-            do_load_order_arrive = false;
-          }
-
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-          work_tile_info = next_work_tile_info;
-          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
-          did_batch_change = next_batch != curr_batch;
-          if (work_tile_info.is_valid() && did_batch_change) {
-            curr_batch = next_batch;
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
-            }
-            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
-            // Since this state is waiting for loads to finish, it must start in the inverted phase.
-            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
-              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
-            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
-            collective_mainloop.tensormaps_perform_update(
-              shared_storage.tensormaps.mainloop,
-              params.mainloop,
-              input_tensormaps,
-              problem_shape_MNKL,
-              curr_batch
-            );
-            // Ensure warp is converged before issuing tensor replace
-            __syncwarp();
-            // Entire warp must do this (i.e. it's aligned)
-            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-          }
-          // Advance the producer state for the last remaining stage that was being waited for above
-          mainloop_pipe_producer_state.advance(1);
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      } // Mainloop Producer Warp End
-
-      // Epilogue Producer Warp
-      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
-        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-        int32_t const sm_count = params.hw_info.sm_count;
-
-        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
-
-        bool did_batch_change = true;
-        constexpr bool IsEpiLoad = true;
-
-        if (work_tile_info.is_valid()) {
-          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_load_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            0
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
-        }
-
-        load_order_barrier.wait();
-
-        while (work_tile_info.is_valid()) {
-          int32_t curr_batch = work_tile_info.L_idx;
-
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-
-          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-            }
-
-            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-            if (did_batch_change) {
-              collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
-            }
-
-            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
-
-            epi_load_pipe_producer_state = collective_epilogue.load(
-              epi_load_pipeline,
-              epi_load_pipe_producer_state,
-              problem_shape_MNKL,
-              blk_shape,
-              blk_coord,
-              tiled_mma,
-              lane_idx,
-              shared_storage.tensors.epilogue,
-              epi_load_tensormap,
-              work_tile_info.reduction_subtile_idx(),
-              wait
-            );
-          }
-
-          work_tile_info = next_work_tile_info;
-          did_batch_change = curr_batch != work_tile_info.L_idx;
-
-          if (work_tile_info.is_valid() && did_batch_change) {
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-            }
-
-            // tensormap update
-            {
-              collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-                shared_storage.tensormaps.epilogue,
-                params.epilogue,
-                epi_load_tensormap,
-                problem_shape_MNKL,
-                work_tile_info.L_idx,
-                0
-              );
-
-              // Converge before issuing tensormap fence release since fence is aligned
-              __syncwarp();
-              collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
-            }
-          }
-
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      } // Epilogue Producer Warp End
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      // Index of warp group within consumer warp groups
-      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
-
-      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-      int32_t const sm_count = params.hw_info.sm_count;
-      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
-      bool do_store_tail = false;
-      // Get a copy of tensormaps
-      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
-
-      bool did_batch_change = true;
-      constexpr bool IsEpiLoad = false;
-
-      if (work_tile_info.is_valid()) {
-
-        if (warp_idx_in_warp_group == 0) {
-          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_store_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            consumer_warp_group_idx
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, 
-                                                                     epi_store_tensormap,
-                                                                     consumer_warp_group_idx);
-        }
-      }
-
-      while (work_tile_info.is_valid()) {
-        if constexpr (IsGroupedGemmKernel) {
-          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-        }
-
-        int32_t curr_batch = work_tile_info.L_idx;
-
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-
-        // Allocate the accumulators for the (M,N) blk_shape
-        //
-        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
-        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-
-        static_assert(cute::is_any_of_v<TileScheduler,
-            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
-            detail::PersistentTileSchedulerSm90>);
-        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-
-          collective_mainloop.mma(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            accumulators,
-            work_k_tile_count,
-            mma_thread_idx,
-            shared_storage.tensors.mainloop,
-            params.mainloop
-          );
-
-          // Make sure the math instructions are done and free buffers before entering the epilogue
-          collective_mainloop.mma_tail(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            work_k_tile_count
-          );
-
-          // Update starting mainloop pipeline state for the next tile
-          mainloop_pipe_consumer_state.advance(work_k_tile_count);
-        }
-
-        // Perform reduction across splits, if needed
-        TileScheduler::fixup(
-          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
-
-        if (did_batch_change) {
-          collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
-        }
-
-        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-
-          // Epilogue and write to gD
-          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-          collective_epilogue.store(
-            epi_load_pipeline,
-            epi_load_pipe_consumer_state,
-            epi_store_pipeline,
-            epi_store_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            accumulators,
-            tiled_mma,
-            mma_thread_idx,
-            shared_storage.tensors.epilogue,
-            epi_store_tensormap,
-            work_tile_info.reduction_subtile_idx()
-          );
-
-          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
-          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
-          do_store_tail = true;
-        }
-
-        // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-        work_tile_info = next_work_tile_info;
-
-        did_batch_change = curr_batch != work_tile_info.L_idx;
-        if (work_tile_info.is_valid() && did_batch_change) {
-          if constexpr (IsGroupedGemmKernel) {
-            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-          }
-          if (warp_idx_in_warp_group == 0) {
-            collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-              shared_storage.tensormaps.epilogue,
-              params.epilogue,
-              epi_store_tensormap,
-              problem_shape_MNKL,
-              work_tile_info.L_idx,
-              consumer_warp_group_idx
-            );
-
-            // Converge before issuing tensormap fence release since fence is aligned
-            __syncwarp();
-            collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
-                                                                       epi_store_tensormap,
-                                                                       consumer_warp_group_idx);
-          }
-        }
-
-      } // Scheduler work fetch loop
-
-      // Cooperative only needs TMA to complete at the very end of the kernel
-      if (do_store_tail) {
-        collective_epilogue.store_tail(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state
-        );
-      }
-    } // Consumer Warp Groups End
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
deleted file mode 100755
index 386337641..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
+++ /dev/null
@@ -1,946 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/gemm/group_array_problem_shape.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cute/tensor.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
->
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
-
-  static constexpr bool IsGdcEnabled = false;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using Schedule = typename DispatchPolicy::Schedule;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-  static_assert(cute::is_void_v<TileScheduler_>,
-    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
-
-  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
-
-  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
-    typename detail::TileSchedulerSelector<
-      GroupScheduler, ArchTag,
-      TileShape, ClusterShape,
-      ProblemShape>::Scheduler,
-    typename detail::TileSchedulerSelector<
-    void, ArchTag, TileShape, ClusterShape>::Scheduler>;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups = 2;
-  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  /// Register requirement for Load and Math WGs
-  static constexpr uint32_t LoadRegisterRequirement = 40;
-  static constexpr uint32_t MmaRegisterRequirement = 232;
-
-  // 1 stage ordered sequence between mainloop and epilogue producer load threads
-  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
-
-  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
-  static constexpr uint32_t StagesPerMathWarpGroup = 2;
-  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, NumMmaWarpGroups>;
-  using MathWarpGroupOrderBarrierSharedStorage = cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
-      MathWarpGroupOrderBarrier::SequenceDepth,
-      MathWarpGroupOrderBarrier::SequenceLength>;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
-      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
-    } pipelines;
-
-    struct TensorMapStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
-      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
-
-      alignas(128) MainloopTensorMapStorage mainloop;
-      alignas(128) EpilogueTensorMapStorage epilogue;
-    } tensormaps;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-    void* workspace{nullptr};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    ProblemShape problem_shapes = args.problem_shape;
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-
-    // Calculate workspace pointers
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    void* scheduler_workspace = workspace_ptr;
-    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* epilogue_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* mainloop_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
-    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
-    // subtile will not be used, therefore separate reduction will not be enabled.
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-    TileSchedulerParams scheduler;
-    if constexpr (IsGroupedGemmKernel) {
-      scheduler = TileScheduler::to_underlying_arguments(
-      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
-    }
-    else {
-      scheduler = TileScheduler::to_underlying_arguments(
-      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
-    }
-
-    return {
-      args.mode,
-      problem_shapes,
-      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
-      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
-      hw_info,
-      scheduler,
-      workspace
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = true;
-    if constexpr (IsGroupedGemmKernel) {
-      // Group GEMM currently only supports rank-3 problem shapes
-      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
-    } else {
-      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
-    }
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-
-    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-    static constexpr uint32_t NumAccumulatorMtxs = 1;
-
-    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
-    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
-      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
-    dim3 grid_shape;
-    if constexpr (IsGroupedGemmKernel) {
-      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-    }
-    else {
-      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
-    }
-    return grid_shape;
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(size(TiledMma{}) == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
-    static_assert(NumMmaWarpGroups == 2, "Pingpong kernels currently only support NumMmaWarpGroups == 2");
-
-    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
-      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
-                    "Tiled MmA does not match expected warp groups performing the epilogue");
-    }
-
-    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer0 = 1,
-      Consumer1 = 2
-    };
-    enum class ProducerWarpRole {
-      Mainloop = 0,
-      Warp1 = 1,
-      Epilogue = 2,
-      Warp3 = 3
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int mma_thread_idx = thread_idx % size(TiledMma{});
-    auto warp_group_idx = canonical_warp_group_idx();
-    auto warp_group_role = WarpGroupRole(warp_group_idx);
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
-    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
-    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
-    params_load_order_barrier.group_size = NumThreadsPerWarp;
-    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
-
-    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
-    // DMA Load WG will not participate in these Ordered Barrier syncs
-    params_math_wg_order_barrier.group_id = warp_group_idx - static_cast<int>(WarpGroupRole::Consumer0);
-    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
-    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    auto cluster_wait_fn = [] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        cute::cluster_arrive_relaxed();
-        return [] () { cute::cluster_wait(); };
-      }
-      else {
-        __syncthreads();
-        return [] () {}; // do nothing
-      }
-    } ();
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-
-    TileScheduler scheduler{params.scheduler};
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-    if (not work_tile_info.is_valid()) {
-      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
-      return;
-    }
-
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-
-    if (warp_group_role == WarpGroupRole::Consumer1) {
-      // Advance 2nd Math WG to the next work tile for the startup
-      const auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-
-      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-      work_tile_info = next_work_tile_info;
-      if (!work_tile_info.is_valid()) {
-        return;
-      }
-
-      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
-      mainloop_pipe_consumer_state.advance(k_tile_count);
-      epi_load_pipe_consumer_state.advance(c_tile_count);
-      epi_store_pipe_producer_state.advance(d_tile_count);
-
-      problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-    }
-
-    // Prepare and partition the input tensors. Expects a tuple of tensors where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    // Get pipeline stage increments from tensor shapes
-    auto k_tile_count = size<3>(gA_mkl);
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
-        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
-        int32_t const mock_l_coord = 0;
-        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-        int32_t const sm_count = params.hw_info.sm_count;
-
-        // Fetch a copy of tensormaps for the CTA
-        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
-
-        // Update tensormap for the initial batch for the CTA
-        if (work_tile_info.is_valid()) {
-          collective_mainloop.tensormaps_perform_update(
-            shared_storage.tensormaps.mainloop,
-            params.mainloop,
-            input_tensormaps,
-            problem_shape_MNKL,
-            curr_batch
-          );
-          // Ensure warp is converged before issuing tensormap fence release
-          __syncwarp();
-          // Entire warp must do this (i.e. it's aligned)
-          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-        }
-
-        bool do_load_order_arrive = true;
-        bool did_batch_change = true;
-        while (work_tile_info.is_valid()) {
-          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-            work_tile_info = next_work_tile_info;
-            continue;
-          }
-
-          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
-
-          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
-          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
-          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
-
-          if (did_batch_change) {
-            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
-          }
-
-          collective_mainloop.load(
-            params.mainloop,
-            mainloop_pipeline,
-            mainloop_pipe_producer_state,
-            load_inputs,
-            input_tensormaps,
-            blk_coord,
-            k_tile_iter, work_k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-          // Update starting pipeline state for the next tile
-          // Wait for the last TMA stage to complete loading, before issuing tensormap updates
-          mainloop_pipe_producer_state.advance(work_k_tile_count - 1);
-
-          // Signal for the epilogue load warp to begin
-          if (do_load_order_arrive) {
-            load_order_barrier.arrive();
-            do_load_order_arrive = false;
-          }
-
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-          work_tile_info = next_work_tile_info;
-          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
-          did_batch_change = next_batch != curr_batch;
-          if (work_tile_info.is_valid() && did_batch_change) {
-            curr_batch = next_batch;
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
-            }
-            // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates
-            // Since this state is waiting for loads to finish, it must start in the inverted phase.
-            typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state =
-              {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()};
-            mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state);
-            collective_mainloop.tensormaps_perform_update(
-              shared_storage.tensormaps.mainloop,
-              params.mainloop,
-              input_tensormaps,
-              problem_shape_MNKL,
-              curr_batch
-            );
-            // Ensure warp is converged before issuing tensor replace
-            __syncwarp();
-            // Entire warp must do this (i.e. it's aligned)
-            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
-          }
-          // Advance the producer state for the last remaining stage that was being waited for above
-          mainloop_pipe_producer_state.advance(1);
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      } // Mainloop Producer Warp End
-
-      // Epilogue Producer Warp
-      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
-        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-        int32_t const sm_count = params.hw_info.sm_count;
-
-        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
-
-        bool did_batch_change = true;
-        constexpr bool IsEpiLoad = true;
-
-        if (work_tile_info.is_valid()) {
-          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_load_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            0
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
-        }
-
-        load_order_barrier.wait();
-
-        while (work_tile_info.is_valid()) {
-          int32_t curr_batch = work_tile_info.L_idx;
-
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-
-          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-            }
-
-            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-            if (did_batch_change) {
-              collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
-            }
-
-            bool wait = work_tile_info.is_valid() && curr_batch != next_work_tile_info.L_idx;
-
-            epi_load_pipe_producer_state = collective_epilogue.load(
-              epi_load_pipeline,
-              epi_load_pipe_producer_state,
-              problem_shape_MNKL,
-              blk_shape,
-              blk_coord,
-              tiled_mma,
-              lane_idx,
-              shared_storage.tensors.epilogue,
-              epi_load_tensormap,
-              work_tile_info.reduction_subtile_idx(),
-              wait
-            );
-          }
-
-          work_tile_info = next_work_tile_info;
-          did_batch_change = curr_batch != work_tile_info.L_idx;
-
-          if (work_tile_info.is_valid() && did_batch_change) {
-            if constexpr (IsGroupedGemmKernel) {
-              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-            }
-
-            // tensormap update
-            {
-              collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-                shared_storage.tensormaps.epilogue,
-                params.epilogue,
-                epi_load_tensormap,
-                problem_shape_MNKL,
-                work_tile_info.L_idx,
-                0
-              );
-
-              // Converge before issuing tensormap fence release since fence is aligned
-              __syncwarp();
-              collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
-            }
-          }
-
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      } // Epilogue Producer Warp End
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      // Index of warp group within consumer warp groups
-      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
-
-      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
-      int32_t const sm_count = params.hw_info.sm_count;
-      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
-      bool do_store_tail = false;
-      // Get a copy of tensormaps
-      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
-
-      bool did_batch_change = true;
-      constexpr bool IsEpiLoad = false;
-
-      if (work_tile_info.is_valid()) {
-
-        if (warp_idx_in_warp_group == 0) {
-          collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-            shared_storage.tensormaps.epilogue,
-            params.epilogue,
-            epi_store_tensormap,
-            problem_shape_MNKL,
-            work_tile_info.L_idx,
-            consumer_warp_group_idx
-          );
-
-          // Converge before issuing tensormap fence release since fence is aligned
-          __syncwarp();
-          collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
-                                                                     epi_store_tensormap,
-                                                                     consumer_warp_group_idx);
-        }
-      }
-
-      while (work_tile_info.is_valid()) {
-        if constexpr (IsGroupedGemmKernel) {
-          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-        }
-
-        int32_t curr_batch = work_tile_info.L_idx;
-
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-
-        // Allocate the accumulators for the (M,N) blk_shape
-        //
-        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
-        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-
-        static_assert(cute::is_any_of_v<TileScheduler,
-            detail::PersistentTileSchedulerSm90Group<ProblemShape>,
-            detail::PersistentTileSchedulerSm90>);
-        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-
-          math_wg_order_barrier.wait();
-
-          collective_mainloop.mma(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            accumulators,
-            work_k_tile_count,
-            mma_thread_idx,
-            shared_storage.tensors.mainloop,
-            params.mainloop
-          );
-
-          math_wg_order_barrier.arrive();
-
-          // Make sure the math instructions are done and free buffers before entering the epilogue
-          collective_mainloop.mma_tail(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            work_k_tile_count
-          );
-
-           math_wg_order_barrier.wait();
-
-          // Update starting mainloop pipeline state for the next tile
-          mainloop_pipe_consumer_state.advance(work_k_tile_count);
-        }
-
-        // Perform reduction across splits, if needed
-        TileScheduler::fixup(
-          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
-
-        if (did_batch_change) {
-          collective_epilogue.tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
-        }
-
-        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-
-          // Epilogue and write to gD
-          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-          collective_epilogue.store(
-            epi_load_pipeline,
-            epi_load_pipe_consumer_state,
-            epi_store_pipeline,
-            epi_store_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            accumulators,
-            tiled_mma,
-            mma_thread_idx,
-            shared_storage.tensors.epilogue,
-            epi_store_tensormap,
-            work_tile_info.reduction_subtile_idx()
-          );
-
-          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
-          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
-          do_store_tail = true;
-        }
-
-        // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-        work_tile_info = next_work_tile_info;
-
-        // Skip a tile for pingpong
-        if (work_tile_info.is_valid()) {
-          if constexpr (IsGroupedGemmKernel) {
-            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-          }
-          work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-          mainloop_pipe_consumer_state.advance(work_k_tile_count);
-
-          // Go to next tile
-          auto [next_next_work_tile_info, next_increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-
-          work_tile_info = next_next_work_tile_info;
-          increment_pipe = next_increment_pipe;
-        }
-
-        did_batch_change = curr_batch != work_tile_info.L_idx;
-        if (work_tile_info.is_valid() && did_batch_change) {
-          if constexpr (IsGroupedGemmKernel) {
-            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
-          }
-          if (warp_idx_in_warp_group == 0) {
-            collective_epilogue.tensormaps_perform_update<IsEpiLoad>(
-              shared_storage.tensormaps.epilogue,
-              params.epilogue,
-              epi_store_tensormap,
-              problem_shape_MNKL,
-              work_tile_info.L_idx,
-              consumer_warp_group_idx
-            );
-
-            // Converge before issuing tensormap fence release since fence is aligned
-            __syncwarp();
-            collective_epilogue.tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
-                                                                       epi_store_tensormap,
-                                                                       consumer_warp_group_idx);
-          }
-        }
-
-        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
-        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
-        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
-        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
-        collective_epilogue.store_tail(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state
-        );
-
-        // Update starting load/store pipeline states for the next tile
-        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
-        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
-        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
-        epi_load_pipe_consumer_state.advance(c_tile_count);
-        epi_store_pipe_producer_state.advance(d_tile_count);
-
-        // Cue for next Math WG's Epilogue to start
-        math_wg_order_barrier.arrive();
-
-      } // Scheduler work fetch loop
-    } // Consumer Warp Groups End
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
deleted file mode 100755
index c7245457e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/trace.h"
-#include "cute/tensor.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelTma, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = false;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
-    "Mainloop and epilogue do not agree on accumulator value type.");
-
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "TMA kernel does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-
-  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
-      sizeof(typename CollectiveMainloop::SharedStorage),
-      sizeof(typename CollectiveEpilogue::SharedStorage)));
-
-  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::ThreadCount;
-
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto cluster_shape = ClusterShape{};
-    auto tile_shape = TileShape{};
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    return TileScheduler::get_tiled_cta_shape_mnl(
-        problem_shape_MNKL, tile_shape, cluster_shape);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    int thread_idx = int(threadIdx.x);
-    int warp_idx   = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_idx == 0) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-    }
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    auto blk_coord = make_coord(_,_,_);                                                   // (m,n,k) -- defer the slice
-
-    // Make tiled views
-    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{});                  // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{});                  // (BLK_N,BLK_K,n,k,l)
-
-    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
-    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
-    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
-    auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-    // Slice with m_coord and n_coord
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
-
-    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
-    TiledMma tiled_mma;
-    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                   // (MMA,MMA_M,MMA_N)
-
-    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
-    auto k_tile_count = size<2>(gA);
-
-    // Perform the collective scoped MMA
-    CollectiveMainloop collective_mma;
-    collective_mma(
-      gA, params.mainloop.tma_load_a,
-      gB, params.mainloop.tma_load_b,
-      accumulators,
-      k_tile_iter, k_tile_count,
-      thread_idx,
-      block_rank_in_cluster,
-      smem_buf,
-      params.mainloop
-    );
-
-    constexpr int BLK_M_RANK = cute::rank<0>(blk_shape);
-    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
-        return  get<i>(M) - get<0,i>(blk_shape) * get<i>(m_coord);
-      }));
-
-    constexpr int BLK_N_RANK = cute::rank<1>(blk_shape);
-    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
-        return  get<i>(N) - get<1,i>(blk_shape) * get<i>(n_coord);
-      }));
-    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
-
-    // Epilogue and write to gD
-    CollectiveEpilogue epilogue{params.epilogue};
-    epilogue(
-      problem_shape_MNKL,
-      blk_shape,
-      output_tile_coord,
-      accumulators,
-      tiled_mma,
-      residue_mnk,
-      thread_idx,
-      smem_buf
-    );
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
deleted file mode 100755
index b278f96e9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cutlass/conv/detail.hpp"
-
-#include "cute/tensor.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-
-#include "cutlass/arch/grid_dependency_control.h"
-
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<cutlass::gemm::KernelTmaWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
->
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-
-  // Handles the static_assert placed inside the operator()
-  // This is also used to decide whether the load_init inside collective mainloop returns rank 4 tensors or rank 5 tensors
-  static constexpr bool IsConvProblemShape = not (cute::is_tuple_v<ProblemShape>|| IsCutlass3ArrayKernel<ProblemShape>::value);
-  static_assert( IsConvProblemShape || (cute::rank(ProblemShape{}) == 3 || cute::rank(ProblemShape{}) == 4), "ProblemShape{} should be <M,N,K> or <M,N,K,L> for Gemm");
-
-  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileSchedulerTag, ArchTag, TileShape, ClusterShape>::Scheduler;
-
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
-    union TensorStorage {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups = 1;
-  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Device side arguments
-  struct Arguments {
-    cutlass::gemm::GemmUniversalMode mode{}; //maintained here for backward compatibility
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-
-    // Default constructor
-    Arguments() = default;
-
-    // Constructor with specified mode 
-    // It is used for Gemm
-    Arguments(
-        cutlass::gemm::GemmUniversalMode mode_,
-        ProblemShape problem_shape_,
-        MainloopArguments mainloop_,
-        EpilogueArguments epilogue_,
-        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
-        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
-    : mode(mode_)
-      , problem_shape(problem_shape_)
-      , mainloop(mainloop_)
-      , epilogue(epilogue_)
-      , hw_info(hw_info_)
-      , scheduler(scheduler_) {}
-
-    // Constructor with default value for 'mode'
-    // This allows us to set GemmUniversal mode as kGemm for Conv right away
-    // while keeping the testbeds unchanged
-    Arguments(
-        ProblemShape problem_shape_,
-        MainloopArguments mainloop_,
-        EpilogueArguments epilogue_,
-        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
-        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
-    : mode(cutlass::gemm::GemmUniversalMode::kGemm) // Default mode
-      , problem_shape(problem_shape_)
-      , mainloop(mainloop_)
-      , epilogue(epilogue_)
-      , hw_info(hw_info_)
-      , scheduler(scheduler_) {}
-
-  };
-
-  // Kernel entry point API
-  struct Params {
-    using ProblemShapeMNKL = decltype(cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(ProblemShape{}, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{}));
-    ProblemShapeMNKL problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-
-    (void) workspace;
-    auto problem_shape_mnkl = cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(args.problem_shape, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{});
-    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
-
-    auto swapped_problem_shape = problem_shape_mnkl;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(swapped_problem_shape) = get<1>(problem_shape_mnkl);
-      get<1>(swapped_problem_shape) = get<0>(problem_shape_mnkl);
-    }
-    return {
-      swapped_problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(transformed_problem_shape, args.epilogue, workspace)
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = true;
-    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
-
-    if (!implementable) {
-        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-        return implementable;
-    }
-
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(transformed_problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto cluster_shape = ClusterShape{};
-    auto tile_shape = TileShape{};
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    return TileScheduler::get_tiled_cta_shape_mnl(
-        problem_shape_MNKL, tile_shape, cluster_shape);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
-#  define ENABLE_SM90_KERNEL_LEVEL 1
-#endif
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-    enum class ProducerWarpRole {
-      MainloopEpilogue = 0,
-      Warp1 = 1,
-      Warp2 = 2,
-      Warp3 = 3
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_idx == 0) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
-    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    auto cluster_wait_fn = [&] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        cute::cluster_arrive_relaxed();
-        return [] () { cute::cluster_wait(); };
-      }
-      else {
-        __syncthreads();
-        return [] () {}; // do nothing
-      }
-    } ();
-  
-    // Preconditions only valid for Gemm
-    static_assert(IsConvProblemShape || cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(IsConvProblemShape || cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(IsConvProblemShape || cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(IsConvProblemShape || cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K)
-    TiledMma tiled_mma;
-
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    // Using constexpr if (C++17 and later)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, cute::Int<1>{});
-    
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-    // Prepare and partition the input tensors. 
-    // Expects a tuple of tensors for conv where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-    
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
-
-    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl), compact_col_major(shape<2>(gB_nkl)));
-
-    // handles the difference between the rank of Tensor returned by load_input in case they do not have a batch mode
-    auto l_coord = [&] (auto const& gB_nkl_) {
-      // gB_nkl needs to be passed into the lambda because C++17
-      // does not permit lambda capture of structured bindings.
-      if constexpr (not IsConvProblemShape) {
-        // This needs to be inside an `if constexpr`,
-        // because shape<4>(gB_nkl) is not well-formed otherwise.
-        return idx2crd(int(blockIdx.z), shape<4>(gB_nkl_));
-      }
-      else {
-        return Int<0>{};
-      }
-    } (gB_nkl);
-
-    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-    // Get pipeline iterators and increments from tensor shapes
-    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
-    auto k_tile_count = size<3>(gA_mkl);
-
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      if (producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
-        // Ensure that the prefetched kernel does not touch
-        // unflushed global memory prior to this instruction
-        cutlass::arch::wait_on_dependent_grids();
-        collective_mainloop.load(
-          params.mainloop,
-          mainloop_pipeline,
-          mainloop_pipe_producer_state,
-          load_inputs,
-          blk_coord,
-          k_tile_iter, k_tile_count,
-          lane_idx,
-          block_rank_in_cluster,
-          shared_storage.tensors.mainloop
-        );
-        // Update starting mainloop pipeline state for the pipeline drain
-        mainloop_pipe_producer_state.advance(k_tile_count);
-        // Make sure mainloop consumer has been waited upon before issuing epilogue load
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-
-        if (collective_epilogue.is_producer_load_needed()) {
-          // Ensure warp is converged before issuing epilogue loads
-          __syncwarp();
-          epi_load_pipe_producer_state = collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            tiled_mma,
-            lane_idx,
-            shared_storage.tensors.epilogue
-          );
-          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-        }
-      } 
-    }
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
-
-      collective_mainloop.mma(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        accumulators,
-        k_tile_count,
-        warp_group_thread_idx,
-        shared_storage.tensors.mainloop,
-        params.mainloop
-      );
-
-      // Make sure the math instructions are done and free buffers before entering the epilogue
-      collective_mainloop.mma_tail(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        k_tile_count
-      );
-
-      // Hint on an early release of global memory resources.
-      // The timing of calling this function only influences performance,
-      // not functional correctness.
-      cutlass::arch::launch_dependent_grids();
-
-      // Epilogue and write to gD
-      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-      collective_epilogue.store(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state,
-        problem_shape_MNKL,
-        blk_shape,
-        blk_coord,
-        accumulators,
-        tiled_mma,
-        warp_group_thread_idx,
-        shared_storage.tensors.epilogue
-      );
-
-      collective_epilogue.store_tail(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state_next,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state_next
-      );
-    }
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
deleted file mode 100755
index 243a9e708..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cute/tensor.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/arch/grid_dependency_control.h"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileSchedulerTag_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileSchedulerTag_,
-  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  using TileSchedulerTag = TileSchedulerTag_;
-
-  using TileScheduler = typename detail::TileSchedulerSelector<
-                                          TileSchedulerTag, 
-                                          ArchTag, 
-                                          TileShape,
-                                          ClusterShape
-                                          >::Scheduler;
-
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-  
-  // Warp specialization thread count per threadblock
-  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
-  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
-  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
-
-  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
-  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  /// Register requirement for Load and Math WGs
-  static constexpr uint32_t LoadRegisterRequirement = 40;
-  static constexpr uint32_t MmaRegisterRequirement = 232;
-
-  // 1 stage ordered sequence between mainloop and epilogue producer load threads
-  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
-    } pipelines;
-
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      EpilogueTensorStorage epilogue;
-      MainloopTensorStorage mainloop;
-    } tensors;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-    void* workspace{nullptr};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-
-    // Calculate workspace pointers
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    void* scheduler_workspace = workspace_ptr;
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* epilogue_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* mainloop_workspace = nullptr;
-    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
-    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
-    // subtile will not be used, therefore separate reduction will not be enabled.
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
-
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
-      hw_info,
-      scheduler,
-      workspace
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-
-    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
-    static constexpr uint32_t NumAccumulatorMtxs = 1;
-
-    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
-      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
-    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
-#  define ENABLE_SM90_KERNEL_LEVEL 1
-#endif
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(NumMMAThreads == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
-    static_assert(size<0>(TileShape{}) >= 128,
-        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
-
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer0 = 1,
-      Consumer1 = 2
-    };
-    enum class ProducerWarpRole {
-      Mainloop = 0,
-      Warp1 = 1,
-      Epilogue = 2,
-      Warp3 = 3
-    };
-
-
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int mma_thread_idx = thread_idx % NumMMAThreads;
-    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_idx == 0) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    mainloop_pipeline_params.num_consumers = NumMMAThreads;
-    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    } 
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
-    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
-    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
-    params_load_order_barrier.group_size = NumThreadsPerWarp;
-    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-
-    auto cluster_wait_fn = [] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        cute::cluster_arrive_relaxed();
-        return [] () { cute::cluster_wait(); };
-      }
-      else {
-        __syncthreads();
-        return [] () {}; // do nothing
-      }
-    } ();
-
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-
-    TileScheduler scheduler{params.scheduler};
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-    
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-
-    // Prepare and partition the input tensors. Expects a tuple of tensors where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
-        // Ensure that the prefetched kernel does not touch
-        // unflushed global memory prior to this instruction
-        cutlass::arch::wait_on_dependent_grids();
-        bool do_load_order_arrive = true;
-        while (work_tile_info.is_valid()) {
-          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-            work_tile_info = next_work_tile_info;   
-            continue;
-          }
-
-          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
-          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
-
-          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
-
-          collective_mainloop.load(
-            params.mainloop,
-            mainloop_pipeline,
-            mainloop_pipe_producer_state,
-            load_inputs,
-            blk_coord,
-            k_tile_iter, work_k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-          // Update starting pipeline state for the next tile
-          mainloop_pipe_producer_state.advance(work_k_tile_count);
-
-          // Signal for the epilogue load warp to begin
-          if (do_load_order_arrive) {
-            load_order_barrier.arrive();
-            do_load_order_arrive = false;
-          }
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
-                                                                           );
-
-          work_tile_info = next_work_tile_info;
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-
-      } // Mainloop Producer Warp End
-
-      // Epilogue Producer Warp
-      else if (producer_warp_role == ProducerWarpRole::Epilogue && is_epi_load_needed) {
-
-        // Ensure that the prefetched kernel does not touch
-        // unflushed global memory prior to this instruction
-        cutlass::arch::wait_on_dependent_grids();
-
-        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
-          load_order_barrier.wait();
-        }
-
-        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-        while (work_tile_info.is_valid()) {
-          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-            
-            epi_load_pipe_producer_state =
-            collective_epilogue.load(
-              epi_load_pipeline,
-              epi_load_pipe_producer_state,
-              problem_shape_MNKL,
-              blk_shape,
-              blk_coord,
-              tiled_mma,
-              lane_idx,
-              shared_storage.tensors.epilogue,
-              work_tile_info.reduction_subtile_idx()
-            );
-          }
-
-          // Get next work tile
-          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
-                                                                           );
-          work_tile_info = next_work_tile_info;
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      } // Epilogue Producer Warp End
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
-      bool do_store_tail = false;
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-
-        // Allocate the accumulators for the (M,N) blk_shape
-        //
-        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
-        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
-          collective_mainloop.mma(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            accumulators,
-            work_k_tile_count,
-            mma_thread_idx,
-            shared_storage.tensors.mainloop,
-            params.mainloop
-          );
-
-          // Make sure the math instructions are done and free buffers before entering the epilogue
-          collective_mainloop.mma_tail(
-            mainloop_pipeline,
-            mainloop_pipe_consumer_state,
-            work_k_tile_count
-          );
-
-          // Update starting mainloop pipeline state for the next tile
-          mainloop_pipe_consumer_state.advance(work_k_tile_count);
-        }
-        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
-        if (scheduler.is_last_tile(work_tile_info)) {
-          // Hint on an early release of global memory resources.
-          // The timing of calling this function only influences performance,
-          // not functional correctness.
-          cutlass::arch::launch_dependent_grids();
-
-        }
-        #endif
-
-        // Index of warp group within consumer warp groups
-        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
-
-        // Perform reduction across splits, if needed
-        TileScheduler::fixup(
-          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
-
-        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-          // Epilogue and write to gD
-          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-          collective_epilogue.store(
-            epi_load_pipeline,
-            epi_load_pipe_consumer_state,
-            epi_store_pipeline,
-            epi_store_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            accumulators,
-            tiled_mma,
-            mma_thread_idx,
-            shared_storage.tensors.epilogue,
-            work_tile_info.reduction_subtile_idx()
-          );
-          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
-          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
-          do_store_tail = true;
-        }
-
-        // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
-                                                                          );
-        work_tile_info = next_work_tile_info;
-      } // Scheduler work fetch loop
-
-      if (do_store_tail) {
-        collective_epilogue.store_tail(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state
-        );
-      }
-    } // Consumer Warp Groups End
-#endif
-  }
-
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
deleted file mode 100755
index cf4a552cb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ /dev/null
@@ -1,664 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/workspace.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/fast_math.h"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/arch/grid_dependency_control.h"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups = 2;
-  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  /// Register requirement for Load and Math WGs
-  static constexpr uint32_t LoadRegisterRequirement = 40;
-  static constexpr uint32_t MmaRegisterRequirement = 232;
-
-  // 1 stage ordered sequence between mainloop and epilogue producer load threads
-  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
-
-  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
-  static constexpr uint32_t StagesPerMathWarpGroup = 2;
-  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
-    StagesPerMathWarpGroup, NumMmaWarpGroups>;
-  using MathWarpGroupOrderBarrierSharedStorage =
-    cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
-      MathWarpGroupOrderBarrier::SequenceDepth,
-      MathWarpGroupOrderBarrier::SequenceLength>;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
-      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
-    } pipelines;
-
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      EpilogueTensorStorage epilogue;
-      MainloopTensorStorage mainloop;
-    } tensors;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-
-    // Calculate workspace pointers
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    void* scheduler_workspace = workspace_ptr;
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* epilogue_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-
-    void* mainloop_workspace = nullptr;
-
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
-      hw_info,
-      TileScheduler::to_underlying_arguments(problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace)
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_size = 0;
-    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
-
-    return workspace_size;
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    Status status = Status::kSuccess;
-    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-    static constexpr uint32_t NumEpilogueSubTiles = 1;
-    static constexpr uint32_t NumAccumulatorMtxs = 1;
-
-    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
-      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    return status;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
-    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer0 = 1,
-      Consumer1 = 2
-    };
-    enum class ProducerWarpRole {
-      Mainloop = 0,
-      Warp1 = 1,
-      Epilogue = 2,
-      Warp3 = 3
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_idx == 0) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
-    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
-    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
-    params_load_order_barrier.group_size = NumThreadsPerWarp;
-    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
-
-    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
-    // DMA Load WG will not participate in these Ordered Barrier syncs
-    params_math_wg_order_barrier.group_id = canonical_warp_group_idx() - static_cast<int>(WarpGroupRole::Consumer0);
-    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
-    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    auto cluster_wait_fn = [&] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        cute::cluster_arrive_relaxed();
-        return [] () { cute::cluster_wait(); };
-      }
-      else {
-        __syncthreads();
-        return [] () {}; // do nothing
-      }
-    } ();
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-
-    // Prepare and partition the input tensors. Expects a tuple of tensors where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    // Get pipeline stage increments from tensor shapes
-    auto k_tile_count = size<3>(gA_mkl);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-
-    TileScheduler scheduler{params.scheduler};
-
-    if (warp_group_role == WarpGroupRole::Consumer1) {
-      // Advance 2nd Math WG to the next work tile for the startup
-      scheduler.advance_to_next_work();
-      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
-      mainloop_pipe_consumer_state.advance(k_tile_count);
-      epi_load_pipe_consumer_state.advance(c_tile_count);
-      epi_store_pipe_producer_state.advance(d_tile_count);
-    }
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      // Mainloop Producer Warp
-      if (producer_warp_role == ProducerWarpRole::Mainloop) {
-        // Ensure that the prefetched kernel does not touch
-        // unflushed global memory prior to this instruction
-        cutlass::arch::wait_on_dependent_grids();
-        bool do_load_order_arrive = true;
-        while (work_tile_info.is_valid()) {
-          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-          auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
-
-          collective_mainloop.load(
-            params.mainloop,
-            mainloop_pipeline,
-            mainloop_pipe_producer_state,
-            load_inputs,
-            blk_coord,
-            k_tile_iter, k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-          // Update starting pipeline state for the next tile
-          mainloop_pipe_producer_state.advance(k_tile_count);
-
-          // Signal for the epilogue load warp to begin
-          if (do_load_order_arrive) {
-            load_order_barrier.arrive();
-            do_load_order_arrive = false;
-          }
-
-          // Get next work tile
-          scheduler.advance_to_next_work();
-          work_tile_info = scheduler.get_current_work();
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-
-      } // Mainloop Producer Warp End
-
-      // Epilogue Producer Warp
-      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
-
-        // Ensure that the prefetched kernel does not touch
-        // unflushed global memory prior to this instruction
-        cutlass::arch::wait_on_dependent_grids();
-
-        load_order_barrier.wait();
-        while (work_tile_info.is_valid()) {
-          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-          epi_load_pipe_producer_state =
-          collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            tiled_mma,
-            lane_idx,
-            shared_storage.tensors.epilogue
-          );
-
-          // Get next work tile
-          scheduler.advance_to_next_work();
-          work_tile_info = scheduler.get_current_work();
-        } // Scheduler work fetch loop
-
-        // Make sure all Consumer Warp Groups have been waited upon
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      } // Epilogue Producer Warp End
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
-      // It is possible to have work tiles start off invalid,
-      // so we have to check that first.
-      if (not work_tile_info.is_valid()) {
-        // Hint on an early release of global memory resources.
-        // The timing of calling this function only influences performance,
-        // not functional correctness.
-        cutlass::arch::launch_dependent_grids();
-
-        return;
-      }
-      #endif
-
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-        // Allocate the accumulators for the (M,N) blk_shape
-        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-
-        // Order two Math WG's MMA one after the other, helps hide Epilogue
-        math_wg_order_barrier.wait();
-
-        collective_mainloop.mma(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          accumulators,
-          k_tile_count,
-          warp_group_thread_idx,
-          shared_storage.tensors.mainloop,
-          params.mainloop
-        );
-
-        // Cue for next Math WG's MMA to start
-        math_wg_order_barrier.arrive();
-
-        // Make sure the math instructions are done and free buffers before entering the epilogue
-        collective_mainloop.mma_tail(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          k_tile_count
-        );
-        // Update starting mainloop pipeline state for the next tile
-        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
-
-        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
-        if (scheduler.is_last_tile(work_tile_info, NumMmaWarpGroups)) {
-          // Hint on an early release of global memory resources.
-          // The timing of calling this function only influences performance,
-          // not functional correctness.
-          cutlass::arch::launch_dependent_grids();
-
-        }
-        #endif
-
-        // Order two Math WG's Epilogue one after the other
-        math_wg_order_barrier.wait();
-
-        // Epilogue and write to gD
-        auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-        collective_epilogue.store(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state,
-          problem_shape_MNKL,
-          blk_shape,
-          blk_coord,
-          accumulators,
-          tiled_mma,
-          warp_group_thread_idx,
-          shared_storage.tensors.epilogue
-        );
-
-        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
-        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
-        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
-        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
-        collective_epilogue.store_tail(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state_next,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state_next
-        );
-
-        // Update starting load/store pipeline states for the next tile
-        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
-        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
-        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
-        epi_load_pipe_consumer_state.advance(c_tile_count);
-        epi_store_pipe_producer_state.advance(d_tile_count);
-
-        // Cue for next Math WG's Epilogue to start
-        math_wg_order_barrier.arrive();
-
-        // Get next work tile
-        scheduler.advance_to_next_work(NumMmaWarpGroups);
-        work_tile_info = scheduler.get_current_work();
-      } // Scheduler work fetch loop
-    } // Consumer Warp Groups End
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
deleted file mode 100755
index c2a888ae3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
+++ /dev/null
@@ -1,417 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cute/tensor.hpp"
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = false;
-
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    union TensorStorage {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
-  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
-  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
-
-  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
-  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
-
-  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static
-  size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto cluster_shape = Shape<_1,_1,_1>{};
-    auto tile_shape = TileShape{};
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    return TileScheduler::get_tiled_cta_shape_mnl(
-        problem_shape_MNKL, tile_shape, cluster_shape);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int warp_group_idx = canonical_warp_group_idx();
-    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
-    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-
-    // Represent the full tensors
-    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
-    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    TiledMma tiled_mma;
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
-
-    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
-    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
-    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
-    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-    // Slice with m_coord and n_coord
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
-
-    // Get pipeline iterators and increments from tensor shapes
-    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
-    auto k_tile_count = size<2>(gA);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-
-    // Wait for all threads in the thread block
-    __syncthreads();
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-      // Compute tile residues for predication
-      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
-      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
-      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
-      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
-
-      collective_mainloop.load(
-        mainloop_pipeline,
-        mainloop_pipe_producer_state,
-        gA,
-        gB,
-        k_tile_iter, k_tile_count,
-        residue_mnk,
-        thread_idx,
-        shared_storage.tensors.mainloop
-      );
-      // Update starting mainloop pipeline state for the pipeline drain
-      mainloop_pipe_producer_state.advance(k_tile_count);
-      // Make sure mainloop consumer has been waited upon before issuing epilogue load
-      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-
-      if (collective_epilogue.is_producer_load_needed()) {
-        epi_load_pipe_producer_state =
-        collective_epilogue.load(
-          epi_load_pipeline,
-          epi_load_pipe_producer_state,
-          problem_shape_MNKL,
-          blk_shape,
-          blk_coord,
-          tiled_mma,
-          thread_idx,
-          shared_storage.tensors.epilogue
-        );
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      }
-    }
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
-
-      collective_mainloop.mma(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        accumulators,
-        k_tile_count,
-        warp_group_thread_idx,
-        shared_storage.tensors.mainloop,
-        params.mainloop
-      );
-
-      // Make sure the math instructions are done and free buffers before entering the epilogue
-      collective_mainloop.mma_tail(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        k_tile_count
-      );
-
-      // Epilogue and write to gD
-      collective_epilogue.store(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state,
-        problem_shape_MNKL,
-        blk_shape,
-        blk_coord,
-        accumulators,
-        tiled_mma,
-        warp_group_thread_idx,
-        shared_storage.tensors.epilogue
-      );
-    }
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
deleted file mode 100755
index 041745206..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
+++ /dev/null
@@ -1,504 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cute/tensor.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = false;
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
-  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
-  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
-
-  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
-  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
-
-  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
-
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
-      hw_info,
-      scheduler
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static
-  size_t
-  get_workspace_size(Arguments const& args) {
-    TileScheduler t;
-    return t.template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
-  }
-
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    TileScheduler t;
-    static constexpr uint32_t NumEpilogueSubTiles = 1;
-    static constexpr uint32_t NumAccumulatorMtxs = 1;
-    return t.template initialize_workspace<ProblemShape, ElementAccumulator>(
-      args.scheduler, workspace, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    /* In the Cooperative kernel, one or multiple Consumers collaborate on the same tile */
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int mma_thread_idx = thread_idx % size(TiledMma{});
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int warp_group_idx = canonical_warp_group_idx();
-    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
-    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-
-    // Represent the full tensors
-    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
-    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
-
-    TileScheduler scheduler{params.scheduler};
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
-
-    // Wait for all threads in the thread block
-    __syncthreads();
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-        // Slice with our work tile coordinates to construct mainloop tensor views
-        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
-        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
-
-        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
-        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
-        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<2>(gA)), shape<2>(gA));
-
-        // Compute tile residues for predication
-        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
-        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
-        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
-        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
-
-        collective_mainloop.load(
-          mainloop_pipeline,
-          mainloop_pipe_producer_state,
-          gA,
-          gB,
-          k_tile_iter, work_k_tile_count,
-          residue_mnk,
-          thread_idx,
-          shared_storage.tensors.mainloop
-        );
-        // Update starting pipeline state for the next tile
-        mainloop_pipe_producer_state.advance(work_k_tile_count);
-
-        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler) &&
-           collective_epilogue.is_producer_load_needed()) {
-          epi_load_pipe_producer_state =
-          collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            tiled_mma,
-            warp_group_thread_idx,
-            shared_storage.tensors.epilogue
-          );
-      }
-
-        // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-        work_tile_info = next_work_tile_info;
-      } // Scheduler work fetch loop
-
-      // Make sure all Consumer Warp Groups have been waited upon
-      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      
-      if (collective_epilogue.is_producer_load_needed()) {
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      }
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-
-      bool do_store_tail = false;
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
-
-        // Allocate the the accumulators for the (M,N) blk_shape
-        //
-        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
-        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-
-        collective_mainloop.mma(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          accumulators,
-          work_k_tile_count,
-          mma_thread_idx,
-          shared_storage.tensors.mainloop,
-          params.mainloop
-        );
-
-        // Make sure the math instructions are done and free buffers before entering the epilogue
-        collective_mainloop.mma_tail(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          work_k_tile_count
-        );
-
-        // Update starting mainloop pipeline state for the next tile
-        mainloop_pipe_consumer_state.advance(work_k_tile_count);
-
-        // Index of warp group within consumer warp groups
-        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
-
-        // Perform reduction across splits, if needed
-        TileScheduler::fixup(
-          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
-
-        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-          // Epilogue and write to gD
-          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-          collective_epilogue.store(
-            epi_load_pipeline,
-            epi_load_pipe_consumer_state,
-            epi_store_pipeline,
-            epi_store_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            accumulators,
-            tiled_mma,
-            mma_thread_idx,
-            shared_storage.tensors.epilogue
-          );
-          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
-          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
-          do_store_tail = true;
-        }
-
-        // Get next work tile
-        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
-        work_tile_info = next_work_tile_info;
-      } // Scheduler work fetch loop
-
-      if (do_store_tail) {
-        collective_epilogue.store_tail(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state
-        );
-      }
-    } // Consumer Warp Groups End
-#endif
-  }
-
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
deleted file mode 100755
index 142fabd2f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
+++ /dev/null
@@ -1,516 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/fast_math.h"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/gemm_universal_decl.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-
-#include "cute/tensor.hpp"
-///////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = false;
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
-  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
-  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
-
-  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumMmaWarpGroups = 2 * cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
-  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
-  static_assert(NumMmaWarpGroups == 2, "Pingpong kernel requires 2 MMA warp groups.");
-
-  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
-  static constexpr uint32_t StagesPerMathWarpGroup = 2;
-  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
-    StagesPerMathWarpGroup, NumMmaWarpGroups>;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _1> {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-      using MathWarpGroupOrderBarrierStorage = typename MathWarpGroupOrderBarrier::SharedStorage;
-
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
-
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
-      hw_info,
-      scheduler
-    };
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-
-    return implementable;
-  }
-
-  static
-  size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    TileSchedulerArguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
-    }
-    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int thread_idx = int(threadIdx.x);
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int warp_group_idx = canonical_warp_group_idx();
-    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
-    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
-    int warp_group_consumer_idx = warp_group_idx - NumLoadWarpGroups;
-
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
-
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-
-    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
-    // DMA Load WG will not participate in these Ordered Barrier syncs
-    params_math_wg_order_barrier.group_id = warp_group_consumer_idx;
-    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
-    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
-
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-
-    // Represent the full tensors
-    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
-    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
-
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    TiledMma tiled_mma;
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
-
-    // Get pipeline stage increments from tensor shapes
-    auto k_tile_count = size<3>(gA_mkl);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-
-    TileScheduler scheduler{params.scheduler};
-
-    if (warp_group_consumer_idx == 1) {
-      // Advance 2nd Math WG to the next work tile for the startup
-      scheduler.advance_to_next_work();
-      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
-      mainloop_pipe_consumer_state.advance(k_tile_count);
-      epi_load_pipe_consumer_state.advance(c_tile_count);
-      epi_store_pipe_producer_state.advance(d_tile_count);
-    }
-    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
-
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
-
-    // Wait for all threads in the thread block
-    __syncthreads();
-
-    if (warp_group_role == WarpGroupRole::Producer) {
-
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-        // Slice with our work tile coordinates to construct mainloop tensor views
-        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
-        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
-
-        auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
-
-        // Compute tile residues for predication
-        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
-        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
-        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
-        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
-
-        collective_mainloop.load(
-          mainloop_pipeline,
-          mainloop_pipe_producer_state,
-          gA,
-          gB,
-          k_tile_iter, k_tile_count,
-          residue_mnk,
-          thread_idx,
-          shared_storage.tensors.mainloop
-        );
-        // Update starting pipeline state for the next tile
-        mainloop_pipe_producer_state.advance(k_tile_count);
-
-        if (collective_epilogue.is_producer_load_needed()) {
-          collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            tiled_mma,
-            warp_group_thread_idx,
-            shared_storage.tensors.epilogue
-          );
-          // Update starting pipeline state for the next tile
-          epi_load_pipe_producer_state.advance(c_tile_count);
-        }
-
-        // Get next work tile
-        scheduler.advance_to_next_work();
-        work_tile_info = scheduler.get_current_work();
-      } // Scheduler work fetch loop
-
-      // Make sure all Consumer Warp Groups have been waited upon
-      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      if (collective_epilogue.is_producer_load_needed()) {
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      }
-    } // Producer Warp Group End
-
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-
-      while (work_tile_info.is_valid()) {
-        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
-        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
-        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
-        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
-        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-        // Allocate the the accumulators for the (M,N) blk_shape
-        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
-
-        // Order two Math WG's MMA one after the other, helps hide Epilogue
-        math_wg_order_barrier.wait();
-
-        collective_mainloop.mma(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          accumulators,
-          k_tile_count,
-          thread_idx,
-          shared_storage.tensors.mainloop,
-          params.mainloop
-        );
-
-        // Cue for next Math WG's MMA to start
-        math_wg_order_barrier.arrive();
-
-        // Make sure the math instructions are done and free buffers before entering the epilogue
-        collective_mainloop.mma_tail(
-          mainloop_pipeline,
-          mainloop_pipe_consumer_state,
-          k_tile_count
-        );
-        // Update starting mainloop pipeline state for the next tile
-        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
-
-        // Order two Math WG's Epilogue one after the other
-        math_wg_order_barrier.wait();
-
-        // Epilogue and write to gD
-        collective_epilogue.store(
-          epi_load_pipeline,
-          epi_load_pipe_consumer_state,
-          epi_store_pipeline,
-          epi_store_pipe_producer_state,
-          problem_shape_MNKL,
-          blk_shape,
-          blk_coord,
-          accumulators,
-          tiled_mma,
-          warp_group_thread_idx,
-          shared_storage.tensors.epilogue
-        );
-        // Update starting load/store pipeline states for the next tile
-        epi_load_pipe_consumer_state.advance(c_tile_count * NumMmaWarpGroups);
-        epi_store_pipe_producer_state.advance(d_tile_count * NumMmaWarpGroups);
-
-        // Wait for all TMA stores to complete
-        epi_store_pipeline.producer_tail(epi_store_pipe_producer_state);
-
-        // Cue for next Math WG's Epilogue to start
-        math_wg_order_barrier.arrive();
-
-        // Get next work tile
-        scheduler.advance_to_next_work(NumMmaWarpGroups);
-        work_tile_info = scheduler.get_current_work();
-      } // Scheduler work fetch loop
-    } // Consumer Warp Groups End
-#endif
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
deleted file mode 100755
index 5e61e7c99..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
-
-
-namespace cutlass::gemm::kernel::detail {
-
-///////////////////////////////////////////////////////////////////////////////
-
-// Persistent Thread Block (TB) scheduler
-class PersistentTileSchedulerSm90:
-public StaticPersistentTileScheduler<PersistentTileSchedulerSm90> {
-
-  using BaseScheduler = StaticPersistentTileScheduler<PersistentTileSchedulerSm90>;
-public:
-  using StaticPersistentTileScheduler::StaticPersistentTileScheduler;
-  using Params = PersistentTileSchedulerSm90Params;
-  using RasterOrder = typename Params::RasterOrder;
-  using RasterOrderOptions = typename Params::RasterOrderOptions;
-  using Arguments = BaseScheduler::Arguments;
-
-  static constexpr bool IsDynamicPersistent = false;
-
-  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
-  static CUTLASS_DEVICE
-  cute::tuple<int32_t, int32_t>
-  get_work_idx_m_and_n(
-      uint64_t blk_per_grid_dim,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
-      FastDivmodU64 const& divmod_cluster_blk_major,
-      int32_t log_swizzle_size,
-      RasterOrder raster_order) {
-    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
-    return get_work_idx_m_and_n(
-      blk_per_grid_dim,
-      divmod_cluster_shape_major,
-      divmod_cluster_shape_minor,
-      divmod_cluster_blk_major,
-      log_swizzle_size,
-      raster_order,
-      cta_m_in_cluster,
-      cta_n_in_cluster
-    );
-  }
-
-  static CUTLASS_DEVICE
-  cute::tuple<int32_t, int32_t>
-  get_work_idx_m_and_n(
-      uint64_t blk_per_grid_dim,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
-      FastDivmodU64 const& divmod_cluster_blk_major,
-      int32_t log_swizzle_size,
-      RasterOrder raster_order,
-      uint64_t cta_m_in_cluster,
-      uint64_t cta_n_in_cluster) {
-
-    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
-    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
-
-    if (raster_order == RasterOrder::AlongN) {
-      cluster_minor_offset = cta_m_in_cluster;
-    }
-    else {
-      cluster_minor_offset = cta_n_in_cluster;
-    }
-
-    uint64_t cluster_idx_minor, cluster_idx_major;
-
-    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
-
-    offset = cluster_id & ((1 << log_swizzle_size) - 1);
-    extra = cluster_id >> log_swizzle_size;
-
-    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
-
-    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
-
-    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor +
-                                               cluster_minor_offset);
-    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor +
-                                               cluster_major_offset);
-
-    if (raster_order == RasterOrder::AlongN) {
-      return {minor_work_idx, major_work_idx};
-    }
-    else {
-      return {major_work_idx, minor_work_idx};
-    }
-
-  }
-
-  // The basic tile scheduler does not require any additional workspace
-  template <class ProblemShape, class ElementAccumulator>
-  static size_t
-  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
-    return 0;
-  }
-
-  template <class ProblemShape, class ElementAccumulator>
-  static cutlass::Status
-  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
-    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-};
-
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
deleted file mode 100755
index 888be276d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm_coord.hpp"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler_params.h"
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-
-namespace cutlass::gemm::kernel::detail {
-
-///////////////////////////////////////////////////////////////////////////////
-
-// Persistent Thread Block (TB) scheduler
-template <class GroupProblemShape>
-class PersistentTileSchedulerSm90Group {
-  //
-  // Data members
-  //
-
-private:
-  uint64_t current_work_linear_idx_ = 0;
-  uint64_t total_grid_size_ = 0;
-
-  // Tracking current group, its starting linear idx and total tiles
-  struct GroupInfo {
-    int group_idx = 0;
-    uint64_t start_linear_idx = 0;
-    uint64_t total_tiles = 0;
-  } current_group_info_;
-
-public:
-  struct WorkTileInfo {
-    int32_t M_idx = 0;
-    int32_t N_idx = 0;
-    int32_t L_idx = 0;
-    bool is_valid_tile = false;
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_valid() const {
-      return is_valid_tile;
-    }
-
-    CUTLASS_HOST_DEVICE
-    static WorkTileInfo
-    invalid_work_tile() {
-      return {-1, -1, -1, false};
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_final_split(uint32_t k_tiles_per_output_tile) const {
-      return true;
-    }
-
-    CUTLASS_HOST_DEVICE
-    int32_t
-    reduction_subtile_idx() const {
-      return -1;
-    }
-  };
-
-  using ProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
-  using Params = PersistentTileSchedulerSm90GroupParams<ProblemShape>;
-  using RasterOrder = typename Params::RasterOrder;
-  using RasterOrderOptions = typename Params::RasterOrderOptions;
-  static constexpr bool IsDynamicPersistent = false;
-
-  struct Arguments {
-    int max_swizzle_size = 1;
-    // Not applying Heuristics for Grouped problems, since largest dimension can change per group
-    RasterOrderOptions raster_order = RasterOrderOptions::AlongM;
-  };
-
-  // Sink scheduler params as a member
-  Params scheduler_params;
-
-  //
-  // Methods
-  //
-
-  template <class TileShape, class ClusterShape>
-  static Params
-  to_underlying_arguments(
-    GroupProblemShape problem_shapes,
-    TileShape tile_shape,
-    ClusterShape cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    Arguments const& arguments,
-    [[maybe_unused]] void* workspace=nullptr,
-    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
-    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u
-    ) {
-
-    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
-    static_assert(cute::is_static<TileShape>::value);
-    static_assert(cute::is_static<ClusterShape>::value);
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(
-      problem_shapes.groups(),
-      problem_shapes,
-      hw_info,
-      tile_shape, cluster_shape);
-
-    Params params;
-    params.initialize(
-      problem_blocks,
-      problem_shapes.groups(),
-      problem_shapes.problem_shapes,
-      problem_shapes.host_problem_shapes,
-      to_gemm_coord(tile_shape),
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      arguments.max_swizzle_size, 
-      arguments.raster_order
-    );
-
-    return params;
-  }
-
-  // Given the inputs, computes the physical grid we should launch.
-  template<class TileShape, class ClusterShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-    [[maybe_unused]] Params const& params,
-    GroupProblemShape problem_shapes,
-    TileShape tile_shape,
-    ClusterShape cluster_shape,
-    KernelHardwareInfo hw_info,
-    Arguments arguments,
-    bool truncate_by_problem_size=true) {
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(
-      problem_shapes.groups(),
-      problem_shapes,
-      hw_info,
-      tile_shape, cluster_shape);
-
-    return Params::get_grid_shape(
-      problem_blocks,
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      arguments.max_swizzle_size,
-      arguments.raster_order,
-      /* truncate_by_problem_size = */true
-    );
-  }
-
-  // Given the inputs, computes the total number of output blocks this problem will compute over
-  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
-  template<class BlockShape, class ClusterShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_tiled_cta_shape_mnl(int groups, GroupProblemShape problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
-    uint32_t total_ctas = 0;
-    uint32_t cta_in_N_dim = 1; // We linearize the blocks across all the problems here
-
-    // If host problem shapes are not provided.
-    if (!problem_shapes.is_host_problem_shape_available()) {
-      total_ctas = hw_info.sm_count;
-    }
-    // If host problem shapes are provided, make a better decision about possibility to launch smaller grid.
-    else {
-      for (int group = 0; group < groups; group++) {
-        auto ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes.get_host_problem_shape(group)), cute::shape<0>(cta_shape)));
-        auto ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes.get_host_problem_shape(group)), cute::shape<1>(cta_shape)));
-        auto problem_blocks_m = round_up(ctas_along_m, cute::get<0>(cluster_shape));
-        auto problem_blocks_n = round_up(ctas_along_n, cute::get<1>(cluster_shape));
-        total_ctas += problem_blocks_m * problem_blocks_n;
-      }
-    }
-
-    return Params::get_tiled_cta_shape_mnl(
-      to_gemm_coord(cluster_shape),
-      total_ctas, cta_in_N_dim
-    );
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    return true;
-  }
-
-  PersistentTileSchedulerSm90Group() = default;
-
-  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_) : scheduler_params(params_) {
-    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
-    // like blockIdx and gridDim, with __CUDA_ARCH__.
-#if defined(__CUDA_ARCH__)
-    if (scheduler_params.raster_order_ == RasterOrder::AlongN) {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
-    }
-    else {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
-    }
-
-    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
-
-    uint64_t ctas_along_m, ctas_along_n;
-    if (is_tuple<decltype(cute::shape<0>(params_.problem_shapes_[0]))>::value ||
-        is_tuple<decltype(cute::shape<1>(params_.problem_shapes_[0]))>::value) {
-      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.m()));
-      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(params_.problem_shapes_[0]), scheduler_params.cta_shape_.n()));
-    }
-    else {
-      ctas_along_m = scheduler_params.divmod_cta_shape_m_.divide(cute::shape<0>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_m_.divisor - 1);
-      ctas_along_n = scheduler_params.divmod_cta_shape_n_.divide(cute::shape<1>(params_.problem_shapes_[0]) +  scheduler_params.divmod_cta_shape_n_.divisor - 1);
-    }
-    auto problem_blocks_m = round_up(ctas_along_m, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.m());
-    auto problem_blocks_n = round_up(ctas_along_n, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.n());
-    current_group_info_.total_tiles = problem_blocks_m * problem_blocks_n;
-#else
-    CUTLASS_ASSERT(false && "This line should never be reached");
-#endif
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work() {
-    return get_current_work_for_linear_idx(current_work_linear_idx_);
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work_for_linear_idx(uint64_t linear_idx) {
-    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
-      return WorkTileInfo::invalid_work_tile();
-    }
-
-    return get_work_idx_m_and_n(linear_idx,
-                                current_group_info_,
-                                scheduler_params.groups_,
-                                scheduler_params.problem_shapes_,
-                                scheduler_params.cta_shape_,
-                                scheduler_params.cluster_shape_,
-                                scheduler_params.divmod_cluster_shape_major_,
-                                scheduler_params.divmod_cluster_shape_minor_,
-                                scheduler_params.divmod_cta_shape_m_,
-                                scheduler_params.divmod_cta_shape_n_,
-                                scheduler_params.log_swizzle_size_, 
-                                scheduler_params.raster_order_);
-  }
-
-  CUTLASS_DEVICE
-  void
-  advance_to_next_work(uint32_t advance_count = 1) {
-    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
-  }
-
-  // get work_idx_m, work_idx_n from linear_idx while applying swizzle
-  static CUTLASS_DEVICE
-  WorkTileInfo
-  get_work_idx_m_and_n(
-      uint64_t linear_idx,
-      struct GroupInfo& group_info,
-      int32_t total_problem_groups,
-      ProblemShape* problem_shapes,
-      GemmCoord cta_shape,
-      GemmCoord cluster_shape,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
-      FastDivmodU64 const& divmod_cta_shape_m,
-      FastDivmodU64 const& divmod_cta_shape_n,
-      int32_t log_swizzle_size, 
-      RasterOrder raster_order) {
-
-    bool valid_tile = true;
-    uint64_t ctas_along_m, ctas_along_n;
-    if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
-        is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
-      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
-      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
-    }
-    else {
-      ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
-      ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
-    }
-    auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
-    group_info.total_tiles = problem_blocks_m * problem_blocks_n;
-
-    while (group_info.start_linear_idx + group_info.total_tiles <= linear_idx) {
-      group_info.group_idx++;
-
-      if (group_info.group_idx >= total_problem_groups)
-        return WorkTileInfo::invalid_work_tile();
-
-      group_info.start_linear_idx += group_info.total_tiles;
-      if (is_tuple<decltype(cute::shape<0>(problem_shapes[group_info.group_idx]))>::value ||
-          is_tuple<decltype(cute::shape<1>(problem_shapes[group_info.group_idx]))>::value) {
-        ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes[group_info.group_idx]), cta_shape.m()));
-        ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes[group_info.group_idx]), cta_shape.n()));
-      }
-      else {
-        ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_m.divisor - 1);
-        ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(problem_shapes[group_info.group_idx]) +  divmod_cta_shape_n.divisor - 1);
-      }
-      problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
-      problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
-      group_info.total_tiles = problem_blocks_m * problem_blocks_n;
-    }
-
-    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
-    uint64_t blk_per_grid_dim = divmod_cluster_shape_minor.divide(linear_idx - group_info.start_linear_idx);
-    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
-
-    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
-    if (raster_order == RasterOrder::AlongN) {
-      cluster_minor_offset = cta_m_in_cluster;
-    }
-    else {
-      cluster_minor_offset = cta_n_in_cluster;
-    }
-
-    uint64_t cluster_idx_minor, cluster_idx_major;
-    
-    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
-
-    offset = cluster_id & ((1 << log_swizzle_size) - 1);
-    extra = cluster_id >> log_swizzle_size;
-
-    uint64_t curr_group_cluster_blk_major;
-    if (raster_order == RasterOrder::AlongN) {
-      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_n);
-    }
-    else {
-      curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(problem_blocks_m);
-    }
-    cluster_idx_minor_div_swizzle = extra / curr_group_cluster_blk_major;
-    cluster_idx_major = extra % curr_group_cluster_blk_major;
-
-    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
-
-    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor + 
-                                               cluster_minor_offset);
-    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor + 
-                                               cluster_major_offset);
-
-    if (raster_order == RasterOrder::AlongN) {
-      return {minor_work_idx, major_work_idx, group_info.group_idx, valid_tile};
-    }
-    else {
-      return {major_work_idx, minor_work_idx, group_info.group_idx, valid_tile}; 
-    }
-
-  }
-
-  // Returns whether the block assigned this work should compute the epilogue for the corresponding
-  // output tile. For the basic tile scheduler, this is always true.
-  CUTLASS_HOST_DEVICE
-  static bool
-  compute_epilogue(WorkTileInfo const&, Params const&) {
-    return true;
-  }
-
-  // Performs the reduction across splits for a given output tile. Since this scheduler does
-  // not split output tiles, no reduction is needed.
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  static void
-  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
-
-  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
-  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
-  // passed in should not be used after having been processed.
-  CUTLASS_DEVICE
-  static bool
-  continue_current_work(WorkTileInfo&) {
-    return false;
-  }
-
-  // The basic tile scheduler does not require any additional workspace
-  template <class ProblemShape, class ElementAccumulator>
-  static size_t
-  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
-    return 0;
-  }
-
-  template <class ProblemShape, class ElementAccumulator>
-  static cutlass::Status
-  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
-    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  template <class ProblemShape_MNKL, class TileShape>
-  CUTLASS_HOST_DEVICE
-  static int
-  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape_MNKL problem_shape, TileShape tile_shape) {
-    // All work units returned by this scheduler cover the entire K iteration
-    // space of the output tile assigned to the work unit.
-    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
-  }
-
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  get_work_k_tile_start(WorkTileInfo const&) {
-    // All work units returned by this scheduler start from K tile 0
-    return 0u;
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  need_separate_reduction(Params const& params) {
-    return false;
-  }
-
-  CUTLASS_DEVICE
-  bool
-  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
-    return false;
-  }
-
-  CUTLASS_DEVICE
-  uint32_t
-  epilgoue_subtile_idx(WorkTileInfo const& work_tile_info, Params const& params) const {
-    return 0;
-  }
-
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  void
-  separate_reduction(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx) {
-  }
-
-  // Shares the accumulator set with peers in the global workspace
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  static void
-  share(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx) {
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
-    return true;
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  requires_separate_reduction(Params const& params) {
-    return false;
-  }
-
-  // Kernel helper function to get next work tile
-  CUTLASS_DEVICE
-  auto
-  fetch_next_work(WorkTileInfo work_tile_info) {
-    if (continue_current_work(work_tile_info)) {
-      return cute::make_tuple(work_tile_info, true);
-    }
-
-    advance_to_next_work();
-    return cute::make_tuple(get_current_work(), true);
-  }
-  
-  // Returns the initial work tile info that will be computed over
-  template <class ClusterShape>
-  CUTLASS_DEVICE
-  WorkTileInfo
-  initial_work_tile_info(ClusterShape) {
-    return get_current_work();
-  }
-
-};
-
-} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
deleted file mode 100755
index 80b374ad7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
+++ /dev/null
@@ -1,960 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/barrier.h"
-#include "cutlass/block_striped.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-
-namespace cutlass::gemm::kernel::detail {
-
-// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
-template <
-  class TileShape,
-  class ClusterShape
->
-class PersistentTileSchedulerSm90StreamK {
-  //
-  // Data members
-  //
-
-private:
-  using UnderlyingScheduler = PersistentTileSchedulerSm90;
-
-private:
-  using UnderlyingArguments = typename UnderlyingScheduler::Arguments;
-  using UnderlyingParams = typename UnderlyingScheduler::Params;
-
-  uint64_t current_work_linear_idx_ = 0;
-
-public:
-
-  using RasterOrder = UnderlyingScheduler::RasterOrder;
-  using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
-  static constexpr bool IsDynamicPersistent = false;
-
-  // Use a dummy barrier manager to simply get the type used to store the barrier
-  using BarrierType = typename NamedBarrierManager<1>::T;
-
-  using Params = PersistentTileSchedulerSm90StreamKParams;
-  using ReductionMode = Params::ReductionMode;
-  using DecompositionMode = Params::DecompositionMode;
-
-  struct WorkTileInfo {
-    int32_t M_idx = 0;
-    int32_t N_idx = 0;
-    int32_t K_idx = 0;
-    int32_t L_idx = 0;
-
-    // Number of k tiles to compute for this unit of work. For stream-K, this
-    // can indicate the number of K tiles across multiple output tiles.
-    uint32_t k_tile_count = 0;
-
-    // Number of k tiles remaining for the work unit as a whole
-    uint32_t k_tile_remaining = 0;
-
-    // Whether this unit of work is the final split for the given tile
-    bool is_separate_reduction = false;
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_valid() const {
-      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
-      // (which only performs reduction and epilogue)
-      return k_tile_count > 0 || is_separate_reduction;
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_reduction_unit() const {
-      return is_separate_reduction;
-    }
-
-    CUTLASS_HOST_DEVICE
-    int32_t
-    reduction_subtile_idx() const {
-      // For separate reduction units, the K_idx of the work tile is unused.
-      // Therefore, we override it to contain the subtile of that the reduction
-      // unit operates on.
-      return is_reduction_unit() ? K_idx : -1;
-    }
-
-    CUTLASS_HOST_DEVICE
-    void
-    setup_separate_reduction(int32_t epilogue_subtile_idx) {
-      // Set the epilogue subtile in the K_idx, since this is otherwise unused
-      // by separate reduction units.
-      K_idx = epilogue_subtile_idx;
-
-      is_separate_reduction = true;
-      k_tile_count = 0;
-      // Clean up remaining k tiles
-      k_tile_remaining = 0;
-    }
-
-    CUTLASS_HOST_DEVICE
-    static WorkTileInfo
-    invalid_work_tile() {
-      return {-1, -1, -1, -1, 0};
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_final_split(uint32_t k_tiles_per_output_tile) const {
-      return (K_idx + k_tile_count) == k_tiles_per_output_tile;
-    }
-  };
-
-  struct Arguments {
-
-    Arguments() = default;
-    Arguments(Arguments const&) = default;
-    Arguments(Arguments&&) = default;
-
-    CUTLASS_HOST_DEVICE
-    Arguments&
-    operator=(Arguments const& args) {
-      splits = args.splits;
-      max_swizzle_size = args.max_swizzle_size;
-      raster_order = args.raster_order;
-      reduction_mode = args.reduction_mode;
-      decomposition_mode = args.decomposition_mode;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    Arguments&
-    operator=(Arguments&& args) noexcept {
-      splits = args.splits;
-      max_swizzle_size = args.max_swizzle_size;
-      raster_order = args.raster_order;
-      reduction_mode = args.reduction_mode;
-      decomposition_mode = args.decomposition_mode;
-      return *this;
-    }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(int splits_) : splits(splits_) {}
-
-    CUTLASS_HOST_DEVICE
-    Arguments(int splits_, int max_swizzle_size_, RasterOrderOptions raster_order_, DecompositionMode decomposition_mode_) :
-      splits(splits_),
-      max_swizzle_size(max_swizzle_size_),
-      raster_order(raster_order_),
-      decomposition_mode(decomposition_mode_) {}
-
-    // The splitting factor to be used in a split-K decomposition of the problem.
-    // If this is set to a value greater than 1, stream-K decomposition logic
-    // is bypassed in favor of a split-K decomposition.
-    int splits = 1;
-    int max_swizzle_size = 1;
-    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
-    ReductionMode reduction_mode = ReductionMode::Deterministic;
-    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
-  };
-
-  // Sink scheduler params as a member
-  Params scheduler_params;
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static Params
-  to_underlying_arguments(
-      ProblemShape problem_shape,
-      TileShape tile_shape,
-      ClusterShape cluster_shape,
-      KernelHardwareInfo const& hw_info,
-      Arguments const& args,
-      void* workspace,
-      const uint32_t epilogue_subtile = 1,
-      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
-
-    static_assert(cute::is_static<TileShape>::value);
-    static_assert(cute::is_static<ClusterShape>::value);
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
-    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
-
-    Params params;
-    params.initialize(
-      problem_blocks,
-      k_tile_per_output_tile,
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      args.splits,
-      args.max_swizzle_size,
-      args.raster_order,
-      args.reduction_mode,
-      args.decomposition_mode,
-      workspace,
-      epilogue_subtile
-    );
-    return params;
-  }
-
-  static bool
-  can_implement(Arguments const& args) {
-    // Split count > 1 is only valid for heuristic and split-K decomposition modes
-    return (args.splits == 1 ||
-            args.decomposition_mode == DecompositionMode::Heuristic ||
-            args.decomposition_mode == DecompositionMode::SplitK);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PersistentTileSchedulerSm90StreamK() { };
-
-  CUTLASS_HOST_DEVICE
-  PersistentTileSchedulerSm90StreamK(Params const& params_) : scheduler_params(params_) {
-    if (params_.raster_order_ == RasterOrder::AlongN) {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
-    }
-    else {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
-    }
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work() const {
-    return get_current_work_for_linear_idx(current_work_linear_idx_, scheduler_params);
-  }
-
-  CUTLASS_DEVICE
-  static WorkTileInfo
-  get_current_work_for_linear_idx(uint64_t linear_idx, Params const& params) {
-    // The maximum number of work units is units_per_problem_ * splits_.
-    // The multiplication by splits_ is used for handling split-K, in which
-    // units_per_problem_ is equal to the total number of output tiles. To account
-    // for the fact that we have splits_ peers per output tile, we multiply this
-    // value by splits_. For stream-K, this multiplication ends up being a no-op
-    // because splits_ is set to 1 for stream-K.
-    if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor + params.separate_reduction_units_)) {
-      // Invalid work. Return an empty result.
-      return WorkTileInfo::invalid_work_tile();
-    }
-
-    WorkTileInfo work_tile_info;
-    assign_work(params, linear_idx, work_tile_info);
-    return work_tile_info;
-  }
-
-  // Returns whether the current work_tile_info passed in should continue to be used. This
-  // occurs only in the stream-K decomposition with stream-K work units, which encompass
-  // work over multiple output tiles. If the current work_tile_info should continue to be
-  // used, it is updated to advance to the next output tile it should cover.
-  CUTLASS_DEVICE
-  bool
-  continue_current_work(WorkTileInfo& work_tile_info) const {
-    return continue_current_work_for_linear_idx(
-      current_work_linear_idx_, work_tile_info, scheduler_params);
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  continue_current_work_for_linear_idx(
-    uint64_t linear_idx,
-    WorkTileInfo& work_tile_info,
-    Params const& params) {
-
-    work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count;
-
-    if (work_tile_info.k_tile_remaining == 0) {
-      return false;
-    }
-    assign_work(params, linear_idx, work_tile_info);
-    return work_tile_info.is_valid();
-  }
-
-  CUTLASS_DEVICE
-  void
-  advance_to_next_work(uint32_t advance_count = 1) {
-    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
-  }
-
-  CUTLASS_DEVICE
-  bool is_last_tile(WorkTileInfo work_tile_info, uint32_t advance_count = 1) const {
-     // Never pass this by reference; it needs a copy,
-    // because continue_current_work will modify it.
-    if (continue_current_work(work_tile_info)) {
-      return false;
-    }
-    return not get_current_work_for_linear_idx(
-        current_work_linear_idx_ + (
-          uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count)
-          ),
-        scheduler_params
-    ).is_valid();
-  }
-
-  // Given the inputs, computes the total number of output blocks this problem will compute over
-  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
-  template <class ProblemShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape, ClusterShape cluster_shape) {
-    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
-  }
-
-  // Given the cluster shape, computes the physical grid we should launch.
-  template <class ProblemShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-    [[maybe_unused]] Params const& params,
-    ProblemShape problem_shape,
-    TileShape tile_shape,
-    ClusterShape cluster_shape,
-    KernelHardwareInfo hw_info,
-    Arguments arguments) {
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
-
-    return Params::get_grid_shape(
-      problem_blocks,
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      arguments.max_swizzle_size,
-      arguments.raster_order
-    );
-  }
-
-  // Returns whether fixup is needed for `work_tile_info`.
-  CUTLASS_HOST_DEVICE
-  static bool
-  requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) {
-    // Fixup is not needed for invalid or data-parallel tiles
-    return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static bool
-  requires_separate_reduction(Params const& params) {
-    return params.requires_separate_reduction();
-  }
-
-  // When the work tile is not special for reduction, it's valid. Otherwise need to skip
-  // global loading that producer warpgroup do, also math computation that consumer warpgroup do.
-  CUTLASS_DEVICE
-  static bool
-  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
-    return !work_tile_info.is_reduction_unit();
-  }
-
-  // Performs the reduction across splits for a given output tile.
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  static void
-  fixup(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx) {
-    static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
-    static constexpr uint32_t MaxNumNamedBarriers = 2;
-    using BarrierManager = NamedBarrierManager<NumThreadsPerWarpGroup, Offset, MaxNumNamedBarriers>;
-    return fixup_helper<FrgTensorC, BarrierManager>(
-      params, work_tile_info, accumulators, num_barriers, barrier_idx);
-  }
-
-  // Helper for performing the reduction across splits for a given output tile.
-  template <class FrgTensorC, class BarrierManager>
-  CUTLASS_DEVICE
-  static void
-  fixup_helper(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx,
-    uint32_t num_accumulator_mtxs = 1) {
-
-    using ElementAccumulator = typename FrgTensorC::value_type;
-
-    if (!requires_fixup(params, work_tile_info)) {
-      return;
-    }
-    uint64_t tile_idx = output_tile_index(params, work_tile_info);
-
-    // Index of the lock on which to wait
-    uint64_t lock_idx = (tile_idx * num_barriers) + barrier_idx;
-
-    uint64_t reduction_tile_idx = tile_idx;
-    uint64_t num_peers = 0;
-    uint64_t reduction_peer_offset = 0;
-    if (params.requires_separate_reduction()) {
-      // If separate reduction is to be performed, each stream-K unit writes its partials
-      // to a separate portion of the workspace. There are as many of these portions as there
-      // are peers for a given output tile, so we multiply the tile index by the maximum peer count.
-      auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast<uint32_t>(work_tile_info.K_idx));
-      num_peers = last_peer_id - first_peer_id + 1;
-      reduction_tile_idx *= Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
-      reduction_peer_offset = my_peer_id * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{});
-    }
-
-    // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
-    // Thus, the start of the reduction space is the same across all threads in a warp group.
-    uint64_t reduction_offset =
-      (static_cast<uint64_t>(cute::size<0>(TileShape{})) * static_cast<uint64_t>(cute::size<1>(TileShape{})) * reduction_tile_idx * num_accumulator_mtxs) +
-      reduction_peer_offset +
-      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount);
-
-    ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
-
-    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
-    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
-
-    AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
-    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
-
-    uint32_t barrier_group_thread_idx = threadIdx.x % BarrierManager::ThreadCount;
-
-    // The number of tiles for which reduction is required is either:
-    //   (a) the total number of output tiles (in the case of split-K)
-    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
-    // To calculate the total number of output tiles in the split-K case, we
-    // note that, in the split-K case, the units_per_problem_ member of Params will be
-    // the total number of output tiles.
-    uint32_t reduction_tiles = 0;
-    if (params.divmod_splits_.divisor > 1) {
-      reduction_tiles = params.units_per_problem_;
-    }
-    else if (params.requires_separate_reduction()) {
-      reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
-    }
-    else {
-      reduction_tiles = params.sk_tiles_;
-    }
-
-    uint64_t reduction_workspace_size = Params::get_reduction_workspace_size(
-      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value, num_accumulator_mtxs);
-    BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
-      reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
-
-    if (work_tile_info.is_reduction_unit()) {
-      plus<AccumulatorArrayT> add_fragments;
-      uint64_t peer_offset = size(accumulators) * num_barriers * BarrierManager::ThreadCount;
-
-      // Wait until the peers collaborating on this output tile have all written
-      // their accumulators to workspace.
-      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers);
-
-      // Load the first peer's data
-      BlockStripedReduceT::load(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
-
-      for (uint64_t i = 1; i < num_peers; ++i) {
-        // Load peer fragment
-        AccumulatorArrayT addend_fragment;
-        auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace + (i * peer_offset));
-
-        BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, barrier_group_thread_idx);
-
-        // Add peer fragment
-        *accumulator_array = add_fragments(*accumulator_array, addend_fragment);
-      }
-    }
-    else if (!compute_epilogue(work_tile_info, params)) {
-      if (params.requires_separate_reduction() || work_tile_info.K_idx == 0) {
-        // The first peer initializes the workspace partials in the non-separate-reduction case,
-        // and all peers write to their own location in workspace when using separate reduction
-        BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
-      }
-      else {
-        // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
-
-        // Perform reduction in workspace
-        BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
-      }
-
-      // If separate reduction is being performed, each participating stream-K unit increments the barrier
-      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
-      uint32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
-
-      // Signal our arrival
-      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
-    }
-    else {
-      if (params.reduction_mode_ == ReductionMode::Deterministic) {
-        // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
-      }
-      else {
-        // Wait unitl the first split has stored its accumulators
-        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
-      }
-
-      // The block computing the final split for the tile adds previously-reduced partials
-      // to its accumulators and computes the epilogue.
-      BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
-    }
-  }
-
-  // Returns whether the block assigned this work should compute the epilogue for the corresponding
-  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
-  CUTLASS_HOST_DEVICE
-  static bool
-  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
-    // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
-    //  1. The tile is computed in data-parallel mode
-    //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
-    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
-    return work_tile_info.is_valid() &&
-            (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) &&
-             !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction;
-  }
-
-  // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K]
-  CUTLASS_DEVICE
-  static uint64_t
-  output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
-    uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n(
-      work_tile_info.M_idx, work_tile_info.N_idx,
-      params.divmod_cluster_shape_major_,
-      params.divmod_cluster_shape_minor_,
-      params.divmod_cluster_blk_major_,
-      params.log_swizzle_size_,
-      params.raster_order_
-    );
-
-    uint64_t tiles_mn = params.divmod_batch_.divisor;
-    return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch;
-  }
-
-  template <class ProblemShape, class ElementAccumulator>
-  static size_t
-  get_workspace_size(
-    Arguments const& args,
-    ProblemShape problem_shape,
-    KernelHardwareInfo const& hw_info,
-    uint32_t mma_warp_groups,
-    const uint32_t epilogue_subtile = 1,
-    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1) {
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
-
-    ClusterShape cluster_shape;
-    TileShape tile_shape;
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
-    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
-
-    return Params::get_workspace_size(
-      problem_blocks,
-      k_tile_per_output_tile,
-      to_gemm_coord(tile_shape),
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      args.splits,
-      args.max_swizzle_size,
-      args.raster_order,
-      args.decomposition_mode,
-      mma_warp_groups,
-      sizeof_bits<BarrierType>::value,
-      sizeof_bits<ElementAccumulator>::value,
-      epilogue_subtile
-    );
-  }
-
-  template <class ProblemShape, class ElementAccumulator>
-  static cutlass::Status
-  initialize_workspace(
-    Arguments const& args,
-    void* workspace,
-    cudaStream_t stream,
-    ProblemShape const& problem_shape,
-    KernelHardwareInfo const& hw_info,
-    uint32_t mma_warp_groups,
-    const uint32_t epilogue_subtile = 1,
-    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
-
-    ClusterShape cluster_shape;
-    TileShape tile_shape;
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
-    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
-
-    return Params::initialize_workspace(
-      workspace,
-      stream,
-      problem_blocks,
-      k_tile_per_output_tile,
-      to_gemm_coord(tile_shape),
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      args.splits,
-      args.max_swizzle_size,
-      args.raster_order,
-      args.decomposition_mode,
-      mma_warp_groups,
-      sizeof_bits<BarrierType>::value,
-      sizeof_bits<ElementAccumulator>::value,
-      epilogue_subtile,
-      1,
-      cuda_adapter
-    );
-  }
-
-  template <class ProblemShape>
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) {
-    return work_tile_info.k_tile_count;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
-    return work_tile_info.K_idx;
-  }
-
-  // Kernel helper function to get next work tile
-  CUTLASS_DEVICE
-  auto
-  fetch_next_work(WorkTileInfo work_tile_info) {
-    if (continue_current_work(work_tile_info)) {
-      return cute::make_tuple(work_tile_info, true);
-    }
-
-    advance_to_next_work();
-    return cute::make_tuple(get_current_work(), true);
-  }
-
-  // Returns the initial work tile info that will be computed over
-  CUTLASS_DEVICE
-  WorkTileInfo
-  initial_work_tile_info(ClusterShape) {
-    return get_current_work();
-  }
-
-private:
-  // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
-  // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining
-  // iterations) is used to find the next tile in the current work unit.
-  CUTLASS_DEVICE
-  static void
-  assign_work(
-    Params const& params,
-    uint64_t linear_idx,
-    WorkTileInfo& work_tile_info) {
-
-    auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = cute::block_id_in_cluster();
-    uint64_t cta_m_in_cluster = static_cast<uint64_t>(cta_m_in_cluster_);
-    uint64_t cta_n_in_cluster = static_cast<uint64_t>(cta_n_in_cluster_);
-    uint64_t output_tile_id = linear_idx;
-    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
-      // Separate-reduction work
-      auto cluster_size = params.get_cluster_size();
-      // Divide up the linearized separate reduction units into clusters
-      uint64_t cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
-      uint64_t cluster_tile_idx, epi_subtile_idx;
-      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
-      // Bring the linearized tile ID back into the space of tiles, rather than clusters
-      output_tile_id = cluster_tile_idx * cluster_size;
-
-      work_tile_info.setup_separate_reduction(epi_subtile_idx);
-    }
-    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
-      // Data-parallel work
-      output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
-      work_tile_info.K_idx = 0;
-      work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor;
-      work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
-    }
-    else {
-      // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
-      // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
-      // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
-      // it would compute a (potentially-partial) output tile for based on the space of k iterations
-      // assigned to it. This often results in stream-K threadblocks processing tiles with different
-      // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
-      // (generally few) waves of threadblocks assigned to compute stream-K work.
-      //
-      // With the introduction of threadblock clusters, there is additional benefit to maintaining
-      // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
-      // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
-      // threadblocks respects the ability to perform multicasting.
-      //
-      // To do so, we divide up the linearized stream-K units into clusters and share the same K
-      // offsets for work within clusters.
-
-      uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
-
-      uint64_t group_idx;
-      params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
-
-      // Determine whether we are in a "big group" that will process an additional
-      // stream-K cluster tile.
-      uint64_t sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
-      uint64_t sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles);
-      if (group_idx < params.big_groups_) {
-        ++sk_cluster_tiles_in_group;
-      }
-
-      // Determine whether we are in a "big unit" within the group, that will process
-      // an additional K chunk in the group.
-      uint64_t sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size();
-      uint64_t k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
-      uint64_t k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
-      uint64_t big_units_in_group = params.div_cluster_size(
-        k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
-
-      uint64_t split;
-      params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
-
-      bool is_split_k = params.divmod_splits_.divisor > 1;
-      uint64_t big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx;
-      uint64_t big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
-      uint64_t linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
-      uint64_t k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
-
-      // Determine the starting k iteration computed by this stream-K work unit
-      uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) +
-                                 (k_tiles_per_split * split);
-
-      // Adjust the starting position and number of k iterations for "big units," which
-      // compute one extra iteration. If there are any big units, they will be the first
-      // in the linearized ID space.
-      auto k_tiles_in_my_split = k_tiles_per_split;
-      if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
-        // Since the "big units" are the first units in the linearized ID space, each
-        // of the units preceding this big unit computed one extra iteration. Thus,
-        // we must offset our start iteration by the number of units that precede
-        // the current unit in the linearized ID space.
-        unit_iter_start += big_unit_cmp_lhs;
-        ++k_tiles_in_my_split;
-      }
-      else {
-        // Increment by one for each of the big clusters (since all big units precede this unit)
-        unit_iter_start += big_unit_cmp_rhs;
-      }
-
-      if (!is_split_k) {
-        // Adjust the unit starting position and number of tiles to avoid
-        // computing splits of size less than min_iters_per_sk_unit_
-        int unused, start_tile_k_tile;
-        params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start);
-        if (start_tile_k_tile < Params::min_iters_per_sk_unit_) {
-          // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another
-          // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
-          // Adjust our work to take over these K tiles.
-          unit_iter_start -= start_tile_k_tile;
-          k_tiles_in_my_split += start_tile_k_tile;
-        }
-        else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
-          // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
-          // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
-          // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
-          auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile);
-          unit_iter_start += adjustment_tiles;
-          k_tiles_in_my_split -= adjustment_tiles;
-        }
-        else if (params.ktile_start_alignment_count == 2 && start_tile_k_tile % 2 != 0) {
-          // ktile for each SM start from even number
-          // If start from odd number ktile within the output tile
-          //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
-          // if end on odd number ktile within the output tile
-          //    now end at ktile that one before my ktile end (give one ktile to next sm)
-          unit_iter_start -= 1;
-          k_tiles_in_my_split += 1;
-        }
-      }
-
-      if (work_tile_info.k_tile_count == 0) {
-        // This is a new unit
-
-        if (!is_split_k) {
-          //
-          // Adjust the unit ending position and number of tiles to avoid
-          // computing splits of size less than min_iters_per_sk_unit_
-          //
-
-          // Begin by assuming that no adjustment is needed
-          auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split;
-
-          int unused, end_tile_k_tile;
-          params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end);
-
-          if (end_tile_k_tile < Params::min_iters_per_sk_unit_) {
-            // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile,
-            // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
-            // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
-            k_tiles_in_my_split -= end_tile_k_tile;
-          }
-          else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
-            // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
-            // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
-            // Adjust our work to take on these K tiles.
-            k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile);
-          }
-          else if (params.ktile_start_alignment_count == 2 && end_tile_k_tile % 2 != 0) {
-            // ktile for each SM start from even number
-            // If start from odd number ktile within the output tile
-            //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
-            // If end on odd number ktile within the output tile,
-            //    now end at ktile that one before my ktile end (give one ktile to next sm)
-            k_tiles_in_my_split -= 1;
-          }
-        }
-
-        work_tile_info.k_tile_remaining = k_tiles_in_my_split;
-      }
-
-      uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1;
-
-      // Find the output tile corresponding to the final k tile covered by this
-      // work unit. Stream-K work units will work backwards in terms of the tiles they
-      // are responsible computing. This is beneficial because the final (partial)
-      // tile computed by a stream-K block is typically the beginning of the output
-      // tile, while the beginning (partial) tile is typically the ending of another
-      // output tile. Since ending portions of an output tile must reduce across
-      // other work units computing portions of that output tile, it is preferable
-      // for them to be computed later, so as to reduce the likelihood of blocking
-      // on other work.
-
-      auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end);
-      uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor;
-      uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor;
-
-      // Convert the output tile from the linearized space within each group to the
-      // overall linearized space.
-      output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx;
-
-      // Bring the linearized tile ID back into the space of tiles, rather than clusters
-      output_tile_id *= params.get_cluster_size();
-
-      // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
-      if (params.raster_order_ == RasterOrder::AlongN) {
-        output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
-      }
-      else {
-        output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
-      }
-
-      // The unit's starting k iteration in the current tile is either the starting
-      // iteration for the tile as a whole, or the starting k iteration for the unit
-      // as a whole (if the latter is greater than the former).
-      uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start);
-
-      // Similarly, the unit's ending k iteration (exclusive) is either the end of
-      // the current tile it is assigned, or the ending iteration of the unit as a whole
-      // (if the latter is less than the former).
-      uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1);
-
-      // Set the k offset to be the starting k tile for this output tile
-      work_tile_info.K_idx = static_cast<int32_t>(tile_iter_start - output_tile_iter_start);
-      work_tile_info.k_tile_count = tile_iter_end - tile_iter_start;
-    }
-
-    uint64_t work_idx_l, remainder;
-    params.divmod_batch_(work_idx_l, remainder, output_tile_id);
-
-    uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder);
-
-    auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n(
-                                          cta_per_grid_dim,
-                                          params.divmod_cluster_shape_major_,
-                                          params.divmod_cluster_shape_minor_,
-                                          params.divmod_cluster_blk_major_,
-                                          params.log_swizzle_size_,
-                                          params.raster_order_
-                                        );
-
-    // Set the M, N, and L block offsets
-    work_tile_info.M_idx = work_idx_m;
-    work_tile_info.N_idx = work_idx_n;
-    work_tile_info.L_idx = static_cast<int32_t>(work_idx_l);
-  }
-
-  // Returns the starting and ending peer ID of this tile
-  CUTLASS_HOST_DEVICE
-  static auto
-  tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) {
-    uint32_t tile_idx_in_cluster_path = params.div_cluster_size(tile_idx);
-    uint32_t start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
-    uint32_t end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
-    uint32_t big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
-
-    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) {
-      uint32_t unit_k_start = unit_idx * k_tiles_per_unit;
-      uint32_t unit_k_end = unit_k_start + k_tiles_per_unit;
-      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
-          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
-        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
-        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
-        // output tile. This work will thus be subsumed by the next stream-K unit.
-        ++unit_idx;
-      }
-
-      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
-          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
-        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
-        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
-        // output tile. This work will thus be subsumed by the previous stream-K unit.
-        --unit_idx;
-      }
-
-      return unit_idx;
-    };
-
-    // Lambda to find the ID of the stream-K unit that computes this K tile
-    auto find_unit = [&](uint32_t k_tile) {
-      if (k_tile < big_unit_k_tiles) {
-        // The tile is within the "big unit range"
-        uint32_t unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
-        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor));
-      }
-      else {
-        // The tile is after the "big unit range." Account for this by finding the "normal unit"
-        // that it belongs to, and then offsetting by the number of big units
-        uint32_t unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_;
-        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor));
-      }
-    };
-
-    return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile));
-  }
-};
-
-} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
deleted file mode 100755
index af274ee09..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
+++ /dev/null
@@ -1,394 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_sparse_base.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
->
-struct SparseGemm {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  static int const kSparse = Mma::kSparse;
-  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
-  static int const kMaxID2 = Mma::kMaxID2;
-  static int const kElementsPerElementE = Mma::kElementsPerElementE;
-
-  using ElementE = typename Mma::ElementE;
-  using LayoutE = typename Mma::LayoutE;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ParamsA = typename Mma::IteratorA::Params;
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using ParamsB = typename Mma::IteratorB::Params;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using ParamsE = typename Mma::IteratorE::Params;
-  using TensorRefE = typename Mma::IteratorE::TensorRef;
-
-  /// Parameters structure
-  struct Params : public SparseParamsBase<
-      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-      ParamsE, TensorRefE> {
-
-    using Base = SparseParamsBase<
-        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-        ParamsE, TensorRefE>;
-
-    //
-    // Data members
-    //
-
-    typename Epilogue::OutputTileIterator::Params params_C;
-    typename Epilogue::OutputTileIterator::TensorRef ref_C;
-    typename Epilogue::OutputTileIterator::Params params_D;
-    typename Epilogue::OutputTileIterator::TensorRef ref_D;
-    typename OutputOp::Params output_op;
-    int *semaphore;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      TensorRefA ref_A,
-      TensorRefB ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      TensorRefE ref_E,
-      typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr
-    ):
-      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
-      params_C(ref_C.layout()),
-      ref_C(ref_C),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op) {
-    semaphore = workspace;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  SparseGemm() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-      cutlass::gemm::GemmCoord const & problem_size,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      typename Mma::IteratorE::TensorRef ref_E) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
-
-    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
-      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    // The k dimension has to be the multiple of the Threadblock k because out
-    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
-    // a valid meta data.
-    if (problem_size.k() % Mma::Shape::kK) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
-    // because of the row reordering of operand E
-    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
-
-    if (problem_size.m() % kAlignmentM) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      threadblock_tile_offset.k() * params.gemm_k_size,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_E{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k = min(
-      params.problem_size.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A, B, and E operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      params.ref_A.data(),
-      {params.problem_size.m(), problem_size_k / kSparse},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      params.ref_B.data(),
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorE iterator_E(
-        params.params_E, params.ref_E.data(),
-        {params.problem_size.m(),
-         problem_size_k / kSparse / kElementsPerElementE},
-        thread_idx, tb_offset_E);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    if (!kSplitKSerial || gemm_k_iterations > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
-    }
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      params.ref_C.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op, iterator_D, accumulators, iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      __threadfence();
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
deleted file mode 100755
index f464e29cc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
+++ /dev/null
@@ -1,509 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Sparse GEMM kernel with an epilogue that computes the absolute maximum value of the output
-    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
-    stored to global memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/params_sparse_base.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
->
-struct SparseGemmWithAbsmax {
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using OutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static bool const kSplitKSerial = SplitKSerial;
-
-  static int const kSparse = Mma::kSparse;
-  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
-  static int const kMaxID2 = Mma::kMaxID2;
-  static int const kElementsPerElementE = Mma::kElementsPerElementE;
-
-  using ElementE = typename Mma::ElementE;
-  using LayoutE = typename Mma::LayoutE;
-
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ParamsA = typename Mma::IteratorA::Params;
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using ParamsB = typename Mma::IteratorB::Params;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using ParamsE = typename Mma::IteratorE::Params;
-  using TensorRefE = typename Mma::IteratorE::TensorRef;
-
-  using ParamsC = typename Epilogue::OutputTileIterator::Params;
-  using TensorRefC = typename Epilogue::OutputTileIterator::TensorRef;
-  using ParamsD = typename Epilogue::OutputTileIterator::Params;
-  using TensorRefD = typename Epilogue::OutputTileIterator::TensorRef;
-  using ParamsAux = typename Epilogue::AuxOutputTileIterator::Params;
-  using TensorRefAux = typename Epilogue::AuxOutputTileIterator::TensorRef;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmCoord problem_size;
-    TensorRefA ref_A;
-    TensorRefB ref_B;
-    TensorRefC ref_C;
-    TensorRefD ref_D;
-    TensorRefE ref_E;
-    TensorRefAux ref_Aux;
-    void* ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    typename Epilogue::OutputOp::Params epilogue;
-    int split_k_slices;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-
-    }
-
-    /// Constructs an Arguments structure 
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      GemmCoord problem_size_,
-      TensorRefA ref_A_,
-      TensorRefB ref_B_,
-      TensorRefC ref_C_,
-      TensorRefD ref_D_,
-      TensorRefE ref_E_,
-      TensorRefAux ref_Aux_,
-      void* ptr_Vector_,
-      typename LayoutC::Stride::Index ldr_,
-      typename OutputOp::Params epilogue_ = 
-        typename OutputOp::Params(),
-      int split_k_slices = 1
-    ):
-      problem_size(problem_size_),
-      ref_A(ref_A_),
-      ref_B(ref_B_),
-      ref_C(ref_C_),
-      ref_D(ref_D_),
-      ref_E(ref_E_),
-      ref_Aux(ref_Aux_),
-      ptr_Vector(ptr_Vector_),
-      ldr(ldr_),
-      epilogue(epilogue_),
-      split_k_slices(split_k_slices) {
-
-    }
-  };
-
-  /// Parameters structure
-  struct Params : public SparseParamsBase<
-      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-      ParamsE, TensorRefE> {
-
-    using Base = SparseParamsBase<
-        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-        ParamsE, TensorRefE>;
-
-    //
-    // Data members
-    //
-
-    ParamsC params_C;
-    TensorRefC ref_C;
-    ParamsD params_D;
-    TensorRefD ref_D;
-    ParamsAux params_Aux;
-    TensorRefAux ref_Aux;
-
-    void* ptr_Vector;
-    typename LayoutC::Stride::Index ldr;
-
-    typename OutputOp::Params output_op;
-    int *semaphore;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      TensorRefA ref_A,
-      TensorRefB ref_B,
-      TensorRefC ref_C,
-      TensorRefD ref_D,
-      TensorRefE ref_E,
-      TensorRefAux ref_Aux,
-      void* ptr_Vector,
-      typename LayoutC::Stride::Index ldr,
-      typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr
-    ):
-      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
-      params_C(ref_C.layout()),
-      ref_C(ref_C),
-      params_D(ref_D.layout()),
-      ref_D(ref_D),
-      output_op(output_op),
-      ref_Aux(ref_Aux),
-      params_Aux(ref_Aux.layout()),
-      ptr_Vector(ptr_Vector),
-      ldr(ldr) {
-    semaphore = workspace;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  SparseGemmWithAbsmax() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-      cutlass::gemm::GemmCoord const & problem_size,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Epilogue::OutputTileIterator::TensorRef ref_C,
-      typename Epilogue::OutputTileIterator::TensorRef ref_D,
-      typename Mma::IteratorE::TensorRef ref_E) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
-
-    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
-      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    // The k dimension has to be the multiple of the Threadblock k because out
-    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
-    // a valid meta data.
-    if (problem_size.k() % Mma::Shape::kK) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
-    // because of the row reordering of operand E
-    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
-
-    if (problem_size.m() % kAlignmentM) {
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      threadblock_tile_offset.k() * params.gemm_k_size,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_E{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k = min(
-      params.problem_size.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A, B, and E operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      params.ref_A.data(),
-      {params.problem_size.m(), problem_size_k / kSparse},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      params.ref_B.data(),
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorE iterator_E(
-        params.params_E, params.ref_E.data(),
-        {params.problem_size.m(),
-         problem_size_k / kSparse / kElementsPerElementE},
-        thread_idx, tb_offset_E);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    if (!kSplitKSerial || gemm_k_iterations > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
-    }
-
-    //
-    // Epilogue
-    //
-
-    OutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    // If performing a reduction via split-K, fetch the initial synchronization
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      // Fetch the synchronization lock initially but do not block.
-      semaphore.fetch();
-
-      // Indicate which position in a serial reduction the output operator is currently updating
-      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-    }
-
-    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
-    // Move to appropriate location for this output tile
-    if (ptr_Vector) {
-      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      params.ref_C.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      params.ref_D.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to auxiliary destination tensor.
-    typename Epilogue::AuxOutputTileIterator iterator_Aux(
-      params.params_Aux,
-      // Only the final block writes the auxiliary tensor
-      ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
-          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-          ? nullptr
-          : params.ref_Aux.data(),
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(output_op,
-             // Only the final block uses Vector
-             ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
-              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
-                 ? nullptr
-                 : ptr_Vector,
-             iterator_D,
-             accumulators,
-             iterator_C,
-             iterator_Aux,
-             params.problem_size.mn(),
-             threadblock_offset);
-    
-    //
-    // Release the semaphore
-    //
-
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-
-      __threadfence();
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
deleted file mode 100755
index 364804086..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
+++ /dev/null
@@ -1,238 +0,0 @@
-
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Sparse GEMM with visitor.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/gemm/kernel/sparse_gemm.h"
-#include "cutlass/gemm/kernel/params_sparse_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Sparse Gemm that compute the epilogue visitor functor
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
->
-struct SparseGemmWithEpilogueVisitor : public SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>  {
-
-  using Base = SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>;
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using FusionCallbacks = typename Epilogue::FusionCallbacks;
-
-  using ParamsA = typename Mma::IteratorA::Params;
-  using TensorRefA = typename Mma::IteratorA::TensorRef;
-  using ParamsB = typename Mma::IteratorB::Params;
-  using TensorRefB = typename Mma::IteratorB::TensorRef;
-  using ParamsE = typename Mma::IteratorE::Params;
-  using TensorRefE = typename Mma::IteratorE::TensorRef;
-
-  static int const kSparse = Base::kSparse;
-  static int const kElementsPerElementE = Base::kElementsPerElementE;
-  using SharedStorage = typename Base::SharedStorage;
-
-  /// Parameters structure
-  struct Params : public SparseParamsBase<
-      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-      ParamsE, TensorRefE> {
-
-    using Base = SparseParamsBase<
-        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
-        ParamsE, TensorRefE>;
-
-    //
-    // Data members
-    //
-
-    typename FusionCallbacks::Params output_op;
-    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      typename Mma::IteratorA::TensorRef ref_A,
-      typename Mma::IteratorB::TensorRef ref_B,
-      typename Mma::IteratorE::TensorRef ref_E,
-      typename FusionCallbacks::Arguments output_op = typename FusionCallbacks::Arguments()
-    ):
-      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
-      output_op(FusionCallbacks::to_underlying_arguments(problem_size, output_op, nullptr /*workspace*/)),
-      problem_shape(problem_size.m(), problem_size.n(), 1) {
-    }
-  };
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  SparseGemmWithEpilogueVisitor() { }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      threadblock_tile_offset.k() * params.gemm_k_size,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_E{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
-    };
-
-    // Problem size is a function of threadblock index in the K dimension
-    int problem_size_k = min(
-      params.problem_size.k(), 
-      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Construct iterators to A, B, and E operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      params.ref_A.data(),
-      {params.problem_size.m(), problem_size_k / kSparse},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      params.ref_B.data(),
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    typename Mma::IteratorE iterator_E(
-        params.params_E, params.ref_E.data(),
-        {params.problem_size.m(),
-         problem_size_k / kSparse / kElementsPerElementE},
-        thread_idx, tb_offset_E);
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    if (gemm_k_iterations > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
-    }
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    //
-    // Epilogue
-    //
-
-    Epilogue epilogue(
-      params.output_op,
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
deleted file mode 100755
index 67d346e3b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
+++ /dev/null
@@ -1,502 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm_coord.hpp"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler_params.h"
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-namespace cutlass::gemm::kernel::detail {
-
-///////////////////////////////////////////////////////////////////////////////
-
-// Users are not supposed to use this class directly.
-// This is a CRTP base class for the actual tile schedulers.
-template<class Subclass>
-class StaticPersistentTileScheduler {
-
-private:
-  uint64_t current_work_linear_idx_;
-  uint64_t total_grid_size_;
-
-public:
-  struct WorkTileInfo {
-    int32_t M_idx = 0;
-    int32_t N_idx = 0;
-    int32_t L_idx = 0;
-    bool is_valid_tile = false;
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_valid() const {
-      return is_valid_tile;
-    }
-
-    CUTLASS_HOST_DEVICE
-    static WorkTileInfo
-    invalid_work_tile() {
-      return {-1, -1, -1, false};
-    }
-
-    CUTLASS_HOST_DEVICE
-    bool
-    is_final_split(uint32_t k_tiles_per_output_tile) const {
-      return true;
-    }
-
-    CUTLASS_HOST_DEVICE
-    int32_t
-    reduction_subtile_idx() const {
-      return -1;
-    }
-  };
-
-  using Params = PersistentTileSchedulerSm90Params;
-  using RasterOrder = typename Params::RasterOrder;
-  using RasterOrderOptions = typename Params::RasterOrderOptions;
-  static constexpr bool IsDynamicPersistent = false;
-
-public:
-  struct Arguments {
-    int max_swizzle_size = 1;
-    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
-  };
-
-  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
-  static Params
-  to_underlying_arguments(
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShape tile_shape,
-      ClusterShape cluster_shape,
-      [[maybe_unused]] KernelHardwareInfo const& hw_info,
-      Arguments const& arguments,
-      [[maybe_unused]] void* workspace=nullptr,
-      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
-      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
-
-    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
-    static_assert(cute::is_static<TileShape>::value);
-    static_assert(cute::is_static<ClusterShape>::value);
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
-
-    Params params;
-    params.initialize(
-      problem_blocks,
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      arguments.max_swizzle_size,
-      arguments.raster_order
-    );
-
-    return params;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static bool
-  can_implement(Arguments const& args) {
-    return args.max_swizzle_size >= 1;
-  }
-
-  CUTLASS_HOST_DEVICE
-  StaticPersistentTileScheduler() { }
-
-  CUTLASS_DEVICE explicit StaticPersistentTileScheduler(Params const& params_) : scheduler_params(params_) {
-    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
-    // like blockIdx and gridDim, with __CUDA_ARCH__.
-#if defined(__CUDA_ARCH__)
-    if (params_.raster_order_ == RasterOrder::AlongN) {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
-    }
-    else {
-      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
-    }
-
-    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
-#else
-    CUTLASS_ASSERT(false && "This line should never be reached");
-#endif
-  }
-
-  // Returns the initial work tile info that will be computed over
-  template <class ClusterShape>
-  CUTLASS_DEVICE
-  WorkTileInfo
-  initial_work_tile_info(ClusterShape cluster_shape) {
-    return get_current_work();
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work() const {
-    return get_current_work_for_linear_idx(current_work_linear_idx_);
-  }
-
-  CUTLASS_DEVICE
-  WorkTileInfo
-  get_current_work_for_linear_idx(uint64_t linear_idx) const {
-    if (linear_idx >= scheduler_params.blocks_per_problem_) {
-      return WorkTileInfo::invalid_work_tile();
-    }
-
-    // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices
-    uint64_t work_idx_l, remainder;
-    scheduler_params.divmod_batch_(work_idx_l, remainder, linear_idx);
-
-    uint64_t blk_per_grid_dim = scheduler_params.divmod_cluster_shape_minor_.divide(remainder);
-
-    auto [work_idx_m, work_idx_n] = Subclass::get_work_idx_m_and_n(blk_per_grid_dim,
-                                                         scheduler_params.divmod_cluster_shape_major_,
-                                                         scheduler_params.divmod_cluster_shape_minor_,
-                                                         scheduler_params.divmod_cluster_blk_major_,
-                                                         scheduler_params.log_swizzle_size_,
-                                                         scheduler_params.raster_order_);
-
-    return {work_idx_m, work_idx_n, static_cast<int32_t>(work_idx_l), true};
-  }
-
-  CUTLASS_DEVICE
-  void
-  advance_to_next_work(uint32_t advance_count = 1) {
-    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
-  }
-
-  CUTLASS_DEVICE
-  bool is_last_tile(WorkTileInfo& work_tile_info, uint32_t advance_count = 1) const {
-    if (continue_current_work(work_tile_info)) {
-      return false;
-    }
-    return not get_current_work_for_linear_idx(
-        current_work_linear_idx_ + (total_grid_size_ * uint64_t(advance_count))
-    ).is_valid();
-  }
-
-  // Computes the linear index within a batch given M and N tile offsets within the batch.
-  // This essentially inverts the mapping performed in get_work_idx_m_and_n
-  static CUTLASS_DEVICE
-  uint64_t
-  get_linear_idx_from_m_and_n(
-    int32_t tile_m,
-    int32_t tile_n,
-    FastDivmodU64Pow2 const& divmod_cluster_shape_major,
-    FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
-    FastDivmodU64 const& divmod_cluster_blk_major,
-    int32_t log_swizzle_size,
-    RasterOrder raster_order) {
-
-    uint64_t minor_work_idx, major_work_idx, cluster_minor_offset;
-    if (raster_order == RasterOrder::AlongN) {
-      minor_work_idx = static_cast<uint64_t>(tile_m);
-      major_work_idx = static_cast<uint64_t>(tile_n);
-      uint64_t cluster_m = divmod_cluster_shape_minor.divide(tile_m) * divmod_cluster_shape_minor.divisor;
-      cluster_minor_offset = tile_m - cluster_m;
-    }
-    else {
-      major_work_idx = static_cast<uint64_t>(tile_m);
-      minor_work_idx = static_cast<uint64_t>(tile_n);
-      uint64_t cluster_n = divmod_cluster_shape_minor.divide(tile_n) * divmod_cluster_shape_minor.divisor;
-      cluster_minor_offset = tile_n - cluster_n;
-    }
-
-    uint64_t cluster_idx_minor, cluster_idx_major, cluster_major_offset;
-    cluster_idx_minor = divmod_cluster_shape_minor.divide(minor_work_idx - cluster_minor_offset);
-    divmod_cluster_shape_major(cluster_idx_major, cluster_major_offset, major_work_idx);
-
-    uint64_t cluster_idx_minor_div_swizzle = cluster_idx_minor >> log_swizzle_size;
-    uint64_t offset = cluster_idx_minor & ((1 << log_swizzle_size) - 1);
-
-    uint64_t extra = cluster_idx_minor_div_swizzle * divmod_cluster_blk_major.divisor + cluster_idx_major;
-
-    uint64_t cluster_id = (extra << log_swizzle_size) | offset;
-    return (cluster_id * divmod_cluster_shape_major.divisor + cluster_major_offset) * divmod_cluster_shape_minor.divisor + cluster_minor_offset;
-  }
-
-  // Given the inputs, computes the total number of output blocks over which this problem will compute. 
-  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
-  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) {
-    auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
-    auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
-
-    return Params::get_tiled_cta_shape_mnl(
-      to_gemm_coord(problem_shape_mnkl),
-      to_gemm_coord(cluster_shape),
-      cta_m, cta_n
-    );
-  }
-
-  // Reloaded interface that receives WorkTileInfo to deduce next work.
-  // Kernel helper function to get next work tile
-  CUTLASS_DEVICE
-  auto
-  fetch_next_work(WorkTileInfo work_tile_info) {
-    if (continue_current_work(work_tile_info)) {
-      return cute::make_tuple(work_tile_info, true);
-    }
-
-    advance_to_next_work();
-    return cute::make_tuple(get_current_work(), true);
-  }
-  
-  // Given the inputs, computes the total number of output blocks over which this problem will compute.
-  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
-  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl,
-                          TileShape tile_shape_mnk,
-                          AtomThrShape atom_thr_shape_mnk,
-                          ClusterShape cluster_shape_mnk) {
-    auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk)));
-    auto cta_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk));
-    auto cta_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk));
-
-    return Params::get_tiled_cta_shape_mnl(
-      to_gemm_coord(problem_shape_mnkl),
-      to_gemm_coord(cluster_shape_mnk),
-      cta_m, cta_n
-    );
-  }
-
-  CUTLASS_DEVICE
-  static auto
-  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
-    // Get every cta coord in three dimensions of the cluster
-    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
-    return make_coord(
-      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
-      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
-      _,
-      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
-    );
-  }
-
-  CUTLASS_DEVICE
-  static auto
-  work_tile_to_cta_coord(WorkTileInfo work_tile_info, dim3 block_id_in_cluster) {
-    // Get every cta coord in three dimensions of the cluster
-    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster;
-    return make_coord(
-      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
-      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
-      _,
-      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
-    );
-  }
-
-  // Given the inputs, computes the physical grid we should launch.
-  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-      [[maybe_unused]] Params const& params,
-      ProblemShapeMNKL problem_shape_mnk,
-      BlockShape cta_shape,
-      ClusterShape cluster_shape,
-      KernelHardwareInfo hw_info,
-      Arguments arguments = Arguments{},
-      bool truncate_by_problem_size=true) {
-
-    auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
-
-    return Params::get_grid_shape(
-      problem_blocks,
-      to_gemm_coord(cluster_shape),
-      hw_info,
-      arguments.max_swizzle_size,
-      arguments.raster_order,
-      /* truncate_by_problem_size = */true
-    );
-  }
-
-  // Given the inputs, computes the physical grid we should launch.
-  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
-  static dim3
-  get_grid_shape(
-      Params const& params,
-      ProblemShapeMNKL problem_shape_mnkl,
-      TileShape tile_shape_mnk,
-      AtomThrShape atom_thr_shape_mnk,
-      ClusterShape cluster_shape_mnk,
-      KernelHardwareInfo hw_info) {
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
-    Arguments args{};
-    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
-      args.max_swizzle_size = 1 << params.log_swizzle_size_;
-    }
-    args.raster_order = params.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
-
-    return Params::get_grid_shape(
-      problem_blocks,
-      to_gemm_coord(cluster_shape_mnk),
-      hw_info,
-      args.max_swizzle_size,
-      args.raster_order,
-      /* truncate_by_problem_size = */true
-    );
-  }
-
-  // Convert CTA-level work tile info to cluster-level tile coord
-  CUTLASS_DEVICE
-  auto
-  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
-    // TileScheduler works at CTA-level, kernel works at cluster-level
-    int m_coord = idx2crd(work_tile_info.M_idx / scheduler_params.cluster_shape_m_,
-                          scheduler_params.problem_tiles_m_);
-    int n_coord = idx2crd(work_tile_info.N_idx / scheduler_params.cluster_shape_n_,
-                          scheduler_params.problem_tiles_n_);
-    int l_coord = idx2crd(work_tile_info.L_idx,
-                          scheduler_params.problem_tiles_l_);
-    return make_coord(m_coord, n_coord, _, l_coord);
-  }
-
-  // Returns whether the block assigned this work should compute the epilogue for the corresponding
-  // output tile. For the basic tile scheduler, this is always true.
-  CUTLASS_HOST_DEVICE
-  static bool
-  compute_epilogue(WorkTileInfo const&, Params const&) {
-    return true;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static bool
-  compute_epilogue(WorkTileInfo const&) {
-    return true;
-  }
-
-  // Performs the reduction across splits for a given output tile. Since this scheduler does
-  // not split output tiles, no reduction is needed.
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  static void
-  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
-
-  // Performs the reduction across splits for a given output tile. No fixup is required for
-  // work units returned by this scheduler.
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  void
-  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) const { }
-
-  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
-  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
-  // passed in should not be used after having been processed.
-  CUTLASS_DEVICE
-  static bool
-  continue_current_work(WorkTileInfo&) {
-    return false;
-  }
-
-  template <class ProblemShapeMNKL, class TileShape, class Shape>
-  CUTLASS_DEVICE
-  auto
-  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) {
-    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
-    return cute::make_coord_iterator(k_tiles);
-  }
-
-  template <class ProblemShape, class TileShape>
-  CUTLASS_HOST_DEVICE
-  static int
-  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) {
-    // All work units returned by this scheduler cover the entire K iteration
-    // space of the output tile assigned to the work unit.
-    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
-  }
-
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  get_work_k_tile_start(WorkTileInfo const&) {
-    // All work units returned by this scheduler start from K tile 0
-    return 0u;
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  need_separate_reduction(Params const& params) {
-    return false;
-  }
-
-  CUTLASS_DEVICE
-  bool
-  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
-    return false;
-  }
-
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  void
-  separate_reduction(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx) {
-  }
-
-  // Shares the accumulator set with peers in the global workspace
-  template <class FrgTensorC>
-  CUTLASS_DEVICE
-  static void
-  share(
-    Params const& params,
-    WorkTileInfo const& work_tile_info,
-    FrgTensorC& accumulators,
-    uint32_t num_barriers,
-    uint32_t barrier_idx) {
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
-    return true;
-  }
-
-  CUTLASS_DEVICE
-  static bool
-  requires_separate_reduction(Params const& params) {
-    return false;
-  }
-
-public:
-  // Sink scheduler params as a member
-  Params scheduler_params;
-};
-
-} // namespace cutlass::gemm::kernel::detail
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
deleted file mode 100755
index b51cc6ede..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h
+++ /dev/null
@@ -1,675 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma1_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (A*B or B*A)
-  typename Mma2_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (AT*B or B*AT)
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
-  FillMode FillMode_              ///! Fill Mode for triangular matrix (kLower or kUpper)
->
-struct SymmUniversal {
-public:
-
-  using Mma1 = Mma1_;
-  using Mma2 = Mma2_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma1::IteratorA::Element;
-  using ElementB = typename Mma1::IteratorB::Element;
-
-  // Mma1 (TRMM - with diagonal: C_tmp = alpha * A * B)
-  using LayoutA = typename Mma1::IteratorA::Layout;
-  using LayoutBT = typename Mma1::IteratorB::Layout;
-  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
-  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
-
-  // Mma2 (TRMM - withOUT diagonal: alpha * AT * B)
-  using LayoutB = typename Mma2::IteratorA::Layout;
-  using LayoutAT = typename Mma2::IteratorB::Layout;
-  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
-  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
-
-  // Common type definitions for Mma1 and Mma2
-  using Operator = typename Mma1::Operator;
-  using OperatorClass = typename Mma1::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma1::Shape;
-  using WarpShape = typename Mma1::Operator::Shape;
-  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma1::ArchTag;
-
-  static int const kStages = Mma1::kStages;
-  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
-
-  // Output related typedefinitions
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  static SideMode const kSideModeA = SideMode_;
-  static FillMode const kFillModeA = FillMode_;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma1::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode = GemmUniversalMode::kGemm;
-    GemmCoord problem_size{};
-    int batch_count{1};
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_B{nullptr};
-    void const * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_C{0};
-    int64_t batch_stride_D{0};
-
-    typename LayoutA::Stride::Index lda{0};
-    typename LayoutB::Stride::Index ldb{0};
-    typename LayoutC::Stride::Index ldc{0};
-    typename LayoutC::Stride::Index ldd{0};
-
-    //
-    // Methods
-    //
-    
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void const * ptr_C,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_C,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldc,
-      typename LayoutC::Stride::Index ldd
-    ):
-      mode(mode), 
-      problem_size(problem_size), 
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_B(0),
-      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
-
-      }
-
-    /// Returns arguments for the transposed problem sizes
-    Arguments transposed_problem_size() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-
-      return args;
-    }
-
-    /// Returns arguments for the transposed matrices
-    Arguments swapped_matrices() const {
-      Arguments args(*this);
-
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-    
-    // Mma1 Iterator A and B params
-    typename Mma1::IteratorA::Params params_A_mma1{};
-    typename Mma1::IteratorB::Params params_B_mma1{};
-
-    // Mma2 Iterator A and B params 
-    typename Mma2::IteratorA::Params params_A_mma2{};
-    typename Mma2::IteratorB::Params params_B_mma2{};
-
-    typename Epilogue::OutputTileIterator::Params params_C{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    
-    typename EpilogueOutputOp::Params output_op{};
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    int batch_count {0};
-    int gemm_k_size {0};
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-    void * ptr_C{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A {0};
-    int64_t batch_stride_B {0};
-    int64_t batch_stride_C {0};
-    int64_t batch_stride_D {0};
-
-    int *semaphore{nullptr};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A_mma1(args.lda),
-      params_B_mma1(args.ldb),
-      params_A_mma2(args.lda),
-      params_B_mma2(args.ldb),
-      params_C(args.ldc),
-      params_D(args.ldd),
-      output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_C(const_cast<void *>(args.ptr_C)),
-      ptr_D(const_cast<void *>(args.ptr_D)),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_C(args.batch_stride_C),
-      batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_C = const_cast<void *>(args.ptr_C);
-      ptr_D = args.ptr_D;
-
-      output_op = args.epilogue;
-
-      semaphore = static_cast<int *>(workspace);
-    }
-
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma1::SharedStorage mma1_main_loop;
-    typename Mma2::SharedStorage mma2_main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  SymmUniversal() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes two GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-      return;
-    }
-   
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_MxK_mma1{
-      threadblock_tile_offset.m() * Mma1::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_KxN_mma1{
-      offset_k,
-      threadblock_tile_offset.n() * Mma1::Shape::kN
-    };
-
-    cutlass::MatrixCoord tb_offset_MxK_mma2{
-      threadblock_tile_offset.m() * Mma1::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_KxN_mma2{
-      offset_k,
-      threadblock_tile_offset.n() * Mma1::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply for Mma1
-    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
-
-    // Construct thread-scoped matrix multiply for Mma2
-    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma1::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-    int gemm_k_iterations_mma1 = gemm_k_iterations;
-    int gemm_k_iterations_mma2 = gemm_k_iterations;
-
-
-    /******************************************************************************************************
-     * SYMM (Side Mode, Fill Mode) is made of two TRMMs:
-      First TRMM (Mma1: Side Mode, Fill Mode, Non-Unit Diag): (A * B) or (B * A)
-      Second TRMM (Mma2: Side Mode, Inverted Fill Mode, Unit Diag): (AT * B) or (B * AT)
-
-     * For the first TRMM (Mma1) of SYMM, the following method is used to calculate the k-iterations:
-      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
-        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
-                                    needed to process all elements till that coordinate.
-        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
-                                    needed to process all elements till that coordinate.
-
-      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
-        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
-                                   that can be skipped for all elements of this tile.
-        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
-                                    that can be skipped for all elements of this tile.
-
-      * For the second TRMM (Mma2) of SYMM, the k-iterations and threadblock offsets are calculated 
-        the same way as the first TRMM (Mma1) of same side mode but with inverted fill mode. 
-        For example, if the first TRMM is left sided with lower fill, the second TRMM would be 
-        left sided with upper fill.
-    ********************************************************************************************************/
-
-    if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kLower) {
-
-      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
-        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
-      }
-      
-      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma2 != 0) {
-        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
-        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
-        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
-      }
-
-    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kUpper) {
-
-      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
-        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
-      }
-
-      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma2 != 0) {
-        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
-        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
-        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
-      }
-
-    } else if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kUpper) {
-
-      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma1 != 0) {
-        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
-        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
-        gemm_k_iterations_mma1  -= k_iterations_till_diagonal_mma1;
-      }
-
-      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
-        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
-      }      
-
-    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kLower) {
-
-      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
-
-      if (k_iterations_till_diagonal_mma1 != 0) {
-        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
-        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
-        gemm_k_iterations_mma1 -= k_iterations_till_diagonal_mma1;
-      }
-
-      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
-      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
-        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
-      }
-
-    }
-
-    // Construct iterators to A and B operands for Mma1
-    typename Mma1::IteratorA iterator_A_mma1(
-      params.params_A_mma1,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_MxK_mma1);
-
-    typename Mma1::IteratorB iterator_B_mma1(
-      params.params_B_mma1,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_KxN_mma1);
-
-    // Construct iterators to A and B operands for Mma2
-    typename Mma2::IteratorA iterator_A_mma2(
-      params.params_A_mma2,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_MxK_mma2);
-
-    typename Mma2::IteratorB iterator_B_mma2(
-      params.params_B_mma2,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_KxN_mma2);
-
-    // Compute threadblock-scoped matrix multiply-add (A x B) or (B x A)
-    mma1(
-      gemm_k_iterations_mma1, 
-      accumulators, 
-      iterator_A_mma1, 
-      iterator_B_mma1, 
-      accumulators);
-
-    // Compute threadblock-scoped matrix multiply-add (AT x B) or (B x AT)
-    mma2(
-      gemm_k_iterations_mma2, 
-      accumulators, 
-      iterator_A_mma2, 
-      iterator_B_mma2, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma1::Shape::kM,
-      threadblock_tile_offset.n() * Mma1::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    // Tile iterator loading from source tensor.
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_C,
-      ptr_C,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
deleted file mode 100755
index 2d9b63ffe..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-/*! \file
-    \brief Utilities for selecting default tile schedulers
-*/
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/detail/dependent_false.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm {
-
-////////////////////////////////////////////////////////////////////////////////
-
-//
-// Tags for specifying tile schedulers
-//
-
-struct PersistentScheduler { };
-
-struct StreamKScheduler { };
-
-struct GroupScheduler { }; // Only used for Grouped GEMMs
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::kernel::detail {
-
-//
-// Selectors mapping tile scheduler tag and arch tag to a tile scheduler class
-//
-
-template <
-  class TileSchedulerTag,
-  class ArchTag,
-  class TileShape,
-  class ClusterShape
-  , class ProblemShapeType = void
->
-struct TileSchedulerSelector {
-  static_assert(cutlass::detail::dependent_false<ArchTag>,
-      "Could not select a tile scheduler for given parameters.");
-};
-
-template <
-  class ArchTag,
-  class TileShape,
-  class ClusterShape
->
-struct TileSchedulerSelector<
-    PersistentScheduler,
-    ArchTag,
-    TileShape,
-    ClusterShape
-  > {
-  using Scheduler = PersistentTileSchedulerSm90;
-};
-
-// Default (void) for Sm90 maps to PersistentTileSchedulerSm90
-template <
-  class ArchTag,
-  class TileShape,
-  class ClusterShape
->
-struct TileSchedulerSelector<
-    void,
-    ArchTag,
-    TileShape,
-    ClusterShape
-  > {
-  using Scheduler = typename TileSchedulerSelector<
-      PersistentScheduler,
-      ArchTag,
-      TileShape,
-      ClusterShape
-  >::Scheduler;
-};
-
-template <
-  class TileShape,
-  class ClusterShape
->
-struct TileSchedulerSelector<
-    StreamKScheduler,
-    arch::Sm90,
-    TileShape,
-    ClusterShape
-  > {
-  using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
-};
-
-template <
-  class TileShape,
-  class ClusterShape
-  , class GroupProblemShape
->
-struct TileSchedulerSelector<
-    GroupScheduler,
-    arch::Sm90,
-    TileShape,
-    ClusterShape
-    , GroupProblemShape
-  > {
-  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::kernel::detail
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
deleted file mode 100755
index 0972731c2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ /dev/null
@@ -1,1535 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-/*! \file
-    \brief Parameters structures for persistent tile schedulers
-*/
-
-#include "cutlass/coord.h"
-#include "cutlass/kernel_hardware_info.h"
-#include "cutlass/workspace.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm_coord.h"
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-namespace detail {
-
-////////////////////////////////////////////////////////////////////////////////
-
-//
-// Parameters for SM90 tile schedulers
-//
-
-// Parameters for SM90 persistent tile scheduler
-struct PersistentTileSchedulerSm90Params {
-
-  enum class RasterOrder {
-    AlongM,
-    AlongN
-  };
-
-  enum class RasterOrderOptions {
-    Heuristic,
-    AlongM,
-    AlongN
-  };
-
-  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
-  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
-  FastDivmodU64 divmod_batch_{};
-  FastDivmodU64 divmod_cluster_blk_major_{};
-
-  uint64_t blocks_per_problem_ = 0;
-  int32_t log_swizzle_size_ = 0;
-  RasterOrder raster_order_ = RasterOrder::AlongN;
-
-  uint32_t problem_tiles_m_ = 0;
-  uint32_t problem_tiles_n_ = 0;
-  uint32_t problem_tiles_l_ = 0;
-  uint32_t cluster_shape_m_ = 0;
-  uint32_t cluster_shape_n_ = 0;
-
-  // Initializes members. This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  void
-  initialize(
-    BatchedGemmCoord problem_shape,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
-    return initialize(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle_size,
-      raster_order_option
-    );
-  }
-
-  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  void
-  initialize(
-    dim3 problem_blocks,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    CUTLASS_UNUSED(hw_info);
-
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
-    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
-
-    problem_tiles_m_ = problem_blocks_m / cluster_shape.m();
-    problem_tiles_n_ = problem_blocks_n / cluster_shape.n();
-    problem_tiles_l_ = problem_blocks.z;
-    cluster_shape_m_ = cluster_shape.m();
-    cluster_shape_n_ = cluster_shape.n();
-
-    RasterOrder raster_order = get_rasterization_order(
-      problem_blocks_m,
-      problem_blocks_n,
-      raster_order_option
-    );
-
-    //
-    // Set members
-    //
-
-    blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks.z;
-    log_swizzle_size_ = log_swizzle_size;
-    raster_order_ = raster_order;
-    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
-
-    if (raster_order == RasterOrder::AlongN) {
-      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
-      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
-      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n / cluster_shape.n());
-    }
-    else {
-      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
-      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
-      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_m / cluster_shape.m());
-    }
-  }
-
-  // Given the inputs, computes the physical grid we should launch.
-  // This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-    BatchedGemmCoord problem_shape,
-    GemmCoord cta_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option,
-    bool truncate_by_problem_size=true
-    ) {
-
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
-    return get_grid_shape(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle_size,
-      raster_order_option,
-      truncate_by_problem_size
-    );
-  }
-
-  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-    dim3 problem_blocks,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option,
-    bool truncate_by_problem_size=true
-    ) {
-
-    int const sm_count = hw_info.sm_count;
-
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
-    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
-
-    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
-
-    RasterOrder raster_order = get_rasterization_order(
-      problem_blocks_m,
-      problem_blocks_n,
-      raster_order_option
-    );
-
-    dim3 launch_grid;
-
-    if (raster_order == RasterOrder::AlongN) {
-      launch_grid = dim3(cluster_shape.m(), 1, 1);
-    }
-    else {
-      launch_grid = dim3(1, cluster_shape.n(), 1);
-    }
-
-    auto possibly_truncate = [&](int x, int y) {
-      if (truncate_by_problem_size) {
-        return platform::min(x, y);
-      }
-      else {
-        return x;
-      }
-    };
-
-    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
-    auto cluster_size = cluster_shape.m() * cluster_shape.n();
-    if (cluster_size == 1) {
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
-      }
-      else {
-        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
-      }
-    }
-    else {
-      int cta_per_device = sm_count;
-      /*
-      * Optimal grid size calculation is based on
-      * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
-      * Hence, maximum SMs per GPC = 18
-      */
-      constexpr int max_sm_per_gpc = 18;
-      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
-      auto cluster_size = cluster_shape.m() * cluster_shape.n();
-      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
-      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
-      cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
-
-      // The calculation below allows for larger grid size launch for different GPUs.
-      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
-      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
-      cta_per_device += max_cta_occupancy_per_residual_gpc;
-
-      if (sm_count < cta_per_device) {
-        cta_per_device = sm_count;
-      }
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(
-            cta_per_device       / cluster_shape.m(),
-            problem_blocks_total / cluster_shape.m());
-      }
-      else {
-        launch_grid.x = possibly_truncate(
-            cta_per_device       / cluster_shape.n(),
-            problem_blocks_total / cluster_shape.n());
-      }
-    }
-    return launch_grid;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t
-  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
-    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
-    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
-      return 3;
-    }
-    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
-      return 2;
-    }
-    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
-      return 1;
-    }
-    else {
-      return 0;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static RasterOrder
-  get_rasterization_order(
-    uint32_t tiles_m,
-    uint32_t tiles_n,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    if (raster_order_option == RasterOrderOptions::Heuristic) {
-      if (tiles_n > tiles_m) {
-        return RasterOrder::AlongM;
-      }
-      else {
-        return RasterOrder::AlongN;
-      }
-    }
-    else {
-      switch (raster_order_option) {
-        case RasterOrderOptions::AlongN:
-          return RasterOrder::AlongN;
-          break;
-        default:
-          return RasterOrder::AlongM;
-      }
-    }
-  }
-
-  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  CUTLASS_HOST_DEVICE
-  static dim3
-  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) {
-    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
-    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
-
-    return get_tiled_cta_shape_mnl(problem_shape, cluster_shape, cta_m, cta_n);
-  }
-
-  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  CUTLASS_HOST_DEVICE
-  static dim3
-  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
-
-    // Round up to nearest multiple of cluster dim along each mode
-    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
-    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
-
-    return {
-      static_cast<uint32_t>(problem_blocks_m),
-      static_cast<uint32_t>(problem_blocks_n),
-      static_cast<uint32_t>(problem_shape.batch())
-    };
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Parameters for SM90 persistent stream-K scheduler
-struct PersistentTileSchedulerSm90StreamKParams {
-
-  // Strategies for computing reductions between CTAs computing portions of a given output tile
-  enum class ReductionMode {
-    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
-    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
-    // currently accumulating.
-    //
-    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
-    Deterministic,
-
-    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
-    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
-    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
-    // the accumulated value and accumulate it into registers on top of which the epilogue will
-    // be performed).
-    //
-    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
-    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
-    // of accumulation)
-    Nondeterministic
-  };
-
-  // Strategies for decomposing the problem
-  enum class DecompositionMode {
-    // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
-    Heuristic,
-    // Force a data-parallel decomposition
-    DataParallel,
-    // Force a split-K decomposition. This should be paired with setting the `splits` parameter
-    SplitK,
-    // Force a stream-K decomposition
-    StreamK
-  };
-
-  using UnderlyingParams = PersistentTileSchedulerSm90Params;
-  using RasterOrder = UnderlyingParams::RasterOrder;
-  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
-
-  // Cluster dimensions are typically always a power of 2, so use
-  // the power-of-two variants of FastDivmod for these.
-  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
-  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
-
-  FastDivmodU64 divmod_batch_{};
-  FastDivmodU64 divmod_cluster_blk_major_{};
-
-  // Total number of cluster-sized output tiles (i.e., not including any
-  // splitting factors). This is primarily used for split-K decompositions,
-  // and may be overridden in other decompositions.
-  FastDivmodU64 divmod_clusters_mnl_{};
-
-  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
-  // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
-  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
-  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
-  // of tiles that would be assigned in the same wave according to the rasterization order of the
-  // data-parallel formulation of the problem.
-  FastDivmodU64 divmod_sk_groups_{};
-
-  // Number of stream-K units in each group
-  FastDivmodU64 divmod_sk_units_per_group_{};
-
-  uint64_t units_per_problem_ = 0;
-  FastDivmod divmod_tiles_per_output_tile_{};
-  int32_t log_swizzle_size_ = 0;
-  RasterOrder raster_order_ = RasterOrder::AlongN;
-
-  // The splitting factor to be used in a split-K decomposition of the problem.
-  // If this is set to a value greater than 1, stream-K decomposition logic
-  // is bypassed in favor of a split-K decomposition.
-  FastDivmod divmod_splits_{};
-
-  // Number of stream-K or split-K work units that compute an extra k iteration.
-  // This is done to handle residuals in dividing up the k iteration space.
-  // For stream-K, since the actual assignment of work to stream-K units will be done
-  // at the granularity of a cluster, we store only the number of big clusters.
-  uint32_t big_units_ = 0;
-
-  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
-  uint32_t big_groups_ = 0;
-
-  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
-  void* reduction_workspace_ = nullptr;
-
-  // Number of tiles covered by stream-K work units
-  uint32_t sk_tiles_ = 0;
-
-  // Number of work units computing stream-K tiles
-  uint32_t sk_units_ = 0;
-
-  // Number of tiled k iterations computed by each stream-K work unit. This
-  // can potentially cover more than one output tile.
-  FastDivmod divmod_k_tiles_per_sk_unit_{};
-  // Number of tiled k iterations computed by each "big" stream-K units, which
-  // processes one more K chunk than a "normal" stream-K unit.
-  FastDivmod divmod_k_tiles_per_sk_big_unit_{};
-
-  // Strategy to use when reducing between collaborating CTAs
-  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
-
-  // The number of sub blocks in the kernel epilogue
-  FastDivmodU64 divmod_epilogue_subtile_{};
-
-  // The number of blocks that launched for doing separate reduction
-  uint32_t separate_reduction_units_ = 0;
-
-  // Minimum number of k tiles that can be assigned to a stream-K unit
-  static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
-
-  // Maximum number of groups of stream-K units
-  static constexpr uint32_t max_sk_groups_ = 8u;
-
-  // ktile start from even for each cta
-  uint32_t ktile_start_alignment_count { 1u };
-
-  // Divides dividend by the cluster size
-  CUTLASS_HOST_DEVICE
-  uint64_t
-  div_cluster_size(uint64_t dividend) const {
-    // Use each underlying fast divmod rather than performing integer division
-    // by the multiplication of major.divisor * minor.divisor
-    return divmod_cluster_shape_minor_.divide(
-      divmod_cluster_shape_major_.divide(dividend)
-    );
-  }
-
-  CUTLASS_HOST_DEVICE
-  uint64_t
-  get_cluster_size() const {
-    return divmod_cluster_shape_minor_.divisor * divmod_cluster_shape_major_.divisor;
-  }
-
-  // Returns whether the kernel uses separate reduction
-  CUTLASS_HOST_DEVICE
-  bool
-  requires_separate_reduction() const {
-    return separate_reduction_units_ > 0;
-  }
-
-  // Returns the maximum number of peers that can collaborate on a given output tile
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
-    // When we can divide up our SK units to SK tiles evenly, the number of peers
-    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
-    // is not exact, some tiles will need to be covered by additional SK units. Because
-    // the extra work can occur at both the beginning and the end of the SK tile, at
-    // most 2 extra peers will be needed.
-    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
-  }
-
-  // Initializes members. This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  void
-  initialize(
-    BatchedGemmCoord problem_shape,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    ReductionMode reduction_mode,
-    DecompositionMode decomposition_mode,
-    void* workspace,
-    const uint32_t epilogue_subtile = 1
-  ) {
-    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
-      problem_shape, tile_shape, cluster_shape);
-
-    // Number of k tiles in each output tile
-    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
-
-    initialize(
-      problem_blocks,
-      k_tiles_per_output_tile,
-      cluster_shape,
-      hw_info,
-      splits,
-      max_swizzle,
-      raster_order_option,
-      reduction_mode,
-      decomposition_mode,
-      workspace,
-      epilogue_subtile
-    );
-  }
-
-  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  void
-  initialize(
-    dim3 problem_blocks,
-    uint32_t k_tiles_per_output_tile,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    ReductionMode reduction_mode,
-    DecompositionMode decomposition_mode,
-    void* workspace,
-    const uint32_t epilogue_subtile = 1
-  ) {
-    UnderlyingParams underlying_params;
-    underlying_params.initialize(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle,
-      raster_order_option
-    );
-
-    auto problem_blocks_l = problem_blocks.z;
-
-    auto problem_blocks_m = round_up(problem_blocks.x, (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
-    auto problem_blocks_n = round_up(problem_blocks.y, (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
-    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
-
-    // Reduction workspace is at the beginning of the workspace. Lock workspace follows.
-    void* reduction_workspace = workspace;
-
-    if (decomposition_mode == DecompositionMode::SplitK ||
-        (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
-      // Short circuit to basic split-K decomposition
-
-      // Don't split by more than the available number of SMs
-      if (splits > hw_info.sm_count) {
-        splits = hw_info.sm_count;
-      }
-
-      // Don't split by more than the K tile iterations
-      //
-      // splits is almost certainly nonnegative here (e.g., hw_info.sm_count,
-      // despite being an int, is a count), so it can safely be converted to unsigned
-      // in the comparison to avoid a signed-unsigned comparison warning-as-error.
-      if (static_cast<decltype(k_tiles_per_output_tile)>(splits) > k_tiles_per_output_tile) {
-        splits = k_tiles_per_output_tile;
-      }
-
-      // If splits == k_tiles_per_output_tiles, there will be one k_tile per cta
-      //   and this violate k_tile start from even requirements. Thus we need to
-      //   reduce the number of splits.
-      if (ktile_start_alignment_count > 1u &&
-           static_cast<decltype(k_tiles_per_output_tile)>(splits) == k_tiles_per_output_tile) { 
-        splits = k_tiles_per_output_tile / ktile_start_alignment_count;
-      }
-
-      set_params_basic(
-        underlying_params,
-        problem_blocks_m,
-        problem_blocks_n,
-        problem_blocks_l,
-        splits,
-        k_tiles_per_output_tile,
-        reduction_workspace,
-        reduction_mode
-      );
-      return;
-    }
-
-    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
-    // can fit within sm_count SMs.
-    dim3 grid = get_grid_shape(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle,
-      raster_order_option
-    );
-
-    uint64_t ctas_per_wave = grid.x * grid.y;
-    auto cluster_size = cluster_shape.m() * cluster_shape.n();
-    // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
-    uint32_t sk_tiles = get_num_sk_tiles(
-      output_tiles,
-      ctas_per_wave,
-      cluster_size,
-      k_tiles_per_output_tile,
-      decomposition_mode
-    );
-    uint64_t dp_tiles = output_tiles - sk_tiles;
-
-    // Calculate the number of work units covering the data-parallel and stream-K tiles.
-    // A "work unit" is a single index in the linearized ID space used by the scheduler.
-    // We distinguish it from a "block," which is typically tied to a hardware unit
-    // (e.g., the callers into this scheduler will be persistent thread blocks).
-    // A work unit can encompass multiple output tiles worth of work (as will be the
-    // case for stream-K blocks).
-    // Since splitting is not required for data-parallel tiles, only one data-parallel unit
-    // is needed per data-parallel tile.
-    uint64_t dp_units = dp_tiles;
-
-    uint64_t ctas_per_sk_wave = ctas_per_wave;
-    uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
-
-    if (decomposition_mode == DecompositionMode::DataParallel ||
-        (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
-        sk_units == 0) {
-      // Short circuit to basic data-parallel decomposition
-      set_params_basic(
-        underlying_params,
-        problem_blocks_m,
-        problem_blocks_n,
-        problem_blocks_l,
-        /* splits = */ 1,
-        k_tiles_per_output_tile,
-        reduction_workspace,
-        reduction_mode
-      );
-      return;
-    }
-
-    bool do_separate_reduction = should_perform_separate_reduction(
-      epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
-
-    // Determine the number of stream-K groups that will be used. We currently use
-    // max_sk_groups_ unless this extends beyond the extent of the dimension over
-    // which the problem is rasterized. For example, if the tiled problem shape
-    // (in CTA_M x CTA_N representation) when using 1x1 clusters is 4x16,
-    // and we rasterize along the M dimension, we choose 4 groups, rather than 8.
-    // If the cluster shape is 2x1, we choose 2 groups (CTA_M / CLUSTER_M).
-    uint32_t max_groups_problem;
-    if (underlying_params.raster_order_ == RasterOrder::AlongM) {
-      max_groups_problem = problem_blocks_m / cluster_shape.m();
-    }
-    else {
-      max_groups_problem = problem_blocks_n / cluster_shape.n();
-    }
-
-    // Select the number of groups that will be use. We start with the maximum
-    // number of potential groups, and iterate down looking for a group size that
-    // evenly divides the stream-K units and tiles, and for which the resulting
-    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
-
-    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
-
-    // Grouping is disabled when separate reduction is used
-    if (do_separate_reduction) {
-      groups = 1;
-    }
-
-    uint32_t fallback_groups = 0;
-    auto sk_cluster_tiles = sk_tiles / cluster_size;
-    auto sk_cluster_units = sk_units / cluster_size;
-
-    auto sk_splits_too_small = [&](uint32_t g) {
-      // Check whether the number of K tiles computed per stream-K unit is less
-      // than min_iters_per_sk_unit_
-      auto total_sk_cluster_tiles = (sk_cluster_tiles / g) * cluster_size;
-      auto total_sk_k_tiles = total_sk_cluster_tiles * k_tiles_per_output_tile;
-      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
-      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
-    };
-
-    auto is_ideal_grouping = [&](uint32_t g) {
-      // An ideal grouping will evenly divide stream-K clusters, evenly divide
-      // stream-K tiles, and not result in stream-K splits that are too small.
-      return (sk_cluster_units % g == 0) && (sk_cluster_tiles % g == 0) && !sk_splits_too_small(g);
-    };
-
-    auto is_valid_grouping = [&](uint32_t g) {
-      // A grouping is valid, but not ideal, if it evenly divides the
-      // stream-K clusters and does not result in stream-K splits that are
-      // too small. Such a setting can be used as a fallback option in the
-      // case that an ideal grouping is not achievable
-      return sk_cluster_units % g == 0 && !sk_splits_too_small(g);
-    };
-
-    while (groups > 1 && !is_ideal_grouping(groups)) {
-      if (fallback_groups == 0 && is_valid_grouping(groups)) {
-        // Set fallback groups once in preference for a larger number of groups.
-        fallback_groups = groups;
-      }
-      --groups;
-    }
-
-    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
-    // found a fallback group count, use this instead.
-    if (groups == 1 && fallback_groups > 0) {
-      groups = fallback_groups;
-    }
-
-    auto sk_units_per_group = sk_units / groups;
-
-    // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
-    //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
-    // Both total_tiles and sm_count are multiples of cluster size due to padding added
-    // prior to kernel launch.
-    uint64_t sk_cluster_tiles_per_group = sk_cluster_tiles / groups;
-    uint64_t sk_tiles_per_group = sk_cluster_tiles_per_group * cluster_size;
-
-    // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
-    // are stream-K units within a group that process an extra K chunk.
-    uint64_t sk_big_groups = sk_cluster_tiles % groups;
-
-    uint64_t k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group;
-
-    // Number of k tiles computed per stream-K unit
-    uint64_t k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group;
-
-    uint32_t reduction_units = 0;
-
-    // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0)
-    // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles)
-    if (do_separate_reduction) {
-      // Each reduction unit will reduce the partials of an epilogue subtile for
-      // a given output tile and compute the epilogue. Thus, there are as many reduction
-      // units as there are epilogue subtiles.
-      reduction_units = sk_tiles * epilogue_subtile;
-    }
-    else if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
-      // If the number of stream-K units is a multiple of the number of stream-K tiles, then
-      // the problem can leverage a basic split-K decomposition for the stream-K tiles.
-      // This case happens when separate reduction is disable.
-      uint32_t sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
-      set_params_basic(
-        underlying_params,
-        problem_blocks_m,
-        problem_blocks_n,
-        problem_blocks_l,
-        sk_splits,
-        k_tiles_per_output_tile,
-        reduction_workspace,
-        reduction_mode
-      );
-      return;
-    }
-    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
-    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
-    divmod_batch_ = underlying_params.divmod_batch_;
-    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
-    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
-    divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
-    divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
-
-    // Override divmod_clusters_mnl_ to be the number of cluster-sized stream-K units.
-    // This setting ensures that the use of this divmod for stream-K decompositions
-    // is essentially a no-op.
-    divmod_clusters_mnl_ = FastDivmodU64(sk_units / cluster_size);
-    divmod_splits_ = FastDivmod(1);
-    log_swizzle_size_ = underlying_params.log_swizzle_size_;
-    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
-    raster_order_ = underlying_params.raster_order_;
-
-    // Assign big_units_ assuming that group count == 1. This is unused by stream-K
-    // when group count > 1.
-    big_units_ = static_cast<uint32_t>(k_tiles_per_group % k_tiles_per_sk_unit);
-
-    big_groups_ = static_cast<uint32_t>(sk_big_groups);
-    reduction_workspace_ = reduction_workspace;
-    sk_tiles_ = sk_tiles;
-    sk_units_ = static_cast<uint32_t>(sk_units);
-    divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit));
-    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit + 1));
-    reduction_mode_ = reduction_mode;
-    divmod_epilogue_subtile_ = FastDivmodU64(epilogue_subtile);
-    separate_reduction_units_ = reduction_units;
-  }
-
-  // Given the inputs, computes the physical grid we should launch.
-  // This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  CUTLASS_HOST_DEVICE
-  static dim3
-  get_grid_shape(
-    BatchedGemmCoord problem_shape,
-    GemmCoord cta_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
-
-    return get_grid_shape(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle_size,
-      raster_order_option
-    );
-  }
-
-  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  CUTLASS_HOST_DEVICE
-  static dim3
-  get_grid_shape(
-    dim3 problem_blocks,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
-    // to be truncated based on the number of output tiles in the problem.
-    return UnderlyingParams::get_grid_shape(
-      problem_blocks,
-      cluster_shape,
-      hw_info,
-      max_swizzle_size,
-      raster_order_option,
-      /* truncate_by_problem_size = */false
-    );
-  }
-
-  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
-  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
-  static uint32_t
-  get_num_sk_tiles(
-    uint64_t output_tiles,
-    uint64_t ctas_per_wave,
-    uint64_t cluster_size,
-    uint32_t k_tiles_per_output_tile,
-    DecompositionMode decomposition_mode
-  ) {
-    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
-    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
-
-    if (decomposition_mode == DecompositionMode::DataParallel ||
-        decomposition_mode == DecompositionMode::SplitK) {
-      return 0;
-    }
-
-    // If there is wave quantization, assign the first two waves worth of tiles to be
-    // covered by stream-K work and the remainder to be data-parallel. Since we know
-    // that full_waves == total_waves - 1 in this case, the number of data-parallel
-    // waves is simply full_waves-1 (unless full_waves == 0).
-    uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
-    uint64_t dp_tiles = dp_waves * ctas_per_wave;
-    uint64_t sk_tiles = output_tiles - dp_tiles;
-
-    if (decomposition_mode == DecompositionMode::Heuristic) {
-      if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
-        // All tiles will be data-parallel tiles if there is either no quantization
-        // or if there is no work to be split.
-        return 0;
-      }
-
-      //
-      // The final wave is not full. Perform some stream-K work.
-      //
-
-      // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
-      // one wave and the tail wave is more than half full. This is subject to change.
-      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
-      if (2 * tail_tiles >= ctas_per_wave) {
-        return 0;
-      }
-    }
-
-    return static_cast<uint32_t>(sk_tiles);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static uint64_t
-  get_num_sk_units(GemmCoord cluster_shape, uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
-    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
-    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
-    // will be as many work units as there are threadblocks in a single wave.
-    //
-    // When the total k iterations across stream-K tiles is too small to justify distributing
-    // across an entire wave of blocks, we instead distribute the iterations over a smaller
-    // set of blocks.
-
-    // Calculate the number of stream-K units that would be needed if each stream-K unit
-    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
-
-    // Number of k iterations computed by the stream-K units as a whole
-    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
-
-    // Calculate the number of stream-K units that would be needed if each stream-K unit
-    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
-    auto cluster_size = cluster_shape.m() * cluster_shape.n();
-    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
-    min_sized_sk_units = (min_sized_sk_units / cluster_size) * cluster_size;
-
-    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
-    return sk_units;
-  }
-
-  // Calculates the size of the workspace needed for holding reduction barriers
-  CUTLASS_HOST_DEVICE
-  static size_t
-  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups, uint32_t barrier_bits) {
-    size_t workspace_bits = num_tiles * static_cast<size_t>(mma_warp_groups) * static_cast<size_t>(barrier_bits);
-    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
-  }
-
-  // Calculates the size of the workspace needed for holding partial outputs from splits
-  CUTLASS_HOST_DEVICE
-  static size_t
-  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
-    size_t output_tile_size = tile_shape.m() * tile_shape.n();
-    size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
-    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
-  }
-
-  #if !defined(__CUDACC_RTC__)
-  static void
-  get_workspace_component_sizes(
-    dim3 problem_blocks,
-    uint32_t k_tiles_per_output_tile,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    size_t& barrier_workspace_size,
-    size_t& reduction_workspace_size,
-    KernelHardwareInfo const& hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    DecompositionMode decomposition_mode,
-    uint32_t mma_warp_groups,
-    uint32_t barrier_bits,
-    uint32_t accumulator_bits,
-    uint32_t epilogue_subtile = 1,
-    uint32_t num_accumulator_mtxs = 1) {
-
-    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
-    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
-
-    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
-    // of output tiles that will be split, and then calculate the workspace needed to cover these.
-    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
-
-    if (decomposition_mode == DecompositionMode::DataParallel) {
-      barrier_workspace_size = 0;
-      reduction_workspace_size = 0;
-    }
-    else if (splits > 1 &&
-             (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic)) {
-      // Basic split-K variant requires workspace for all output tiles
-      barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups, barrier_bits);
-      reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
-    }
-    else {
-      KernelHardwareInfo new_hw_info;
-      new_hw_info.device_id = hw_info.device_id;
-      new_hw_info.sm_count = hw_info.sm_count;
-      if (new_hw_info.sm_count <= 0) {
-        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
-            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
-      }
-
-      dim3 grid = get_grid_shape(
-        problem_blocks,
-        cluster_shape,
-        new_hw_info,
-        max_swizzle,
-        raster_order_option
-      );
-      uint64_t ctas_per_wave = grid.x * grid.y;
-      uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
-      uint32_t sk_tiles = get_num_sk_tiles(
-        output_tiles,
-        ctas_per_wave,
-        cluster_size,
-        static_cast<uint32_t>(k_tiles_per_output_tile),
-        decomposition_mode
-      );
-      uint64_t ctas_per_sk_wave = ctas_per_wave;
-      uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
-      uint64_t dp_tiles = output_tiles - sk_tiles;
-
-      uint64_t reduction_tiles = sk_tiles;
-      if (should_perform_separate_reduction(epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave)) {
-        // In separate reduction, each peer writes to its own location in scratch space.
-        // Thus, for separate reduction, we need as many reduction tiles per output tile
-        // as there are the maximum number of peers that can collaborate on an output tile.
-        reduction_tiles *= max_peers_per_tile(sk_units, sk_tiles);
-      }
-
-      // Though separate reduction requires a larger reduction workspace, only one barrier
-      // is needed per output tile. Each peer will increment the barrier by one once the peer has
-      // written its accumulator to scratch space. The separate reduction unit will only begin
-      // performing the reduction when the barrier has reached the number of peers for the output tile.
-      barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups, barrier_bits);
-      reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
-    }
-  }
-  #endif // !defined(__CUDACC_RTC__)
-
-  // Returns whether the kernel is configured in a manner for which separate reduction should be used
-  CUTLASS_HOST_DEVICE
-  static bool
-  should_perform_separate_reduction(uint32_t, uint64_t, uint64_t, uint64_t, uint64_t) {
-    // Separate reduction is temporarily disabled, pending fixes
-    return false;
-  }
-
-  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  static size_t
-  get_workspace_size(
-    BatchedGemmCoord problem_shape,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    DecompositionMode decomposition_mode,
-    uint32_t mma_warp_groups,
-    uint32_t barrier_bits,
-    uint32_t element_accumulator_bits,
-    uint32_t epilogue_subtile,
-    uint32_t num_accumulator_mtxs) {
-
-    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
-    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
-
-    return get_workspace_size(
-      problem_blocks,
-      k_tiles_per_output_tile,
-      tile_shape,
-      cluster_shape,
-      hw_info,
-      splits,
-      max_swizzle,
-      raster_order_option,
-      decomposition_mode,
-      mma_warp_groups,
-      barrier_bits,
-      element_accumulator_bits,
-      epilogue_subtile,
-      num_accumulator_mtxs
-    );
-  }
-
-  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  static size_t
-  get_workspace_size(
-    dim3 problem_blocks,
-    uint32_t k_tiles_per_output_tile,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    DecompositionMode decomposition_mode,
-    uint32_t mma_warp_groups,
-    uint32_t barrier_bits,
-    uint32_t element_accumulator_bits,
-    uint32_t epilogue_subtile = 1,
-    uint32_t num_accumulator_mtxs = 1) {
-
-    size_t barrier_workspace_size = 0;
-    size_t reduction_workspace_size = 0;
-
-    #if !defined(__CUDACC_RTC__)
-      get_workspace_component_sizes(
-        problem_blocks,
-        k_tiles_per_output_tile,
-        tile_shape,
-        cluster_shape,
-        barrier_workspace_size,
-        reduction_workspace_size,
-        hw_info,
-        splits,
-        max_swizzle,
-        raster_order_option,
-        decomposition_mode,
-        mma_warp_groups,
-        barrier_bits,
-        element_accumulator_bits,
-        epilogue_subtile,
-        num_accumulator_mtxs
-      );
-    #endif
-
-    return barrier_workspace_size + reduction_workspace_size;
-  }
-
-  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
-  // problem_shape and tile_shape contain modes of only rank 1.
-  static cutlass::Status
-  initialize_workspace(
-    void* workspace,
-    cudaStream_t stream,
-    BatchedGemmCoord problem_shape,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    DecompositionMode decomposition_mode,
-    uint32_t mma_warp_groups,
-    uint32_t barrier_bits,
-    uint32_t element_accumulator_bits,
-    uint32_t epilogue_subtile,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-
-    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
-    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
-
-    return initialize_workspace(
-      workspace,
-      stream,
-      problem_blocks,
-      k_tiles_per_output_tile,
-      tile_shape,
-      cluster_shape,
-      hw_info,
-      splits,
-      max_swizzle,
-      raster_order_option,
-      decomposition_mode,
-      mma_warp_groups,
-      barrier_bits,
-      element_accumulator_bits,
-      epilogue_subtile,
-      1,
-      cuda_adapter
-    );
-  }
-
-  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  static cutlass::Status
-  initialize_workspace(
-    void* workspace,
-    cudaStream_t stream,
-    dim3 problem_blocks,
-    uint32_t k_tiles_per_output_tile,
-    GemmCoord tile_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int splits,
-    int max_swizzle,
-    RasterOrderOptions raster_order_option,
-    DecompositionMode decomposition_mode,
-    uint32_t mma_warp_groups,
-    uint32_t barrier_bits,
-    uint32_t element_accumulator_bits,
-    uint32_t epilogue_subtile = 1,
-    uint32_t num_accumulator_mtxs = 1,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-
-    #if !defined(__CUDACC_RTC__)
-      uint64_t barrier_workspace_size = 0;
-      uint64_t reduction_workspace_size = 0;
-
-      get_workspace_component_sizes(
-        problem_blocks,
-        k_tiles_per_output_tile,
-        tile_shape,
-        cluster_shape,
-        barrier_workspace_size,
-        reduction_workspace_size,
-        hw_info,
-        splits,
-        max_swizzle,
-        raster_order_option,
-        decomposition_mode,
-        mma_warp_groups,
-        barrier_bits,
-        element_accumulator_bits,
-        epilogue_subtile,
-        num_accumulator_mtxs
-      );
-
-      if (barrier_workspace_size > 0) {
-        if (workspace == nullptr) {
-          return Status::kErrorWorkspaceNull;
-        }
-
-        // Only the barrier workspace needs to be cleared for stream-K.
-        // Barrier workspace follows reduction workspace.
-        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
-        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream, cuda_adapter);
-      }
-    #endif // !defined(__CUDACC_RTC__)
-
-    return Status::kSuccess;
-  }
-
-  void
-  set_params_basic(
-    UnderlyingParams const& underlying_params,
-    uint32_t blocks_m,
-    uint32_t blocks_n,
-    uint32_t blocks_l,
-    uint32_t splits,
-    uint32_t k_tiles_per_output_tile,
-    void* reduction_workspace,
-    ReductionMode reduction_mode) {
-
-    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
-    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
-    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
-    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
-    divmod_sk_groups_ = FastDivmodU64(1u);
-    auto cluster_size = underlying_params.divmod_cluster_shape_major_.divisor * underlying_params.divmod_cluster_shape_minor_.divisor;
-    divmod_clusters_mnl_ = FastDivmodU64((blocks_m * blocks_n * blocks_l) / cluster_size);
-    divmod_splits_ = FastDivmod(splits);
-    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
-    log_swizzle_size_ = underlying_params.log_swizzle_size_;
-    units_per_problem_ = blocks_m * blocks_n * blocks_l;
-    raster_order_ = underlying_params.raster_order_;
-    big_units_ = k_tiles_per_output_tile % splits;
-    reduction_workspace_ = reduction_workspace;
-    reduction_mode_ = reduction_mode;
-    divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
-    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1);
-
-    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
-    sk_tiles_ = 0;
-    sk_units_ = 0;
-    divmod_sk_units_per_group_ = FastDivmodU64(1u);
-    separate_reduction_units_ = 0;
-  }
-
-  private:
-  // Round up number of bytes to the nearest multiple of L2 cache line alignment
-  CUTLASS_HOST_DEVICE
-  static size_t
-  round_up_to_l2_alignment(size_t bytes) {
-    constexpr size_t L2CacheLineSizeBytes = 128u;
-    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Parameters for SM90 persistent group scheduler (only used for Grouped Gemms)
-template<class ProblemShape>
-struct PersistentTileSchedulerSm90GroupParams {
-
-  enum class RasterOrder {
-    AlongM,
-    AlongN
-  };
-
-  enum class RasterOrderOptions {
-    Heuristic,
-    AlongM,
-    AlongN
-  };
-
-  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
-  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
-  FastDivmodU64 divmod_cta_shape_m_{};
-  FastDivmodU64 divmod_cta_shape_n_{};
-
-  uint64_t blocks_across_problem_ = 0;
-  bool pre_processed_problem_shapes = true;
-  int32_t log_swizzle_size_ = 0;
-  RasterOrder raster_order_ = RasterOrder::AlongN;
-
-  int32_t groups_ = 0;
-  ProblemShape* problem_shapes_ = nullptr;
-  GemmCoord cta_shape_;
-  GemmCoord cluster_shape_;
-
-  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  void
-  initialize(
-    dim3 problem_blocks,
-    int32_t groups,
-    ProblemShape* problem_shapes,
-    ProblemShape const* host_problem_shapes,
-    GemmCoord cta_shape,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo const& hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    CUTLASS_UNUSED(hw_info);
-
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
-    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
-
-    RasterOrder raster_order = get_rasterization_order(
-      problem_blocks_m,
-      problem_blocks_n,
-      raster_order_option
-    );
-
-    //
-    // Set members
-    //
-    groups_ = groups;
-    problem_shapes_ = problem_shapes;
-    cta_shape_ = cta_shape;
-    cluster_shape_ = cluster_shape;
-
-    blocks_across_problem_ = problem_blocks.x * problem_blocks.y * problem_blocks.z;
-    pre_processed_problem_shapes = (host_problem_shapes == nullptr) ? false : true;
-    log_swizzle_size_ = log_swizzle_size;
-    raster_order_ = raster_order;
-
-    if (raster_order == RasterOrder::AlongN) {
-      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
-      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
-    }
-    else {
-      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
-      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
-    }
-
-    divmod_cta_shape_m_ = FastDivmodU64(cta_shape_.m());
-    divmod_cta_shape_n_ = FastDivmodU64(cta_shape_.n());
-  }
-
-  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  CUTLASS_HOST_DEVICE
-  static dim3
-  get_tiled_cta_shape_mnl(GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
-    // Round up to nearest multiple of cluster dim along each mode
-    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
-    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
-
-    return {
-      static_cast<uint32_t>(cta_m),
-      static_cast<uint32_t>(cta_n),
-      static_cast<uint32_t>(1) // Only a single batch per group is currently supported
-    };
-  }
-
-  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
-  // for which using CuTe algebra for calculating tile shapes is easiest.
-  CUTLASS_HOST_DEVICE static
-  dim3
-  get_grid_shape(
-    dim3 problem_blocks,
-    GemmCoord cluster_shape,
-    KernelHardwareInfo hw_info,
-    int max_swizzle_size,
-    RasterOrderOptions raster_order_option,
-    bool truncate_by_problem_size=true) {
-
-    int const sm_count = hw_info.sm_count;
-
-    // Round up to nearest multiple of swizzle_size along each mode
-    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
-    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
-    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
-
-    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
-
-    RasterOrder raster_order = get_rasterization_order(
-      problem_blocks_m,
-      problem_blocks_n,
-      raster_order_option
-    );
-
-    dim3 launch_grid;
-
-    if (raster_order == RasterOrder::AlongN) {
-      launch_grid = dim3(cluster_shape.m(), 1, 1);
-    }
-    else {
-      launch_grid = dim3(1, cluster_shape.n(), 1);
-    }
-
-    auto possibly_truncate = [&](int x, int y) {
-      if (truncate_by_problem_size) {
-        return platform::min(x, y);
-      }
-      else {
-        return x;
-      }
-    };
-
-    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
-    auto cluster_size = cluster_shape.m() * cluster_shape.n();
-    if (cluster_size == 1) {
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
-      }
-      else {
-        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
-      }
-    }
-    else {
-      // Optimal grid size calculation is based on
-      // GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
-      // Hence, maximum SMs per GPC = 18
-      constexpr int max_sm_per_gpc = 18;
-      // Provided SM count could possibly be less than the assumed maximum SMs per GPC
-      auto cluster_size = cluster_shape.m() * cluster_shape.n();
-      int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
-      int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
-      int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
-
-      // The calculation below allows for larger grid size launch for different GPUs.
-      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
-      int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
-      cta_per_device += max_cta_occupancy_per_residual_gpc;
-
-      cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
-
-      if (raster_order == RasterOrder::AlongN) {
-        launch_grid.y = possibly_truncate(
-            cta_per_device       / cluster_shape.m(),
-            problem_blocks_total / cluster_shape.m());
-      }
-      else {
-        launch_grid.x = possibly_truncate(
-            cta_per_device       / cluster_shape.n(),
-            problem_blocks_total / cluster_shape.n());
-      }
-    }
-    return launch_grid;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static int32_t
-  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
-    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
-    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
-      return 3;
-    }
-    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
-      return 2;
-    }
-    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
-      return 1;
-    }
-    else {
-      return 0;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  static RasterOrder
-  get_rasterization_order(
-    uint32_t tiles_m,
-    uint32_t tiles_n,
-    RasterOrderOptions raster_order_option
-  ) {
-
-    if (raster_order_option == RasterOrderOptions::Heuristic) {
-      if (tiles_n > tiles_m) {
-        return RasterOrder::AlongM;
-      }
-      else {
-        return RasterOrder::AlongN;
-      }
-    }
-    else {
-      switch (raster_order_option) {
-        case RasterOrderOptions::AlongN:
-          return RasterOrder::AlongN;
-          break;
-        default:
-          return RasterOrder::AlongM;
-      }
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-} // namespace detail
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h b/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
deleted file mode 100755
index 50b33eab7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief 
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/semaphore.h"
-#include "cutlass/core_io.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
-  typename Epilogue_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
-  FillMode FillMode_,             ///! Fill Mode for triangular matrix (kLower or kUpper)
-  DiagType DiagType_              ///! Diag Type for triangular matrix (kNonUnit or kUnit)
->
-struct TrmmUniversal {
-public:
-
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-
-  using ElementA = typename Mma::IteratorA::Element;
-  using LayoutA = typename Mma::IteratorA::Layout;
-  using ElementB = typename Mma::IteratorB::Element;
-  using LayoutB = typename Mma::IteratorB::Layout;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-  static SideMode const kSideMode = SideMode_;
-  static FillMode const kFillMode = FillMode_;
-  static DiagType const kDiagType = DiagType_;
-
-  static ComplexTransform const kTransformA = Mma::kTransformA;
-  static ComplexTransform const kTransformB = Mma::kTransformB;
-  using Operator = typename Mma::Operator;
-
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  /// Split-K preserves splits that are 128b aligned
-  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    GemmUniversalMode mode{GemmUniversalMode::kGemm};
-    GemmCoord problem_size{};
-    int batch_count{1};
-
-    typename EpilogueOutputOp::Params epilogue{};
-
-    void const * ptr_A{nullptr};
-    void const * ptr_B{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A{0};
-    int64_t batch_stride_B{0};
-    int64_t batch_stride_D{0};
-
-    typename LayoutA::Stride::Index lda{0};
-    typename LayoutB::Stride::Index ldb{0};
-    typename LayoutC::Stride::Index ldd{0};
-
-    //
-    // Methods
-    //
-
-    Arguments() = default;
-
-    /// constructs an arguments structure
-    Arguments(
-      GemmUniversalMode mode,
-      GemmCoord problem_size,
-      int batch_count,
-      typename EpilogueOutputOp::Params epilogue,
-      void const * ptr_A,
-      void const * ptr_B,
-      void * ptr_D,
-      int64_t batch_stride_A,
-      int64_t batch_stride_B,
-      int64_t batch_stride_D,
-      typename LayoutA::Stride::Index lda,
-      typename LayoutB::Stride::Index ldb,
-      typename LayoutC::Stride::Index ldd
-    ):
-      mode(mode), 
-      problem_size(problem_size),
-      batch_count(batch_count),
-      epilogue(epilogue), 
-      ptr_A(ptr_A), ptr_B(ptr_B), ptr_D(ptr_D), 
-      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_D(batch_stride_D), 
-      lda(lda), ldb(ldb), ldd(ldd) {
-      }
-    
-    /// Returns arguments for the transposed problem sizes
-    Arguments transposed_problem_size() const {
-      Arguments args(*this);
-
-      std::swap(args.problem_size.m(), args.problem_size.n());
-
-      return args;
-    }
-
-    /// Returns arguments for the transposed matrices
-    Arguments swapped_matrices() const {
-      Arguments args(*this);
-
-      std::swap(args.ptr_A, args.ptr_B);
-      std::swap(args.lda, args.ldb);
-      std::swap(args.batch_stride_A, args.batch_stride_B);
-
-      return args;
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-
-    cutlass::gemm::GemmCoord problem_size{};
-    cutlass::gemm::GemmCoord grid_tiled_shape{};
-    int swizzle_log_tile{0};
-   
-    typename Mma::IteratorA::Params params_A{};
-    typename Mma::IteratorB::Params params_B{};
-    typename Epilogue::OutputTileIterator::Params params_D{};
-    
-    typename EpilogueOutputOp::Params output_op{};
-
-    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
-    int batch_count {0};
-    int gemm_k_size {0};
-
-    void * ptr_A{nullptr};
-    void * ptr_B{nullptr};
-    void * ptr_D{nullptr};
-
-    int64_t batch_stride_A {0};
-    int64_t batch_stride_B {0};
-    int64_t batch_stride_D {0};
-
-    int *semaphore{nullptr};
-
-    //
-    // Methods
-    //
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      Arguments const &args,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      int gemm_k_size,
-      void *workspace = nullptr
-    ):
-      problem_size(args.problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      params_A(args.lda),
-      params_B(args.ldb),
-      params_D(args.ldd),
-      output_op(args.epilogue),
-      mode(args.mode),
-      batch_count(args.batch_count),
-      gemm_k_size(gemm_k_size),
-      ptr_A(const_cast<void *>(args.ptr_A)),
-      ptr_B(const_cast<void *>(args.ptr_B)),
-      ptr_D(args.ptr_D),
-      batch_stride_A(args.batch_stride_A),
-      batch_stride_B(args.batch_stride_B),
-      batch_stride_D(args.batch_stride_D),
-      semaphore(static_cast<int *>(workspace)) {
-    }
-
-    CUTLASS_HOST_DEVICE
-    void update(
-      Arguments const &args,
-      void *workspace = nullptr) {
-
-      ptr_A = const_cast<void *>(args.ptr_A);
-      ptr_B = const_cast<void *>(args.ptr_B);
-      ptr_D = args.ptr_D;
-
-      batch_stride_A = args.batch_stride_A;
-      batch_stride_B = args.batch_stride_B;
-      batch_stride_D = args.batch_stride_D;
-
-      output_op = args.epilogue;
-
-      semaphore = static_cast<int *>(workspace);
-    }
-
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TrmmUniversal() { } 
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(
-    cutlass::gemm::GemmCoord const & problem_size) {
-
-    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
-    return Status::kSuccess;
-  }
-
-  static Status can_implement(Arguments const &args) {
-    return can_implement(args.problem_size);
-  }
-
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-
-    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-
-      return;
-    }
-
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-
-    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
-    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == GemmUniversalMode::kGemm || 
-      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
-      }
-
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
-      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
-    }
-
-    __syncthreads();
-
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A{
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      offset_k,
-    };
-
-    cutlass::MatrixCoord tb_offset_B{
-      offset_k,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    };
-
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = canonical_warp_idx_sync();
-
-    int lane_idx = threadIdx.x % 32;
-
-    //
-    // Main loop
-    //
-
-    // Construct thread-scoped matrix multiply
-    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-    typename Mma::FragmentC accumulators;
-
-    accumulators.clear();
-
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
-    
-    /******************************************************************************************************
-      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
-        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
-                                    needed to process all elements till that coordinate.
-        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
-                                    needed to process all elements till that coordinate.
-
-      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
-        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
-                                   that can be skipped for all elements of this tile.
-        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
-                                    that can be skipped for all elements of this tile.
-    ********************************************************************************************************/
- 
-    if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kLower) {
-
-      int k_iterations_till_diagonal = ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      if (k_iterations_till_diagonal < gemm_k_iterations) {
-        gemm_k_iterations = k_iterations_till_diagonal;
-      }
-
-    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kUpper) {
-
-      int k_iterations_till_diagonal = ((threadblock_tile_offset.n() + 1) * Mma::Shape::kN + Mma::Shape::kK - 1) / Mma::Shape::kK;
-      if (k_iterations_till_diagonal < gemm_k_iterations) {
-        gemm_k_iterations = k_iterations_till_diagonal;
-      }
-
-    } else if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kUpper) {
-
-      int k_iterations_till_diagonal = ((threadblock_tile_offset.m()) * Mma::Shape::kM) / Mma::Shape::kK;
-
-      if (k_iterations_till_diagonal != 0) {
-        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
-        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
-        gemm_k_iterations -= k_iterations_till_diagonal;
-      }
-
-    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kLower) {
-
-      int k_iterations_till_diagonal = ((threadblock_tile_offset.n()) * Mma::Shape::kN) / Mma::Shape::kK;
-
-      if (k_iterations_till_diagonal != 0) {
-        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
-        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
-        gemm_k_iterations -= k_iterations_till_diagonal;
-      }
-
-    }
-
-    // Construct iterators to A and B operands
-    typename Mma::IteratorA iterator_A(
-      params.params_A,
-      ptr_A,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A);
-
-    typename Mma::IteratorB iterator_B(
-      params.params_B,
-      ptr_B,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B);
-
-    // Compute threadblock-scoped matrix multiply-add
-    mma(
-      gemm_k_iterations, 
-      accumulators, 
-      iterator_A, 
-      iterator_B, 
-      accumulators);
-
-    //
-    // Epilogue
-    //
-
-    EpilogueOutputOp output_op(params.output_op);
-
-    //
-    // Masked tile iterators constructed from members
-    //
-
-    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * Mma::Shape::kM,
-      threadblock_tile_offset.n() * Mma::Shape::kN
-    );
-
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-
-    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
-
-    //
-    // Fetch pointers based on mode.
-    //
-    
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-
-    if (params.mode == GemmUniversalMode::kGemm) {
-
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (params.grid_tiled_shape.k() > 1) {
-        
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kBatched) {
-      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    else if (params.mode == GemmUniversalMode::kArray) {
-      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
-    }
-
-    
-    // Tile iterator loading from source tensor (although irrelevant to this kernel as beta is zero).
-    typename Epilogue::OutputTileIterator iterator_C(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    // Tile iterator writing to destination tensor.
-    typename Epilogue::OutputTileIterator iterator_D(
-      params.params_D,
-      ptr_D,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-
-    Epilogue epilogue(
-      shared_storage.epilogue, 
-      thread_idx, 
-      warp_idx, 
-      lane_idx);
-
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
-        
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C = iterator_D;
-      }
-
-      semaphore.wait(threadblock_tile_offset.k());
-
-      __threadfence();
-    }
-
-
-    // Execute the epilogue operator to update the destination tensor.
-    epilogue(
-      output_op, 
-      iterator_D, 
-      accumulators, 
-      iterator_C); 
-    
-    //
-    // Release the semaphore
-    //
-
-    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
-
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      
-      semaphore.release(lock);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
deleted file mode 100755
index 2e3798b15..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for warp-level multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape,
-  /// Data type of A elements
-  typename ElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA,
-  /// Data type of B elements
-  typename ElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB,
-  /// Element type of C matrix
-  typename ElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC,
-  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
-  typename Operator = arch::OpMultiplyAdd,
-  /// Used for partial specialization
-  typename Enable = bool
->
-struct Mma;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Overloads specialized for existing architectures
-//
-
-#include "cutlass/gemm/thread/mma_sm50.h"
-#include "cutlass/gemm/thread/mma_sm60.h"
-#include "cutlass/gemm/thread/mma_sm61.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
deleted file mode 100755
index c778832bf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h
+++ /dev/null
@@ -1,538 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/thread/mma.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Gemplate that handles all packed matrix layouts
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: layout::MapFunc)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: layout::MapFunc)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: layout::MapFunc)
-  typename LayoutC_,
-  /// Operator used to compute GEMM
-  typename Operator_
->
-struct MmaGeneric {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = ElementA_;
-
-  /// Layout of A matrix (concept: layout::MapFunc)
-  using LayoutA = LayoutA_;
-
-  /// Data type of operand B
-  using ElementB = ElementB_;
-
-  /// Layout of B matrix (concept: layout::MapFunc)
-  using LayoutB = LayoutB_;
-
-  /// Element type of operand C
-  using ElementC = ElementC_;
-
-  /// Layout of C matrix (concept: layout::MapFunc)
-  using LayoutC = LayoutC_;
-
-  /// Underlying mathematical operator
-  using Operator = Operator_;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Instruction
-  using MmaOp = arch::Mma<
-    gemm::GemmShape<1,1,1>,
-    1,
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    Operator>;
-
-  static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0));
-
-  static bool const kAllFp32 = platform::is_same<ElementA, float>::value &&
-      platform::is_same<ElementB, float>::value &&
-      platform::is_same<ElementC, float>::value;
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    TensorRef<ElementA const, LayoutA> a_ref(
-      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
-
-    TensorRef<ElementB const, LayoutB> b_ref(
-      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
-
-    TensorRef<ElementC, LayoutC> d_ref(
-      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
-
-    MmaOp mma_op;
-
-    // Copy accumulators
-    D = C;
-
-    // Compute matrix product
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Shape::kK; ++k) {
-      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860)
-      if (kMultipleOf2 && kAllFp32) {
-        //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile.
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < Shape::kN; n+=2) {
-  
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < Shape::kM; m+=2) {
-  
-            int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m;
-
-            //top-left element in 2x2 tile
-            {
-              MatrixCoord mn(m_serpentine, n);
-              MatrixCoord mk(m_serpentine, k);
-              MatrixCoord kn(k, n);
-              Array<ElementC, 1> d;
-              Array<ElementA, 1> a;
-              Array<ElementB, 1> b;
-              d[0] = d_ref.at(mn);
-              a[0] = a_ref.at(mk);
-              b[0] = b_ref.at(kn);
-              mma_op(d, a, b, d);
-              d_ref.at(mn) = d[0];
-            }
-  
-            //bottom-left element in 2x2 tile
-            {
-              MatrixCoord mn(m_serpentine+1, n);
-              MatrixCoord mk(m_serpentine+1, k);
-              MatrixCoord kn(k, n);
-              Array<ElementC, 1> d;
-              Array<ElementA, 1> a;
-              Array<ElementB, 1> b;
-              d[0] = d_ref.at(mn);
-              a[0] = a_ref.at(mk);
-              b[0] = b_ref.at(kn);
-              mma_op(d, a, b, d);
-              d_ref.at(mn) = d[0];
-            }
-  
-            //bottom-right element in 2x2 tile
-            {
-              MatrixCoord mn(m_serpentine+1, n+1);
-              MatrixCoord mk(m_serpentine+1, k);
-              MatrixCoord kn(k, n+1);
-              Array<ElementC, 1> d;
-              Array<ElementA, 1> a;
-              Array<ElementB, 1> b;
-              d[0] = d_ref.at(mn);
-              a[0] = a_ref.at(mk);
-              b[0] = b_ref.at(kn);
-              mma_op(d, a, b, d);
-              d_ref.at(mn) = d[0];
-            }
-  
-            //top-right element in 2x2 tile
-            {
-              MatrixCoord mn(m_serpentine, n+1);
-              MatrixCoord mk(m_serpentine, k);
-              MatrixCoord kn(k, n+1);
-              Array<ElementC, 1> d;
-              Array<ElementA, 1> a;
-              Array<ElementB, 1> b;
-              d[0] = d_ref.at(mn);
-              a[0] = a_ref.at(mk);
-              b[0] = b_ref.at(kn);
-              mma_op(d, a, b, d);
-              d_ref.at(mn) = d[0];
-            }
-          }
-        }
-      } else 
-      #endif
-      {
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < Shape::kN; ++n) {
-  
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < Shape::kM; ++m) {
-  
-            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
-  
-            MatrixCoord mn(m_serpentine, n);
-            MatrixCoord mk(m_serpentine, k);
-            MatrixCoord kn(k, n);
-  
-            Array<ElementC, 1> d;
-            Array<ElementA, 1> a;
-            Array<ElementB, 1> b;
-  
-            d[0] = d_ref.at(mn);
-            a[0] = a_ref.at(mk);
-            b[0] = b_ref.at(kn);
-  
-            mma_op(d, a, b, d);
-  
-            d_ref.at(mn) = d[0];
-          }
-        }
-      }
-    }
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Matrix multiply-add operation - assumes operand B is not changing
-struct MmaComplexF32_Column {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using ElementC = complex<float>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<float>, 1> &d,
-    Array<complex<float>, 1> const &a,
-    Array<complex<float>, 1> const &b,
-    Array<complex<float>, 1> const &c
-  ) {
-
-    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
-    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
-    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
-    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
-  }
-};
-
-/// Matrix multiply-add operation - assumes operand A is not changing
-struct MmaComplexF32_Corner {
-
-  using Shape = gemm::GemmShape<1, 1, 1>;
-  using ElementC = complex<float>;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    Array<complex<float>, 1> &d,
-    Array<complex<float>, 1> const &a,
-    Array<complex<float>, 1> const &b,
-    Array<complex<float>, 1> const &c
-  ) {
-
-    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
-    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
-    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
-    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
-  }
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Gemplate that handles all packed matrix layouts
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of A matrix (concept: layout::MapFunc)
-  typename LayoutA_,
-  /// Layout of B matrix (concept: layout::MapFunc)
-  typename LayoutB_,
-  /// Layout of C matrix (concept: layout::MapFunc)
-  typename LayoutC_
->
-struct MmaGeneric<
-  Shape_,
-  complex<float>,
-  LayoutA_,
-  complex<float>,
-  LayoutB_,
-  complex<float>,
-  LayoutC_,
-  arch::OpMultiplyAdd> {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = complex<float>;
-
-  /// Layout of A matrix (concept: layout::MapFunc)
-  using LayoutA = LayoutA_;
-
-  /// Data type of operand B
-  using ElementB = complex<float>;
-
-  /// Layout of B matrix (concept: layout::MapFunc)
-  using LayoutB = LayoutB_;
-
-  /// Element type of operand C
-  using ElementC = complex<float>;
-
-  /// Layout of C matrix (concept: layout::MapFunc)
-  using LayoutC = LayoutC_;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Instruction
-  using MmaOp = arch::Mma<
-    gemm::GemmShape<1,1,1>,
-    1,
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    Operator>;
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    TensorRef<ElementA const, LayoutA> a_ref(
-      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
-
-    TensorRef<ElementB const, LayoutB> b_ref(
-      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
-
-    TensorRef<ElementC, LayoutC> d_ref(
-      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
-
-    detail::MmaComplexF32_Column mma_column;
-    detail::MmaComplexF32_Corner mma_corner;
-
-    // Copy accumulators
-    D = C;
-
-    // Compute matrix product
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Shape::kK; ++k) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Shape::kN; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM; ++m) {
-
-          int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
-
-          MatrixCoord mn(m_serpentine, n);
-          MatrixCoord mk(m_serpentine, k);
-          MatrixCoord kn(k, n);
-
-          Array<ElementC, 1> d;
-          Array<ElementA, 1> a;
-          Array<ElementB, 1> b;
-
-          d[0] = d_ref.at(mn);
-          a[0] = a_ref.at(mk);
-          b[0] = b_ref.at(kn);
-
-          if ((m == 0 && n) || m == Shape::kM - 1) {
-            mma_corner(d, a, b, d);
-          }
-          else {
-            mma_column(d, a, b, d);
-          }
-
-          d_ref.at(mn) = d[0];
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Gemplate that handles conventional layouts for FFMA and DFMA GEMM
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: layout::MapFunc)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: layout::MapFunc)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: layout::MapFunc)
-  typename LayoutC_
->
-struct Mma<
-  Shape_,
-  ElementA_,
-  LayoutA_,
-  ElementB_,
-  LayoutB_,
-  ElementC_,
-  LayoutC_,
-  arch::OpMultiplyAdd,
-  bool> {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = ElementA_;
-
-  /// Layout of A matrix (concept: layout::MapFunc)
-  using LayoutA = LayoutA_;
-
-  /// Data type of operand B
-  using ElementB = ElementB_;
-
-  /// Layout of B matrix (concept: layout::MapFunc)
-  using LayoutB = LayoutB_;
-
-  /// Element type of operand C
-  using ElementC = ElementC_;
-
-  /// Layout of C matrix (concept: layout::MapFunc)
-  using LayoutC = LayoutC_;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename MmaGeneric<
-                                    Shape,
-                                    ElementA,
-                                    LayoutA,
-                                    ElementB,
-                                    LayoutB,
-                                    ElementC,
-                                    LayoutC,
-                                    Operator>::MmaOp;
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    MmaGeneric<
-      Shape,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      Operator> mma;
-
-    mma(D, A, B, C);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
deleted file mode 100755
index 5e2178982..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h
+++ /dev/null
@@ -1,1161 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/thread/mma.h"
-#include "cutlass/functional.h"
-#include "cutlass/reduction/thread/reduce.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Structure to compute the matrix product for HFMA
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape,
-
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA,
-
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB,
-
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC,
-
-  /// Type of GEMM inner vs outer product
-  bool
->
-struct Mma_HFMA2;
-
-
-/////////////////////////////
-// Specialization for NNN  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2 <
-  Shape_,
-  layout::ColumnMajor,
-  layout::ColumnMajor,
-  layout::ColumnMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-   /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kM % 2),
-    "Mma_HFMA2 requires the M dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x1x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<2,1,1>,
-      1,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::ColumnMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-      CUTLASS_PRAGMA_UNROLL
-      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
-
-            mma(
-                tmp,
-                ptr_A[k*Shape::kM/2 + m],
-                ptr_B[n*Shape::kK + k],
-                tmp);
-
-            ptr_D[n*Shape::kM/2 + m] = tmp;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////
-// Specialization for NNT  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2<
-  Shape_,
-  layout::ColumnMajor,
-  layout::ColumnMajor,
-  layout::RowMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-   /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kN % 2),
-    "Mma_HFMA2 requires the N dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x2x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<1,2,1>,
-      1,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::RowMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-          CUTLASS_PRAGMA_UNROLL
-          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
-
-            Array<half_t, 2> tmp_B;
-            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
-            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
-
-            mma(
-                tmp,
-                ptr_A[k*Shape::kM + m],
-                tmp_B,
-                tmp);
-
-            ptr_D[m*Shape::kN/2 + n] = tmp;
-        }
-      }
-    }
-  }
-};
-
-
-/////////////////////////////
-// Specialization for NTN  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2 <
-  Shape_,
-  layout::ColumnMajor,
-  layout::RowMajor,
-  layout::ColumnMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kM % 2),
-    "Mma_HFMA2 requires the GEMM M dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    using Mma = arch::Mma<
-      gemm::GemmShape<2,1,1>,
-      1,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::ColumnMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM / Mma::Shape::kM; ++m) {
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < Shape::kN / Mma::Shape::kN; ++n) {
-
-          Array<half_t, 2> tmp { ptr_D[m + n * Shape::kM/2] };
-
-          mma(
-            tmp,
-            ptr_A[m + k * Shape::kM/2],
-            ptr_B[k * Shape::kN + n],
-            tmp);
-
-          ptr_D[m + n * Shape::kM/2] = tmp;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////
-// Specialization for NTT  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2<
-  Shape_,
-  layout::ColumnMajor,
-  layout::RowMajor,
-  layout::RowMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kN % 2),
-    "Mma_HFMA2 requires the N dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x2x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<1,2,1>,
-      1,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::RowMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-          CUTLASS_PRAGMA_UNROLL
-          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
-
-            mma(
-                tmp,
-                ptr_A[k*Shape::kM + m],
-                ptr_B[k*Shape::kN/2 + n],
-                tmp);
-
-            ptr_D[m*Shape::kN/2 + n] = tmp;
-        }
-      }
-    }
-  }
-};
-
-
-/////////////////////////////
-// Specialization for TNN  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2 <
-  Shape_,
-  layout::RowMajor,
-  layout::ColumnMajor,
-  layout::ColumnMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kM % 2),
-    "Mma_HFMA2 requires the M dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x1x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<2,1,1>,
-      1,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::ColumnMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-      CUTLASS_PRAGMA_UNROLL
-      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
-
-            Array<half_t, 2> tmp_A;
-            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
-            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
-
-            mma(
-                tmp,
-                tmp_A,
-                ptr_B[n*Shape::kK + k],
-                tmp);
-
-            ptr_D[n*Shape::kM/2 + m] = tmp;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////
-// Specialization for TNT  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2 <
-  Shape_,
-  layout::RowMajor,
-  layout::ColumnMajor,
-  layout::RowMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-   /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kN % 2),
-    "Mma_HFMA2 requires the N dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x2x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<1,2,1>,
-      1,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::ColumnMajor,
-      half_t,
-      layout::RowMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-          CUTLASS_PRAGMA_UNROLL
-          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
-
-            Array<half_t, 2> tmp_B;
-            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
-            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
-
-            mma(
-                tmp,
-                ptr_A[m*Shape::kK + k],
-                tmp_B,
-                tmp);
-
-            ptr_D[m*Shape::kN/2 + n] = tmp;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////
-// Specialization for TTN  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2 <
-  Shape_,
-  layout::RowMajor,
-  layout::RowMajor,
-  layout::ColumnMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-   /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kM % 2),
-    "Mma_HFMA2 requires the M dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x2x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<2,1,1>,
-      1,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::ColumnMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-      CUTLASS_PRAGMA_UNROLL
-      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
-
-            Array<half_t, 2> tmp_A;
-            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
-            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
-
-            mma(
-                tmp,
-                tmp_A,
-                ptr_B[k*Shape::kN + n],
-                tmp);
-
-            ptr_D[n*Shape::kM/2 + m] = tmp;
-        }
-      }
-    }
-  }
-};
-
-
-/////////////////////////////
-// Specialization for TTT  //
-/////////////////////////////
-
-template <typename Shape_>
-struct Mma_HFMA2<
-  Shape_,
-  layout::RowMajor,
-  layout::RowMajor,
-  layout::RowMajor,
-  true
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kN % 2),
-    "Mma_HFMA2 requires the N dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x2x1 HFMA2 sequence for bulk of computation
-    using Mma = arch::Mma<
-      gemm::GemmShape<1,2,1>,
-      1,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::RowMajor,
-      half_t,
-      layout::RowMajor,
-      arch::OpMultiplyAdd>;
-
-    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
-    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    Mma mma;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
-
-          CUTLASS_PRAGMA_UNROLL
-          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
-
-            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
-
-            mma(
-                tmp,
-                ptr_A[m*Shape::kK + k],
-                ptr_B[k*Shape::kN/2 + n],
-                tmp);
-
-            ptr_D[m*Shape::kN/2 + n] = tmp;
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////
-// Specialization for TNT + Inner Product  or 1x1x2K + LayoutC = T //
-/////////////////////////////////////////////////////////////////////
-
-template <typename Shape_, typename LayoutA, typename LayoutB>
-struct Mma_HFMA2<
-  Shape_,
-  LayoutA,
-  LayoutB,
-  layout::RowMajor,
-  false
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kK % 2),
-    "Mma_HFMA2 requires the K dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x1x2 HFMA2 sequence for bulk of computation
-    using GemmShape = gemm::GemmShape<1,1,2>;
-
-    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    // Inner product is calculated using MACs, followed by final reduction
-    multiply_add<Array<half_t, 2>> mac;
-    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
-
-      CUTLASS_PRAGMA_UNROLL
-      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
-
-        Array<half_t, 2> tmp_C;
-        tmp_C.clear();
-        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
-        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
-          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
-        }
-
-        Array<half_t, 1> res;
-        Array<half_t, 1> *ptr_res = &res;
-        res = reduce(tmp_C);
-
-        ptr_D[m*Shape::kN + n] = ptr_res[0];
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////
-// Specialization for TNN + Inner Product  or 1x1x2K + LayoutC = N //
-/////////////////////////////////////////////////////////////////////
-
-template <typename Shape_, typename LayoutA, typename LayoutB>
-struct Mma_HFMA2<
-  Shape_,
-  LayoutA,
-  LayoutB,
-  layout::ColumnMajor,
-  false
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// A operand storage
-  using FragmentA = Array<half_t, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<half_t, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<half_t, Shape::kMN>;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  static_assert(
-    !(Shape::kK % 2),
-    "Mma_HFMA2 requires the K dimension to be divisible by 2."
-  );
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    /// Initialize output with input
-    D = C;
-
-    /// Use 1x1x2 HFMA2 sequence for bulk of computation
-    using GemmShape= gemm::GemmShape<1,1,2>;
-
-    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
-    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
-    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
-
-    // Inner product is calculated using MACs, followed by final reduction
-    multiply_add<Array<half_t, 2>> mac;
-    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
-
-    CUTLASS_PRAGMA_UNROLL
-    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
-
-      CUTLASS_PRAGMA_UNROLL
-      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
-
-        Array<half_t, 2> tmp_C;
-        tmp_C.clear();
-        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
-        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
-
-        CUTLASS_PRAGMA_UNROLL
-        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
-
-          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
-
-        }
-
-        Array<half_t, 1> res;
-        Array<half_t, 1> *ptr_res = &res;
-        res = reduce(tmp_C);
-
-        ptr_D[n*Shape::kM + m] = ptr_res[0];
-      }
-    }
-  }
-};
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_, typename LayoutA, typename LayoutB, typename LayoutC
->
-struct Mma<
-  Shape_,
-  half_t,
-  LayoutA,
-  half_t,
-  LayoutB,
-  half_t,
-  LayoutC,
-  arch::OpMultiplyAdd
-  > {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = half_t;
-
-  /// Data type of operand B
-  using ElementB = half_t;
-
-  /// Element type of operand C
-  using ElementC = half_t;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  static bool const a_row_major = platform::is_same< LayoutA, layout::RowMajor>::value;
-  static bool const b_column_major = platform::is_same< LayoutB, layout::ColumnMajor>::value;
-  static bool const c_row_major = platform::is_same< LayoutC, layout::RowMajor>::value;
-  static bool const c_column_major = platform::is_same< LayoutC, layout::ColumnMajor>::value;
-
-  static bool const m_mod2 = !(Shape::kM % 2);
-  static bool const n_mod2 = !(Shape::kN % 2);
-  static bool const k_mod2 = !(Shape::kK % 2);
-
-  // HFMA based MMA optimizations are of 2 types :
-  // 1. Inner product 
-  // 2. Outer product
-  // It is chosen based on LayoutC (for outer product gemm) or
-  // Using LayoutA and LayoutB or shape=1x1x2K (for inner product gemms)
-  // If all fails, we choose the generic MMA
-  static bool const use_outer_prod = (c_column_major && m_mod2) || (c_row_major && n_mod2);
-  static bool const use_inner_prod = (a_row_major && b_column_major && k_mod2) || (Shape::kM==1 && Shape::kN==1 && k_mod2);
-  static bool const use_optimized =  (use_outer_prod || use_inner_prod);
-
-  using ArchMmaOperator = typename platform::conditional< use_optimized, 
-    detail::Mma_HFMA2<Shape, LayoutA, LayoutB, LayoutC, use_outer_prod>, 
-    MmaGeneric <Shape, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, Operator> 
-  >::type;
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    ArchMmaOperator mma;
-
-    mma(D, A, B, C);
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-  /// Determines whether to enable thread::Gemm<> specializations compatible with SM50
-  template <
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB>
-  struct EnableMma_Crow_SM60 {
-
-    static bool const kIsConventionalLayout =
-      (platform::is_same<LayoutA, layout::RowMajor>::value ||
-        platform::is_same<LayoutA, layout::ColumnMajor>::value) &&
-      (platform::is_same<LayoutB, layout::RowMajor>::value ||
-        platform::is_same<LayoutB, layout::ColumnMajor>::value);
-
-    static bool const value = kIsConventionalLayout;
-  };
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes matrix product when C is row-major
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  typename LayoutA_,
-  typename LayoutB_
->
-struct Mma<
-  Shape_,
-  half_t,
-  LayoutA_,
-  half_t,
-  LayoutB_,
-  half_t,
-  layout::RowMajor,
-  arch::OpMultiplyAdd,
-  typename platform::enable_if<detail::EnableMma_Crow_SM60<
-    LayoutA_,
-    LayoutB_
-    >::value>::type>{
-
-  using Shape = Shape_;
-  using ElementA = half_t;
-  using LayoutA = LayoutA_;
-  using ElementB = half_t;
-  using LayoutB = LayoutB_;
-  using ElementC = half_t;
-  using LayoutC = layout::RowMajor;
-  using Operator = arch::OpMultiplyAdd;
-
-  using TransposeMma = Mma<
-    GemmShapeTranspose<Shape>,
-    half_t,
-    typename layout::LayoutTranspose<LayoutB>::type,
-    half_t,
-    typename layout::LayoutTranspose<LayoutA>::type,
-    half_t,
-    layout::ColumnMajor,
-    arch::OpMultiplyAdd,
-    bool>;
-
-  using FragmentA = Array<ElementA, Shape::kMK>;
-  using FragmentB = Array<ElementB, Shape::kKN>;
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  using ArchMmaOperator = typename TransposeMma::ArchMmaOperator;
-
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    TransposeMma mma;
-
-    mma(D, B, A, C);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h b/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
deleted file mode 100755
index a1abb05f6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/thread/mma.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Gemplate that handles conventional layouts for IDP4A
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_
->
-struct Mma<
-  Shape_,
-  int8_t,
-  layout::RowMajor,
-  int8_t,
-  layout::ColumnMajor,
-  int32_t,
-  LayoutC_,
-  arch::OpMultiplyAdd,
-  bool> {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = int8_t;
-
-  /// Layout of A matrix (concept: layout::MapFunc)
-  using LayoutA = layout::RowMajor;
-
-  /// Data type of operand B
-  using ElementB = int8_t;
-
-  /// Layout of B matrix (concept: layout::MapFunc)
-  using LayoutB = layout::ColumnMajor;
-
-  /// Element type of operand C
-  using ElementC = int32_t;
-
-  /// Layout of C matrix (concept: layout::MapFunc)
-  using LayoutC = LayoutC_;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  //  Use 1x1x4 IDP4A sequence for bulk of computation
-  using ArchMmaOperator = arch::Mma<
-      gemm::GemmShape<1,1,4>,
-      1,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      arch::OpMultiplyAdd>; 
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    TensorRef<ElementC, LayoutC> d(
-      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
-    
-    // Copy accumulators
-    D = C;
-
-    /// Use 1x1x4 IDP4A sequence for bulk of computation
-    ArchMmaOperator mma;
-
-    // Compute matrix product
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Shape::kN; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM; ++m) {
-          MatrixCoord mn(m, n);
-
-          Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
-          Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
-
-          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
-
-          mma(
-            tmp,
-            ptr_A[m * Shape::kK / ArchMmaOperator::Shape::kK + k],
-            ptr_B[n * Shape::kK / ArchMmaOperator::Shape::kK + k],
-            tmp);
-
-          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Gemplate that handles conventional layouts for IDP4A
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_
->
-struct Mma<
-  Shape_,
-  int8_t,
-  layout::ColumnMajor,
-  int8_t,
-  layout::RowMajor,
-  int32_t,
-  LayoutC_,
-  arch::OpMultiplyAdd,
-  int8_t> {
-
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  /// Data type of operand A
-  using ElementA = int8_t;
-
-  /// Layout of A matrix (concept: layout::MapFunc)
-  using LayoutA = layout::ColumnMajor;
-
-  /// Data type of operand B
-  using ElementB = int8_t;
-
-  /// Layout of B matrix (concept: layout::MapFunc)
-  using LayoutB = layout::RowMajor;
-
-  /// Element type of operand C
-  using ElementC = int32_t;
-
-  /// Layout of C matrix (concept: layout::MapFunc)
-  using LayoutC = LayoutC_;
-
-  /// Underlying mathematical operator
-  using Operator = arch::OpMultiplyAdd;
-
-  /// A operand storage
-  using FragmentA = Array<ElementA, Shape::kMK>;
-
-  /// B operand storage
-  using FragmentB = Array<ElementB, Shape::kKN>;
-
-  /// C operand storage
-  using FragmentC = Array<ElementC, Shape::kMN>;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  /// Use 1x1x4 IDP4A sequence for bulk of computation
-  using ArchMmaOperator = arch::Mma<
-      gemm::GemmShape<1,1,4>,
-      1,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      arch::OpMultiplyAdd>; 
-
-  //
-  // Methods
-  //
-
-  /// Computes a matrix product D = A * B + C
-  CUTLASS_HOST_DEVICE
-  void operator()(
-    FragmentC & D,
-    FragmentA const & A,
-    FragmentB const & B,
-    FragmentC const & C) {
-
-    TensorRef<ElementC, LayoutC> d(
-      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
-    
-    // Copy accumulators
-    D = C;
-
-    /// Underlying matrix multiply operator
-    ArchMmaOperator mma;
-    
-    Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
-    Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
-
-    // Compute matrix product
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Shape::kN; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM; ++m) {
-          MatrixCoord mn(m, n);
-
-          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
-
-          mma(
-            tmp,
-            ptr_A[m + k * Shape::kM],
-            ptr_B[n + k * Shape::kN],
-            tmp);
-
-          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
-        }
-      }
-    }
-  }
-};
-
-} // namespace thread
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
deleted file mode 100755
index fba281264..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
+++ /dev/null
@@ -1,734 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default template for a Blocked-Ell MMA.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-#include "cutlass/gemm/threadblock/ell_mma_pipelined.h"
-#include "cutlass/gemm/threadblock/ell_mma_multistage.h"
-#include "cutlass/transform/threadblock/ell_predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false
-    >
-struct DefaultEllMma;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass Simt)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      arch::OpClassSimt, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator
-    >
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      arch::OpClassTensorOp, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator
-    >
-struct DefaultEllMma<float, LayoutA, kAlignmentA, float, LayoutB,
-                  kAlignmentB, float, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
-      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
-      arch::OpMultiplyAddFastF16>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, float,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for column-major-interleaved output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of Interleaved K
-    int InterleavedK>
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator,
-                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
-                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
-                  Operator, true> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
-      true>;
-
-  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
-    "Alignment must match thread data map's vector length");
-
-  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
-    "Alignment must match thread data map's vector length");
-
-  // Define iterators over tiles from the A operand
-  using IteratorA = cutlass::transform::threadblock::EllPredicatedTileIterator<
-      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
-      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB = cutlass::transform::threadblock::EllPredicatedTileIterator<
-      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
-      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>,
-      typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-      Stages, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for column-major-interleaved output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of Interleaved K
-    int InterleavedK>
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator,
-                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
-                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
-                  Stages, Operator, true> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
-      Operator, true>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for SIMT IDP4A Kernels
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape>
-struct DefaultEllMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
-                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
-                  Operator, false> {
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using ElementB = int8_t;
-  using OperatorClass =  arch::OpClassSimt;
-
-  static const bool transposeA =  cutlass::platform::is_same< LayoutA, layout::ColumnMajor >::value;
-  static const bool transposeB =  cutlass::platform::is_same< LayoutB, layout::RowMajor >::value;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-/// Specialization for Wmma TensorOp operator with 2 staged pipeline
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC,
-      arch::OpClassWmmaTensorOp, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      LayoutC, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for Wmma TensorOp operator with 1 staged pipeline
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 1, Operator, false> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC,
-      arch::OpClassWmmaTensorOp, 1, Operator>; 
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::EllPredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped singlestage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      LayoutC, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
deleted file mode 100755
index 404e18919..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level batched GEMV assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting SIMT instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/thread/mma.h"
-
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-
-#include "cutlass/gemm/threadblock/gemv.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/// Template defininng default vector-matrix multiply operators inferred from threadblock tile size,
-/// global memory data layout.
-template <
-  typename Shape_,            /// Shape of the threadblock vector-matrix multiply operator
-  typename ThreadShape_,      /// Shape of per-thread vector-matrix multiply operator
-  typename ElementA_,         /// Element data type of A operand
-  typename LayoutA_,          /// Layout of operand A
-  typename ElementB_,         /// Element data type of B operand
-  typename LayoutB_,          /// Layout of operand B
-  typename ElementC_,         /// Data type of accumulator
-  typename LayoutC_           /// Layout of accumulator
->
-struct DefaultGemvCore {
-
-  using Shape = Shape_;
-  using ThreadShape = ThreadShape_;
-
-  using LayoutA = LayoutA_;
-  using LayoutB = LayoutB_;
-  using LayoutC = LayoutC_;
-  
-  using ElementA = ElementA_;
-  using ElementB = ElementB_;
-  using ElementC = ElementC_;
-
-  static int const kThreadsPerN = Shape::kN / ThreadShape::kN;
-
-  using IteratorPolicyA = typename platform::conditional<
-                            platform::is_same<LayoutA, layout::RowMajor>::value,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
-                              layout::PitchLinearShape<Shape::kK, Shape::kM>, 1, ThreadShape::kK>,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
-                              layout::PitchLinearShape<Shape::kM, Shape::kK>, 1, ThreadShape::kM>>::type;
-
-  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
-                          cutlass::MatrixShape<Shape::kM, Shape::kK>, ElementA, LayoutA, 1, IteratorPolicyA>;
-
-  using IteratorPolicyB = typename platform::conditional<
-                            platform::is_same<LayoutB, layout::RowMajor>::value,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
-                              layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreadsPerN, ThreadShape::kN>,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
-                              layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreadsPerN, ThreadShape::kK>>::type;
-
-  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
-                            cutlass::MatrixShape<Shape::kK, Shape::kN>, ElementB, LayoutB, 0, IteratorPolicyB>;
-
-  using IteratorPolicyC = typename platform::conditional<
-                            platform::is_same<LayoutC, layout::RowMajor>::value,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
-                              layout::PitchLinearShape<Shape::kN, Shape::kM>, kThreadsPerN, ThreadShape::kN>,
-                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
-                              layout::PitchLinearShape<Shape::kM, Shape::kN>, kThreadsPerN, ThreadShape::kM>>::type;
-
-  using IteratorC = cutlass::transform::threadblock::PredicatedTileIterator<
-                             cutlass::MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC, 0, IteratorPolicyC>;
-
-  using MmaSimtOp = typename cutlass::gemm::thread::Mma<
-    cutlass::gemm::GemmShape<ThreadShape::kM, ThreadShape::kN, Shape::kK>,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC>;
-
-  using Operator = MmaSimtOp;
-
-  // Assertions for correctness
-  static_assert((Shape::kM == 1), "M=1 is required for GEMV");
-  
-  static_assert((ThreadShape::kM == 1), "M=1 is required for GEMV");
-
-  static_assert(Shape::kK % ThreadShape::kK == 0, "Shape::K must be a multiple of ThreadShape::K");
-
-  static_assert(((ThreadShape::kK == 1) ||
-                (ThreadShape::kK == 2) || 
-                (ThreadShape::kK == 4) ||
-                (ThreadShape::kK == 8) ||
-                (ThreadShape::kK == 16) ||
-                (ThreadShape::kK == 32)
-               ),
-              "ThreadShape::K must be a 1, 2, 4, 8, 16 or 32");
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
deleted file mode 100755
index 8885d1ffc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h
+++ /dev/null
@@ -1,823 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/permute.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Gather operand A by using an index array
-    bool GatherA = false,
-    /// Gather operand B by using an index array
-    bool GatherB = false,
-    /// Permute operand A
-    typename PermuteALayout = layout::NoPermute,
-    /// Permute operand B
-    typename PermuteBLayout = layout::NoPermute
-    >
-struct DefaultMma;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass Simt)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-    >
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
-
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC,
-      arch::OpClassSimt, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
-          GatherA, PermuteALayout>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
-          GatherB, PermuteBLayout>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      LayoutC, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-    >
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClear,
-                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      arch::OpClassTensorOp, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
-          GatherA, PermuteALayout>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
-          GatherB, PermuteBLayout>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-    >
-struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
-                  kAlignmentB, float, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
-      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
-      arch::OpMultiplyAddFastF16>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
-          GatherA, PermuteALayout>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
-          GatherB, PermuteBLayout>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, float,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for column-major-interleaved output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of Interleaved K
-    int InterleavedK>
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator,
-                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
-                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
-                  Operator, true, SharedMemoryClearOption::kNone, false, false,
-                  layout::NoPermute, layout::NoPermute> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
-      true>;
-
-  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
-    "Alignment must match thread data map's vector length");
-
-  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
-    "Alignment must match thread data map's vector length");
-
-  // Define iterators over tiles from the A operand
-  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
-      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
-      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
-      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
-      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>,
-      typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-    >
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone,
-                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
-
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassSimt,
-      Stages, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operand
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear,
-    /// Gather operand A by using an index array
-    bool GatherA,
-    /// Gather operand B by using an index array
-    bool GatherB,
-    /// Permute operand A
-    typename PermuteALayout,
-    /// Permute operand B
-    typename PermuteBLayout
-    >
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false, SharedMemoryClear,
-                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
-
-  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
-             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for column-major-interleaved output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Number of Interleaved K
-    int InterleavedK>
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator,
-                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
-                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
-                  Stages, Operator, true, SharedMemoryClearOption::kNone, 
-                  false, false, layout::NoPermute, layout::NoPermute> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator,
-      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
-      Operator, true>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for SIMT IDP4A Kernels
-template <
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Operation performed by GEMM
-    typename Operator,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape>
-struct DefaultMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
-                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
-                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
-                  Operator, false, SharedMemoryClearOption::kNone,
-                  false, false, layout::NoPermute, layout::NoPermute> {
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using ElementB = int8_t;
-  using OperatorClass =  arch::OpClassSimt;
-
-  static const bool transposeA = platform::is_same< LayoutA, layout::ColumnMajor >::value;
-  static const bool transposeB = platform::is_same< LayoutB, layout::RowMajor >::value;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
-      OperatorClass, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      layout::RowMajor, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-/// Specialization for Wmma TensorOp operator with 2 staged pipeline
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
-                  false, false, layout::NoPermute, layout::NoPermute> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC,
-      arch::OpClassWmmaTensorOp, 2, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped pipelined matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      LayoutC, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for Wmma TensorOp operator with 1 staged pipeline
-template <
-    ///< Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Operation performed by GEMM
-    typename Operator>
-struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, LayoutC,
-                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone,
-                  false, false, layout::NoPermute, layout::NoPermute> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, LayoutC,
-      arch::OpClassWmmaTensorOp, 1, Operator>; 
-
-  // Define iterators over tiles from the A operand
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
-
-  // Define iterators over tiles from the B operand
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
-
-  // Define the threadblock-scoped singlestage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
-      LayoutC, typename MmaCore::MmaPolicy>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
deleted file mode 100755
index da83982f4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/warp/mma.h"
-#include "cutlass/gemm/threadblock/mma_pipelined.h"
-#include "cutlass/gemm/threadblock/mma_singlestage.h"
-#include "cutlass/arch/cache_operation.h" 
-#include "cutlass/arch/mma.h" 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template defininng default matrix multiply operators inferred from threadblock tile size,
-/// global memory data layout, and target math instruction.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Number of stages
-    int Stages = 2,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
->
-struct DefaultMmaCore;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
deleted file mode 100755
index 91f4710ed..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ /dev/null
@@ -1,1723 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting simt instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h"
-
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-namespace detail {
-
-// convert a WarpShape which is the whole tile of elements into warp num threads.
-// The goal is for each thread's tile of elements to be as square as possible
-// for performance (4x4 will be faster than 2x8).
-template<typename WarpShape>
-constexpr int simt_get_warp_threads_m() {
-    return (WarpShape::kM > WarpShape::kN) ? 8 : 4;
-}
-
-/// Computes padding in shared memory to perform efficient transpose without bank conflicts.
-constexpr int simt_transpose_padding(int threads, int crosswise, int size_in_bits) {
-  return (size_in_bits >= 32 ?
-      threads / crosswise / (size_in_bits / 32) :
-      threads / crosswise * (32 / size_in_bits)
-  );
-}
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-  
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    SmemThreadMapA // was IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    SmemThreadMapB // was IteratorThreadMapA
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-      ElementA,       /// Data type of A elements
-      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,       /// Data type of B elements
-      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,       /// Element type of C matrix
-      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
-      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
-    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    SmemThreadMapA
-  >;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-
-  static_assert(!(kPaddingM % LaneM),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-      ElementA,     /// Data type of A elements
-      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,     /// Data type of B elements
-      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,     /// Element type of C matrix
-      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA,
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB,
-    SmemLayoutB,
-    0,
-    SmemThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  static_assert(!(kPaddingN % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-      ElementA,     /// Data type of A elements
-      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-      ElementB,     /// Data type of B elements
-      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-      ElementC,     /// Element type of C matrix
-      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, kPaddingN>, // skew for B matrix to avoid SMEM bank conflicts
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              OperatorClass,
-                              2,
-                              Operator>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              OperatorClass,
-                              2,
-                              Operator>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              OperatorClass,
-                              2,
-                              Operator>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
-                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              OperatorClass,
-                              2,
-                              Operator>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: simt class, for dp4a
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
-                      layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                    > {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = int8_t;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
-  using SmemLayoutB = layout::RowMajorInterleaved<4>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-  
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(4, ThreadTileM);
-  static const int LaneN = cutlass::const_min(4, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      4>;
-
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    PartitionsK   /// Number of partitions along K dimension
-    >;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization:
-//
-///
-///   A: Row-major
-///   B: Column-major
-///   Operator: simt class, for dp4a
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
-                      layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                      > {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
-  using SmemLayoutB = layout::RowMajorInterleaved<4>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    SmemThreadMapA
-  >;
-  
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    SmemThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(4, ThreadTileM);
-  static const int LaneN = cutlass::const_min(4, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      4>;
-
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    PartitionsK   /// Number of partitions along K dimension
-    >;
-
-  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,
-    MatrixShape<0, kPaddingN>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization:
-//
-///
-///   A: Row-major
-///   B: Row-major
-///   Operator: simt class, for dp4a
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
-                      layout::RowMajor, int8_t, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                      > {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using LayoutA = layout::RowMajor;
-  using ElementB = int8_t;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
-  using SmemLayoutB = layout::RowMajorInterleaved<4>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    SmemThreadMapA
-  >;
-  
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(4, ThreadTileM);
-  static const int LaneN = cutlass::const_min(4, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      4>;
-
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    PartitionsK   /// Number of partitions along K dimension
-    >;
-
-  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<kPaddingM, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization:
-//
-///
-///   A: Column-major
-///   B: Column-major
-///   Operator: simt class, for dp4a
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
-                      layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, 2, Operator_
-                      > {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 4>;
-  using ElementA = int8_t;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = int8_t;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
-  using SmemLayoutB = layout::RowMajorInterleaved<4>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-  
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    layout::PitchLinearShape<4, 4>
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    SmemThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(4, ThreadTileM);
-  static const int LaneN = cutlass::const_min(4, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      4>;
-
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    PartitionsK   /// Number of partitions along K dimension
-    >;
-
-  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
-  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, kPaddingN>,
-    WarpCount::kK
-  >;
-};
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
deleted file mode 100755
index 41000dc18..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ /dev/null
@@ -1,682 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-
-#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = 
-    layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value>;
-
-  // Shared memory layout
-  using SmemLayoutB = 
-    layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
-      sizeof_bits<ElementB>::value>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Mma<
-      cutlass::gemm::GemmShape<16, 16, 4>,
-      32,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      cutlass::layout::RowMajor,
-      cutlass::arch::OpMultiplyAdd
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    layout::PitchLinearShape<4, 8>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    layout::PitchLinearShape<4, 8>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    1,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Mma<
-      cutlass::gemm::GemmShape<16, 16, 4>,
-      32,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      cutlass::layout::RowMajor,
-      cutlass::arch::OpMultiplyAdd
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
-      sizeof_bits<ElementB>::value>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    layout::PitchLinearShape<4, 8>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Mma<
-      cutlass::gemm::GemmShape<16, 16, 4>,
-      32,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      cutlass::layout::RowMajor,
-      cutlass::arch::OpMultiplyAdd
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<8, 8, 4>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    layout::PitchLinearShape<4, 8>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    1,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Mma<
-      cutlass::gemm::GemmShape<16, 16, 4>,
-      32,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      cutlass::layout::RowMajor,
-      cutlass::arch::OpMultiplyAdd
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
deleted file mode 100755
index 0162ef0df..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ /dev/null
@@ -1,1315 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"
-
-#include "cutlass/gemm/warp/default_mma_tensor_op.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-  using SmemLayoutA = 
-    layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-    sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                             kWarpThreadArrangementStridedA>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                             kWarpThreadArrangementStridedB>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    1,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                             kWarpThreadArrangementStridedB>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
-                      > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_; 
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                       MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Below is for arch::OpMultiplyAddFastF16
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
-                      layout::ColumnMajor, float, layout::RowMajor, float,
-                      LayoutC_, arch::OpClassTensorOp, 2,
-                      arch::OpMultiplyAddFastF16> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = float;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = float;
-  using LayoutB = layout::RowMajor;
-  using ElementC = float;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 256;
-
-  /// Default Operator
-  using Operator = arch::OpMultiplyAdd;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
-
-  // Shared memory layout
-  using SmemLayoutB =
-      layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<half_t>::value,
-                                                    int(128 / sizeof(half_t))>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    half_t, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    half_t, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
-                      layout::RowMajor, float, layout::ColumnMajor, float,
-                      LayoutC_, arch::OpClassTensorOp, 2,
-                      arch::OpMultiplyAddFastF16> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = float;
-  using LayoutA = layout::RowMajor;
-  using ElementB = float;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = float;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 256;
-
-  /// Default Operator
-  using Operator = arch::OpMultiplyAdd;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA =
-      layout::RowMajorTensorOpMultiplicandCrosswise<sizeof_bits<half_t>::value,
-                                                    Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<half_t>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    half_t, 
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    half_t, 
-    SmemLayoutB,
-    1,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
-                      layout::RowMajor, float, layout::RowMajor, float,
-                      LayoutC_, arch::OpClassTensorOp, 2,
-                      arch::OpMultiplyAddFastF16> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = float;
-  using LayoutA = layout::RowMajor;
-  using ElementB = float;
-  using LayoutB = layout::RowMajor;
-  using ElementC = float;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 256;
-
-  /// Default Operator
-  using Operator = arch::OpMultiplyAdd;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<half_t>::value, Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    half_t,
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    half_t, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
-                      layout::ColumnMajor, float, layout::ColumnMajor, float,
-                      LayoutC_, arch::OpClassTensorOp, 2,
-                      arch::OpMultiplyAddFastF16> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = float;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = float;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = float;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 256;
-
-  /// Default Operator
-  using Operator = arch::OpMultiplyAdd; 
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<half_t>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kM, Shape::kK>, half_t, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, half_t, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
-                              WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major-interleave
-///   B: row-major-interleave
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-///
-/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
-/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
-/// can be reused. The shared store iterator is the same as the crosswise shared
-/// store iterator. So, the only thing we need to do is to swap the coordinates
-/// (contiguous <=> strided) used by the global iterator and the shared store
-/// iterator.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor,
-    /// Number of interleaved k
-    int InterleavedK>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
-                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, 2, Operator_,
-                      AccumulatorsInRowMajor> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassTensorOp;
-  static int const kInterleavedK = InterleavedK;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess =
-      kAccessSizeInBits / sizeof_bits<ElementA>::value;
-
-  static int const kWarpThreadArrangementContiguous =
-      kInterleavedK / kElementsPerAccess;
-
-  static int const kWarpThreadArrangementStrided =
-      kWarpSize / kWarpThreadArrangementContiguous;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, kInterleavedK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, kInterleavedK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedK,
-                               Shape::kK / kInterleavedK>,
-      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
-      IteratorThreadMapA,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
-                               kWarpThreadArrangementStrided>>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN * kInterleavedK,
-                               Shape::kK / kInterleavedK>,
-      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
-      IteratorThreadMapB,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
-                               kWarpThreadArrangementStrided>>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                       MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
deleted file mode 100755
index ae21ee8bc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
+++ /dev/null
@@ -1,2951 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming
-   expectations about data layout of the global memory fragments, data types,
-   and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp
-   instructions.
-
-      SM80 Multi stage kernel expects stage number to be larger or equal to 3
-   to use asyncronous copy.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/gemm/warp/default_mma_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
-#include "cutlass/gemm/threadblock/mma_multistage.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for double-precision
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::ColumnMajor, double, layout::ColumnMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>; 
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 64;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-/// Partial specialization for double-precision
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::ColumnMajor, double, layout::RowMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = double;
-  using LayoutB = layout::RowMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>; 
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 64;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for double-precision
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::RowMajor, double, layout::ColumnMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using ElementB = double;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 64;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
-
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Partial specialization for double-precision
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::RowMajor, double, layout::RowMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::RowMajor;
-  using ElementB = double;
-  using LayoutB = layout::RowMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 64;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
-
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for double-precision
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2ColumnMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = double;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassTensorOp,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/// Partial specialization for double-precision
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2RowMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = double;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassTensorOp,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for double-precision
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::AffineRank2RowMajor, double, layout::AffineRank2ColumnMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = double;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassTensorOp,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Partial specialization for double-precision
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
-                      layout::AffineRank2RowMajor, double, layout::AffineRank2RowMajor, double,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = double;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = double;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = double;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassTensorOp,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float-precision
-///
-///   ElementA: complex<float>
-///   ElementB: complex<float>
-///   ElementC: complex<float>
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout for A operand
-    typename LayoutA_,
-    /// Layout for B operand
-    typename LayoutB_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA_,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB_
-    >
-struct DefaultMmaCore<
-  Shape_, WarpShape_, GemmShape<16, 8, 8>, 
-  complex<float>, LayoutA_, 
-  complex<float>, LayoutB_, 
-  complex<float>, LayoutC_, 
-  arch::OpClassTensorOp, 
-  Stages, 
-  Operator_, 
-  false, 
-  CacheOpA, 
-  CacheOpB,
-  TransformA_, TransformB_, true> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  using ElementA = complex<float>;
-  using LayoutA = LayoutA_;
-  using ElementB = complex<float>;
-  using LayoutB = LayoutB_;
-  using ElementC = complex<float>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-  static const ComplexTransform TransformA = TransformA_;
-  static const ComplexTransform TransformB = TransformB_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>; 
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  static_assert(
-    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
-    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value ||
-    platform::is_same<Operator, arch::OpMultiplyAddComplexFastF32>::value,
-    "The operator tag must indicate complex multiplication.");
-
-  //
-  // Underlying template
-  //
-
-  using MmaComplexCore = DefaultMultistageMmaComplexCore<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    arch::OpClassTensorOp,
-    kStages, 
-    TransformA,
-    TransformB,
-    Operator,
-    kCacheOpA,
-    kCacheOpB
-  >;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
-
-  // Shared memory layout
-  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for double-precision
-///
-///   ElementA: complex<double>
-///   ElementB: complex<double>
-///   ElementC: complex<double>
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout for A operand
-    typename LayoutA_,
-    /// Layout for B operand
-    typename LayoutB_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA_,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB_
-    >
-struct DefaultMmaCore<
-  Shape_, WarpShape_, InstructionShape_, 
-  complex<double>, LayoutA_, 
-  complex<double>, LayoutB_, 
-  complex<double>, LayoutC_, 
-  arch::OpClassTensorOp, 
-  Stages, 
-  Operator_, 
-  false, 
-  CacheOpA, 
-  CacheOpB,
-  TransformA_, TransformB_, true> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = complex<double>;
-  using LayoutA = LayoutA_;
-  using ElementB = complex<double>;
-  using LayoutB = LayoutB_;
-  using ElementC = complex<double>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-  static const ComplexTransform TransformA = TransformA_;
-  static const ComplexTransform TransformB = TransformB_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>; 
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 64;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  static_assert(
-    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
-    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
-    "The operator tag must indicate complex multiplication.");
-
-  //
-  // Underlying template
-  //
-
-  using MmaComplexCore = DefaultMultistageMmaComplexCore<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    arch::OpClassTensorOp,
-    kStages, 
-    TransformA,
-    TransformB,
-    Operator,
-    kCacheOpA,
-    kCacheOpB
-  >;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
-
-  // Shared memory layout
-  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, Shape::kK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK>;
-
-  // Shared memory layout
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major-interleaved
-///   B: row-major-interleaved
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-///
-/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
-/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
-/// can be reused. The shared store iterator is the same as the crosswise shared
-/// store iterator. So, the only thing we need to do is to swap the coordinates
-/// (contiguous <=> strided) used by the global iterator and the shared store
-/// iterator.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Number of interleaved K
-    int InterleavedK>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
-                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      AccumulatorsInRowMajor, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-  static int const kInterleavedK = InterleavedK;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>; 
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess =
-      kAccessSizeInBits / sizeof_bits<ElementA>::value;
-
-  static int const kWarpThreadArrangementContiguous =
-      kInterleavedK / kElementsPerAccess;
-
-  static int const kWarpThreadArrangementStrided =
-      kWarpSize / kWarpThreadArrangementContiguous;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, kInterleavedK>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, kInterleavedK>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedK,
-                               Shape::kK / kInterleavedK>,
-      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
-      IteratorThreadMapA,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
-                               kWarpThreadArrangementStrided>>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN * kInterleavedK,
-                               Shape::kK / kInterleavedK>,
-      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
-      IteratorThreadMapB,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
-                               kWarpThreadArrangementStrided>>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator B 
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static_assert(!((Shape::kK / 32) % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, Shape::kK / 32>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator B 
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static_assert(!((Shape::kK / 32) % LaneM) && !((Shape::kK / 32) % LaneN),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<Shape::kK / 32, 0>,
-    MatrixShape<0, Shape::kK / 32>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-
-  static_assert(!((Shape::kK / 32) % LaneM),
-                "Padding must be divisible by Lane");
-
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,  /// Data type of A elements
-    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,  /// Data type of B elements
-    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,  /// Element type of C matrix
-    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
-    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    >;         /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<Shape::kK / 32, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassSimt,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassSimt,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::ColumnMajor,
-                              ElementB,
-                              layout::ColumnMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassSimt,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-
-};
-
-/// Partial specialization for SIMT GEMMs using multistage pipeline.
-///
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by Simt
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::AffineRank2RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::AffineRank2RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  using Base = DefaultMmaCore<Shape,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              layout::RowMajor,
-                              ElementB,
-                              layout::RowMajor,
-                              ElementC,
-                              LayoutC,
-                              arch::OpClassSimt,
-                              kStages,
-                              Operator,
-                              false,
-                              kCacheOpA,
-                              kCacheOpB>;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = typename Base::SmemIteratorA;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = typename Base::SmemIteratorB;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = typename Base::MmaPolicy;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
deleted file mode 100755
index 985693ce6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
+++ /dev/null
@@ -1,876 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming
-   expectations about data layout of the global memory fragments, data types,
-   and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting sparse
-   TensorOp instructions.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/gemm/warp/default_mma_sparse_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
-#include "cutlass/gemm/threadblock/mma_sparse_multistage.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template defininng default matrix multiply operators inferred from threadblock tile size,
-/// global memory data layout, and target math instruction.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false
-    /// Cache operation of operand A
-    , cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global
->
-struct DefaultSparseMmaCore;
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  static int const kSparse = 2;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
- 
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Cache operation of operand E
-  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
-      cutlass::arch::CacheOperation::Global;
-
-  static int const kInterleavedE = MmaTensorOp::kInterleaved;
-  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
-  static int const kMaxID2 = MmaTensorOp::kMaxID2;
-  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
-
-  using ElementE = typename MmaTensorOp::ElementE;
-  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
-
-  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
-  using SmemLayoutE = typename MmaTensorOp::LayoutE;
-
-  /// ThreadMap of iterator E
-  static int const kElementsPerAccessE =
-      kAccessSizeInBits / sizeof_bits<ElementE>::value;
-
-  /// E is tiny.  Not all warps are needed.
-  static int const kThreadsE =
-      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
-       kThreads)
-          ? kThreads
-          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
-
-  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedE,
-                               Shape::kK / kSparse / kElementsPerElementE /
-                                   kInterleavedE>,
-      kThreadsE, kElementsPerAccessE>;
-
-  /// Shared memory iterator to E operand
-  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM * kInterleavedE,
-                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
-      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy =
-      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
-                      MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  static int const kSparse = 2;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  // crosswise cannot be larger than 1024 bit.
-  static int const kCrosswiseB =
-      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
-          ? (1024 / sizeof_bits<ElementB>::value)
-          : Shape::kK;
-
-  static int const kWarpThreadArrangementContiguousB =
-      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, kCrosswiseB>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Cache operation of operand E
-  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
-      cutlass::arch::CacheOperation::Global;
-
-  static int const kInterleavedE = MmaTensorOp::kInterleaved;
-  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
-  static int const kMaxID2 = MmaTensorOp::kMaxID2;
-  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
-
-  using ElementE = typename MmaTensorOp::ElementE;
-  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
-
-  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
-  using SmemLayoutE = typename MmaTensorOp::LayoutE;
-
-  /// ThreadMap of iterator E
-  static int const kElementsPerAccessE =
-      kAccessSizeInBits / sizeof_bits<ElementE>::value;
-
-  /// E is tiny.  Not all warps are needed.
-  static int const kThreadsE =
-      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
-       kThreads)
-          ? kThreads
-          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
-
-  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedE,
-                               Shape::kK / kSparse / kElementsPerElementE /
-                                   kInterleavedE>,
-      kThreadsE, kElementsPerAccessE>;
-
-
-  /// Shared memory iterator to E operand
-  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM * kInterleavedE,
-                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
-      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy =
-      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
-                      MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
-                      Operator_, false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  static int const kSparse = 2;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
-                                               Shape::kM);
-
-  static int const kWarpThreadArrangementContiguousA =
-      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  // Warp thread arrangement
-  // crosswise cannot be larger than 1024 bit.
-  static int const kCrosswiseB =
-      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
-          ? (1024 / sizeof_bits<ElementB>::value)
-          : Shape::kK;
-
-  static int const kWarpThreadArrangementContiguousB =
-      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementA>::value, Crosswise_A>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementB>::value, kCrosswiseB>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Cache operation of operand E
-  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
-      cutlass::arch::CacheOperation::Global;
-
-  static int const kInterleavedE = MmaTensorOp::kInterleaved;
-  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
-  static int const kMaxID2 = MmaTensorOp::kMaxID2;
-  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
-
-  using ElementE = typename MmaTensorOp::ElementE;
-  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
-
-  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
-  using SmemLayoutE = typename MmaTensorOp::LayoutE;
-
-  /// ThreadMap of iterator E
-  static int const kElementsPerAccessE =
-      kAccessSizeInBits / sizeof_bits<ElementE>::value;
-
-  /// E is tiny.  Not all warps are needed.
-  static int const kThreadsE =
-      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
-       kThreads)
-          ? kThreads
-          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
-
-  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedE,
-                               Shape::kK / kSparse / kElementsPerElementE /
-                                   kInterleavedE>,
-      kThreadsE, kElementsPerAccessE>;
-
-  /// Shared memory iterator to E operand
-  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM * kInterleavedE,
-                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
-      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy =
-      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
-                      MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
-                      false, CacheOpA, CacheOpB> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  static int const kSparse = 2;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
-                                               Shape::kN);
-
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
-
-  // Shared memory layout
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<ElementB>::value, Crosswise_B>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                               kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                               kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Cache operation of operand E
-  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
-      cutlass::arch::CacheOperation::Global;
-
-  static int const kInterleavedE = MmaTensorOp::kInterleaved;
-  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
-  static int const kMaxID2 = MmaTensorOp::kMaxID2;
-  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
-
-  using ElementE = typename MmaTensorOp::ElementE;
-  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
-
-  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
-  using SmemLayoutE = typename MmaTensorOp::LayoutE;
-
-  /// ThreadMap of iterator E
-  static int const kElementsPerAccessE =
-      kAccessSizeInBits / sizeof_bits<ElementE>::value;
-
-  /// E is tiny.  Not all warps are needed.
-  static int const kThreadsE =
-      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
-       kThreads)
-          ? kThreads
-          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
-             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
-
-  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
-      layout::PitchLinearShape<Shape::kM * kInterleavedE,
-                               Shape::kK / kSparse / kElementsPerElementE /
-                                   kInterleavedE>,
-      kThreadsE, kElementsPerAccessE>;
-
-  /// Shared memory iterator to E operand
-  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM * kInterleavedE,
-                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
-      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy =
-      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
-                      MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
deleted file mode 100755
index 665010741..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting simt instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/warp/mma.h"
-#include "cutlass/gemm/threadblock/mma_pipelined.h"
-#include "cutlass/gemm/threadblock/mma_singlestage.h"
-#include "cutlass/arch/cache_operation.h" 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Size of a threadblock-scoped access
-    int kAccessSizeInBits = -1, // -1 denoting the default
-    /// Number of stages
-    int Stages = 2,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
->
-struct DefaultMmaCoreWithAccessSize;
-
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Number of stages
-    int Stages,
-    /// Operation performed by MMA
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB,
-    bool IsComplex
->
-struct DefaultMmaCoreWithAccessSize<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-    OperatorClass, -1, Stages, Operator, AccumulatorsInRowMajor,
-    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
-> : DefaultMmaCore<
-    Shape, WarpShape, InstructionShape,
-    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
-    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
-> {};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: simt class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Size of a threadblock-scoped access (a value of -1 indicates the default)
-    int kAccessSizeInBits_,
-    /// Operation performed by GEMM
-    typename Operator_>
-struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename platform::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassSimt, kAccessSizeInBits_, 2, Operator_
-                     > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassSimt;
-  static int const PartitionsK = Shape::kK / WarpShape::kK;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    PartitionsK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  static int const kElementsPerAccessDefault = 1;
-  static_assert(kAccessSizeInBits_ == -1 ||
-          sizeof_bits<ElementA>::value == sizeof_bits<ElementB>::value ||
-          kAccessSizeInBits_ / sizeof_bits<ElementA>::value == kElementsPerAccessDefault,
-          "Non-default value for kAccessSizeInBits_ is only allowed if size(elementA) == sizeof(elementB)");
-  static int const kElementsPerAccess = (kAccessSizeInBits_ != -1) ? kAccessSizeInBits_ / sizeof_bits<ElementA>::value : kElementsPerAccessDefault;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
-  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
deleted file mode 100755
index 9f45601a8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming
-   expectations about data layout of the global memory fragments, data types,
-   and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp
-   instructions.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
-#include "cutlass/gemm/threadblock/mma_with_reduction_multistage.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template defininng default matrix multiply operators inferred from threadblock tile size,
-/// global memory data layout, and target math instruction.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Reduce operand A or B along K dimension
-    bool ReduceKForA_,
-    /// Number of stages
-    int Stages = 2,
-    /// Operation performed by MMA
-    typename Operator = typename platform::conditional<
-        (platform::is_same<OperatorClass,
-                           cutlass::arch::OpClassTensorOp>::value) &&
-            (platform::is_same<ElementA, int8_t>::value ||
-             platform::is_same<ElementA, int4b_t>::value ||
-             platform::is_same<ElementA, uint8_t>::value ||
-             platform::is_same<ElementA, uint4b_t>::value),
-        cutlass::arch::OpMultiplyAddSaturate,
-        cutlass::arch::OpMultiplyAdd>::type,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global,
-    /// per-element transformation for elements of A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// per-element transformation for elements of B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    bool IsComplex = false// (is_complex<ElementA>::value || is_complex<ElementB>::value)
->
-struct DefaultMmaWithReductionCore {
-  using Base = DefaultMmaCore<Shape_,
-                              WarpShape,
-                              InstructionShape,
-                              ElementA,
-                              LayoutA,
-                              ElementB,
-                              LayoutB,
-                              ElementC,
-                              LayoutC,
-                              OperatorClass,
-                              Stages,
-                              Operator,
-                              AccumulatorsInRowMajor,
-                              CacheOpA,
-                              CacheOpB,
-                              TransformA,
-                              TransformB,
-                              IsComplex>;
-  using Shape = Shape_;
-  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
-  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
-  using SmemIteratorA = typename Base::SmemIteratorA;
-  using SmemIteratorB = typename Base::SmemIteratorB;
-  using SmemLayoutA = typename Base::SmemLayoutA;
-  using SmemLayoutB = typename Base::SmemLayoutB;
-  using WarpCount = typename Base::WarpCount;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-   
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaWithReductionTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, ReduceKForA_, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
deleted file mode 100755
index 5f8e3e339..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ /dev/null
@@ -1,712 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
-      layout of the global memory fragments, data types, and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: row-major
-///   Operator: wmma tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    ///< Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Number of stages
-    int Stages>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::RowMajor,
-                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
-                      Operator_> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassWmmaTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  //
-  // Shared memory layouts
-  //
-  // NOTE: shared memory layout for wmma is same as the operands' layout in the global memory
-  using SmemLayoutA = LayoutA;
-  using SmemLayoutB = LayoutB;
-
-  // Pad shared memory to avoid bank conflicts
-  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
-  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
-
-  //
-  // Iterators to write to shared memory
-  //
-  
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Wmma<
-      InstructionShape,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      Operator
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<kPaddingA, 0>,
-    MatrixShape<0, kPaddingB>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: wmma tensorop class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    ///< Shape of threadblock-scoped matrix multiply operator
-    ///< (concept:GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape) [allowed
-    /// wmma instruction shapes, e.g., 16x16x16, 32x8x16, 8x32x16,...]
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by GEMM
-    typename Operator_,
-    /// Number of stages
-    int Stages>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
-                      Operator_> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassWmmaTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
-
-  /// Number of threads per threadblock
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  // shared memory layout for wmma is same as the operands' layout in global memory
-  using SmemLayoutA = LayoutA;
-  using SmemLayoutB = LayoutB;
-  
-  // Pad shared memory to avoid bank conflicts
-  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
-  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
-
-  //
-  // Iterators to write to shared memory 
-  //
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA 
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;  
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB // SmemThreadMapB 
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Wmma<
-      InstructionShape,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      Operator
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, kPaddingA>,
-    MatrixShape<kPaddingB, 0>,
-    WarpCount::kK
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Number of stages
-    int Stages>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
-                      LayoutC_, arch::OpClassWmmaTensorOp, Stages, Operator_> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::RowMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassWmmaTensorOp;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousA =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  //
-  // Shared memory layouts
-  //
-
-  // shared memory layout for wmma is same as the operands' layout in global memory
-  using SmemLayoutA = LayoutA;
-  using SmemLayoutB = LayoutB;
-
-  // Pad shared memory to avoid bank conflicts
-  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
-  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
-  
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>, 
-    ElementA, 
-    SmemLayoutA,
-    1,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>, 
-    ElementB, 
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Wmma<
-      InstructionShape,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      Operator
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<0, kPaddingA>,
-    MatrixShape<0, kPaddingB>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: column-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_,
-    /// Number of stages
-    int Stages>
-struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
-                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
-                      Operator_> {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = ElementB_;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = arch::OpClassWmmaTensorOp;
-
-  /// Number of warps present
-  using WarpCount =
-      GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN,
-                Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Default Operator
-  using Operator = Operator_; 
-
-  // Warp thread arrangement 
-  static int const kWarpThreadArrangementContiguousB =
-      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  // shared memory layout for wmma is same as the operands' layout in global memory
-  using SmemLayoutA = LayoutA;
-  using SmemLayoutB = LayoutB;
-
-  // Pad shared memory to avoid bank conflicts
-  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
-  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
-  
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementA>::value
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kAccessSizeInBits / sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-    cutlass::arch::Wmma<
-      InstructionShape,
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      Operator
-    >,
-    cutlass::MatrixShape<1, 1>
-  >;
-
-  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
-    WarpShape,
-    ElementA,
-    SmemLayoutA,
-    ElementB,
-    SmemLayoutB,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Policy used to define MmaPipelined 
-  using MmaPolicy = MmaPolicy<
-    MmaTensorOp,
-    MatrixShape<kPaddingA, 0>,
-    MatrixShape<kPaddingB, 0>,
-    WarpCount::kK
-  >;
-};
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
deleted file mode 100755
index 5dd3dbc3a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-#include "cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
-#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Use zfill or predicate for SM80 out-of-bound cp.async 
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
-    >
-struct DefaultMmaLayernormMainloopFusion {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorVarMean =
-      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
-          cutlass::MatrixShape<1, WarpShape::kN>,
-          ElementScaleBias,
-          LayoutScaleBias>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorGammaBeta =
-      cutlass::transform::threadblock::PredicatedScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  using SmemIteratorGammaBeta =
-      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
-          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
-          LayoutScaleBias>;
-
-  static int const kThreadCount = 32;
-
-  // Warp-level iterators to load scale and bias vectors
-  using WarpIteratorGammaBeta = cutlass::gemm::warp::ScaleBiasTileIterator<
-      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
-      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-      typename MmaCore::MmaTensorOp::IteratorA::Base::Policy, kThreadCount,
-      MmaCore::WarpCount::kK>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaLayernormMainloopFusionMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, IteratorVarMean, IteratorGammaBeta, SmemIteratorGammaBeta,
-      CacheOpGammaBeta,
-      ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, WarpIteratorGammaBeta, Stages, SharedMemoryClear>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
deleted file mode 100755
index 1895962a7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Math operator tag (e.g. arch::OpMultiplyAdd)
-    typename Operator = arch::OpMultiplyAdd
->
-struct DefaultMmaPlanarComplexMultistage {
-
-    // Construct a planar complex variant from the real-valued variant
-    using RealMmaMultistage = typename DefaultMma<
-        ElementA_,
-        LayoutA_,
-        kAlignmentA,
-        ElementB_,
-        LayoutB_,
-        kAlignmentB,
-        ElementAccumulator_,
-        LayoutC_,
-        OperatorClass_,
-        ArchTag_,
-        ThreadblockShape_,
-        WarpShape_,
-        InstructionShape_,
-        Stages,
-        Operator
-    >::ThreadblockMma;
-
-    using ThreadblockMma = MmaPlanarComplexMultistage<
-      ThreadblockShape_,
-      typename RealMmaMultistage::IteratorA,
-      typename RealMmaMultistage::SmemIteratorA,
-      cutlass::arch::CacheOperation::Global,
-      typename RealMmaMultistage::IteratorB,
-      typename RealMmaMultistage::SmemIteratorB,
-      cutlass::arch::CacheOperation::Global,
-      ElementAccumulator_,
-      LayoutC_,
-      typename RealMmaMultistage::Policy,
-      Stages,
-      TransformA,
-      TransformB
-    >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}   // namespace threadblock
-}   // namespace gemm
-}   // namespace cutlass
-
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
deleted file mode 100755
index e800ba44d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-#include "cutlass/gemm/warp/mma_planar_complex.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/mma_planar_complex_pipelined.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for internal accumulation
-  typename ElementAccumulator_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Operator class tag
-  typename OperatorClass_,
-  /// Tag indicating architecture to tune for
-  typename ArchTag_,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape_,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape_,
-  /// Instruction-level tile size (concept: GemmShape)
-  typename InstructionShape_,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Complex transformation on operand A
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Complex transformation on operand B
-  ComplexTransform TransformB = ComplexTransform::kNone,
-  /// Math operator tag (e.g. arch::OpMultiplyAdd)
-  typename Operator = arch::OpMultiplyAdd
->
-struct DefaultMmaPlanarComplexPipelined {
-
-  // Construct a planar complex variant from the real-valued variant
-  using RealMma = typename DefaultMma<
-    ElementA_,
-    LayoutA_,
-    kAlignmentA,
-    ElementB_,
-    LayoutB_,
-    kAlignmentB,
-    ElementAccumulator_,
-    LayoutC_,
-    OperatorClass_,
-    ArchTag_,
-    ThreadblockShape_,
-    WarpShape_,
-    InstructionShape_,
-    Stages,
-    Operator
-  >::ThreadblockMma;
-
-  using ThreadblockMma = MmaPlanarComplexPipelined<
-    ThreadblockShape_,
-    typename RealMma::IteratorA,
-    typename RealMma::SmemIteratorA,
-    typename RealMma::IteratorB,
-    typename RealMma::SmemIteratorB,
-    ElementAccumulator_,
-    LayoutC_,
-    typename RealMma::Policy,
-    Stages,
-    TransformA,
-    TransformB
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
deleted file mode 100755
index f50d36a4b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined softmax-GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-#include "cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
-#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
-#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for Scale/Bias vectors
-    typename ElementScaleBias,
-    /// Layout type for Scale/Bias vectors
-    typename LayoutScaleBias,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Operator class tag
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Whether problem has been transformed. This determines to which operand
-    /// the softmax is applied.
-    bool InternalTranspose,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Use zfill or predicate for SM80 out-of-bound cp.async 
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
-    >
-struct DefaultMmaSoftmaxMainloopFusion {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using IteratorNormSum =
-      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
-          cutlass::MatrixShape<1, WarpShape::kN>,
-          ElementScaleBias,
-          LayoutScaleBias>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaSoftmaxMainloopFusionMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, IteratorNormSum,
-      ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, InternalTranspose, SharedMemoryClear>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
deleted file mode 100755
index 677c11443..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
-#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Layout type for C and D matrix operands
-    typename LayoutC,
-    /// Operator class tag
-    typename OperatorClass,
-    ///                                                                                               
-    bool ReduceKForA_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false,
-    /// Use zfill or predicate for SM80 out-of-bound cp.async 
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
-    >
-struct DefaultMmaWithReduction {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaWithReductionCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      ReduceKForA_,  Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaWithReductionMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
deleted file mode 100755
index 7f249780d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/arch/arch.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator = arch::OpMultiplyAddComplex,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false>
-struct DefaultMultistageMmaComplex;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageMmaComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            ElementAccumulator, layout::RowMajor, OperatorClass,
-                            ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
deleted file mode 100755
index cab2a96ae..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming
-   expectations about data layout of the global memory fragments, data types,
-   and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp
-   instructions.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/gemm/warp/default_mma_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-#include "cutlass/gemm/threadblock/default_mma_core.h"
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template defininng default matrix multiply operators inferred from
-/// threadblock tile size, global memory data layout, and target math
-/// instruction.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator = arch::OpMultiplyAddComplex,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA =
-        cutlass::arch::CacheOperation::Global,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB =
-        cutlass::arch::CacheOperation::Global>
-struct DefaultMultistageMmaComplexCore;
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
deleted file mode 100755
index 33150314a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ /dev/null
@@ -1,1808 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic properties needed by CTA-level GEMMs assuming
-   expectations about data layout of the global memory fragments, data types,
-   and internal tile sizes.
-
-      Partial specializations for threadblock::Mma operations targeting TensorOp
-   instructions.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-#include "cutlass/gemm/warp/mma_simt.h"
-#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
-
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
-#include "cutlass/gemm/threadblock/mma_multistage.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex double-precision
-///
-///   A: column-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, InstructionShape_, 
-    complex<double>, layout::ColumnMajor,
-    complex<double>, layout::RowMajor,
-    complex<double>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = complex<double>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<double>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<double>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped 128
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-
-/// Partial specialization for complex double-precision
-///
-///   A: column-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, InstructionShape_, 
-    complex<double>, layout::ColumnMajor,
-    complex<double>, layout::ColumnMajor,
-    complex<double>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages, 
-    TransformA, TransformB,
-    Operator_, 
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = complex<double>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<double>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<double>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  using Operator = Operator_;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped 128
-  static int const kAccessSizeInBits = 128;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex double-precision
-///
-///   A: row-major
-///   B: column-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, InstructionShape_, 
-    complex<double>, layout::RowMajor,
-    complex<double>, layout::ColumnMajor,
-    complex<double>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_, 
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = complex<double>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<double>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<double>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-  
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped 128
-  static int const kAccessSizeInBits = 128;
-
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-
-/// Partial specialization for complex double-precision
-///
-///   A: row-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,    
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, InstructionShape_, 
-    complex<double>, layout::RowMajor,
-    complex<double>, layout::RowMajor,
-    complex<double>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages, 
-    TransformA, TransformB, 
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = complex<double>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<double>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<double>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-  
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped 128
-  static int const kAccessSizeInBits = 128;
-
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<8, 4>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex floating-point
-///
-///   A: column-major
-///   B: column-major
-///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
-    complex<float>, layout::ColumnMajor,
-    complex<float>, layout::ColumnMajor,
-    complex<float>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  using ElementA = complex<float>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<float>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<float>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped
-  static int const kAccessSizeInBits = 64;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-
-/// Partial specialization for complex floating-point
-///
-///   A: column-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
-    complex<float>, layout::ColumnMajor,
-    complex<float>, layout::RowMajor,
-    complex<float>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  using ElementA = complex<float>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<float>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<float>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped
-  static int const kAccessSizeInBits = 64;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex floating-point
-///
-///   A: row-major
-///   B: column-major
-///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
-    complex<float>, layout::RowMajor,
-    complex<float>, layout::ColumnMajor,
-    complex<float>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  using ElementA = complex<float>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<float>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<float>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped
-  static int const kAccessSizeInBits = 64;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
-
-  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-      
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex floating-point
-///
-///   A: row-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex
-///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
-    complex<float>, layout::RowMajor,
-    complex<float>, layout::RowMajor,
-    complex<float>, LayoutC_, 
-    arch::OpClassTensorOp,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<16, 8, 8>;
-  using ElementA = complex<float>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<float>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<float>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped
-  static int const kAccessSizeInBits = 64;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
-
-  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
-      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
-      IteratorThreadMapA>;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
-      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
-      layout::PitchLinearShape<16, 2>,
-      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
-      IteratorThreadMapB>;
-      
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
-      WarpShape, InstructionShape, 
-      ElementA, SmemLayoutA, 
-      ElementB, SmemLayoutB,
-      ElementC, LayoutC, 
-      kTransformA, kTransformB,
-      Operator>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
-                                        MatrixShape<0, 0>, WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex SIMT operation
-///
-///   A: column-major
-///   B: column-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    typename RealA,
-    typename RealB,
-    typename RealC,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
-    complex<RealA>, layout::ColumnMajor,
-    complex<RealB>, layout::ColumnMajor,
-    complex<RealC>, LayoutC_, 
-    arch::OpClassSimt,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = complex<RealA>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<RealB>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<RealC>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of access
-  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
-
-  /// No vectorized accesses
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator B 
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    1,            /// 1 partition along K dimension
-    kTransformA,  /// Transform for A
-    kTransformB   /// Transform for B
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, Shape::kK / 32>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for complex SIMT operation
-///
-///   A: column-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    typename RealA,
-    typename RealB,
-    typename RealC,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
-    complex<RealA>, layout::ColumnMajor,
-    complex<RealB>, layout::RowMajor,
-    complex<RealC>, LayoutC_, 
-    arch::OpClassSimt,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = complex<RealA>;
-  using LayoutA = layout::ColumnMajor;
-  using ElementB = complex<RealB>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<RealC>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of access
-  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
-
-  /// No vectorized accesses
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kM, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      IteratorThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    1,            /// 1 partition along K dimension
-    kTransformA,  /// Transform for A
-    kTransformB   /// Transform for B
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<0, 0>,
-    MatrixShape<0, 0>,    // or Shape::kK / 32
-    WarpCount::kK>;
-};
-
-/// Partial specialization for complex SIMT operation
-///
-///   A: row-major
-///   B: column-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    typename RealA,
-    typename RealB,
-    typename RealC,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
-    complex<RealA>, layout::RowMajor,
-    complex<RealB>, layout::ColumnMajor,
-    complex<RealC>, LayoutC_, 
-    arch::OpClassSimt,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = complex<RealA>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<RealB>;
-  using LayoutB = layout::ColumnMajor;
-  using ElementC = complex<RealC>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of access
-  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
-
-  /// No vectorized accesses
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kN>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator B 
-  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      SmemThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    1,            /// 1 partition along K dimension
-    kTransformA,  /// Transform for A
-    kTransformB   /// Transform for B
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<Shape::kK / 32, 0>,
-    MatrixShape<0, Shape::kK / 32>,
-    WarpCount::kK>;
-};
-
-/// Partial specialization for complex SIMT operation
-///
-///   A: row-major
-///   B: row-major
-///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    typename RealA,
-    typename RealB,
-    typename RealC,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Number of stages
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_,
-    /// Cache operation of operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Cache operation of operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB>
-struct DefaultMultistageMmaComplexCore<
-    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
-    complex<RealA>, layout::RowMajor,
-    complex<RealB>, layout::RowMajor,
-    complex<RealC>, LayoutC_, 
-    arch::OpClassSimt,
-    Stages,
-    TransformA, TransformB,
-    Operator_,
-    CacheOpA, CacheOpB> {
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = GemmShape<1, 1, 1>;
-  using ElementA = complex<RealA>;
-  using LayoutA = layout::RowMajor;
-  using ElementB = complex<RealB>;
-  using LayoutB = layout::RowMajor;
-  using ElementC = complex<RealC>;
-  using LayoutC = LayoutC_;
-  static int const kStages = Stages;
-  static ComplexTransform const kTransformA = TransformA;
-  static ComplexTransform const kTransformB = TransformB;
-  using Operator = Operator_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
-
-  /// Number of warps present
-  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
-                              Shape::kN / WarpShape::kN, 
-                              Shape::kK / WarpShape::kK>;
-
-  // Divisility requirements
-  static_assert(
-      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
-      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
-
-  static_assert(WarpCount::kCount > 1,
-    "This specialization requires at least two warps.");
-
-  /// Number of threads per warp
-  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of access
-  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
-
-  /// No vectorized accesses
-  static int const kElementsPerAccess = 1;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = layout::ColumnMajor;
-
-  using SmemLayoutB = layout::RowMajor;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kK, Shape::kM>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Transpose the ThreadMap of iterator A
-  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
-      SmemThreadMapA>;
-
-  /// Policy of iterator B
-  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
-    layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    kElementsPerAccess
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
-      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
-      IteratorThreadMapB>;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level op
-  static const int WarpNumThreadsM = 4;
-  static const int WarpNumThreadsN = 8;
-  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
-      "WarpShape must be divisible by ThreadTile shape.");
-  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
-  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
-  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
-  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
-  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
-  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
-  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
-  // these should have max of thread tile also
-  using LaneMmaShape = cutlass::gemm::GemmShape<
-      LaneM,
-      LaneN,
-      1>;
-  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
-      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
-      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
-      LaneMmaShape
-  >;
-
-  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
-    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
-    ElementA,     /// Data type of A elements
-    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
-    ElementB,     /// Data type of B elements
-    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
-    ElementC,     /// Element type of C matrix
-    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
-    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-    1,            /// 1 partition along K dimension
-    kTransformA,  /// Transform for A
-    kTransformB   /// Transform for B
-    >;            /// Used for partial specialization
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = MmaPolicy<
-    MmaWarpSimt,
-    MatrixShape<Shape::kK / 32, 0>,
-    MatrixShape<0, 0>,    // or Shape::kK / 32
-    WarpCount::kK>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
deleted file mode 100755
index abcb063e3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
-#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator = arch::OpMultiplyAddComplex,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kTriangular,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false>
-struct DefaultMultistageTrmmComplex;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            kSideMode, kFillMode, kDiagType,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          kSideMode, kFillMode, kDiagType, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          kSideMode, FillMode::kFull, DiagType::kInvalid,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output and right-side mode
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            SideMode::kRight, kFillMode, kDiagType,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          SideMode::kRight, kFillMode, kDiagType,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output with unit diagonal
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            kSideMode, kFillMode, DiagType::kUnit,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          kSideMode, kFillMode, DiagType::kUnit, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          kSideMode, FillMode::kFull, DiagType::kInvalid,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output and right-side mode, unit diagonal
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            SideMode::kRight, kFillMode, DiagType::kUnit,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator> {
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          SideMode::kRight, kFillMode, DiagType::kUnit,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (for TRMM where diagonal imag part is ignored - used by HEMM)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            kSideMode, kFillMode, DiagType::kNonUnit,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
-  // when DiagType is kUnit
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          kSideMode, kFillMode, DiagType::kUnit, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          kSideMode, FillMode::kFull, DiagType::kInvalid,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
-      BlasMode::kHermitian>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output and right-side mode (for TRMM where diagonal imag part is ignored - used by HEMM)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename OperatorClass,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Complex transformation on operand A
-    ComplexTransform TransformA,
-    /// Complex transformation on operand B
-    ComplexTransform TransformB,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator>
-struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
-                            SideMode::kRight, kFillMode, DiagType::kNonUnit,
-                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
-                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
-      Stages, TransformA, TransformB, Operator>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, 
-          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
-          AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
-  // when DiagType is kUnit
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, 
-          SideMode::kRight, kFillMode, DiagType::kUnit,
-          AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
-      BlasMode::kHermitian>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
deleted file mode 100755
index 388b9c476..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false
-    >
-struct DefaultSparseMma;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultSparseMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
-                  kAlignmentB, ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-  
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  static int const kSparse = MmaCore::kSparse;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK / kSparse>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
-
-  // Define iterators over tiles from the E operand
-  using ElementE = typename MmaCore::ElementE;
-  using LayoutE = typename MmaCore::GmemLayoutE;
-  using ThreadMapE = typename MmaCore::IteratorThreadMapE;
-  using AccessTypeE =
-      cutlass::Array<ElementE, 128 / sizeof_bits<ElementE>::value>;
-  using IteratorE =
-      cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM,
-                               ThreadblockShape::kK / kSparse /
-                                   MmaCore::kElementsPerElementE>,
-          ElementE, LayoutE, 1, ThreadMapE, AccessTypeE>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::SparseMmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      IteratorE, typename MmaCore::SmemIteratorE, MmaCore::kCacheOpE,
-      typename MmaCore::MmaPolicy, Stages>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
deleted file mode 100755
index 5e90f25c2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
+++ /dev/null
@@ -1,445 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-// 
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/wmma.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
-#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
-#endif //CUTLASS_ARCH_WMMA_ENABLED
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B matrix operand
-    typename LayoutB_,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Number of stages used in the pipelined mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false
-    >
-struct DefaultTrmm;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
-                  kSideMode, kFillMode, kDiagType, 
-                  ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, kDiagType, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
-  
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output, right side mode (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Diag Type for the triangular matrix
-    DiagType kDiagType,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
-                  SideMode::kRight, kFillMode, kDiagType, 
-                  ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, kDiagType, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output with unit diagonal (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Side Mode for the kernel
-    SideMode kSideMode,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
-                  kSideMode, kFillMode, DiagType::kUnit, 
-                  ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, DiagType::kUnit, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
-  
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for row-major output, right side mode, unit diagonal (OperatorClass TensorOp)
-template <
-    /// Element type for A matrix operand
-    typename ElementA,
-    /// Layout type for A matrix operand
-    typename LayoutA,
-    /// Access granularity of A matrix in units of elements
-    int kAlignmentA,
-    /// Element type for B matrix operand
-    typename ElementB,
-    /// Layout type for B matrix operand
-    typename LayoutB,
-    /// Access granularity of B matrix in units of elements
-    int kAlignmentB,
-    /// Fill Mode for the triangular matrix
-    FillMode kFillMode,
-    /// Element type for internal accumulation
-    typename ElementAccumulator,
-    /// Tag indicating architecture to tune for
-    typename ArchTag,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape,
-    /// Number of stages used in the multistage mainloop
-    int Stages,
-    /// Operation perfomed by GEMM
-    typename Operator
-    >
-struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
-                  SideMode::kRight, kFillMode, DiagType::kUnit, 
-                  ElementAccumulator, layout::RowMajor,
-                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false> {
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpA =
-      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  static cutlass::arch::CacheOperation::Kind const CacheOpB =
-      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
-          ? cutlass::arch::CacheOperation::Global
-          : cutlass::arch::CacheOperation::Always;
-
-  // Define the MmaCore components
-  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
-      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, Operator, false, CacheOpA, CacheOpB>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
-
-  using IteratorA =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
-
-  using IteratorB =
-      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
-          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, DiagType::kUnit, AccessTypeB>;
-
-  // Define the threadblock-scoped multistage matrix multiply
-  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
-      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
-      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
-      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass 
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
deleted file mode 100755
index 27f410ccd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
+++ /dev/null
@@ -1,648 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a multistage threadblock-scoped Blocked-Ell MMA.
-*/
-
-#pragma once
-
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class EllMmaMultistage : 
-  public MmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  EllMmaMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  template<bool is_A_sparse, bool is_offset_constant>
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, EllIterator &ell_iter,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-          bool is_valid = iterator_A.valid();
-
-          if (!is_A_sparse){
-            if (is_offset_constant){
-              auto ell_offset = ell_iter.get_offset_fast();
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr +=  ell_offset * sizeof(IteratorA::Element) / kSrcBytes;
-            } else {
-              int k_offset = iterator_A.get_k();
-              auto ell_offset = ell_iter.get_offset(k_offset);
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += (ell_offset * sizeof(IteratorA::Element)) / kSrcBytes;
-            }
-          }
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, gmem_ptr, is_valid);
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-          bool is_valid = iterator_B.valid();
-
-          if (is_A_sparse){
-            if (is_offset_constant){
-              auto ell_offset = ell_iter.get_offset_fast();
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += ell_offset * sizeof(IteratorB::Element) / kSrcBytes;
-            } else {
-              int k_offset = iterator_B.get_k();
-              auto ell_offset = ell_iter.get_offset(k_offset);
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += ( ell_offset * sizeof(IteratorB::Element)) / kSrcBytes;
-            }
-          }
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, gmem_ptr, is_valid);
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  template<bool is_A_sparse, bool is_offset_constant>
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      EllIterator &ell_iterator
-      ) {
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          auto gmem_ptr = iterator_A.get();
-          bool is_valid = iterator_A.valid();
-
-          if (!is_A_sparse){
-            if (is_offset_constant){
-              auto ell_offset = ell_iterator.get_offset_fast();
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr +=  ell_offset * sizeof(IteratorA::Element) / kSrcBytes;
-            } else {
-              int k_offset = iterator_A.get_k();
-              auto ell_offset = ell_iterator.get_offset(k_offset);
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += (ell_offset * sizeof(IteratorA::Element)) / kSrcBytes;
-            }
-          }
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, gmem_ptr, is_valid);
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-          
-          auto gmem_ptr = iterator_B.get();
-          bool is_valid = iterator_B.valid();
-          
-          if (is_A_sparse){
-            if (is_offset_constant){
-              auto ell_offset = ell_iterator.get_offset_fast();
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += ell_offset * sizeof(IteratorB::Element) / kSrcBytes;
-            } else {
-              int k_offset = iterator_B.get_k();
-              auto ell_offset = ell_iterator.get_offset(k_offset);
-              is_valid = is_valid && (ell_offset >= 0);
-              gmem_ptr += ( ell_offset * sizeof(IteratorB::Element)) / kSrcBytes;
-            }
-          }
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, gmem_ptr, is_valid);
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-      ++ell_iterator;
-      
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    if (is_A_sparse){
-      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
-    }
-    else {
-      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
-    }
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
-    // accumulator and this temporary accumulator is added to the final
-    // accumulator once in every mainloop iteration.
-    plus<FragmentC> plus_accum;
-
-    FragmentC tmp_accum;
-
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-
-      tmp_accum.clear();
-    }
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-        if (platform::is_same<typename Operator::MathOperator,
-                              arch::OpMultiplyAddFastF32>::value
-          || platform::is_same<typename Operator::MathOperator,
-                               arch::OpMultiplyAddComplexFastF32>::value) {
-
-          warp_mma(
-            tmp_accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            tmp_accum
-          );
-
-          if (warp_mma_k == 0) {
-            accum = plus_accum(accum, tmp_accum);
-            tmp_accum.clear();
-          }
-        } else {
-          warp_mma(
-            accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            accum
-          );
-        }
-
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
-
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
-              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
-                               group_start_iteration_B);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
-              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
-                               group_start_iteration_B);
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-          ++ell_iterator;
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-      }
-
-    }
-
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-      accum = plus_accum(accum, tmp_accum); 
-    }
-
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
deleted file mode 100755
index 55a951e1d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped Blocked-Ell MMA.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Iterates over tiles of A operand in global memory 
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorA_,
-  /// Iterates over tiles of A operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorA_,
-  /// Iterates over tiles of B operand in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorB_,
-  /// Iterates over tiles of B operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorB_,
-  /// Data type of accumulator matrix
-  typename ElementC_,
-  /// Data type of accumulator matrix
-  typename LayoutC_,
-  /// Policy describing tuning details (concept: MmaPolicy)
-  typename Policy_,
-  /// Transformation applied to A operand
-  typename TransformA_ = NumericArrayConverter<
-    typename SmemIteratorA_::Element, 
-    typename IteratorA_::Element, 
-    IteratorA_::Fragment::kElements>,
-  ///
-  /// Transformation applied to B operand
-  typename TransformB_ = NumericArrayConverter<
-    typename SmemIteratorB_::Element, 
-    typename IteratorB_::Element, 
-    IteratorB_::Fragment::kElements>,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class EllMmaPipelined : public MmaBase<Shape_, Policy_, 2> {
-public:
-
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, 2>;
-
-  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
-  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
-  using ElementC = ElementC_;       ///< Data type of accumulator matrix
-  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-  using Policy = Policy_;           ///< Policy describing tuning details
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Obtain the arch tag from the warp-level operator
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  // staticaly assert kStages for EllMmaPipelined is two (Double-buffered pipeline)
-  static_assert((Base::kStages==2), "EllMmaPipelined requires kStages set to value 2");
-
-private:
-
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
-protected:
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
-
-public:
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  EllMmaPipelined(
-    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
-    int thread_idx,                                     ///< ID within the threadblock
-    int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-    
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  template<bool is_A_sparse, bool is_offset_constant>
-  CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,                            ///< number of iterations of the mainloop
-    FragmentC &accum,                                 ///< destination accumulator tile
-    IteratorA iterator_A,                             ///< iterator over A operand in global memory
-    IteratorB iterator_B,                             ///< iterator over B operand in global memory
-    FragmentC const &src_accum,                       ///< source accumulator tile
-    EllIterator &ell_iterator,
-    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
-    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
-
-    //
-    // Prologue
-    //
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
-
-    tb_frag_A.clear();
-    tb_frag_B.clear();
-
-    // load sparse matrix  
-    if (is_A_sparse){
-      iterator_A.load(tb_frag_A);
-    } else {
-      iterator_B.load(tb_frag_B);
-    }
-    
-    // load dense matrix
-    if (is_offset_constant){
-      if (is_A_sparse){
-        iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
-      } else {
-        iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
-      }
-    } else {
-      if (is_A_sparse){
-        iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
-      } else {
-        iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
-      }
-    }
-
-    ++iterator_A;
-    ++iterator_B;
-    ++ell_iterator;
-
-    this->smem_iterator_A_.store(transform_A(tb_frag_A));
-    this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_A[2];
-    WarpFragmentB warp_frag_B[2];
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    Operator warp_mma;
-
-    int smem_write_stage_idx = 1;
-
-    // Avoid reading out of bounds
-    iterator_A.clear_mask(gemm_k_iterations <= 1);
-    iterator_B.clear_mask(gemm_k_iterations <= 1);
-
-    if (is_A_sparse){
-      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
-    }
-    else {
-      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
-    }
-
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tightest latency requirement).
-
-    //
-    // Mainloop
-    //
-
-    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-
-        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
-
-          // Write fragments to shared memory
-          this->smem_iterator_A_.store(transform_A(tb_frag_A));
-
-          this->smem_iterator_B_.store(transform_B(tb_frag_B));
-
-          __syncthreads();
-          
-          ++this->smem_iterator_A_;
-          ++this->smem_iterator_B_;
-
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-          if (smem_write_stage_idx == 1) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-          }
-          else {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
-                 0});
-          }
-
-          smem_write_stage_idx ^= 1;
-        }
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k == 0) {
-          // load sparse matrix  
-          if (is_A_sparse){
-            iterator_A.load(tb_frag_A);
-          } else {
-            iterator_B.load(tb_frag_B);
-          }
-
-          // load dense matrix
-          if (is_offset_constant){
-            if (is_A_sparse){
-              iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
-            } else {
-              iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
-            }
-          } else {
-            if (is_A_sparse){
-              iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
-            } else {
-              iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
-            }
-          }
-
-          ++iterator_A;
-          ++iterator_B;
-          ++ell_iterator;
-
-          // Avoid reading out of bounds if this was the last loop iteration
-          iterator_A.clear_mask(gemm_k_iterations <= 2);
-          iterator_B.clear_mask(gemm_k_iterations <= 2);
-        }
-
-        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
-                 warp_frag_B[warp_mma_k % 2], accum);
-      }
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
deleted file mode 100755
index e246ddce6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Template for a threadblock-scoped GEMV kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix-vector product using SIMT math instructions.
-template <
-  class Core_ //< GemvCore
->
-class Gemv {
-public:
-  using Shape = typename Core_::Shape;
-
-  /// The MMA operator that computes GEMV 
-  using Operator = typename Core_::Operator;
-
-  /// Iterates over A in global memory
-  using IteratorA = typename Core_::IteratorA;
-
-  /// Iterates over B in global memory
-  using IteratorB = typename Core_::IteratorB;
-
-  /// Fragment of operand C loaded from global memory
-  using IteratorC = typename Core_::IteratorC;
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of operand accumulator loaded/stored to global memory
-  using FragmentC = typename Operator::FragmentC;
-
-  /// Shape of the per-thread GEMV operation
-  using ThreadShape = typename Core_::ThreadShape;
-
-public:
-  CUTLASS_DEVICE
-  Gemv() { }
-
-  CUTLASS_DEVICE
-  void operator()(
-    GemmCoord const &problem_size,    ///< problem size of batched GEMV
-    FragmentC &accum,                 ///< destination accumulator tile
-    IteratorA iterator_A,             ///< iterator over A operand in global memory
-    IteratorB iterator_B,             ///< iterator over B operand in global memory
-    FragmentC const &src_accum) {     ///< source accumualtor tile
-
-    //
-    // Prologue
-    //
-
-    FragmentA frag_A;
-    FragmentB frag_B;
-    frag_A.clear();
-    frag_B.clear();
-
-    iterator_A.load(frag_A);
-    iterator_B.load(frag_B);
-    ++iterator_A;
-    ++iterator_B;
-
-    //
-    // Mainloop
-    //
-    Operator thread_mma;
-    int gemm_k = problem_size.k();
-
-    if (gemm_k < Shape::kK)
-    {
-      iterator_A.clear_mask();
-      iterator_B.clear_mask();
-    }
-
-    // iterate over K to accumulate result
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k > 0; gemm_k -= Shape::kK) {
-      thread_mma(accum, frag_A, frag_B, accum);
-
-      iterator_A.load(frag_A);
-      iterator_B.load(frag_B);
-      ++iterator_A;
-      ++iterator_B;
-
-      if (gemm_k < Shape::kK)
-      {
-        iterator_A.clear_mask();
-        iterator_B.clear_mask();
-      }
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
deleted file mode 100755
index 8370f6145..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Helpers for rematerializing indices/dimensions in the thread hierarchy from special registers
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxX() {
-  return threadIdx.x;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxY() {
-  return threadIdx.y;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeThreadIdxZ() {
-  return threadIdx.z;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxX() {
-  return blockIdx.x;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxY() {
-  return blockIdx.y;
-}
-
-/// Helper to rematerialize block Idx. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockIdxZ() {
-  return blockIdx.z;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimX() {
-  return blockDim.x;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimY() {
-  return blockDim.y;
-}
-
-/// Helper to rematerialize block Dim. Reduces register liveness.
-CUTLASS_DEVICE
-int RematerializeBlockDimZ() {
-  return blockDim.z;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
deleted file mode 100755
index 16ec65688..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/tensor_ref.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy object describing MmaTensorOp
-template <
-    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
-    typename Operator_,
-    /// Padding used for A operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingA_,
-    /// Padding used for B operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingB_,
-    /// Number of partitions of K dimension of GEMM
-    int PartitionsK = 1>
-struct MmaPolicy {
-  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
-  using Operator = Operator_;
-
-  /// Padding used for A operand in shared memory
-  using SmemPaddingA = SmemPaddingA_;
-
-  /// Padding used for B operand in shared memory
-  using SmemPaddingB = SmemPaddingB_;
-
-  /// Number of partitions of K dimension
-  static int const kPartitionsK = PartitionsK;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
-                              Shape::kN / WarpGemm::kN,
-                              Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-
-  static_assert((kWarpGemmIterations % 2) == 0,
-                "Inner loop iteration must be an even number.");
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
deleted file mode 100755
index 11eb20adb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
+++ /dev/null
@@ -1,707 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-    Used by BLAS3 kernels that need to treat diagonal elements of a input iterator as a special case.
-  
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kZfill,
-    /// Blas3 computation mode
-    BlasMode BlasMode_ = BlasMode::kTriangular,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaBlas3Multistage : 
-  public MmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-  ///< Blas Mode
-  static BlasMode const kBlasMode = BlasMode_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaBlas3Multistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-          bool isvalid = iterator_A.valid();
-
-          if (isvalid && iterator_A.getOnDiag()) {
-            // Elements that are on diagonal
-            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
-              /* Copy real part from gmem, write zero for imag part in smem */
-              /* The following logic to determine kSizeRealBytes is so that compiler doesn't complain when
-               * compiling for not complex datatype and using half the size for cp_async_zfill */
-              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
-                                          complex<double>>::value) ? 8 : 4;
-              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
-                dst_ptr + v, gmem_ptr, true);
-              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
-                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
-            } else {
-              /* Write one (1) directly to smem*/
-              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
-            }
-          } else {
-            // Elements that are not of diagonal
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, isvalid);
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-          bool isvalid = iterator_B.valid();
-
-          if (isvalid && iterator_B.getOnDiag()) {
-            // Elements that are on diagonal
-            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
-              /* Copy real part from gmem, write zero for imag part in smem */
-              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
-                                          complex<double>>::value) ? 8 : 4;
-              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
-                dst_ptr + v, gmem_ptr, true);
-              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
-                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
-            } else {
-              /* Write one (1) directly to smem*/
-              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
-            }
-          } else {
-            // Elements that are not of diagonal
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, isvalid);
-          }
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          auto gmem_ptr = iterator_A.get();
-          bool isvalid = iterator_A.valid();
-
-          if (isvalid && iterator_A.getOnDiag()) {
-            // Elements that are on diagonal
-            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
-              /* Copy real part from gmem, write zero for imag part in smem */
-              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
-                                          complex<double>>::value) ? 8 : 4;
-              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
-                dst_ptr + v, gmem_ptr, true);
-              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
-                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
-            } else {
-              /* Write one (1) directly to smem*/
-              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
-            }
-          } else {
-            // Elements that are not of diagonal
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, isvalid);
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          auto gmem_ptr = iterator_B.get();
-          bool isvalid = iterator_B.valid();
-
-          if (isvalid && iterator_B.getOnDiag()) {
-            // Elements that are on diagonal
-            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
-              /* Copy real part from gmem, write zero for imag part in smem */
-              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
-                                          complex<double>>::value) ? 8 : 4;
-              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
-                dst_ptr + v, gmem_ptr, true);
-              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
-                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
-            } else {
-              /* Write one (1) directly to smem*/
-              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
-            }
-          } else {
-            // Elements that are not of diagonal
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, isvalid);
-          }
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    //
-    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
-    // so that all accumulator elements outside the GEMM footprint are zero.
-    //
-
-    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
-
-      /// Iterator to write threadblock-scoped tile of A operand to shared memory
-      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
-
-      typename IteratorA::AccessType zero_A;
-      zero_A.clear();
-
-      last_smem_iterator_A.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                last_smem_iterator_A.get());
-
-        *dst_ptr = zero_A;
-
-        ++last_smem_iterator_A;
-      }
-
-      /// Iterator to write threadblock-scoped tile of B operand to shared memory
-      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
-      typename IteratorB::AccessType zero_B;
-
-      zero_B.clear();
-      last_smem_iterator_B.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                last_smem_iterator_B.get());
-
-        *dst_ptr = zero_B;
-
-        ++last_smem_iterator_B;
-      }
-    }
-
-    // Waits until kStages-2 stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
-    // accumulator and this temporary accumulator is added to the final
-    // accumulator once in every mainloop iteration.
-    plus<FragmentC> plus_accum;
-
-    FragmentC tmp_accum;
-
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-
-      tmp_accum.clear();
-    }
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-        if (platform::is_same<typename Operator::MathOperator,
-                              arch::OpMultiplyAddFastF32>::value
-          || platform::is_same<typename Operator::MathOperator,
-                               arch::OpMultiplyAddComplexFastF32>::value) {
-
-          warp_mma(
-            tmp_accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            tmp_accum
-          );
-
-          if (warp_mma_k == 0) {
-            accum = plus_accum(accum, tmp_accum);
-            tmp_accum.clear();
-          }
-        } else {
-          warp_mma(
-            accum, 
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B[warp_mma_k % 2], 
-            accum
-          );
-        }
-
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
-
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-      }
-
-    }
-
-    if (platform::is_same<typename Operator::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-      accum = plus_accum(accum, tmp_accum); 
-    }
- 
-    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-      cutlass::arch::cp_async_fence();
-      cutlass::arch::cp_async_wait<0>();
-      __syncthreads();
-    }
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
deleted file mode 100755
index 11ad54446..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-
-    It loads two loop invariant vectors, mean and var, in the prologue and
-    stores them in the register file.  In the mainloop, it loads two loop
-    variant vectors, gamma and beta, by using cp.async.  We will call
-    elementwise operation to apply var, mean, gamma, beta between ldmatrix and
-    warp mma.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-#include "cutlass/gemm/warp/layernorm_scale_bias_transform.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Element type of scale and bias vectors 
-    typename ElementScaleBias_,
-    /// Layout of scale and bias vectors
-    typename LayoutScaleBias_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// WarpIterator to load Scale or Bias vector from the shared memory
-    typename WarpIteratorGammaBeta_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaMainloopFusionBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Element type of scale and bias vectors 
-  using ElementScaleBias = ElementScaleBias_;
-
-  /// Layout of scale and bias vectors
-  using LayoutScaleBias = LayoutScaleBias_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  ///< WarpIterator to load Scale or Bias vector from the shared memory
-  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
-                                             Shape::kN / WarpGemm::kN,
-                                             Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the scale and bias vectors
-  using TensorRefGammaBeta = TensorRef<ElementScaleBias, LayoutScaleBias>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the A scale and bias vectors in shared memory
-    using ShapeGammaBeta =
-        MatrixShape<1 + Policy::SmemPaddingA::kRow,
-                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-    /// Buffer for A operand Scale and Bias
-    AlignedBuffer<ElementScaleBias, ShapeGammaBeta::kCount> operand_A_gamma_beta;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a layout object for the A scale and bias vectors
-    CUTLASS_DEVICE
-    static LayoutScaleBias LayoutScaleBias() {
-      return LayoutScaleBias::packed(
-          {ShapeGammaBeta::kRow, ShapeGammaBeta::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-
-    /// Returns a TensorRef to the A operand Scale vector
-    CUTLASS_HOST_DEVICE
-    TensorRefGammaBeta operand_A_gamma_beta_ref() {
-      return TensorRefGammaBeta{operand_A_gamma_beta.data(), LayoutScaleBias()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
-  /// from shared memory
-  WarpIteratorGammaBeta warp_tile_iterator_A_gamma_beta_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaMainloopFusionBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-        warp_tile_iterator_A_gamma_beta_(
-            shared_storage.operand_A_gamma_beta_ref(), lane_idx),
-        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
-};
-
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Iterates over vectors of var and mean vector in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorVarMean_,
-    /// Iterates over vectors of scale and bias vector in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorGammaBeta_,
-    /// Iterates over vectors of scale and bias vector in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorGammaBeta_,
-    /// Cache operation for scale/bias operand 
-    cutlass::arch::CacheOperation::Kind CacheOpGammaBeta,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// WarpIterator to load Scale or Bias vector from the shared memory
-    typename WarpIteratorGammaBeta_,
-    /// Number of stages,
-    int Stages,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaLayernormMainloopFusionMultistage : 
-  public MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta_::Element,
-                       typename IteratorGammaBeta_::Layout, Policy_, WarpIteratorGammaBeta_, Stages> {
-public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Iterates over tiles of the var and mean vectors in global memory
-  using IteratorVarMean = IteratorVarMean_;
-  ///< Iterates over tiles of the scale and bias vectors in global memory
-  using IteratorGammaBeta = IteratorGammaBeta_;
-  ///< WarpIterator to load Scale or Bias vector from the shared memory
-  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  ///< Base class
-  using Base = MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta::Element, 
-                                     typename IteratorGammaBeta::Layout, Policy,
-                                     WarpIteratorGammaBeta, Stages>;
-
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-  using SmemIteratorGammaBeta = SmemIteratorGammaBeta_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpGammaBeta =
-      CacheOpGammaBeta;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
-  using WarpLoadedFragmentVarMean = typename IteratorVarMean::Fragment;
-  using WarpLoadedFragmentGammaBeta =
-      typename WarpIteratorGammaBeta::Fragment;
-
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
-  SmemIteratorGammaBeta smem_iterator_A_gamma_beta_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  int warp_idx_m_;
-
-  int warp_idx_n_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaLayernormMainloopFusionMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_A_gamma_beta_(shared_storage.operand_A_gamma_beta_ref(),
-                                  thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
-    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
-        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A,
-                              IteratorGammaBeta &iterator_A_gamma_beta,
-                              IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
-    // are small.  One iteration is enough.
-    if (group_start_A == 0) {
-      typename IteratorGammaBeta::AccessType *dst_ptr =
-          reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
-              this->smem_iterator_A_gamma_beta_.get());
-
-      int const kSrcBytes =
-          sizeof_bits<typename IteratorGammaBeta::Element>::value *
-          IteratorGammaBeta::kElementsPerAccess / 8;
-
-      cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
-          dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          }
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< iterator over B operand in global memory
-      IteratorVarMean iterator_var_mean,
-      ///< iterator over scale and bias vectors in global memory
-      IteratorGammaBeta iterator_A_gamma_beta,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-    // Issue several complete stages
-
-    WarpLoadedFragmentVarMean warp_loaded_frag_var_mean;
-    iterator_var_mean.add_tile_offset({0, warp_idx_m_});
-    iterator_var_mean.load(warp_loaded_frag_var_mean);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      // Async Copy for operand A scale and bias vectors.  Scale and bias
-      // vectors are small.  One iteration is enough.
-      {
-        typename IteratorGammaBeta::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
-                this->smem_iterator_A_gamma_beta_.get());
-
-        int const kSrcBytes =
-            sizeof_bits<typename IteratorGammaBeta::Element>::value *
-            IteratorGammaBeta::kElementsPerAccess / 8;
-
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
-            dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_A_gamma_beta.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpLoadedFragmentGammaBeta warp_loaded_frag_A_gamma_beta[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-    cutlass::gemm::warp::LayernormScaleBiasTransform<WarpTransformedFragmentA,
-                                            WarpLoadedFragmentVarMean,
-                                            WarpLoadedFragmentGammaBeta>
-                         elementwise_transform;
- 
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_A_gamma_beta_.load(
-        warp_loaded_frag_A_gamma_beta[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_A_gamma_beta_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    elementwise_transform(warp_transformed_frag_A[0],
-                         warp_loaded_frag_var_mean,
-                         warp_loaded_frag_A_gamma_beta[0]);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(
-            (warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_A_gamma_beta_.load(
-            warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_A_gamma_beta_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0) {
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
-                               warp_loaded_frag_var_mean,
-                               warp_loaded_frag_A_gamma_beta[warp_mma_k % 2]);
-        }
-
-        warp_mma(
-          accum, 
-          warp_transformed_frag_A[warp_mma_k % 2],
-          warp_transformed_frag_B[warp_mma_k % 2], 
-          accum
-        );
-
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
-
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
-	  		       group_start_iteration_A, 
-                               group_start_iteration_B);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
-	                               group_start_iteration_A, 
-                                 group_start_iteration_B);
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_A_gamma_beta.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_A_gamma_beta_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-          elementwise_transform(
-              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-              warp_loaded_frag_var_mean,
-              warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
-        }
-      }
-
-    }
-    
-    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
deleted file mode 100755
index ef5513170..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
+++ /dev/null
@@ -1,741 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaMultistage : 
-  public MmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
-    // accuracy, where each mainloop iteration first accumulates into a temporary
-    // set of freshly-cleared accumulators, which are subsequently added to the
-    // final accumulator set.
-    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
-  };
-
- private:
-
-
-  // Structure encapsulating pipeline state live from one iteration to the next
-  struct PipeState {
-
-    using WarpLoadedFragmentA = typename Operator::FragmentA;
-    using WarpLoadedFragmentB = typename Operator::FragmentB;
-    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
-    /// Temporary accumulator to facilitate staged-accumulation
-    FragmentC tmp_accum_;
-
-    /// Pair of A fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentA warp_loaded_frag_A_[2];
-    WarpTransformedFragmentA warp_transformed_frag_A_[2];
-
-    /// Pair of B fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentB warp_loaded_frag_B_[2];
-    WarpTransformedFragmentB warp_transformed_frag_B_[2];
-  };
-
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Warp-level MMA operator
-  Operator warp_mma_;
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  /// Shared memory write stage index
-  int smem_write_stage_idx_;
-
-  /// Shared memory read stage index
-  int smem_read_stage_idx_;
-
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
-      smem_write_stage_idx_(0),
-      smem_read_stage_idx_(0)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  /// Advance shared memory read-iterators to the next stage
-  CUTLASS_DEVICE
-  void advance_smem_read_stage()
-  {
-    ++smem_read_stage_idx_;
-
-    if (smem_read_stage_idx_ == Base::kStages) {
-      // Wrap back around to the 'start' of the circular buffer in shared memory
-      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
-      smem_read_stage_idx_ = 0;
-    }
-  }
-
-  /// Advance global memory read-iterators and shared memory write-iterators to the stage
-  CUTLASS_DEVICE
-  void advance_smem_write_stage(
-    IteratorA &iterator_A,
-    IteratorB &iterator_B)
-  {
-    // Advance global iterators
-    iterator_A.add_tile_offset({0, 1});
-    iterator_B.add_tile_offset({1, 0});
-
-    // Advance shared iterators
-    smem_iterator_A_.add_tile_offset({0, 1});
-    smem_iterator_B_.add_tile_offset({1, 0});
-
-    // Increment shared memory write stage index
-    ++smem_write_stage_idx_;
-
-    if (smem_write_stage_idx_ == Base::kStages) {
-      // Wrap back around to the 'start' of the circular buffer in shared memory
-      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-      smem_write_stage_idx_ = 0;
-    }
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          }
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
-  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
-  CUTLASS_DEVICE
-  void prologue(
-    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
-    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
-    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
-  {
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
-
-      // Disable global fetching if done with global fetch iterations
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next write stage
-      advance_smem_write_stage(iterator_A, iterator_B);
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Optionally clear the remaining stages of SMEM. This is a functional requirement for
-    // some kernels so that all accumulator elements outside the GEMM footprint are zero.
-    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
-
-      /// Iterator to write threadblock-scoped tile of A operand to shared memory
-      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
-      typename IteratorA::AccessType zero_A;
-
-      zero_A.clear();
-      last_smem_iterator_A.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                last_smem_iterator_A.get());
-
-        *dst_ptr = zero_A;
-
-        ++last_smem_iterator_A;
-      }
-
-      /// Iterator to write threadblock-scoped tile of B operand to shared memory
-      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
-      typename IteratorB::AccessType zero_B;
-
-      zero_B.clear();
-      last_smem_iterator_B.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                last_smem_iterator_B.get());
-
-        *dst_ptr = zero_B;
-
-        ++last_smem_iterator_B;
-      }
-    }
-  }
-
-
-  /// Wait until we have at least one completed global fetch stage
-  CUTLASS_DEVICE
-  void gmem_wait()
-  {
-    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-  }
-
-
-  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void mac_loop_iter(
-    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
-    FragmentC &accum,               ///< [in|out] destination accumulator tile
-    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
-    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
-    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
-  {
-    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
-    CUTLASS_PRAGMA_UNROLL
-    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-      // Load the next warp-tile's A fragment from shared memory
-      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
-      ++this->warp_tile_iterator_A_;
-
-      // Load the next warp-tile's B fragment from shared memory
-      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
-      ++this->warp_tile_iterator_B_;
-
-      // Except for the first warp-tile, all warp-tiles convert their incoming shared memory fragments as necessary
-      if (warp_mma_k > 0) {
-        warp_mma_.transform(
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
-          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_loaded_frag_B_[warp_mma_k % 2]);
-      }
-
-      // Execute the current warp-tile of MMA operations
-      if (Detail::kStagedAccumulation) {
-        warp_mma_(
-          pipe_state.tmp_accum_,
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
-          pipe_state.tmp_accum_
-        );
-
-        if (warp_mma_k == 0) {
-          plus<FragmentC> plus_accum;
-          accum = plus_accum(accum, pipe_state.tmp_accum_);
-          pipe_state.tmp_accum_.clear();
-        }
-      } else {
-        warp_mma_(
-          accum,
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
-          accum
-        );
-      }
-
-      // Except for the last warp-tile, all warp-tiles issue their share of
-      // global->shared fragment copies
-      if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-
-        int group_start_iteration_A, group_start_iteration_B;
-        group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-        group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-
-        copy_tiles_and_advance(
-            iterator_A,
-            iterator_B,
-            group_start_iteration_A,
-            group_start_iteration_B);
-      }
-
-      // The second-to-last warp-tile also:
-      //   - performs the last warp-tile's share of global->shared fragment copies
-      //   - moves to the next global fetch stage
-      if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-
-        // Performs the last warp-tile's share of global->shared fragment copies
-        int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-        int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-
-        copy_tiles_and_advance(
-          iterator_A,
-          iterator_B,
-          group_start_iteration_A,
-          group_start_iteration_B);
-
-        // Inserts a memory fence between stages of cp.async instructions.
-        cutlass::arch::cp_async_fence();
-
-        // Wait until we have at least one completed global fetch stage
-        gmem_wait();
-
-        // Move to the next global fetch stage
-        advance_smem_write_stage(iterator_A, iterator_B);
-        advance_smem_read_stage();
-
-        // Disable global fetching when done with global fetch iterations
-        --gemm_k_iterations;
-        iterator_A.clear_mask(gemm_k_iterations == 0);
-        iterator_B.clear_mask(gemm_k_iterations == 0);
-      }
-
-      // The last warp-tile also converts the shared memory fragments used by
-      // the first warp-tile of the next iteration, if necessary (so we can
-      // immediately start issuing MMA instructions at the top of the loop )
-      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-
-        warp_mma_.transform(
-          pipe_state.warp_transformed_frag_A_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
-      }
-
-    }
-  }
-
-
-  /// Perform the specified number of threadblock mainloop iterations of matrix
-  /// multiply-accumulate.  Assumes prologue has been initiated.
-  CUTLASS_DEVICE
-  void gemm_iters(
-      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
-      FragmentC &accum,             ///< [in|out] accumulator tile
-      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
-      IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
-  {
-    PipeState pipe_state;
-
-    // Disable global fetching if done with global fetch iterations
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    // Load first warp-tile's A fragment from shared memory
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
-    ++this->warp_tile_iterator_A_;
-
-    // Load first warp-tile's B fragment from shared memory
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
-    ++this->warp_tile_iterator_B_;
-
-    // Transform, if necessary, the first warp-tile's shared memory fragments
-    warp_mma_.transform(
-      pipe_state.warp_transformed_frag_A_[0],
-      pipe_state.warp_transformed_frag_B_[0],
-      pipe_state.warp_loaded_frag_A_[0],
-      pipe_state.warp_loaded_frag_B_[0]);
-
-    if (Detail::kStagedAccumulation) {
-      pipe_state.tmp_accum_.clear();
-    }
-
-    // Mainloop
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      mac_loop_iter(
-        pipe_state,
-        accum,
-        iterator_A,
-        iterator_B,
-        gemm_k_iterations);
-    }
-
-    if (Detail::kStagedAccumulation) {
-      plus<FragmentC> plus_accum;
-      accum = plus_accum(accum, pipe_state.tmp_accum_);
-    }
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-
-
-  /// Prepares the class for another prologue.
-  CUTLASS_DEVICE
-  void wind_down()
-  {
-    // Catch-up the smem-read iterator to the smem-write iterator (so this class can be reused for another tile's prologue)
-
-    // First, increment remaining warp tiles to get to the next full stage.  (Ideally we would
-    // just decrement one tile, but not all iterators implement --() decrement.)
-    #pragma unroll
-    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
-    {
-      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
-      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
-
-      ++this->warp_tile_iterator_A_;
-      ++this->warp_tile_iterator_B_;
-    }
-    smem_read_stage_idx_++;
-
-    // Then wrap back two full stages (one for the tile advancing we just did, and one to catch the write iterators)
-    static const int kStageIters = Policy::kPartitionsK * Base::kWarpGemmIterations;
-    if (smem_read_stage_idx_ > 1)
-    {
-      this->warp_tile_iterator_A_.add_tile_offset({0, (-2 * kStageIters)});
-      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
-    }
-    else
-    {
-      this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
-      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
-    }
-    smem_read_stage_idx_ = smem_write_stage_idx_;
-  }
-
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    // Prologue (start fetching iterations of global fragments into shared memory)
-    prologue(iterator_A, iterator_B, gemm_k_iterations);
-
-    // Wait until we have at least one completed global fetch stage
-    gmem_wait();
-
-    // Initialize destination accumulators with source accumulators
-    accum = src_accum;
-
-    // Perform the MAC-iterations
-    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
deleted file mode 100755
index 89681ebce..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ /dev/null
@@ -1,439 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Iterates over tiles of A operand in global memory 
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorA_,
-  /// Iterates over tiles of A operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorA_,
-  /// Iterates over tiles of B operand in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorB_,
-  /// Iterates over tiles of B operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorB_,
-  /// Data type of accumulator matrix
-  typename ElementC_,
-  /// Data type of accumulator matrix
-  typename LayoutC_,
-  /// Policy describing tuning details (concept: MmaPolicy)
-  typename Policy_,
-  /// Transformation applied to A operand
-  typename TransformA_ = NumericArrayConverter<
-    typename SmemIteratorA_::Element, 
-    typename IteratorA_::Element, 
-    IteratorA_::Fragment::kElements>,
-  ///
-  /// Transformation applied to B operand
-  typename TransformB_ = NumericArrayConverter<
-    typename SmemIteratorB_::Element, 
-    typename IteratorB_::Element, 
-    IteratorB_::Fragment::kElements>,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
-public:
-
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, 2>;
-
-  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
-  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
-  using ElementC = ElementC_;       ///< Data type of accumulator matrix
-  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-  using Policy = Policy_;           ///< Policy describing tuning details
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Obtain the arch tag from the warp-level operator
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
-  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
-
-protected:
-
-  //
-  // Data members
-  //
-
-  /// Warp-level MMA operator
-  Operator warp_mma;
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  ///< transformation applied to A fragment
-  TransformA transform_A_;
-
-  ///< transformation applied to B fragment
-  TransformB transform_B_;
-
-  /// Shared memory write stage index
-  int smem_write_stage_idx;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaPipelined(
-    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
-    int thread_idx,                                     ///< ID within the threadblock
-    int warp_idx,                                       ///< ID of warp
-    int lane_idx,                                       ///< ID of each thread within a warp
-    TransformA transform_A = TransformA(),              ///< transformation applied to A fragment
-    TransformB transform_B = TransformB()               ///< transformation applied to B fragment
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
-    transform_A_(transform_A),
-    transform_B_(transform_B),
-    smem_write_stage_idx(0)
-  {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-
-  /// Advance shared memory write-iterators to the next stage
-  CUTLASS_DEVICE
-  void advance_smem_write_stage()
-  {
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-    if (smem_write_stage_idx == 1) {
-      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-    }
-
-    smem_write_stage_idx ^= 1;
-  }
-
-  /// Advance shared memory read- and write-iterators to the next stage
-  CUTLASS_DEVICE
-  void advance_smem_stages()
-  {
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-    if (smem_write_stage_idx == 1) {
-      // wrap write stage
-      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-    }
-    else
-    {
-      // wrap read stage
-      this->warp_tile_iterator_A_.add_tile_offset(
-        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-      this->warp_tile_iterator_B_.add_tile_offset(
-        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
-    }
-
-    smem_write_stage_idx ^= 1;
-  }
-
-
-  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
-  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
-  CUTLASS_DEVICE
-  void prologue(
-    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
-    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
-    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
-  {
-    // The last kblock is loaded in the prolog
-
-    // Load A fragment from global A
-    FragmentA tb_frag_A;
-    tb_frag_A.clear();
-    iterator_A.load(tb_frag_A);
-    ++iterator_A;
-
-    // Load B fragment from global B
-    FragmentB tb_frag_B;
-    tb_frag_B.clear();
-    iterator_B.load(tb_frag_B);
-    ++iterator_B;
-
-    // Store A and B fragments to shared
-    this->smem_iterator_A_.store(transform_A_(tb_frag_A));
-    this->smem_iterator_B_.store(transform_B_(tb_frag_B));
-
-    // Advance write stage
-    advance_smem_write_stage();
-  }
-
-  /// Wait until we have at least one completed global fetch stage
-  CUTLASS_DEVICE
-  void gmem_wait()
-  {
-    __syncthreads();
-  }
-
-
-  /// Perform the specified number of threadblock mainloop iterations of matrix
-  /// multiply-accumulate.  Assumes prologue has been initiated.
-  CUTLASS_DEVICE
-  void gemm_iters(
-    int gemm_k_iterations,        ///< number of threadblock mainloop iterations
-    FragmentC &accum,             ///< [in|out] accumulator tile
-    IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
-    IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
-  {
-    using WarpFragmentA = typename Operator::FragmentA;
-    using WarpFragmentB = typename Operator::FragmentB;
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_A[2];
-    WarpFragmentB warp_frag_B[2];
-
-    // Load A fragment from shared A
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
-    ++this->warp_tile_iterator_A_;
-
-    // Load B fragment from shared B
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
-    ++this->warp_tile_iterator_B_;
-
-    // Pair of fragments used to overlap global memory loads and math instructions;
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
-
-    // Avoid reading out of bounds
-    iterator_A.clear_mask(gemm_k_iterations <= 1);
-    iterator_B.clear_mask(gemm_k_iterations <= 1);
-
-    //
-    // Mainloop
-    //
-
-    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-
-        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
-
-          // Write fragments to shared memory
-          this->smem_iterator_A_.store(transform_A_(tb_frag_A));
-
-          this->smem_iterator_B_.store(transform_B_(tb_frag_B));
-
-          // Wait until we have at least one completed global fetch stage
-          gmem_wait();
-
-          // Advance smem read and write stages
-          advance_smem_stages();
-        }
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-
-        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k == 0) {
-
-          // Load fragment from global A
-          tb_frag_A.clear();
-          iterator_A.load(tb_frag_A);
-          ++iterator_A;
-
-          // Load fragment from global B
-          tb_frag_B.clear();
-          iterator_B.load(tb_frag_B);
-          ++iterator_B;
-
-          // Avoid reading out of bounds if this was the last loop iteration
-          iterator_A.clear_mask(gemm_k_iterations <= 2);
-          iterator_B.clear_mask(gemm_k_iterations <= 2);
-        }
-
-        warp_mma(
-          accum,
-          warp_frag_A[warp_mma_k % 2],
-          warp_frag_B[warp_mma_k % 2],
-          accum);
-      }
-    }
-
-  }
-
-
-  /// Prepares the class for another prologue.
-  CUTLASS_DEVICE
-  void wind_down()
-  {
-    // First, increment remaining warp tiles to catch it up with the write stage.
-    #pragma unroll
-    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
-    {
-      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
-      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
-
-      ++this->warp_tile_iterator_A_;
-      ++this->warp_tile_iterator_B_;
-    }
-
-    // If we bumped the read iterators to the end of the circular buffer, wrap them around to
-    // align them with the write iterators
-    if (smem_write_stage_idx == 0)
-    {
-      this->warp_tile_iterator_A_.add_tile_offset(
-        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-      this->warp_tile_iterator_B_.add_tile_offset(
-        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,                            ///< number of iterations of the mainloop
-    FragmentC &accum,                                 ///< destination accumulator tile
-    IteratorA iterator_A,                             ///< iterator over A operand in global memory
-    IteratorB iterator_B,                             ///< iterator over B operand in global memory
-    FragmentC const &src_accum)                       ///< source accumulator tile
-  {
-    // Prologue
-    prologue(iterator_A, iterator_B, gemm_k_iterations);
-
-    // Wait until we have at least one completed global fetch stage
-    gmem_wait();
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Perform the MAC-iterations
-    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
deleted file mode 100755
index e8616cc90..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaPlanarComplexBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
-                              Shape::kN / WarpGemm::kN,
-                              Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Stride to the imaginary part of the A operand
-    static int const kImaginaryStrideA = ShapeA::kCount;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-    /// Stride to the imaginary part of the A operand
-    static int const kImaginaryStrideB = ShapeB::kCount;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount + kImaginaryStrideA> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount + kImaginaryStrideB> operand_B;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaPlanarComplexBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
deleted file mode 100755
index b9deb6320..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
+++ /dev/null
@@ -1,646 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/array_planar_complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Transformation applied to A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Transformation applied to B
-    ComplexTransform TransformB = ComplexTransform::kNone
->
-class MmaPlanarComplexMultistage : 
-  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
-
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  ///< Archtecture tag
-  using ArchTag = arch::Sm80;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  /// Transformation applied to A
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Transformation applied to B
-  static ComplexTransform const kTransformB = TransformB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = ArrayPlanarComplex<
-    typename Policy::Operator::FragmentC::Element,
-    Policy::Operator::FragmentC::kElements
-  >;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const TBLoadIterationsA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const TBLoadIterationsB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    static int const kAccessesPerGroupA =
-        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    static int const kAccessesPerGroupB =
-        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaPlanarComplexMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(
-    IteratorA &iterator_A_real,
-    IteratorA &iterator_A_imag,
-    
-    IteratorB &iterator_B_real, 
-    IteratorB &iterator_B_imag, 
-    
-    int group_start_A = 0, 
-    int group_start_B = 0) {
-
-    iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
-    iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Load for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-        
-      typename IteratorA::AccessType *dst_ptr = 
-        reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-          
-      int const kSrcBytes = 
-        sizeof_bits<typename IteratorA::Element>::value * 
-        IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-
-        auto gmem_ptr_real = iterator_A_real.get();
-        auto gmem_ptr_imag = iterator_A_imag.get();
-
-        bool pred_guard = iterator_A_real.valid();
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-            dst_ptr + v,
-            gmem_ptr_real,
-            pred_guard);
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess),
-            reinterpret_cast<char const *>(gmem_ptr_imag),
-            pred_guard);
-
-        ++iterator_A_real;
-        ++iterator_A_imag;
-      }
-
-      ++this->smem_iterator_A_;
-    }
-
-    iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
-    iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Load for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      typename IteratorB::AccessType *dst_ptr = 
-        reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
-      
-      int const kSrcBytes = 
-        sizeof_bits<typename IteratorB::Element>::value * 
-        IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-        auto gmem_ptr_real = iterator_B_real.get();
-        auto gmem_ptr_imag = iterator_B_imag.get();
-
-        bool pred_guard = iterator_B_real.valid();
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-            dst_ptr + v,
-            gmem_ptr_real,
-            pred_guard);
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess),
-            reinterpret_cast<char const *>(gmem_ptr_imag),
-            pred_guard);
-
-        ++iterator_B_real;
-        ++iterator_B_imag;
-      }
-      ++this->smem_iterator_B_;
-    }
-  }
-
-  CUTLASS_DEVICE
-  void warp_mma_planar_complex(
-    Operator & warp_mma, 
-    FragmentC &accum,
-    WarpFragmentA const & real_A, 
-    WarpFragmentA const & imag_A, 
-    WarpFragmentB const & real_B, 
-    WarpFragmentB const & imag_B) {
-
-    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
-
-    WarpFragmentB neg_real_B = neg_op_B(real_B);
-    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
-
-    warp_mma(accum.real, real_A, real_B, accum.real);  
-
-    if (kTransformB == ComplexTransform::kNone) {
-      warp_mma(accum.imag, real_A, imag_B, accum.imag);
-    }
-    else {
-      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
-    }
-
-    if (kTransformA == ComplexTransform::kNone) {
-      warp_mma(accum.imag, imag_A, real_B, accum.imag);
-    }
-    else {
-      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
-    }
-
-    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
-      warp_mma(accum.real, imag_A, imag_B, accum.real);
-    }
-    else {
-      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
-    }
-  }
-
-public:
-  
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A_real,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A_imag,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B_real,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B_imag,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A_real.clear_mask(gemm_k_iterations == 0);
-      iterator_A_imag.clear_mask(gemm_k_iterations == 0);
-      iterator_B_real.clear_mask(gemm_k_iterations == 0);
-      iterator_B_imag.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A_real.set_iteration_index(0);
-      iterator_A_imag.set_iteration_index(0);
-
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Load for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
-
-        typename IteratorA::AccessType *dst_ptr = 
-          reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-
-          int const kSrcBytes = 
-            sizeof_bits<typename IteratorA::Element>::value * 
-            IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
-
-          bool pred_guard = iterator_A_real.valid();
-
-          auto src_ptr_real = iterator_A_real.get();
-          auto src_ptr_imag = iterator_A_imag.get();
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, src_ptr_real, pred_guard);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v +
-                  Base::SharedStorage::kImaginaryStrideA /
-                      IteratorA::ThreadMap::kElementsPerAccess,
-              reinterpret_cast<char const *>(src_ptr_imag),
-              pred_guard);
-
-          ++iterator_A_real;
-          ++iterator_A_imag;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B_real.set_iteration_index(0);
-      iterator_B_imag.set_iteration_index(0);
-
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Load for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
-
-        typename IteratorB::AccessType *dst_ptr = 
-          reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-
-          int const kSrcBytes = 
-            sizeof_bits<typename IteratorB::Element>::value * 
-            IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
-
-          bool pred_guard = iterator_B_real.valid();
-
-          auto src_ptr_real = iterator_B_real.get();
-          auto src_ptr_imag = iterator_B_imag.get();
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-            dst_ptr + v, src_ptr_real, pred_guard);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v +
-                  Base::SharedStorage::kImaginaryStrideB /
-                      IteratorB::ThreadMap::kElementsPerAccess,
-              reinterpret_cast<char const *>(src_ptr_imag),
-              pred_guard);
-
-          ++iterator_B_real;
-          ++iterator_B_imag;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A_real.add_tile_offset({0, 1});
-      iterator_A_imag.add_tile_offset({0, 1});
-
-      iterator_B_real.add_tile_offset({1, 0});
-      iterator_B_imag.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Inserts a memory fence between stages of cp.async instructions
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Blocks until all but kStages-2 cp.async stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-
-    WarpFragmentA warp_frag_real_A[2];
-    WarpFragmentA warp_frag_imag_A[2];
-
-    WarpFragmentB warp_frag_real_B[2];
-    WarpFragmentB warp_frag_imag_B[2];
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
-    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
-
-    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
-    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A_real.clear_mask(gemm_k_iterations == 0);
-    iterator_A_imag.clear_mask(gemm_k_iterations == 0);
-    iterator_B_real.clear_mask(gemm_k_iterations == 0);
-    iterator_B_imag.clear_mask(gemm_k_iterations == 0);
-
-    // Start issuing the first group of the next stage outside of the mainloop
-    copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag);
-
-    Operator warp_mma;
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
-        
-        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-        }
-        else {
-          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-        }
-    
-        copy_tiles_and_advance(
-          iterator_A_real, 
-          iterator_A_imag,
-          iterator_B_real, 
-          iterator_B_imag,
-          group_start_iteration_A, 
-          group_start_iteration_B);
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          // Inserts a memory fence between stages of cp.async instructions
-          cutlass::arch::cp_async_fence();
-
-          // Blocks until all but kStages-2 cp.async stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A_real.add_tile_offset({0, 1});
-          iterator_A_imag.add_tile_offset({0, 1});
-          
-          iterator_B_real.add_tile_offset({1, 0});
-          iterator_B_imag.add_tile_offset({1, 0});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A_real.clear_mask(gemm_k_iterations == 0);
-          iterator_A_imag.clear_mask(gemm_k_iterations == 0);
-          iterator_B_real.clear_mask(gemm_k_iterations == 0);
-          iterator_B_imag.clear_mask(gemm_k_iterations == 0);
-        }
-
-        warp_mma_planar_complex(
-          warp_mma, 
-          accum, 
-          warp_frag_real_A[warp_mma_k % 2], 
-          warp_frag_imag_A[warp_mma_k % 2],
-          warp_frag_real_B[warp_mma_k % 2], 
-          warp_frag_imag_B[warp_mma_k % 2]);
-      }
-
-    }
-
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
deleted file mode 100755
index 0e36a6dc6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Transformation applied to A
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Transformation applied to B
-    ComplexTransform TransformB = ComplexTransform::kNone
->
-class MmaPlanarComplexPipelined : 
-  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
-
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  /// Transformation applied to A
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Transformation applied to B
-  static ComplexTransform const kTransformB = TransformB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = ArrayPlanarComplex<
-    typename Policy::Operator::FragmentC::Element,
-    Policy::Operator::FragmentC::kElements
-  >;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
- private:
-
-  using FragmentA = typename IteratorA::Fragment;
-  using FragmentB = typename IteratorB::Fragment;
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaPlanarComplexPipelined(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  void warp_mma_planar_complex(
-    Operator & warp_mma, 
-    FragmentC &accum,
-    WarpFragmentA const & real_A, 
-    WarpFragmentA const & imag_A, 
-    WarpFragmentB const & real_B, 
-    WarpFragmentB const & imag_B) {
-
-    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
-
-    WarpFragmentB neg_real_B = neg_op_B(real_B);
-    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
-
-    warp_mma(accum.real, real_A, real_B, accum.real);  
-
-    if (kTransformB == ComplexTransform::kNone) {
-      warp_mma(accum.imag, real_A, imag_B, accum.imag);
-    }
-    else {
-      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
-    }
-
-    if (kTransformA == ComplexTransform::kNone) {
-      warp_mma(accum.imag, imag_A, real_B, accum.imag);
-    }
-    else {
-      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
-    }
-
-    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
-      warp_mma(accum.real, imag_A, imag_B, accum.real);
-    }
-    else {
-      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
-    }
-  }
-
-public:
-  
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A_real,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A_imag,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B_real,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B_imag,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    FragmentA tb_frag_A_real;
-    FragmentA tb_frag_A_imag;
-
-    FragmentB tb_frag_B_real;
-    FragmentB tb_frag_B_imag;
-
-    tb_frag_A_real.clear();
-    tb_frag_A_imag.clear();
-
-    tb_frag_B_real.clear();
-    tb_frag_B_imag.clear();
-
-    // The last kblock is loaded in the prolog
-    iterator_A_real.load(tb_frag_A_real);
-    iterator_A_imag.load(tb_frag_A_imag);
-
-    iterator_B_real.load(tb_frag_B_real);
-    iterator_B_imag.load(tb_frag_B_imag);
-
-    ++iterator_A_real;
-    ++iterator_A_imag;
-
-    ++iterator_B_real;
-    ++iterator_B_imag;
-
-    this->smem_iterator_A_.store(tb_frag_A_real);
-    this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
-
-    this->smem_iterator_B_.store(tb_frag_B_real);
-    this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
-
-    ++this->smem_iterator_A_;
-    ++this->smem_iterator_B_;
-
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_real_A[2];
-    WarpFragmentA warp_frag_imag_A[2];
-
-    WarpFragmentB warp_frag_real_B[2];
-    WarpFragmentB warp_frag_imag_B[2];
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
-    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
-
-    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
-    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
-
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    Operator warp_mma;
-
-    int smem_write_stage_idx = 1;
-
-    // Avoid reading out of bounds
-    iterator_A_real.clear_mask(gemm_k_iterations <= 1);
-    iterator_A_imag.clear_mask(gemm_k_iterations <= 1);
-    
-    iterator_B_real.clear_mask(gemm_k_iterations <= 1);
-    iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
-
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tightest latency requirement).
-
-    //
-    // Mainloop
-    //
-
-    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-
-        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
-
-          // Write fragments to shared memory
-          this->smem_iterator_A_.store(tb_frag_A_real);
-          this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
-
-          this->smem_iterator_B_.store(tb_frag_B_real);
-          this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
-
-          __syncthreads();
-          
-          ++this->smem_iterator_B_;
-          ++this->smem_iterator_A_;
-
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
-          if (smem_write_stage_idx == 1) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-          }
-          else {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
-                 0});
-          }
-
-          smem_write_stage_idx ^= 1;
-        }
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
-        
-        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k == 0) {
-
-          iterator_A_real.load(tb_frag_A_real);
-          iterator_A_imag.load(tb_frag_A_imag);
-
-          iterator_B_real.load(tb_frag_B_real);
-          iterator_B_imag.load(tb_frag_B_imag);
-
-          ++iterator_A_real;
-          ++iterator_A_imag;
-          ++iterator_B_real;
-          ++iterator_B_imag;
-
-          // Avoid reading out of bounds if this was the last loop iteration
-          iterator_A_real.clear_mask(gemm_k_iterations <= 2);
-          iterator_A_imag.clear_mask(gemm_k_iterations <= 2);
-          iterator_B_real.clear_mask(gemm_k_iterations <= 2);
-          iterator_B_imag.clear_mask(gemm_k_iterations <= 2);
-        }
-
-        warp_mma_planar_complex(
-          warp_mma, 
-          accum, 
-          warp_frag_real_A[warp_mma_k % 2], 
-          warp_frag_imag_A[warp_mma_k % 2],
-          warp_frag_real_B[warp_mma_k % 2], 
-          warp_frag_imag_B[warp_mma_k % 2]);
-      }
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
deleted file mode 100755
index 311562865..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/aligned_buffer.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Iterates over tiles of A operand in global memory 
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorA_,
-  /// Iterates over tiles of A operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorA_,
-  /// Iterates over tiles of B operand in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
-  typename IteratorB_,
-  /// Iterates over tiles of B operand in shared memory
-  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-  typename SmemIteratorB_,
-  /// Data type of accumulator matrix
-  typename ElementC_,
-  /// Data type of accumulator matrix
-  typename LayoutC_,
-  /// Policy describing tuning details (concept: MmaPolicy)
-  typename Policy_,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaSingleStage : public MmaBase<Shape_, Policy_, 1> {
-public:
-
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, 1>;
-
-  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
-  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
-  using ElementC = ElementC_;       ///< Data type of accumulator matrix
-  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-  using Policy = Policy_;           ///< Policy describing tuning details
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of operand A loaded from global memory
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Fragment of operand B loaded from global memory
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  using ArchTag = arch::Sm70;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline)
-  static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1");
-private:
-
-  using WarpFragmentA = typename Operator::FragmentA;
-  using WarpFragmentB = typename Operator::FragmentB;
-
-protected:
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaSingleStage(
-    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
-    int thread_idx,                                     ///< ID within the threadblock
-    int warp_idx,                                       ///< ID of warp
-    int lane_idx                                        ///< ID of each thread within a warp
-  ):
-    Base(shared_storage, thread_idx, warp_idx, lane_idx),
-    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-    int gemm_k_iterations,            ///< number of iterations of the mainloop
-    FragmentC &accum,                 ///< destination accumulator tile
-    IteratorA iterator_A,             ///< iterator over A operand in global memory
-    IteratorB iterator_B,             ///< iterator over B operand in global memory
-    FragmentC const &src_accum) {     ///< source accumualtor tile
-
-    //
-    // Prologue
-    //
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    FragmentA tb_frag_A;
-    FragmentB tb_frag_B;
-
-    tb_frag_A.clear();
-    tb_frag_B.clear();
-
-    // The last kblock is loaded in the prolog
-    iterator_A.load(tb_frag_A);
-    iterator_B.load(tb_frag_B);
-
-    ++iterator_A;
-    ++iterator_B;
-
-    // Pair of fragments used to overlap shared memory loads and math instructions
-    WarpFragmentA warp_frag_A;
-    WarpFragmentB warp_frag_B;
-
-    Operator warp_mma;
-
-    // Avoid reading out of bounds
-    iterator_A.clear_mask(gemm_k_iterations <= 1);
-    iterator_B.clear_mask(gemm_k_iterations <= 1);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
-      this->smem_iterator_A_.store(tb_frag_A);
-      this->smem_iterator_B_.store(tb_frag_B);
-
-      __syncthreads();
-
-      //
-      // Loop over GEMM K dimension
-      //
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
-        // as the case may be.
-        
-        this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
-
-        this->warp_tile_iterator_A_.load(warp_frag_A);
-        this->warp_tile_iterator_B_.load(warp_frag_B);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        warp_mma(accum, warp_frag_A, warp_frag_B, accum);
-      }
-
-      // Add negative offsets to return smem load iterators to the 'start' of the shared memory
-      this->warp_tile_iterator_A_.add_tile_offset({0, -Policy::kPartitionsK * Base::kWarpGemmIterations});
-      this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
-
-      __syncthreads();
-
-      iterator_A.load(tb_frag_A);
-      iterator_B.load(tb_frag_B);
-
-      ++iterator_A;
-      ++iterator_B;
-
-      // Avoid reading out of bounds if this was the last loop iteration
-      iterator_A.clear_mask(gemm_k_iterations <= 2);
-      iterator_B.clear_mask(gemm_k_iterations <= 2);
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
deleted file mode 100755
index bd793fc84..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
+++ /dev/null
@@ -1,756 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-
-    It loads two loop invariant vectors, norm and sum, in the prologue and
-    stores them in the register file.  We will call elementwise operation to
-    apply norm and sum between ldmatrix and warp mma.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-#include "cutlass/gemm/warp/softmax_scale_bias_transform.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaMainloopFusionBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
-                                             Shape::kN / WarpGemm::kN,
-                                             Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaMainloopFusionBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx)
-      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
-};
-
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Iterates over vectors of var and mean vector in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorNormSum_,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Whether problem has been transformed. This determines to which operand
-    /// the softmax is applied.
-    bool InternalTranspose,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaSoftmaxMainloopFusionMultistage : 
-  public MmaMainloopFusionBase<Shape_, Policy_, Stages> {
-public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Iterates over tiles of the var and mean vectors in global memory
-  using IteratorNormSum = IteratorNormSum_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  ///< Base class
-  using Base = MmaMainloopFusionBase<Shape_, Policy, Stages>;
-
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    static_assert(Base::kWarpGemmIterations > 1,
-                  "The pipelined structure requires at least two warp-level "
-                  "GEMM operations.");
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
-  using WarpLoadedFragmentNormSum = typename IteratorNormSum::Fragment;
-
-  static bool const kInternalTranspose = InternalTranspose;
-
-  using SoftmaxFragment = typename platform::conditional<kInternalTranspose,
-                                                         WarpTransformedFragmentB,
-                                                         WarpTransformedFragmentA>::type;
-
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  int warp_idx_m_;
-
-  int warp_idx_n_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaSoftmaxMainloopFusionMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
-    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A,
-                              IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          }
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< iterator over B operand in global memory
-      IteratorNormSum iterator_norm_sum,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-    // Issue several complete stages
-
-    WarpLoadedFragmentNormSum warp_loaded_frag_norm_sum;
-    iterator_norm_sum.add_tile_offset({0, warp_idx_m_});
-    iterator_norm_sum.load(warp_loaded_frag_norm_sum);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-    cutlass::gemm::warp::SoftmaxScaleBiasTransform<
-        SoftmaxFragment, WarpLoadedFragmentNormSum> elementwise_transform;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    // Start issuing the first group of the next stage outside of the mainloop
-    copy_tiles_and_advance(iterator_A, iterator_B);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    if (kInternalTranspose) {
-      elementwise_transform(warp_transformed_frag_B[0],
-                         warp_loaded_frag_norm_sum);
-    } else {
-      elementwise_transform(warp_transformed_frag_A[0],
-                         warp_loaded_frag_norm_sum);
-    }
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0) {
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-              if (kInternalTranspose) {
-                elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
-                                  warp_loaded_frag_norm_sum);
-              } else {
-                elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
-                                  warp_loaded_frag_norm_sum);
-              }
-        }
-
-        // Issue global->shared copies for the next stage
-        int group_start_iteration_A, group_start_iteration_B;
-
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          group_start_iteration_A = 0;
-          group_start_iteration_B = 0;
-        } else {
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-        }
-
-        copy_tiles_and_advance(iterator_A, iterator_B,
-                               group_start_iteration_A,
-                               group_start_iteration_B);
-
-        warp_mma(
-          accum, 
-          warp_transformed_frag_A[warp_mma_k % 2],
-          warp_transformed_frag_B[warp_mma_k % 2], 
-          accum
-        );
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-              if (kInternalTranspose) {
-                elementwise_transform(warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                                  warp_loaded_frag_norm_sum);
-              } else {
-                elementwise_transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                                  warp_loaded_frag_norm_sum);
-              }
-        }
-      }
-
-    }
-    
-    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-      cutlass::arch::cp_async_fence();
-      cutlass::arch::cp_async_wait<0>();
-      __syncthreads();
-    }
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
deleted file mode 100755
index bb10c0a8f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy object describing MmaTensorOp
-template <
-    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
-    typename Operator_,
-    /// Padding used for A operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingA_,
-    /// Padding used for B operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingB_,
-    /// Padding used for E operand in shared memory (concept: MatrixShape)
-    typename SmemPaddingE_,
-    /// Number of partitions of K dimension of GEMM
-    int PartitionsK = 1>
-struct SparseMmaPolicy {
-  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
-  using Operator = Operator_;
-
-  /// Padding used for A operand in shared memory
-  using SmemPaddingA = SmemPaddingA_;
-
-  /// Padding used for B operand in shared memory
-  using SmemPaddingB = SmemPaddingB_;
-
-  /// Padding used for B operand in shared memory
-  using SmemPaddingE = SmemPaddingE_;
-
-  /// Number of partitions of K dimension
-  static int const kPartitionsK = PartitionsK;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class SparseMmaBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  //
-  // Dependent types
-  //
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy::Operator::Shape;
-
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
-                              Shape::kN / WarpGemm::kN,
-                              Shape::kK / WarpGemm::kK>;
-
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-
-  static_assert((kWarpGemmIterations % 2) == 0,
-                "Inner loop iteration must be an even number.");
-
-  /// Number of stages
-  static int const kStages = Stages;
-
-  static int const kSparse = Operator::kSparse;
-
-  static int const kElementsPerElementE = Operator::kElementsPerElementE;
-
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
-
-  /// Tensor reference to the B operand
-  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
-
-  /// Tensor reference to the E operand
-  using TensorRefE = TensorRef<typename Operator::ElementE, typename Operator::LayoutE>;
-
-  //
-  // Nested structs
-  //
-
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
-                               Shape::kK / kSparse * kStages +
-                                   Policy::SmemPaddingA::kColumn>;
-
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB =
-        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;
-
-    /// Shape of the E matrix operand in shared memory
-    using ShapeE =
-        MatrixShape<Shape::kM * 2 + Policy::SmemPaddingE::kRow,
-                    Shape::kK / kSparse / kElementsPerElementE / 2 * kStages +
-                        Policy::SmemPaddingE::kColumn>;
-
-   public:
-    //
-    // Data members
-    //
-
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
-
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
-
-    /// Buffer for E operand
-    AlignedBuffer<typename Operator::ElementE, ShapeE::kCount> operand_E;
-
-   public:
-
-    //
-    // Methods
-    //
-
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator::LayoutA LayoutA() {
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutB LayoutB() {
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
-    }
-
-    /// Returns a layout object for the E matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator::LayoutE LayoutE() {
-      return Operator::LayoutE::packed({ShapeE::kRow, ShapeE::kColumn});
-    }
-
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB operand_B_ref() {
-      return TensorRefB{operand_B.data(), LayoutB()};
-    }
-
-    /// Returns a TensorRef to the E operand
-    CUTLASS_HOST_DEVICE
-    TensorRefE operand_E_ref() {
-      return TensorRefE{operand_E.data(), LayoutE()};
-    }
-  };
-
- protected:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator::IteratorA warp_tile_iterator_A_;
-
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator::IteratorB warp_tile_iterator_B_;
-
-  /// Iterator to load a warp-scoped tile of E operand from shared memory
-  typename Operator::IteratorE warp_tile_iterator_E_;
-
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  SparseMmaBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx),
-      warp_tile_iterator_E_(shared_storage.operand_E_ref(), lane_idx) {
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
deleted file mode 100755
index 8113583d6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
+++ /dev/null
@@ -1,668 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/threadblock/mma_sparse_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Iterates over tiles of E operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorE_,
-    /// Iterates over tiles of E operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorE_,
-    /// Cache operation for operand E
-    cutlass::arch::CacheOperation::Kind CacheOpE,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class SparseMmaMultistage : 
-  public SparseMmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = SparseMmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Iterates over tiles of E operand in global memory
-  using IteratorE = IteratorE_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-  using SmemIteratorE = SmemIteratorE_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpE = CacheOpE;
-
-  static int const kSparse = Policy::Operator::kSparse;
-  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
-  static int const kMaxID2 = Policy::Operator::kMaxID2;
-  static int const kElementsPerElementE =
-      Policy::Operator::kElementsPerElementE;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  /// ElementE
-  using ElementE = typename IteratorE::Element;
-
-  /// LayoutE
-  using LayoutE = typename IteratorE::Layout; 
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of async copies to load one stage of operand A
-    static int const TBLoadIterationsA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of async copies to load one stage of operand B
-    static int const TBLoadIterationsB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of async copies to load one stage of operand E
-    static int const TBLoadIterationsE =
-        IteratorE::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of async copies to load one group of operand A
-    static int const kAccessesPerGroupA =
-        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of async copies to load one group of operand B
-    static int const kAccessesPerGroupB =
-        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of async copies to load one group of operand E
-    static int const kAccessesPerGroupE =
-        (TBLoadIterationsE + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// E operand is tiny.  For the most of time, not all the warps are needed
-    /// to load it from the global memory.
-    static int const kValidWarps = IteratorE::ThreadMap::kThreads / 32;
-
-    /// B operand is twice as big as A which brings very high register pressure.
-    /// We have to sacrifice the double buffer when the warp tile size is big.
-    static int const kBBufferSize =
-        ((sizeof(typename Operator::ElementC) == 4) &&
-         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
-                             typename Operator::ElementA>::value &&
-           platform::is_same<typename Operator::Policy::Operator::ElementB,
-                             typename Operator::ElementB>::value)) &&
-         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
-            ? 1
-            : 2;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-  using WarpFragmentE = typename Operator::FragmentE;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-  /// Iterator to write threadblock-scoped tile of E operand to shared memory
-  SmemIteratorE smem_iterator_E_;
-
-  /// Warp id
-  bool is_warp_valid_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  SparseMmaMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
-      smem_iterator_E_(shared_storage.operand_E_ref(), thread_idx)
-  {
-    is_warp_valid_ = warp_idx < Detail::kValidWarps;
-
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-    this->warp_tile_iterator_E_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
-                              IteratorE &iterator_E, int group_start_A = 0,
-                              int group_start_B = 0, int group_start_E = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // async copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::TBLoadIterationsA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, gmem_ptr, iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // async copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::TBLoadIterationsB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-
-          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, gmem_ptr, iterator_B.valid());
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-
-    iterator_E.set_iteration_index(group_start_E);
-    this->smem_iterator_E_.set_iteration_index(group_start_E);
-
-    // async copy for operand E
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupE; ++j) {
-      if (group_start_E + j < Detail::TBLoadIterationsE) {
-        typename IteratorE::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorE::AccessType *>(
-                this->smem_iterator_E_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
-                              IteratorE::ThreadMap::kElementsPerAccess / 8;
-
-        auto gmem_ptr = iterator_E.get();
-
-        cutlass::arch::cp_async<kSrcBytes, kCacheOpE>(
-            dst_ptr, gmem_ptr, iterator_E.valid() && is_warp_valid_);
-
-        ++iterator_E;
-        ++this->smem_iterator_E_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< iterator over E operand in global memory
-      IteratorE iterator_E,
-      ///< initial value of accumulator
-      FragmentC const &src_accum) {
-
-    //
-    // Prologue
-    //
-
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-      iterator_E.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // async copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // async copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      iterator_E.set_iteration_index(0);
-      this->smem_iterator_E_.set_iteration_index(0);
-
-      // async copy for operand E
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::TBLoadIterationsE; ++j) {
-        typename IteratorE::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorE::AccessType *>(
-                this->smem_iterator_E_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
-                              IteratorE::ThreadMap::kElementsPerAccess / 8;
-        if (is_warp_valid_)
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpE>(
-              dst_ptr, iterator_E.get(), iterator_E.valid());
-
-        ++iterator_E;
-
-        ++this->smem_iterator_E_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-      iterator_E.add_tile_offset({0, 1});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-      this->smem_iterator_E_.add_tile_offset({0, 1});
-
-      // cp.async.commit_group - completes a stage
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[Detail::kBBufferSize];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[Detail::kBBufferSize];
-    WarpFragmentE warp_frag_E[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_E_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-    this->warp_tile_iterator_E_.load(warp_frag_E[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-    ++this->warp_tile_iterator_E_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-    iterator_E.clear_mask(gemm_k_iterations == 0);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_E_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_E_.load(warp_frag_E[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_E_;
-
-       if (Detail::kBBufferSize == 2) {
-          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-          this->warp_tile_iterator_B_.load(
-              warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
-          ++this->warp_tile_iterator_B_;
-        }
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % Detail::kBBufferSize]);
-
-        warp_mma(
-          accum,
-          warp_transformed_frag_A[warp_mma_k % 2],
-          warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize], accum,
-          warp_frag_E[warp_mma_k % 2]
-        );
-
-        if (Detail::kBBufferSize == 1) {
-          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-          ++this->warp_tile_iterator_B_;
-  
-        }
-
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
-
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-          group_start_iteration_E = warp_mma_k * Detail::kAccessesPerGroupE;
-
-          copy_tiles_and_advance(
-              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
-              group_start_iteration_B, group_start_iteration_E);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-          group_start_iteration_E =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupE;
-
-          copy_tiles_and_advance(
-              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
-              group_start_iteration_B, group_start_iteration_E);
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed. 
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-          iterator_E.add_tile_offset({0, 1});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-          this->smem_iterator_E_.add_tile_offset({0, 1});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            this->smem_iterator_E_.add_tile_offset({0, -Base::kStages});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            this->warp_tile_iterator_E_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-          iterator_E.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
-      }
-
-    }
-
-
-    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
deleted file mode 100755
index fa95dd7d2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
+++ /dev/null
@@ -1,545 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-
-#pragma once
-
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/threadblock/mma_base.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB_,
-    /// Iterates over tiles of B operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy_,
-    /// Number of stages,
-    int Stages,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class MmaWithReductionMultistage : 
-  public MmaBase<Shape_, Policy_, Stages> {
-public:
-  ///< Base class
-  using Base = MmaBase<Shape_, Policy_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B operand in global memory
-  using IteratorB = IteratorB_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy = Policy_;
-
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB = SmemIteratorB_;
-
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-
-  //
-  // Dependent types
-  //
-
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy::Operator::FragmentC;
-
-  /// Warp-level Mma
-  using Operator = typename Policy::Operator;
-
-  using FragmentReduction = typename Operator::FragmentReduction;
-
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Operator::kTransformB;
-
-  static int const kReduceKForA = Operator::kReduceKForA;
-
-  /// Internal structure exposed for introspection.
-  struct Detail {
-
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB::ThreadMap::Iterations::kCount;
-
-    /// Number of stages
-    static int const kStages = Stages;
-
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
-
- private:
-
-  using WarpLoadedFragmentA = typename Operator::FragmentA;
-  using WarpLoadedFragmentB = typename Operator::FragmentB;
-  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
-  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB smem_iterator_B_;
-
-public:
-
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  MmaWithReductionMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          }
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-    }
-
-    iterator_B.set_iteration_index(group_start_B *
-                                   IteratorB::kAccessesPerVector);
-    this->smem_iterator_B_.set_iteration_index(group_start_B);
-
-    // Async Copy for operand B
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
-                              IteratorB::ThreadMap::kElementsPerAccess /
-                              IteratorB::kAccessesPerVector / 8;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B.get();
-
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          }
-
-          ++iterator_B;
-        }
-        ++this->smem_iterator_B_;
-      }
-    }
-  }
-
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB iterator_B,
-      ///< initial value of accumulator
-      FragmentC const &src_accum,
-      FragmentReduction &gemm_k_reduction_accum) {
-
-    //
-    // Prologue
-    //
-    // Issue several complete stages
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B.clear_mask(gemm_k_iterations == 0);
-
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-
-          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-
-          ++iterator_A;
-        }
-
-        ++this->smem_iterator_A_;
-      }
-
-      iterator_B.set_iteration_index(0);
-      this->smem_iterator_B_.set_iteration_index(0);
-
-      // Async Copy for operand B
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB::AccessType *>(
-                this->smem_iterator_B_.get());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB::Element>::value *
-              IteratorB::ThreadMap::kElementsPerAccess /
-              IteratorB::kAccessesPerVector / 8;
-
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-
-          ++iterator_B;
-        }
-
-        ++this->smem_iterator_B_;
-      }
-
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B.add_tile_offset({1, 0});
-
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B_.add_tile_offset({1, 0});
-
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-
-    // Perform accumulation in the 'd' output operand
-    accum = src_accum;
-
-    // Waits until kStages-2 stages have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB warp_loaded_frag_B[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB warp_transformed_frag_B[2];
-
-    Operator warp_mma;
-
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.set_kgroup_index(0);
-
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
-
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B_;
-
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == 0);
-
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-
-    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
-                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
-
-    //
-    // Mainloop
-    //
-
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B_;
-
-        if (warp_mma_k > 0)
-          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                             warp_transformed_frag_B[warp_mma_k % 2],
-                             warp_loaded_frag_A[warp_mma_k % 2],
-                             warp_loaded_frag_B[warp_mma_k % 2]);
-
-        warp_mma(
-          accum, 
-          warp_transformed_frag_A[warp_mma_k % 2],
-          warp_transformed_frag_B[warp_mma_k % 2], 
-          accum,
-          gemm_k_reduction_accum
-        );
-
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
-
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
-        }
-
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-
-          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
-                               group_start_iteration_B);
-
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-
-          // Waits until kStages-2 stages have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B.add_tile_offset({1, 0});
-
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B_.add_tile_offset({1, 0});
-
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B_.add_tile_offset(
-                {-Base::kStages * Policy::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B.clear_mask(gemm_k_iterations == 0);
-        }
-
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
-          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
-      }
-
-    }
-    
-    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
deleted file mode 100755
index 1a4948d07..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ /dev/null
@@ -1,459 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
-      GEMM problems.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/gemm/threadblock/index_remat.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle_streamk.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for GEMMs
-template <int N = 1>
-struct GemmIdentityThreadblockSwizzle {
-
-  CUTLASS_HOST_DEVICE
-  GemmIdentityThreadblockSwizzle() { }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// *Gemm* problem size: gemm(M, N, K)
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int split_k_slices) {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      split_k_slices);
-  }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// *ImplicitGemm* Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    cutlass::conv::Operator conv_operator,
-    cutlass::conv::Conv2dProblemSize const &problem_size,
-    GemmCoord tile_size,
-    int split_k_slices) {
-
-    gemm::GemmCoord implicit_gemm_problem_size = 
-    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
-
-    return get_tiled_shape(
-      implicit_gemm_problem_size, tile_size, split_k_slices);
-  }
-
-  /// Returns the shape of the problem in units of logical tiles
-  /// *ImplicitGemm* Conv3d problem size: conv_operator(NZPQK, NDHWC, KTRSC)
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    cutlass::conv::Operator conv_operator,
-    cutlass::conv::Conv3dProblemSize const &problem_size,
-    GemmCoord tile_size,
-    int split_k_slices) {
-
-    gemm::GemmCoord implicit_gemm_problem_size = 
-    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
-
-    return get_tiled_shape(
-      implicit_gemm_problem_size, tile_size, split_k_slices);
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(GemmCoord tiled_shape) {
-    int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    auto n = tiled_shape.n();
-    // Thresholds picked so that it doesn't cause too many no-op CTAs
-    if (N >= 8 && n >= 6)
-      return 3;
-    else if (N >= 4 && n >= 3)
-      return 2;
-    else if (N >= 2 && n >= 2)
-      return 1;
-    else
-      return 0;
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(int log_tile) {
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-    int block_idx_z = RematerializeBlockIdxZ();
-
-    return GemmCoord{(block_idx_x >> log_tile),  //
-                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
-                     block_idx_z};
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
-
-    int const kTile = N;
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-
-    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
-      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
-
-    return GemmCoord{
-      (block_idx_x / kTile),
-      (block_idx_y * kTile) + (block_idx_x % kTile),
-      RematerializeBlockIdxZ()
-    };
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for GEMMs
-struct GemmHorizontalThreadblockSwizzle {
-
-  CUTLASS_HOST_DEVICE
-  GemmHorizontalThreadblockSwizzle() { }
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int split_k_slices) {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      split_k_slices);
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(GemmCoord tiled_shape) {
-    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    return 0;
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
-    return GemmCoord{
-      RematerializeBlockIdxY(),
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxZ()
-    };
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for batched GEMMs
-struct GemmBatchedIdentityThreadblockSwizzle {
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int batch_count) {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      batch_count % (1 << 16));
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(GemmCoord tiled_shape) {
-    return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    return 0;
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
-    return GemmCoord{
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxY(),
-      RematerializeBlockIdxZ()
-    };
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(int log_tile) {
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-    int block_idx_z = RematerializeBlockIdxZ();
-
-    return GemmCoord{(block_idx_x >> log_tile),  //
-                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
-                     block_idx_z};
-  }
-
-  /// Gets the batch index
-  CUTLASS_DEVICE
-  static int get_batch_idx() {
-    return RematerializeBlockIdxZ();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for split-K GEMMs
-template <int N = 1>
-struct GemmSplitKIdentityThreadblockSwizzle {
-
-  int const kTile = N;
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int partitions) {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      partitions);
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    auto n = tiled_shape.n();
-    // Thresholds picked so that it doesn't cause too many no-op CTAs
-    if (N >= 8 && n >= 6)
-      return 3;
-    else if (N >= 4 && n >= 3)
-      return 2;
-    else if (N >= 2 && n >= 2)
-      return 1;
-    else
-      return 0;
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(GemmCoord tiled_shape) {
-    int tile = 1 << get_log_tile(tiled_shape);
-    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(int log_tile) {
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-    int block_idx_z = RematerializeBlockIdxZ();
-
-    return GemmCoord{(block_idx_x >> log_tile),  //
-                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
-                     block_idx_z};
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
-
-    int const kTile = N;
-    int block_idx_x = RematerializeBlockIdxX();
-    int block_idx_y = RematerializeBlockIdxY();
-
-    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
-      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
-
-    return GemmCoord{
-      (block_idx_x / kTile),
-      (block_idx_y * kTile) + (block_idx_x % kTile),
-      RematerializeBlockIdxZ()
-    };
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for split-K GEMMs
-struct GemmSplitKHorizontalThreadblockSwizzle {
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static GemmCoord get_tiled_shape(
-    GemmCoord problem_size,
-    GemmCoord tile_size,
-    int partitions) {
-
-    return GemmCoord(
-      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      partitions);
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(GemmCoord tiled_shape) {
-    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    return 0;
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(int log_tile) {
-    return GemmCoord{
-      RematerializeBlockIdxY(),
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxZ()
-    };
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
-    return GemmCoord{
-      RematerializeBlockIdxY(),
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxZ()
-    };
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock swizzling function for batched GEMVs
-struct GemvBatchedStridedThreadblockDefaultSwizzle {
-
-  /// Returns the shape of the problem in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static BatchedGemmCoord get_tiled_shape(
-    BatchedGemmCoord problem_size,
-    BatchedGemmCoord tile_size) {
-
-    return BatchedGemmCoord(
-      1, // M is always 1
-      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
-      (problem_size.k() + tile_size.k() - 1) / tile_size.k(),
-      (problem_size.batch() + tile_size.batch() - 1) / tile_size.batch());
-  }
-
-  /// Computes CUDA grid dimensions given a size in units of logical tiles
-  CUTLASS_HOST_DEVICE
-  static dim3 get_grid_shape(BatchedGemmCoord tiled_shape) {
-    return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
-  }
-
-  /// Calculates optimal swizzle width
-  CUTLASS_HOST_DEVICE
-  static int get_log_tile(GemmCoord tiled_shape) {
-    return 0;
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static BatchedGemmCoord get_tile_offset(int log_tile) {
-    return BatchedGemmCoord{
-      0, // M is always 1
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxZ(),
-      RematerializeBlockIdxY(),
-    };
-  }
-
-  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
-  CUTLASS_DEVICE
-  static BatchedGemmCoord get_tile_offset() {
-    return BatchedGemmCoord{
-      0, // M is always 1
-      RematerializeBlockIdxX(),
-      RematerializeBlockIdxZ(),
-      RematerializeBlockIdxY(),
-    };
-  }
-
-  /// Gets the batch tile index
-  CUTLASS_DEVICE
-  static int get_batch_tile_idx() {
-    return RematerializeBlockIdxY();
-  }
-
-  /// Gets the absolute batch index
-  CUTLASS_DEVICE
-  static int get_batch_idx() {
-    return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
deleted file mode 100755
index b79e587d7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ /dev/null
@@ -1,801 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Implements streamk threadblock mapping blockIdx to GEMM problems.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/gemm/gemm_enumerated_types.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include "cutlass/gemm/threadblock/index_remat.h"
-
-#if !defined(__CUDACC_RTC__)
-#include <iostream>
-#include "cutlass/core_io.h"
-#include "cutlass/trace.h"
-#endif
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Threadblock mapping control for GEMMs
-struct ThreadblockSwizzleStreamK {
-
-  /// Advertise StreamkFeature
-  using StreamkFeature = void;
-
-
-  /// Kernel traits
-  template <typename GemmKernel>
-  struct KernelTraits {};
-
-
-  /// Reduction strategy
-  enum ReductionStrategy
-  {
-    kNone,      // Data-parallel strategy (no seams, fixup, etc.)
-
-    kAtomic,    // Non-deterministic reduction of SK-block partials using atomic aggregation in L2
-
-    kMixed,     // Deterministic reduction of SK-block partials employing either:
-                //   (a) A separate wave of reduction thread blocks" (for scenarios with lots of
-                //       SK-blocks per SK-tile)
-                //   (b) Turnstile-ordered atomic aggregation in L2 (for scenarios with few
-                //       SK-blocks per SK-tile)
-  };
-
-  static ReductionStrategy const kReductionStrategy = kMixed;
-
-
-  //
-  // Heuristics
-  //
-
-  /// Data-parallel wave-quantization efficiency threshold (above which we go data-parallel)
-  static float constexpr kDpEfficiencyThreshold = 0.92f;
-
-  /// Minimum number of MAC-iterations per streamk block
-  static int const kMinItersPerSkBlock = 2;
-
-  /// Height in CTAs of a grid rasterization cohort
-  static int const kCohortCtasM = 8;
-
-  /// Width in CTAs of a grid rasterization cohort
-  static int const kCohortCtasN = 4;
-
-  /// Number of CTAs per cohort
-  static int const kCtasPerCohort = kCohortCtasN * kCohortCtasM;
-
-  /// Cost-equivalent number of SM-iterations for fixup I/O
-  static int const kFixupStartupIterEquiv = 10;
-  static int const kFixupPeerIterEquiv = 3;
-
-
-  //
-  // Member state
-  //
-
-
-  /// The 3D value-extents of the GEMM computation volume (m,n,k)
-  GemmCoord problem_size;
-
-  /// Div/mod accelerators
-  FastDivmod div_mod_tiled_shape_m;
-  FastDivmod div_mod_tiled_shape_n;
-  FastDivmod div_mod_tiled_cohort_shape_n;
-  FastDivmod div_mod_iters_per_tile;
-
-  /// Whether to perform cohort CTA rasterization
-  bool cohort_raster;
-
-  // Whether to pad and remap block indices
-  bool remap_block_indices;
-
-  /// CTA occupancy per SM
-  int sm_occupancy;
-
-  /// Number of SMs for dispatch heuristics to load-balance using Stream-K CTAs (wave size)
-  int avail_sms;
-
-  int dp_blocks;                            /// Number of data-parallel thread blocks in the grid
-  int dp_first_wave_tiles;                  /// Number of output tiles each CTA in the first DP wave will produce
-
-  /// Number of reduction blocks in the grid
-  int reduction_blocks;
-
-  int sk_waves;
-  int sk_tiles;
-  int sk_big_blocks_per_region;
-  int sk_iters_per_region;
-
-  /// Div/mod accelerators
-  FastDivmod div_mod_sk_iters_per_normal_block;
-  FastDivmod div_mod_sk_iters_per_big_block;
-  FastDivmod div_mod_sk_iters_per_region;
-  FastDivmod div_mod_sk_regions;                      //!! used in block map
-  FastDivmod div_mod_sk_blocks_per_region;            //!! used in block map
-
-  /// The batch count
-  int batch_count;
-
-
-  //
-  // Host+device interface
-  //
-
-  /// Constructor
-  ThreadblockSwizzleStreamK() = default;
-
-  /// Returns the GEMM volume in thread block tiles
-  CUTLASS_HOST_DEVICE
-  GemmCoord tiled_shape() const
-  {
-    return GemmCoord(
-        static_cast<int>(div_mod_tiled_shape_m),
-        static_cast<int>(div_mod_tiled_shape_n),
-        batch_count);
-  }
-
-  /// Number of iterations per output tile
-  CUTLASS_HOST_DEVICE
-  int iters_per_tile() const
-  {
-    return static_cast<int>(div_mod_iters_per_tile);
-  }
-
-  /// Number of iterations for normal SK-blocks
-  CUTLASS_HOST_DEVICE
-  int sk_iters_per_normal_block() const
-  {
-    return static_cast<int>(div_mod_sk_iters_per_normal_block);
-  }
-
-  /// Number of SK regions
-  CUTLASS_HOST_DEVICE
-  int sk_regions() const
-  {
-    return static_cast<int>(div_mod_sk_regions);
-  }
-
-  /// Number of SK blocks per region (splitting factor)
-  CUTLASS_HOST_DEVICE
-  int sk_blocks_per_region() const
-  {
-    return static_cast<int>(div_mod_sk_blocks_per_region);
-  }
-
-
-  //
-  // Host-side interface
-  //
-
-  /// Debug print
-  void Print()
-  {
-#ifndef __CUDA_ARCH__
-    auto tiles = tiled_shape().mn().product();
-    std::cout <<
-        "problem_size: (" << problem_size.m() << "," << problem_size.n() << ")" <<
-        ", tiled_shape: (" << tiled_shape().m() << "," << tiled_shape().n() << ")" <<
-        ", tiles: " << tiles <<
-        ", dp_tiles: " << tiles - sk_tiles <<
-        ", sk_tiles: " << sk_tiles <<
-        ", iters_per_tile: " << iters_per_tile() <<
-        ", reduction_blocks: " << reduction_blocks <<
-        ", dp_blocks: " << dp_blocks <<
-        ", dp_waves: " << dp_blocks / avail_sms <<
-        ", dp_first_wave_tiles: " << dp_first_wave_tiles <<
-        ", sk_blocks_per_region: " << sk_blocks_per_region() <<
-        ", sk_regions: " << sk_regions() <<
-        ", sk_waves: " << sk_waves <<
-        ", sk_iters_per_normal_block: " << sk_iters_per_normal_block() <<
-        ", sk_big_blocks_per_region: " << sk_big_blocks_per_region <<
-        ", remap_block_indices: " << remap_block_indices <<
-        ", cohort_raster: " << cohort_raster <<
-        ", sm_occupancy: " << sm_occupancy <<
-        ", avail_sms: " << avail_sms <<
-        ", num_blocks: " << get_num_blocks() <<
-        "\n\n";
-#endif
-  }
-
-
-  // Compute sk_blocks to dispatch for a given number of sk_tiles
-  static void get_sk_blocks(
-    int &sk_blocks,     /// [out]
-    int &savings_iters, /// [out]
-    int sk_tiles,
-    int iters_per_tile,
-    int avail_sms,
-    int max_sk_occupancy,
-    bool allow_partial_wave)
-  {
-    savings_iters = INT_MIN;
-    sk_blocks = 0;
-
-    if (sk_tiles == 0) {
-      return;
-    }
-
-    int sk_iters = sk_tiles * iters_per_tile;
-
-    int dp_equiv_waves = (sk_tiles + avail_sms - 1) / avail_sms;
-    int dp_equiv_iters = iters_per_tile * dp_equiv_waves;
-
-    int min_sk_blocks = (allow_partial_wave) ? fast_min(avail_sms, sk_tiles + 1) : avail_sms;
-    int max_sk_blocks = fast_min(avail_sms * max_sk_occupancy, sk_iters / kMinItersPerSkBlock);
-
-    for (int trial_sk_blocks = min_sk_blocks; trial_sk_blocks <= max_sk_blocks; ++trial_sk_blocks)
-    {
-      int sk_waves = (trial_sk_blocks + avail_sms - 1) / avail_sms;
-      int max_sk_iters_per_block = (sk_iters + trial_sk_blocks - 1) / trial_sk_blocks;
-      int sk_iter_equiv = max_sk_iters_per_block * sk_waves;
-
-      int num_peers = ((trial_sk_blocks + sk_tiles - 1) / sk_tiles) + 1;        // add one for alignment skew
-
-      float iter_cost = 0.02f * float(num_peers) * float(sk_iter_equiv);
-
-      if (trial_sk_blocks % sk_tiles == 0)
-      {
-        // aligned
-        num_peers = (trial_sk_blocks / sk_tiles);
-
-        iter_cost = 0.0f;
-      }
-
-      float peer_cost = 2.0f * float(num_peers);
-
-      float base_cost = 2.0f * float(sk_waves);
-
-      int fixup_iter_equiv = int(base_cost + iter_cost + peer_cost);
-
-      int trial_savings_iters = dp_equiv_iters - sk_iter_equiv - fixup_iter_equiv;
-
-      if (trial_savings_iters >= savings_iters) {
-          savings_iters = trial_savings_iters;
-          sk_blocks = trial_sk_blocks;
-      }
-    }
-  }
-
-
-  /// Determine the populations of DP and SK blocks to invoke for the given number of output tiles
-  static void get_blocks(
-    int &dp_tiles,      /// [out]
-    int &sk_blocks,     /// [out]
-    int output_tiles,
-    int iters_per_tile,
-    int avail_sms,
-    int sm_occupancy)
-  {
-    int full_waves = output_tiles / avail_sms;
-    int full_wave_tiles = full_waves * avail_sms;
-    int partial_wave_tiles = output_tiles - full_wave_tiles;
-
-    int score = -1;
-    dp_tiles = output_tiles;
-    sk_blocks = 0;
-
-    if (partial_wave_tiles == 0)
-    {
-      // Perfect quantization
-      return;
-    }
-
-    if (full_waves < sm_occupancy)
-    {
-        // We're less than full GPU occupancy
-
-        // Form the SK wave from the partial wave to get us up to full GPU occupancy
-        int max_sk_occupancy = sm_occupancy - full_waves;
-
-        dp_tiles = full_wave_tiles;
-
-        get_sk_blocks(
-          sk_blocks,
-          score,
-          partial_wave_tiles,
-          iters_per_tile,
-          avail_sms,
-          max_sk_occupancy,
-          true);                 // we can run with less than a full wave of SK-blocks
-
-        if (score < 0) {
-          // not profitable
-          sk_blocks = 0;
-          dp_tiles = output_tiles;
-        }
-
-        return;
-    }
-
-    // We're at (or greater) than GPU occupancy
-
-    if ((sm_occupancy > 1 ) && (full_waves % sm_occupancy == sm_occupancy - 1))
-    {
-        // If occupancy is more than one CTA per SM, form the SK wave from the partial
-        // wave to get us to full GPU occupancy
-        int max_sk_occupancy = 1;
-
-        dp_tiles = full_wave_tiles;
-
-        get_sk_blocks(
-          sk_blocks,
-          score,
-          partial_wave_tiles,
-          iters_per_tile,
-          avail_sms,
-          max_sk_occupancy,
-          true);                 // we can run with less than a full wave of SK-blocks
-
-        if (score >= 0) {
-            return;
-        }
-    }
-
-    // Form the SK wave by combining the last full wave and the partial wave
-    // We're less than full GPU occupancy
-    dp_tiles = full_wave_tiles - avail_sms;
-
-    int max_sk_occupancy = sm_occupancy - ((full_waves - 1) % sm_occupancy);
-
-    get_sk_blocks(
-      sk_blocks,
-      score,
-      partial_wave_tiles + avail_sms,
-      iters_per_tile,
-      avail_sms,
-      max_sk_occupancy,
-      false);                 // we cannot run with less than a full wave of SK-blocks
-
-    if (score < 0) {
-      // not profitable
-      sk_blocks = 0;
-      dp_tiles = output_tiles;
-    }
-
-  }
-
-  /// Constructor: *Gemm* problem size (m, n, k)
-  ThreadblockSwizzleStreamK(
-    GemmUniversalMode const mode_,
-    GemmCoord const problem_size_,
-    GemmCoord const tile_size_,
-    int const batch_split_,                        /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
-    int const sm_occupancy_,
-    int const device_sms_,
-    int const avail_sms_,                          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
-    size_t const element_A_bytes_,
-    size_t const element_B_bytes_,
-    size_t const element_C_bytes_,
-    int const epilogue_acc_fragments_)
-  :
-    problem_size(problem_size_),
-    batch_count((mode_ == GemmUniversalMode::kBatched || mode_ == GemmUniversalMode::kArray) ? batch_split_ : 1),
-    reduction_blocks(0),
-    dp_blocks(0),
-    dp_first_wave_tiles(1),     // Default: one tile per DP-block in the first wave of DP blocks
-    sk_tiles(0),
-    sk_big_blocks_per_region(0),
-    sk_iters_per_region(0),
-    sk_waves(0),
-    sm_occupancy(sm_occupancy_),
-    remap_block_indices(false),
-    avail_sms(fast_max(1, avail_sms_)),
-    cohort_raster(false)
-  {
-    int gpu_occupancy = device_sms_ * sm_occupancy;
-    int iters_per_tile = (problem_size.k() + tile_size_.k() - 1) / tile_size_.k();
-    int sk_iters_per_normal_block = 0;
-
-    int sk_regions = 1;              // Default: a single region of iteration space (across all SK tiles)
-    int sk_blocks_per_region = 0;
-
-    GemmCoord tiled_shape(
-      (problem_size.m() + tile_size_.m() - 1) / tile_size_.m(),
-      (problem_size.n() + tile_size_.n() - 1) / tile_size_.n(),
-      batch_count);
-
-    size_t problem_bytes =
-              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
-              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
-              (element_B_bytes_ * problem_size.k() * problem_size.n());
-
-    size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
-
-    [[maybe_unused]] float flops_per_byte = float(problem_flops) / float(problem_bytes);
-
-    int output_tiles = tiled_shape.m() * tiled_shape.n();
-    int waves = (output_tiles + avail_sms - 1) / avail_sms;
-    [[maybe_unused]] float dp_efficiency = float(output_tiles) / float(waves * avail_sms);
-
-    //
-    // Determine dispatch composition of DP-tiles and SK-blocks
-    //
-
-    // Start with a DP-only configuration
-    int dp_tiles = output_tiles;    // Number of data-parallel tiles
-    int sk_blocks = 0;              // Number of thread blocks to produce the remaining SK tiles
-
-    // Only kGemm mode allows for SK load balancing
-    if (mode_ == GemmUniversalMode::kGemm)
-    {
-      int split_factor = batch_split_;
-      if (split_factor > 1)
-      {
-        // Split-K override
-        dp_tiles = 0;
-        sk_blocks = output_tiles * split_factor;
-      }
-      else if ((kReductionStrategy != kNone) &&   // Load-balancing strategy statically enabled
-        (avail_sms > 1))                         // Plurality of SMs to load balance across
-      {
-        // Use heuristics
-        get_blocks(
-          dp_tiles,      /// [out]
-          sk_blocks,     /// [out]
-          output_tiles,
-          iters_per_tile,
-          avail_sms,
-          sm_occupancy);
-      }
-    }
-
-    sk_tiles = output_tiles - dp_tiles;
-
-
-    // Compute SK block iteration details
-    if (sk_blocks > 0)
-    {
-      sk_waves = (sk_blocks + avail_sms - 1) / avail_sms;
-
-      int sk_iters = sk_tiles * iters_per_tile;
-      sk_blocks = fast_min(sk_blocks, sk_iters);
-
-      sk_iters_per_normal_block = sk_iters / sk_blocks;
-      int extra_sk_iters = sk_iters - (sk_iters_per_normal_block * sk_blocks);
-      int sk_big_blocks = extra_sk_iters;
-
-      if ((sk_blocks > sk_tiles) && (sk_blocks % sk_tiles == 0))
-      {
-        // Split-K decomposition
-        sk_regions = sk_tiles;
-      }
-
-      sk_blocks_per_region = sk_blocks / sk_regions;
-      sk_big_blocks_per_region = sk_big_blocks / sk_regions;
-      sk_iters_per_region = sk_iters / sk_regions;
-
-      // Use a separate reduction wave when all of:
-      // - Non-atomic reduction stratgy
-      // - The number of SK waves won't fully occupy the GPU (Otherwise we don't have
-      //   a strong-scaling case for more parallel reduction)
-      // - More than three peers working on an SK tile.  (This occurs when the ratio of
-      //   SK-blocks to SK-tiles > 2, as a single tile may be covered by four SK-blocks,
-      //   e.g.:[partial-block | block | block | partial-block] ).  With three or
-      //   less peers, the two non-finishing SK-blocks are not expexted to contend.
-      if ((kReductionStrategy == kMixed) &&
-          (sk_waves < sm_occupancy) &&
-          (sk_blocks > 2 * sk_tiles))
-      {
-        // Launch a reduction block for every accumulator fragment in each SK-tile
-        reduction_blocks = sk_tiles * epilogue_acc_fragments_;
-
-      }
-
-      // When we have a multi-occupancy kernel and at least two waves of active blocks (where
-      // at least one wave is SK blocks), we need to (1) dispatch at least four waves, and (2)
-      // remap the block indices so that we can reliably spread the SK blocks evenly across the
-      // device's first SM occupancy valence. Also see get_num_blocks() and get_block_idx().
-      remap_block_indices = (
-          (sm_occupancy > 1) &&
-          (device_sms_ == avail_sms) &&
-          (get_num_active_blocks() > avail_sms * 2));
-
-      // Initialize fast div/mod members related to SK
-      div_mod_sk_iters_per_normal_block = FastDivmod(sk_iters_per_normal_block);
-      div_mod_sk_iters_per_big_block = FastDivmod(sk_iters_per_normal_block + 1);
-      div_mod_sk_iters_per_region = FastDivmod(sk_iters_per_region);
-      div_mod_sk_regions = FastDivmod(sk_regions);
-      div_mod_sk_blocks_per_region = FastDivmod(sk_blocks_per_region);
-    }
-
-    //
-    // Compute DP blocks
-    //
-
-    dp_blocks = dp_tiles;
-
-    cutlass::gemm::GemmCoord tiled_cohort_shape(
-        (tiled_shape.m() + kCohortCtasM - 1) / kCohortCtasM,
-        (tiled_shape.n() + kCohortCtasN - 1) / kCohortCtasN,
-        tiled_shape.k());
-    int cohort_blocks = (tiled_cohort_shape.m() * tiled_cohort_shape.n()) * kCtasPerCohort;
-    float cohort_efficiency = float(dp_blocks) / float(cohort_blocks);
-
-    // Check if the SK tiles would be in cohorts that are in-bounds
-    bool sk_in_range = true;
-    if (sk_tiles > 0)
-    {
-      int last_sk_tile = sk_tiles - 1;
-      int cohort_tile_idx = last_sk_tile / kCtasPerCohort;
-      int cohort_grid_m = cohort_tile_idx / tiled_cohort_shape.n();
-      int cohort_grid_n = (cohort_grid_m > 0) ?
-        tiled_cohort_shape.n() - 1 :
-        cohort_tile_idx % tiled_cohort_shape.n();
-
-      if ((((cohort_grid_m + 1) * kCohortCtasM) >= tiled_shape.m()) ||
-          (((cohort_grid_n + 1) * kCohortCtasN) >= tiled_shape.n()))
-      {
-        sk_in_range = false;
-      }
-
-    }
-
-    // Decide if we're going to be doing cohort raster
-    if (sk_in_range &&
-        (dp_blocks >= gpu_occupancy * 2) &&
-        (cohort_efficiency > 0.85f))
-    {
-      cohort_raster = true;
-      dp_blocks = cohort_blocks;
-    }
-    else if (sk_waves > 0)
-    {
-      // Update semi-persistence of first DP wave to ensure full grid wavesets
-      // (Only applies when there's an SK component and we're not doing blocked cohort rasterization)
-      int dp_tile_waves = (dp_tiles + avail_sms - 1) / avail_sms;
-      int full_dp_tile_waves = dp_tiles / avail_sms;
-      int waveset_excess = (sk_waves + dp_tile_waves) % sm_occupancy;
-
-      if (dp_first_wave_tiles + waveset_excess <= full_dp_tile_waves)
-      {
-        dp_first_wave_tiles += waveset_excess;
-        dp_blocks -= (waveset_excess * avail_sms);
-      }
-    }
-
-    // Setup fast-div/mod for device-side usage
-    div_mod_tiled_shape_m = FastDivmod(tiled_shape.m());
-    div_mod_tiled_shape_n = FastDivmod(tiled_shape.n());
-    div_mod_tiled_cohort_shape_n = FastDivmod(tiled_cohort_shape.n());
-    div_mod_iters_per_tile = FastDivmod(iters_per_tile);
-
-  }
-
-  /// Number of blocks performing useful work
-  int get_num_active_blocks() const
-  {
-    return (sk_waves * avail_sms) + dp_blocks + reduction_blocks;
-  }
-
-  /// Obtains number of threadblocks per GEMM
-  int get_num_blocks() const
-  {
-    int active_blocks = get_num_active_blocks();
-    if (remap_block_indices)
-    {
-      // Add padding blocks if we are performing remapping in order to dispatch a grid of at least four waves
-      return fast_max(active_blocks, avail_sms * 4);
-    }
-
-    return active_blocks;
-  }
-
-
-  /// Obtains grid extents in CTAs
-  dim3 get_grid_dims() const
-  {
-    return dim3(get_num_blocks(), 1, batch_count);
-  }
-
-
-  //
-  // Device-side interface
-  //
-
-  /// Obtains number of threadblocks per GEMM
-  CUTLASS_DEVICE
-  int device_num_blocks() const
-  {
-    return gridDim.x;
-  }
-
-  /// Obtains tile index for the given sk iteration
-  CUTLASS_DEVICE
-  int get_sk_tile_idx(int iter) const
-  {
-    int tile_idx = div_mod_iters_per_tile.div(iter);
-    return tile_idx;
-  }
-
-  /// Obtains the batch index
-  CUTLASS_DEVICE
-  int get_batch_idx() const
-  {
-    return RematerializeBlockIdxZ();
-  }
-
-  /// Obtains the calling threadblock's tiled coordinates for the given tile index
-  CUTLASS_DEVICE
-  GemmCoord get_tile_offset(int tile_idx) const
-  {
-    int m, n;
-
-    // row-major raster
-    div_mod_tiled_shape_n(m, n, tile_idx);
-
-    if (tiled_shape().m() < tiled_shape().n())
-    {
-      // column-major raster
-      div_mod_tiled_shape_m(n, m, tile_idx);
-    }
-
-    if (cohort_raster)
-    {
-      // tiled cohort raster
-      int cohort_tile_idx = tile_idx / kCtasPerCohort;
-      int cohort_grid_m, cohort_grid_n;
-      div_mod_tiled_cohort_shape_n(cohort_grid_m, cohort_grid_n, cohort_tile_idx);
-
-      int block_idx_cohort = tile_idx % kCtasPerCohort;
-      int block_cohort_m = block_idx_cohort / kCohortCtasN;
-      int block_cohort_n = block_idx_cohort % kCohortCtasN;
-
-      m = (cohort_grid_m * kCohortCtasM) + block_cohort_m;
-      n = (cohort_grid_n * kCohortCtasN) + block_cohort_n;
-    }
-
-    return GemmCoord(m, n, get_batch_idx());
-  }
-
-  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
-  CUTLASS_DEVICE
-  GemmCoord get_tile_offset_row_major(int tile_idx) const
-  {
-    // row-major raster
-    int m, n;
-    div_mod_tiled_shape_n(m, n, tile_idx);
-    return GemmCoord(m, n, get_batch_idx());
-  }
-
-  /// Obtains calling threadblock's linear threadblock index
-  CUTLASS_DEVICE
-  int get_block_idx() const
-  {
-    int block_idx = RematerializeBlockIdxX();
-
-    // Remap the block indices for the first two waves of thread blocks if
-    // we have multi-occupancy and the grid constitutes four or more waves
-    if (remap_block_indices && (block_idx < avail_sms * 2))
-    {
-      int dest_sm = block_idx / 2;
-      int dest_wave = block_idx % 2;
-      int remapped_block_idx = dest_sm + (dest_wave * avail_sms);
-      block_idx = remapped_block_idx;
-    }
-
-    // Remap block indices to interleave SK regions to limit intra-region waiting
-    if (block_idx < sk_regions() * sk_blocks_per_region())
-    {
-      int block_in_region;
-      int region;
-      div_mod_sk_regions(block_in_region, region, block_idx);
-      block_idx = (region * sk_blocks_per_region()) + block_in_region;
-    }
-
-    return block_idx;
-  }
-
-
-  /// Obtains calling linear threadblock index of the first block to work on the given tile
-  CUTLASS_DEVICE
-  int get_sk_block_idx(int iter) const
-  {
-    int region_idx;
-    int iter_in_region;
-    div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
-
-    int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
-    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks
-
-    int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
-    int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
-
-    int block_idx_in_region = (big_block_idx_in_region < sk_big_blocks_per_region) ?
-        big_block_idx_in_region :
-        normal_block_idx_in_region;
-
-    int owning_block_idx = (sk_blocks_per_region() * region_idx) + block_idx_in_region;
-
-    return owning_block_idx;
-  }
-
-  /// Obtains iteration extends for the given SK block index
-  CUTLASS_DEVICE
-  void get_iter_extents(
-      int sk_block_idx,
-      int &block_iter_begin,
-      int &block_iter_end) const
-  {
-    int region_idx;
-    int block_idx_in_region;
-    div_mod_sk_blocks_per_region(region_idx, block_idx_in_region, sk_block_idx);
-
-    block_iter_begin = (region_idx * sk_iters_per_region) + (block_idx_in_region * sk_iters_per_normal_block());
-
-    // Adjust extents for the first "num_big_blocks" blocks that get one extra iteration
-    int block_iters = sk_iters_per_normal_block();
-    if (block_idx_in_region < sk_big_blocks_per_region) {
-      // This is a +1 iteration block
-      block_iter_begin += block_idx_in_region;
-      block_iters++;
-    } else {
-      // This is a regular block
-      block_iter_begin += sk_big_blocks_per_region;
-    }
-    block_iter_end = block_iter_begin + block_iters;
-  }
-
-
-  /// Obtains calling linear threadblock index of the first block to work on the given tile
-  CUTLASS_DEVICE
-  int get_first_block_idx(int tile_idx, int block_idx) const
-  {
-    if (tile_idx >= sk_tiles) {
-      // DP tile
-      return block_idx;
-    }
-
-    int iter = tile_idx * iters_per_tile();
-    return get_sk_block_idx(iter);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace gemm
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
deleted file mode 100755
index 92e698f8a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
+++ /dev/null
@@ -1,612 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/warp/mma_complex_tensor_op.h"
-#include "cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h"
-#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Complex transform on A operand
-    ComplexTransform TransformA = ComplexTransform::kNone,
-    /// Complex transform on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone,
-    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
-    typename Operator_ = arch::OpMultiplyAddComplex>
-struct DefaultMmaComplexTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex<T>*complex<T> case
-//  4 real-valued mma operations
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Real-valued underlying type of complex-valued A operand
-    typename RealElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Real-valued underlying type of complex-valued B operand
-    typename RealElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Real-valued underlying type of complex-valued C operand
-    typename RealElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddComplex> {
-
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        RealElementA,
-        cutlass::layout::RowMajor,
-        RealElementB,
-        cutlass::layout::ColumnMajor,
-        RealElementC,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
-    WarpShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
-//  3 real-valued mma operations
-//  A  = (ar + j ai), B = (br +j bi), D = AB
-//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
-//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Real-valued underlying type of complex-valued A operand
-    typename RealElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Real-valued underlying type of complex-valued B operand
-    typename RealElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Real-valued underlying type of complex-valued C operand
-    typename RealElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddGaussianComplex> {
-
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        RealElementA,
-        cutlass::layout::RowMajor,
-        RealElementB,
-        cutlass::layout::ColumnMajor,
-        RealElementC,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
-    WarpShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization - input and output types are complex<float>*complex<float> 
-//  Use TF32 tensor operation internally
-//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddComplex> {
-
-  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        tfloat32_t,
-        cutlass::layout::RowMajor,
-        tfloat32_t,
-        cutlass::layout::ColumnMajor,
-        float,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
-    WarpShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization - input and output types are complex<float>*complex<float> 
-//  Use BF16 tensor operation internally
-//  4 real-valued mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 operations on BF16
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddFastBF16> {
-
-  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 mma instruction
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        bfloat16_t,
-        cutlass::layout::RowMajor,
-        bfloat16_t,
-        cutlass::layout::ColumnMajor,
-        float,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
-    WarpShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization - input and output types are complex<float>*complex<float> 
-//  Use F16 tensor operation internally
-//  4 real-valued mma.sync.aligned.m16n8k8.f32.f16.f16.f32 operations on F16
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddFastF16> {
-
-  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.f16.f16.f32 mma instruction
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        half_t,
-        cutlass::layout::RowMajor,
-        half_t,
-        cutlass::layout::ColumnMajor,
-        float,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
-    WarpShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// 3xTF32 or 4xTF32 (fast and accurate complex<float> operation)
-/// Partial specialization - input and output types are complex<float> * complex<float> 
-//  Use 3xTF32 or 4xTF32 tensor operation internally
-//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = 3x[(ar*br - ai*bi) + j (ar*bi + ai*br)]
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    InstructionShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddComplexFastF32> {
-
-  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        tfloat32_t,
-        cutlass::layout::RowMajor,
-        tfloat32_t,
-        cutlass::layout::ColumnMajor,
-        float,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOpFastF32<
-    WarpShape_,
-    complex<float>,
-    LayoutA,
-    complex<float>,
-    LayoutB,
-    complex<float>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex<double>*complex<double> case
-//  4 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations
-//  A = (ar + j ai), B (br +j bi), D = AB
-//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Real-valued underlying type of complex-valued A operand
-    typename RealElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Real-valued underlying type of complex-valued B operand
-    typename RealElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Real-valued underlying type of complex-valued C operand
-    typename RealElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    GemmShape<16, 8, 4>,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddComplex> {
-
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 4>,
-        32, 
-        RealElementA,
-        cutlass::layout::RowMajor,
-        RealElementB,
-        cutlass::layout::ColumnMajor,
-        RealElementC,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
-    WarpShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB,
-    true>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
-//  3 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations 
-//  A  = (ar + j ai), B = (br +j bi), D = AB
-//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
-//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Real-valued underlying type of complex-valued A operand
-    typename RealElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Real-valued underlying type of complex-valued B operand
-    typename RealElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Real-valued underlying type of complex-valued C operand
-    typename RealElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Complex transform on A operand
-    ComplexTransform TransformA,
-    /// Complex transform on B operand
-    ComplexTransform TransformB>
-struct DefaultMmaComplexTensorOp<
-    WarpShape_,
-    GemmShape<16, 8, 4>,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC,
-    TransformA,
-    TransformB,
-    arch::OpMultiplyAddGaussianComplex> {
-
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 4>,
-        32, 
-        RealElementA,
-        cutlass::layout::RowMajor,
-        RealElementB,
-        cutlass::layout::ColumnMajor,
-        RealElementC,
-        cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd>,
-      cutlass::MatrixShape<1, 1>
-    >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
-    WarpShape_,
-    complex<RealElementA>,
-    LayoutA,
-    complex<RealElementB>,
-    LayoutB,
-    complex<RealElementC>,
-    LayoutC, 
-    Policy,
-    TransformA,
-    TransformB,
-    true>;
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
deleted file mode 100755
index 223426544..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/warp/mma_sparse_tensor_op.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Operator describing the tensor operation
-    typename Operator_ = arch::OpMultiplyAdd,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false
->
-struct DefaultSparseMmaTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs and output types are float - uses TF32 internally
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of target matrix multiply instruction (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultSparseMmaTensorOp<
-  WarpShape_, 
-  InstructionShape_, 
-  float, LayoutA, 
-  float, LayoutB, 
-  float, LayoutC, 
-  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
-
-  // Uses TF32 internally
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::SparseMma<
-        InstructionShape_, 
-        32, 
-        tfloat32_t, cutlass::layout::RowMajor, 
-        tfloat32_t, cutlass::layout::ColumnMajor,
-        float, cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
-      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for m-by-n-by-kgroup
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Data type of B elements
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Operator describing the tensor operation
-    typename Operator_,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultSparseMmaTensorOp {
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::SparseMma<InstructionShape_, 32, ElementA,
-                               cutlass::layout::RowMajor, ElementB,
-                               cutlass::layout::ColumnMajor, ElementC,
-                               cutlass::layout::RowMajor, Operator_>,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
-      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
deleted file mode 100755
index 3a8cacd3d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Operator describing the tensor operation
-    typename Operator_ = arch::OpMultiplyAdd,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false>
-struct DefaultMmaTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for m-by-n-by-kgroup
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Data type of B elements
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Operator describing the tensor operation
-    typename Operator_,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp {
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
-                         cutlass::layout::RowMajor, ElementB,
-                         cutlass::layout::ColumnMajor, ElementC,
-                         cutlass::layout::RowMajor, Operator_>,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOp<
-      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/warp/default_mma_tensor_op_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
deleted file mode 100755
index 67fcde77e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
+++ /dev/null
@@ -1,375 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-#include "cutlass/gemm/warp/mma_mixed_input_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
-#include "cutlass/gemm/warp/default_mma_tensor_op.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs and output types are float - uses BF16 internally
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_, 
-  GemmShape<16, 8, 8>, 
-  float, LayoutA, 
-  float, LayoutB, 
-  float, LayoutC, 
-  arch::OpMultiplyAddFastBF16, 
-  PartitionsK, AccumulatorsInRowMajor> {
-
-  // Uses BF16 internally
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 8>, 
-        32, 
-        bfloat16_t, cutlass::layout::RowMajor, 
-        bfloat16_t, cutlass::layout::ColumnMajor,
-        float, cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOp<
-      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs and output types are float - uses F16 internally
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_, 
-  GemmShape<16, 8, 8>, 
-  float, LayoutA, 
-  float, LayoutB, 
-  float, LayoutC, 
-  arch::OpMultiplyAddFastF16, 
-  PartitionsK, AccumulatorsInRowMajor> {
-
-  // Uses F16 internally
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 8>, 
-        32, 
-        half_t, cutlass::layout::RowMajor, 
-        half_t, cutlass::layout::ColumnMajor,
-        float, cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOp<
-      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs and output types are float - uses TF32 internally
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of target matrix multiply instruction (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_, 
-  InstructionShape_, 
-  float, LayoutA, 
-  float, LayoutB, 
-  float, LayoutC, 
-  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
-
-  // Uses TF32 internally
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        tfloat32_t, cutlass::layout::RowMajor, 
-        tfloat32_t, cutlass::layout::ColumnMajor,
-        float, cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOp<
-      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs and output types are float - uses TF32 for Fast Accurate FP32
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of target matrix multiply instruction (concept: GemmShape)
-    typename InstructionShape_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_, 
-  InstructionShape_, 
-  float, LayoutA, 
-  float, LayoutB, 
-  float, LayoutC, 
-  arch::OpMultiplyAddFastF32, PartitionsK, AccumulatorsInRowMajor> {
-
-  // Uses TF32 internally
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        InstructionShape_, 
-        32, 
-        cutlass::tfloat32_t, cutlass::layout::RowMajor, 
-        cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
-        float, cutlass::layout::RowMajor, 
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOpFastF32<
-      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
-/// (e.g. F16 <= F16 x S8 + F16, F16 <= BF16 x S8 + F32)
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Element type of A matrix
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Element type of B matrix
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_,
-  GemmShape<16, 8, 16>,                 // InstructionShape
-  ElementA,                             // Element type of A matrix in Global Memory
-  LayoutA,                              // Layout of A matrix in Global Memory
-  ElementB,                             // Element type of B matrix in Global Memory
-  LayoutB,                              // Layout of B matrix in Global Memory
-  ElementC,                             // Element type of C matrix in Global Memory
-  LayoutC,                              // Layout of C matrix in Global Memory
-  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
-  PartitionsK, AccumulatorsInRowMajor> {
-
-
-  // Check if the ElementA and ElementB are of different data types
-  static_assert(!platform::is_same<ElementA, ElementB>::value,
-    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
-
-  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
-  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
-                                                    ElementA, ElementB>::type;
-
-  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
-  using ElementAMma = ElementOperand;
-  using ElementBMma = ElementOperand;
-  using MmaElementC = ElementC;
-
-  // Uses
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 16>,
-        32,
-        ElementAMma, cutlass::layout::RowMajor,
-        ElementBMma, cutlass::layout::ColumnMajor,
-        MmaElementC, cutlass::layout::RowMajor,
-        arch::OpMultiplyAdd
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
-      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
-/// (e.g. S32 <= S4 x S8 + S32, S32 <= S8 x S4 + S32)
-template <
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Element type of A matrix
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Element type of B matrix
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Number of partitions along K dimension
-    int PartitionsK,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor>
-struct DefaultMmaTensorOp<
-  WarpShape_,
-  GemmShape<16, 8, 32>,                 // InstructionShape
-  ElementA,                             // Element type of A matrix in Global Memory
-  LayoutA,                              // Layout of A matrix in Global Memory
-  ElementB,                             // Element type of B matrix in Global Memory
-  LayoutB,                              // Layout of B matrix in Global Memory
-  ElementC,                             // Element type of C matrix in Global Memory
-  LayoutC,                              // Layout of C matrix in Global Memory
-  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
-  PartitionsK, AccumulatorsInRowMajor> {
-
-
-  // Check if the ElementA and ElementB are of different data types
-  static_assert(!platform::is_same<ElementA, ElementB>::value,
-    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
-
-  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
-  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
-                                                    ElementA, ElementB>::type;
-
-  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
-  using MmaElementA = ElementOperand;
-  using MmaElementB = ElementOperand;
-  using MmaElementC = ElementC;
-
-  // Uses
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<
-        GemmShape<16, 8, 32>,
-        32,
-        MmaElementA, cutlass::layout::RowMajor,
-        MmaElementB, cutlass::layout::ColumnMajor,
-        MmaElementC, cutlass::layout::RowMajor,
-        arch::OpMultiplyAddSaturate
-      >,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
-      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
deleted file mode 100755
index db6713cb4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/warp/mma_with_reduction_tensor_op.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Data type of B elements
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Operator describing the tensor operation
-    typename Operator_,
-    /// Reduce operand A or B along K dimension
-    bool ReduceKForA_,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1,
-    /// Store the accumulators in row major or column major.  Row major is used
-    /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false>
-struct DefaultMmaWithReductionTensorOp {
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
-                         cutlass::layout::RowMajor, ElementB,
-                         cutlass::layout::ColumnMajor, ElementC,
-                         cutlass::layout::RowMajor, Operator_>,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaWithReductionTensorOp<
-      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-      Policy, ReduceKForA_, PartitionsK, AccumulatorsInRowMajor>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
deleted file mode 100755
index 145e4be7c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
-*/
-
-#pragma once
-
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    ///< Size of the Gemm problem (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA_,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA_,
-    /// Data type of B elements
-    typename ElementB_,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB_,
-    /// Element type of C matrix
-    typename ElementC_,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC_,
-    /// Operator describing the tensor operation
-    typename Operator_ = arch::OpMultiplyAdd,
-    /// Number of partitions along K dimension
-    int PartitionsK = 1
->
-struct DefaultMmaTensorOpWmma;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for m-by-n-by-kgroup
-template <
-    ///< Shape of one matrix production operation (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A elements
-    typename ElementA,
-    /// Layout of A matrix (concept: MatrixLayout)
-    typename LayoutA,
-    /// Data type of B elements
-    typename ElementB,
-    /// Layout of B matrix (concept: MatrixLayout)
-    typename LayoutB,
-    /// Element type of C matrix
-    typename ElementC,
-    /// Layout of C matrix (concept: MatrixLayout)
-    typename LayoutC,
-    /// Operator describing the tensor operation
-    typename Operator_,
-    /// Number of partitions along K dimension
-    int PartitionsK>
-struct DefaultMmaTensorOpWmma {
-  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
-      cutlass::arch::Wmma<
-          InstructionShape_, 
-          ElementA,
-          LayoutA, 
-          ElementB,
-          LayoutB, 
-          ElementC,
-          LayoutC, 
-          Operator_>,
-      cutlass::MatrixShape<1, 1> >;
-
-  // Define the warp-level tensor op
-  using Type = cutlass::gemm::warp::MmaTensorOpWmma<
-        WarpShape_,
-        ElementA, 
-        LayoutA, 
-        ElementB, 
-        LayoutB,
-        ElementC, 
-        LayoutC, 
-        Policy, 
-        PartitionsK>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-#endif
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
deleted file mode 100755
index bbf0090b0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level per channel scale+bias+relu before
-   matrix multiply-accumulate operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename FragmentActivations, typename FragmentVarMean, typename FragmentGammaBeta>
-struct LayernormScaleBiasTransform {
-
-  using T = typename FragmentActivations::Element;
-
-  static int const NumActivations = FragmentActivations::kElements;
-  static int const NumVarMean = FragmentVarMean::kElements;
-  static int const NumGammaBeta = FragmentGammaBeta::kElements;
-  static int const MmaElements = 2;
-  // One element has one scale and one bias
-  static int const MmaScaleBiasPair = 2;
-  // 16816 has 2 columns and 2 rows
-  static int const MmaCols = 2;
-  static int const MmaRows = 2;
-
-  using MmaOperand = Array<T, MmaElements>;
-  using VarMeanOperand = Array<__half2, MmaScaleBiasPair>;
-  using GammaBetaOperand = Array<T, MmaElements * MmaScaleBiasPair>;
-
-  CUTLASS_DEVICE
-  void transform(MmaOperand &activations,
-                 VarMeanOperand const &var_mean,
-                 GammaBetaOperand const &gamma_beta) {
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
-    uint32_t const *ptr_var_mean = reinterpret_cast<uint32_t const *>(&var_mean);
-    uint32_t const *ptr_gamma_beta = reinterpret_cast<uint32_t const *>(&gamma_beta);
-
-    // Apply per channel scale+bias+relu if the data is not a special NaN
-    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
-
-    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
-    // It requires C to be an even number.
-    asm volatile(
-        "{\n\t"
-        " fma.rn.f16x2 %0, %1, %2, %3;\n"
-        " fma.rn.f16x2 %0, %4, %0, %5;\n"
-        "}\n"
-        : "=r"(ptr_activations[0])
-        : "r"(ptr_var_mean[0]), "r"(ptr_activations[0]),
-          "r"(ptr_var_mean[1]),
-          "r"(ptr_gamma_beta[0]), "r"(ptr_gamma_beta[1]));
-#else
-    assert(0);
-#endif
-  }
-
-  CUTLASS_DEVICE
-  void operator()(FragmentActivations &activations,
-                  FragmentVarMean const &var_mean,
-                  FragmentGammaBeta const &gamma_beta) {
-    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
-    VarMeanOperand const *ptr_var_mean =
-        reinterpret_cast<VarMeanOperand const *>(&var_mean);
-    GammaBetaOperand const *ptr_gamma_beta =
-        reinterpret_cast<GammaBetaOperand const *>(&gamma_beta);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
-      transform(ptr_activations[i],
-                ptr_var_mean[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows],
-                ptr_gamma_beta[(i / MmaScaleBiasPair) % MmaCols]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm 
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
deleted file mode 100755
index dc210b025..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates exposing architecture support for warp-level multiply-add operations
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Query the number of threads per warp
-template <typename OperatorClass>
-struct WarpSize {
-  static int const value = 32;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
deleted file mode 100755
index 2ef8bb42d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ /dev/null
@@ -1,1168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-#include "cutlass/arch/mma_sm90.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  /// Data type of real & imag members of complex numbers in the SourceFragment
-  typename RealElement,
-  /// Destination fragment required by the mma operation 
-  typename DestinationFragment,
-  /// Source fragment holding complex<RealElement> elements
-  typename SourceFragment,
-  /// Number of mma operations performed
-  typename MmaIterations,
-  /// Shape of operand elements
-  typename MmaOperandShape,
-  /// Complex transform on A operand
-  ComplexTransform Transform_,
-  /// Operand A or Operand B
-  Operand Operand_,
-  /// Floating-point rounding style
-  FloatRoundStyle Round_>
-struct UnpackComplexConvertAndPackForMma;
-
-// Partial specialization for OperandA and Congruous smem layout
-template <
-  typename RealElement,
-  typename DestinationFragment, 
-  typename SourceFragment,
-  typename MmaIterations,
-  typename MmaOperandShape,
-  ComplexTransform Transform_,
-  FloatRoundStyle Round_>
-struct UnpackComplexConvertAndPackForMma <
-  RealElement,
-  DestinationFragment,
-  SourceFragment,
-  MmaIterations,
-  MmaOperandShape,
-  Transform_,
-  Operand::kA,
-  Round_> {
-  
-  //
-  // Type definitions
-  //
-  static Operand const kOperand = Operand::kA;
-  static ComplexTransform const kTransform = Transform_;
-  static FloatRoundStyle const kRound = Round_;
-
-  // Data type of elements in the destination fragment
-  using MmaElement = typename DestinationFragment::Element;
-
-  // Numeric convertor MmaElement <= RealElement
-  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
-
-  // Operand layout parameters
-  using SourceFragmentLayout = layout::ColumnMajor;
-  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
-
-  /// Ctor
-  CUTLASS_DEVICE
-  UnpackComplexConvertAndPackForMma() {}
-
-  CUTLASS_DEVICE
-  void operator()(DestinationFragment *dest, SourceFragment const &source) {
-    
-    Converter convert_op;
-    SourceFragmentLayout layout(kLdm);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i=0; i<MmaIterations::kRow; i++) {
-      int pos = 0;
-      CUTLASS_PRAGMA_UNROLL
-      for(int c=0; c<MmaOperandShape::kColumn; c++) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int r=0; r<MmaOperandShape::kRow; r++) {
-          // Logical position of element in source fragment
-          int row = r + i * MmaOperandShape::kRow;
-          int col = c;
-
-          // Access complex<RealElement> and apply rounding on real and imag parts
-          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
-          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest[i][pos] = a;
-          dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
-
-        }
-      }
-    }
-  }
-};
-
-// Partial specialization for OperandB and Congruous smem layout
-template <
-  typename RealElement,
-  typename DestinationFragment, 
-  typename SourceFragment,
-  typename MmaIterations,
-  typename MmaOperandShape,
-  ComplexTransform Transform_,
-  FloatRoundStyle Round_>
-struct UnpackComplexConvertAndPackForMma <
-  RealElement,
-  DestinationFragment,
-  SourceFragment,
-  MmaIterations,
-  MmaOperandShape,
-  Transform_,
-  Operand::kB,
-  Round_> {
-  
-  //
-  // Type definitions
-  //
-  static Operand const kOperand = Operand::kB;
-  static ComplexTransform const kTransform = Transform_;
-  static FloatRoundStyle const kRound = Round_;
-
-  // Data type of elements in the destination fragment
-  using MmaElement = typename DestinationFragment::Element;
-
-  // Numeric convertor MmaElement <= RealElement
-  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
-
-  // Operand layout parameters
-  using SourceFragmentLayout = layout::RowMajor;
-  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
-
-  /// Ctor
-  CUTLASS_DEVICE
-  UnpackComplexConvertAndPackForMma() {}
-
-  CUTLASS_HOST_DEVICE
-  void operator()(DestinationFragment *dest, SourceFragment const &source) {
-    
-    Converter convert_op;
-    SourceFragmentLayout layout(kLdm);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i=0; i<MmaIterations::kColumn; i++) {
-      int pos = 0;
-      CUTLASS_PRAGMA_UNROLL
-      for(int c=0; c<MmaOperandShape::kColumn; c++) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int r=0; r<MmaOperandShape::kRow; r++) {
-          // Logical position of element in source fragment
-          int row = r;
-          int col = c + i * MmaOperandShape::kColumn;
-
-          // Access complex<RealElement> apply rounding on real and imag parts
-          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
-          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest[i][pos] = a;
-          dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
-        }
-      }
-    }
-  }
-};
-} // namespace detail 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Complex transform on B operand
-  ComplexTransform TransformB = ComplexTransform::kNone,
-  /// Do source operands need more than one elements
-  bool GeneralizedOperatorElements = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaComplexTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB
->
-class MmaComplexTensorOp<
-  Shape_, 
-  complex<RealElementA>, 
-  LayoutA_, 
-  complex<RealElementB>,
-  LayoutB_,
-  complex<RealElementC>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Indicates math operator 
-  using MathOperator = arch::OpMultiplyAddComplex;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB = FragmentB;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
-  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
-  /// of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-  static_assert(
-    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
-    "Unexpected planar complex fragment length.");
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaComplexTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    static_assert(MmaOperandA::kElements == 1, 
-      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
-      "We can geneneralize later.");
-
-    static_assert(MmaOperandB::kElements == 1, 
-      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
-      "We can geneneralize later.");
-
-    D = C;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.real(), a.real(), b.real(), accum.real());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        operand_A[0] = A[m].real();
-        operand_B[0] = B[n].real();
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-          mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        operand_A[0] = A[m].real();
-        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        // A imaginary part is intentionally negated
-        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag());
-        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag());
-        operand_B[0] = B[n].real();
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex*complex+complex => complex:
-//  Operands data type: complex<float>
-//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
-//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-//  Output data type: complex<float>
-// 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB
->
-class MmaComplexTensorOp<
-  Shape_, 
-  complex<float>, 
-  LayoutA_, 
-  complex<float>,
-  LayoutB_,
-  complex<float>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of members of complex multiplicand A
-  using RealElementA = float;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of members of complex multiplicand B
-  using RealElementB = float;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of members of complex accumulator matrix C
-  using RealElementC = float;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Underlying arch tag
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Indicates math operator 
-  using MathOperator = typename arch::OpMultiplyAddComplex;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA =
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB =
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of complex products operations performed (one complex product needs four mma instructions)
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
-  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
-  /// of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaComplexTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    // Alias types for underlying real-valued matrix multiply operator
-    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
-    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
-      "This implementation only supports mma.m16n8k8 math instructions.");
-
-    static_assert(InstMmaOperandA::kElements == 4, 
-      "This implementation only supports math instructions in which exactly four element is needed for the A operand."
-      "We can geneneralize later.");
-
-    static_assert(InstMmaOperandB::kElements == 2, 
-      "This implementation only supports math instructions in which exactly two element is needed for the B operand."
-      "We can geneneralize later.");
-
-    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
-    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&A);
-    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&B);
-
-    //
-    // Accumulate in place
-    //
-    D = C;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.real(), a.real(), b.real(), accum.real());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-          mma(*accum, operand_A[m], operand_B[n], *accum);
-      }
-
-      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
-      }
-
-      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // negate OperandB to accumulate  -(a.imag()*b.imag())
-        // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements
-        negate<InstMmaOperandB> negate_op;
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-        mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
-      }
-
-      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    // Alias types for underlying real-valued matrix multiply operator
-    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
-    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
-
-    //
-    // Define conversions from source type to instruction operands' type
-    //
-
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-    FloatRoundStyle const kRoundA = FloatRoundStyle::round_to_nearest;
-    FloatRoundStyle const kRoundB = FloatRoundStyle::round_to_nearest;
-    #else
-    FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; 
-    FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz;
-    #endif
-
-    detail::UnpackComplexConvertAndPackForMma <
-      RealElementA,
-      InstMmaOperandA,
-      FragmentA,
-      MmaIterations,
-      MatrixShape<2, 2>,
-      kTransformA,
-      Operand::kA,
-      kRoundA> convert_A;
-
-    detail::UnpackComplexConvertAndPackForMma <
-      RealElementB,
-      InstMmaOperandB,
-      FragmentB,
-      MmaIterations,
-      MatrixShape<2, 1>,
-      kTransformB,
-      Operand::kB,
-      kRoundB> convert_B;
-
-    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
-    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
-    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for complex*complex+complex => complex:
-//  Operands data type: complex<double>
-//  Math instruction: mma.sync.aligned.m16n8k4.f64.f64.f64.f64
-//  Output data type: complex<double>
-// 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB
->
-class MmaComplexTensorOp<
-  Shape_, 
-  complex<double>, 
-  LayoutA_, 
-  complex<double>,
-  LayoutB_,
-  complex<double>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB,
-  true>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of members of complex multiplicand A
-  using RealElementA = double;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of members of complex multiplicand B
-  using RealElementB = double;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of members of complex accumulator matrix C
-  using RealElementC = double;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Underlying arch tag
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Indicates math operator 
-  using MathOperator = typename arch::OpMultiplyAddComplex;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB = FragmentB;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
-  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
-  /// of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-  static_assert(
-    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
-    "Unexpected planar complex fragment length.");
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaComplexTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    D = C;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.real(), a.real(), b.real(), accum.real());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-          mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ? 
-                          -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        // A imaginary part is intentionally negated
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
-                          A[m*MmaOperandA::kElements + mk].imag() : -A[m*MmaOperandA::kElements + mk].imag());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-            operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ?
-                            -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-
-      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_A;
-        MmaOperandB operand_B;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
-                          -A[m*MmaOperandA::kElements + mk].imag() : A[m*MmaOperandA::kElements + mk].imag());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A, operand_B, *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
deleted file mode 100755
index d52c5e24b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
+++ /dev/null
@@ -1,663 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/functional.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-namespace detail {
-
-template <
-  /// Data type of real & imag members of complex numbers in the SourceFragment
-  typename RealElement,
-  /// Destination fragment required by the mma operation 
-  typename DestinationFragment,
-  /// Source fragment holding complex<RealElement> elements
-  typename SourceFragment,
-  /// Number of mma operations performed
-  typename MmaIterations,
-  /// Shape of operand elements
-  typename MmaOperandShape,
-  /// Complex transform on A operand
-  ComplexTransform Transform_,
-  /// Operand A or Operand B
-  Operand Operand_,
-  /// Floating-point rounding style for big part
-  FloatRoundStyle RoundBig_,
-  /// Floating-point rounding style for small part
-  FloatRoundStyle RoundSmall_>
-struct UnpackComplexConvertAndPackForMmaFastF32;
-
-// Partial specialization for OperandA and Congruous smem layout
-template <
-  typename RealElement,
-  typename DestinationFragment, 
-  typename SourceFragment,
-  typename MmaIterations,
-  typename MmaOperandShape,
-  ComplexTransform Transform_,
-  FloatRoundStyle RoundBig_,
-  FloatRoundStyle RoundSmall_>
-struct UnpackComplexConvertAndPackForMmaFastF32 <
-  RealElement,
-  DestinationFragment,
-  SourceFragment,
-  MmaIterations,
-  MmaOperandShape,
-  Transform_,
-  Operand::kA,
-  RoundBig_,
-  RoundSmall_> {
-  
-  //
-  // Type definitions
-  //
-  static Operand const kOperand = Operand::kA;
-  static ComplexTransform const kTransform = Transform_;
-  static FloatRoundStyle const kRoundBig = RoundBig_;
-  static FloatRoundStyle const kRoundSmall = RoundSmall_;
-
-  // Data type of elements in the destination fragment
-  using MmaElement = typename DestinationFragment::Element;
-
-  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
-  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
-
-  // Operand layout parameters
-  using SourceFragmentLayout = layout::ColumnMajor;
-  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
-
-  // BigSmall Fragment holding two TF32 elements (big, small) for every float
-  using BigSmallFragment = Array<MmaElement, 2>;
-
-  /// Index in fargments for the big and small part
-  static int const kBigIndex = 0;
-  static int const kSmallIndex = 1;
-
-  /// Ctor
-  CUTLASS_DEVICE
-  UnpackComplexConvertAndPackForMmaFastF32() {}
-
-  CUTLASS_DEVICE
-  void operator()(DestinationFragment *dest, SourceFragment const &source) {
-    
-    Converter convert_op;
-    SourceFragmentLayout layout(kLdm);
-
-    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
-    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kRow * 2]);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i=0; i<MmaIterations::kRow; i++) {
-      int pos = 0;
-      CUTLASS_PRAGMA_UNROLL
-      for(int c=0; c<MmaOperandShape::kColumn; c++) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int r=0; r<MmaOperandShape::kRow; r++) {
-          // Logical position of element in source fragment
-          int row = r + i * MmaOperandShape::kRow;
-          int col = c;
-
-          // Access complex<RealElement> and apply rounding on real and imag parts
-          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
-          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest_big_[i][pos] = a[kBigIndex];
-          dest_big_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest_small_[i][pos] = a[kSmallIndex];
-          dest_small_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
-
-          // Next position
-          pos++;
-        }
-      }
-    }
-  }
-};
-
-// Partial specialization for OperandB and Congruous smem layout
-template <
-  typename RealElement,
-  typename DestinationFragment, 
-  typename SourceFragment,
-  typename MmaIterations,
-  typename MmaOperandShape,
-  ComplexTransform Transform_,
-  FloatRoundStyle RoundBig_,
-  FloatRoundStyle RoundSmall_>
-struct UnpackComplexConvertAndPackForMmaFastF32 <
-  RealElement,
-  DestinationFragment,
-  SourceFragment,
-  MmaIterations,
-  MmaOperandShape,
-  Transform_,
-  Operand::kB,
-  RoundBig_,
-  RoundSmall_> {
-  
-  //
-  // Type definitions
-  //
-  static Operand const kOperand = Operand::kB;
-  static ComplexTransform const kTransform = Transform_;
-  static FloatRoundStyle const kRoundBig = RoundBig_;
-  static FloatRoundStyle const kRoundSmall = RoundSmall_;
-
-  // Data type of elements in the destination fragment
-  using MmaElement = typename DestinationFragment::Element;
-
-  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
-  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
-
-  // Operand layout parameters
-  using SourceFragmentLayout = layout::RowMajor;
-  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
-
-  // BigSmall Fragment holding two TF32 elements (big, small) for every float
-  using BigSmallFragment = Array<MmaElement, 2>;
-
-  /// Index in fargments for the big and small part
-  static int const kBigIndex = 0;
-  static int const kSmallIndex = 1;
-
-  /// Ctor
-  CUTLASS_DEVICE
-  UnpackComplexConvertAndPackForMmaFastF32() {}
-
-  CUTLASS_HOST_DEVICE
-  void operator()(DestinationFragment *dest, SourceFragment const &source) {
-    
-    Converter convert_op;
-    SourceFragmentLayout layout(kLdm);
-
-    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
-    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kColumn * 2]);
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i=0; i<MmaIterations::kColumn; i++) {
-      int pos = 0;
-      CUTLASS_PRAGMA_UNROLL
-      for(int c=0; c<MmaOperandShape::kColumn; c++) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int r=0; r<MmaOperandShape::kRow; r++) {
-          // Logical position of element in source fragment
-          int row = r;
-          int col = c + i * MmaOperandShape::kColumn;
-
-          // Access complex<RealElement> apply rounding on real and imag parts
-          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
-          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest_big_[i][pos] = a[kBigIndex];
-          dest_big_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
-
-          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
-          dest_small_[i][pos] = a[kSmallIndex];
-          dest_small_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
-
-          // next position
-          pos++;       
-        }
-      }
-    }
-  }
-};
-} // namespace detail 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Complex transform on B operand
-  ComplexTransform TransformB = ComplexTransform::kNone,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaComplexTensorOpFastF32;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex*complex+complex => complex:
-//  Operands data type: complex<float>
-//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
-//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
-//  Output data type: complex<float>
-// 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB,
-  /// Used for partial specialization
-  typename Enable
->
-class MmaComplexTensorOpFastF32<
-  Shape_, 
-  complex<float>, 
-  LayoutA_, 
-  complex<float>,
-  LayoutB_,
-  complex<float>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB,
-  Enable>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of members of complex multiplicand A
-  using RealElementA = float;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of members of complex multiplicand B
-  using RealElementB = float;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of members of complex accumulator matrix C
-  using RealElementC = float;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Underlying arch tag
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Indicates math operator 
-  using MathOperator = arch::OpMultiplyAddComplexFastF32;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-
-  /// Tune F32 to TF32 big small conversion for complex<float> operation
-  /// Different combination of big small conversin can cause different tradeoff
-  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
-  /// improve the performance but hur the accuracy.
-  using ComplexFastF32 = FastF32 <
-    FloatRoundStyle::round_toward_zero,        // kRoundBigA
-    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
-    FloatRoundStyle::round_toward_zero,        // kRoundBigB
-    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
-    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
-  >;
-
-  /// Index in fargments for the big and small part
-  static int const kBigIndex = 0;
-  static int const kSmallIndex = 1;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  // (4 times the original FragmentA::kElements)
-  // (real_big), (imag_big), (real_small), (imag_small)
-  using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, 
-                                              FragmentA::kElements * 2 * 2>;
-
-  // Fragment bisecting big and small sections
-  // (real_big, imag_big), (real_small, imag_small)
-  using AccessTypeFragmentA = Array<typename ArchMmaOperator::ElementA, 
-                                                    FragmentA::kElements * 2>;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile 
-  // (4 times the original FragmentB::kElements)
-  // (real_big), (imag_big), (real_small), (imag_small)
-  using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, 
-                                              FragmentB::kElements * 2 * 2>;
-
-  // Fragment bisecting big and small sections
-  // (real_big, imag_big), (real_small, imag_small)
-  using AccessTypeFragmentB = Array<typename ArchMmaOperator::ElementB, 
-                                                    FragmentB::kElements * 2>;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of complex products operations performed (one complex product needs four mma instructions)
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
-  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
-  /// of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-  //
-  // Alias types for underlying real-valued matrix multiply operator
-  //
-  using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
-  using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
-  using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-  static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
-    "This implementation only supports mma.m16n8k8 math instructions.");
-
-  static_assert(InstMmaOperandA::kElements == 4, 
-    "This implementation only supports math instructions in which exactly four element is needed for the A operand."
-    "We can geneneralize later.");
-
-  static_assert(InstMmaOperandB::kElements == 2, 
-    "This implementation only supports math instructions in which exactly two element is needed for the B operand."
-    "We can geneneralize later.");
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaComplexTensorOpFastF32() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    AccessTypeFragmentA const *complex_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
-    AccessTypeFragmentB const *complex_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
-
-    //
-    // Accumulate in place
-    //
-    D = C;
-
-
-    complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kBigIndex], D);
-
-    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kSmallIndex], D);
-
-    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kBigIndex], D);
-
-    if (ComplexFastF32::kPrecision == TensorFloat32Op::k4xTF32)
-      complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kSmallIndex], D);
-  }
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void complex_mma_operator(
-    FragmentC &D, 
-    AccessTypeFragmentA const &complex_A, 
-    AccessTypeFragmentB const &complex_B, 
-    FragmentC const &C
-  ) const {
-
-    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
-    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&complex_A);
-    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&complex_B);
-
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.real(), a.real(), b.real(), accum.real());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-          mma(*accum, operand_A[m], operand_B[n], *accum);
-      }
-
-      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
-      }
-
-      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // negate OperandB to accumulate  -(a.imag()*b.imag())
-        // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements
-        negate<InstMmaOperandB> negate_op;
-
-        // Real-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-         mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
-      }
-
-      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Complex-valued accumulator part
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    detail::UnpackComplexConvertAndPackForMmaFastF32 <
-      RealElementA,
-      InstMmaOperandA,
-      FragmentA,
-      MmaIterations,
-      MatrixShape<2, 2>,
-      kTransformA,
-      Operand::kA,
-      ComplexFastF32::kRoundBigA,
-      ComplexFastF32::kRoundSmallA> convert_A;
-
-    detail::UnpackComplexConvertAndPackForMmaFastF32 <
-      RealElementB,
-      InstMmaOperandB,
-      FragmentB,
-      MmaIterations,
-      MatrixShape<2, 1>,
-      kTransformB,
-      Operand::kB,
-      ComplexFastF32::kRoundBigB,
-      ComplexFastF32::kRoundSmallB> convert_B;
-
-    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
-    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
-    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
deleted file mode 100755
index bc51bca09..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
+++ /dev/null
@@ -1,2485 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 128b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous128b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility.");
-
-  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 1;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<8, 4>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      Shape::kContiguous / Delta::kContiguous,
-      InstructionShape::kStrided / Delta::kStrided
-    >;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
-
-    int quad_pair = lane_id / 8;
-    int quad = lane_id / 4;
-    int lane = lane_id % 4;
-
-    int row = (quad & 1) * 4 + (lane ^ quad_pair);
-    
-    byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType);
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data());
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    pointer_ += offset;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int offset =
-      (tile_offset.contiguous() * Shape::kContiguous) +
-      (tile_offset.strided() * InstructionShape::kStrided * stride_);
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    pointer_ += stride_ * InstructionShape::kStrided;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::Iterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c +
-            Policy::Delta::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-        tile_offset.contiguous() * Shape::kContiguous +
-        tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous128b,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous128b,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// 
-/// Partial specialization for complex<T>
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of underlying field of reals.
-    typename RealElement,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpAccumulatorTileIterator<
-    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = complex<RealElement>;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                      Shape::kColumn / InstructionShape::kN>;
-  };
-
-private:
-
-  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
-  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
-  // of that row. The accumulators within one row are assumed to be consecutive.
- static int const kElementsPerAccess = InstructionShape::kN / 4;
- static int const kRowsPerTile = 8;
- static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
-  /// are stored in a planar complex arrangement with the real parts as entirely contiguous
-  /// followed by the imaginary parts.
-  using Fragment = Array<RealElement, Shape::kCount / kThreads * 2>;
-
-  static int const kRealIndex = 0;
-  static int const kImaginaryIndex = Shape::kCount / kThreads;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-
-            Element z = offset_ref.at({accum_m, accum_n});
-
-            frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real();
-            frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag();
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-
-            Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]);
-
-            offset_ref.at({accum_m, accum_n}) = z;
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 128b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCrosswise128x4,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility.");
-
-  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 1;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<4, 8>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      InstructionShape::kContiguous / Delta::kContiguous,
-      Shape::kStrided / Delta::kStrided
-    >;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
-
-    int quad = lane_id / 4;
-    int liq = lane_id % 4;
-
-    int c = liq + (quad & 1) * 4;
-    int s = (quad / 2);
-
-    byte_offset_ = (c + s * stride_) * sizeof(AccessType);
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data());
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    pointer_ += offset;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    // Compute the offset in units of elements. Note, the external coordinate system is
-    // approximately transposed with respect to the tiled internal structure
-    int offset =
-      (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ +
-      (tile_offset.strided() * Shape::kStrided);
-
-    add_pointer_offset(offset);
-
-    byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    pointer_ += stride_ * InstructionShape::kContiguous;
-
-    byte_offset_ ^= 4 * sizeof(AccessType);
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-        int access_idx = s + c * Policy::Iterations::kStrided;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c * stride_ +
-            Policy::Delta::kStrided * s;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-        tile_offset.contiguous() * InstructionShape::kContiguous * stride_ +
-        tile_offset.strided() * Shape::kStrided;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicandCrosswise128x4,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicandCrosswise128x4,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Congruous shared memory layout
-// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
-// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 64b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, cutlass::complex<float>,
-    cutlass::layout::TensorOpMultiplicandCongruous64b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility.");
-
-  /// Element type
-  using Element = cutlass::complex<float>;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 2;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<8, 4>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
-      InstructionShape::kStrided / Delta::kStrided
-    >;
-
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
-    k_group_idx_(0) {
-
-    int access_strided = lane_id / Policy::Delta::kContiguous;
-    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
-      access_contiguous + access_strided * stride_;
-
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int offset = 
-      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
-      tile_offset.contiguous() * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-    
-    add_tile_offset({0, -1});
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::Iterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c +
-            Policy::Delta::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    Index pointer_offset = 
-      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Crosswise shared memory layout
-// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
-// The underlying iterators are similar to that for f64*f64 + f64 = f64 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 64b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, complex<float>,
-    cutlass::layout::TensorOpMultiplicand64bCrosswise,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
-
-  static_assert(sizeof_bits<complex<float>>::value == 64, "This is specialized for 64b accesses.");
-
-  /// Element type
-  using Element = complex<float>;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 2;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<4, 16>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      InstructionShape::kContiguous / Delta::kContiguous,
-      Shape::kStrided / Delta::kStrided
-    >;
-
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Internal counter for tracking K-group
-  Index k_group_idx_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
-    k_group_idx_(0) {
-
-    int access_strided = lane_id / 8;
-    int access_contiguous = (lane_id  % 8);
-
-    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data());
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    pointer_ += offset / kElementsPerAccess;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
-                     stride_ * kElementsPerAccess +
-                 tile_offset.strided() * Shape::kStrided;
-
-    add_pointer_offset(offset);
-    
-    
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
-
-    add_tile_offset(tile_offset);
-
-    if (k_group_idx_ & 1)
-      byte_offset_ ^= 0x40;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    pointer_ += stride_ * InstructionShape::kContiguous;
-    
-    // xor ptr
-    byte_offset_ ^= 0x40;
-
-    ++k_group_idx_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-        int access_idx = c * Policy::Iterations::kStrided + s;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c * stride_ +
-            Policy::Delta::kStrided * s / kElementsPerAccess;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-
-    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
-
-    // exchange on 64b granularity only for fragments held in k=8/2 to k=8 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) {
-      Element tmp = exchange_ptr[i];
-      exchange_ptr[i] = exchange_ptr[i + 1];
-      exchange_ptr[i + 1] = tmp;
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = tile_offset.contiguous() *
-                               InstructionShape::kContiguous /
-                               Layout::kElementsPerAccess +
-                           tile_offset.strided() * Shape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    k_group_idx_ = k_group;
-  }
-};
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
deleted file mode 100755
index 5a02417aa..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ /dev/null
@@ -1,642 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Complex transform on B operand
-  ComplexTransform TransformB = ComplexTransform::kNone,
-  /// Do source operands need more than one elements
-  bool GeneralizedOperatorElements = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaGaussianComplexTensorOp;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB
->
-class MmaGaussianComplexTensorOp<
-  Shape_, 
-  complex<RealElementA>, 
-  LayoutA_, 
-  complex<RealElementB>,
-  LayoutB_,
-  complex<RealElementC>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Underlying arch tag
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Indicates math operator 
-  using MathOperator = arch::OpMultiplyAddGaussianComplex;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB = FragmentB;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
-  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
-  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-  static_assert(
-    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
-    "Unexpected gaussian complex fragment length.");
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaGaussianComplexTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    static_assert(MmaOperandA::kElements == 1, 
-      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
-      "We can geneneralize later.");
-
-    static_assert(MmaOperandB::kElements == 1, 
-      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
-      "We can geneneralize later.");
-
-    D = C;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Asum;
-        MmaOperandB operand_Br;
-
-        operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag());
-        operand_Br[0] = B[n].real();
-
-        // accumulator part1
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-        mma(*accum, operand_Asum, operand_Br, *accum);
-      }
-
-      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Ar;
-        MmaOperandB operand_Bdiff;
-
-        operand_Ar[0] = -A[m].real();
-        operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
-
-        // accumulator part2
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_Ar, operand_Bdiff, *accum);
-      }
-
-      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Ai;
-        MmaOperandB operand_Bsum;
-
-        operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag();
-        operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
-
-        // accumulator part3
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
-
-        mma(*accum, operand_Ai, operand_Bsum, *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename RealElementA,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename RealElementB,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename RealElementC,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Complex transform on A operand
-  ComplexTransform TransformA,
-  /// Complex transform on B operand
-  ComplexTransform TransformB
->
-class MmaGaussianComplexTensorOp<
-  Shape_, 
-  complex<RealElementA>, 
-  LayoutA_, 
-  complex<RealElementB>,
-  LayoutB_,
-  complex<RealElementC>,
-  LayoutC_,
-  Policy_,
-  TransformA,
-  TransformB,
-  true>  {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = complex<RealElementA>;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = complex<RealElementB>;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = complex<RealElementC>;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Underlying arch tag
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Indicates math operator 
-  using MathOperator = arch::OpMultiplyAddGaussianComplex;
-  
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-    Policy::OpDelta::kRow,
-    32,
-    1
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-    Policy::OpDelta::kColumn,
-    32,
-    1
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB = FragmentB;
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Shape::kM / ArchMmaOperator::Shape::kM,
-    Shape::kN / ArchMmaOperator::Shape::kN
-  >;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, 
-     ElementC, 
-     LayoutC,
-     typename ArchMmaOperator::Shape, 
-     typename Policy::OpDelta>;
-
-  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
-  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
-  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
-  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
-  using FragmentC = typename IteratorC::Fragment;
-
-  static_assert(
-    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
-    "Unexpected gaussian complex fragment length.");
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaGaussianComplexTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    // Alias types for underlying real-valued matrix multiply operator
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    D = C;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Asum;
-        MmaOperandB operand_Br;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_Asum[mk] = A[m*MmaOperandA::kElements + mk].real() + ((kTransformA == ComplexTransform::kConjugate) ?
-                            -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag());
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_Br[nk] = B[n*MmaOperandB::kElements + nk].real();
-
-        // accumulator part1
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow);
-
-        mma(*accum, operand_Asum, operand_Br, *accum);
-      }
-
-      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Ar;
-        MmaOperandB operand_Bdiff;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_Ar[mk] = -A[m*MmaOperandA::kElements + mk].real();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_Bdiff[nk] = B[n*MmaOperandB::kElements + nk].real() - ((kTransformB == ComplexTransform::kConjugate) ?
-                              -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
-
-        // accumulator part2
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
-
-        mma(*accum, operand_Ar, operand_Bdiff, *accum);
-      }
-
-      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        // Pack operands together. This may result in actual MOVs 
-        MmaOperandA operand_Ai;
-        MmaOperandB operand_Bsum;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
-          operand_Ai[mk] = (kTransformA == ComplexTransform::kConjugate) ?
-                           -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
-          operand_Bsum[nk] = B[n*MmaOperandB::kElements + nk].real() + ((kTransformB == ComplexTransform::kConjugate) ?
-                             -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
-
-        // accumulator part3
-        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
-          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
-
-        mma(*accum, operand_Ai, operand_Bsum, *accum);
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
deleted file mode 100755
index fe785f8d3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
+++ /dev/null
@@ -1,390 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpGaussianComplexAccumulatorTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// 
-/// Partial specialization for complex<T>
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of underlying field of reals.
-    typename RealElement,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpGaussianComplexAccumulatorTileIterator<
-    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = complex<RealElement>;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                      Shape::kColumn / InstructionShape::kN>;
-  };
-
-private:
-
-  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
-  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
-  // of that row. The accumulators within one row are assumed to be consecutive.
- static int const kElementsPerAccess = InstructionShape::kN / 4;
- static int const kRowsPerTile = 8;
- static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
-  /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous
-  /// arranged as [part1, part2, part3]
-  using Fragment = Array<RealElement, (Shape::kCount / kThreads) * 3>;
-
-  static int const kPart1Index = (Shape::kCount / kThreads) * 0;
-  static int const kPart2Index = (Shape::kCount / kThreads) * 1;
-  static int const kPart3Index = (Shape::kCount / kThreads) * 2;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-
-            Element z = offset_ref.at({accum_m, accum_n});
-
-            frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag();
-            frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real();
-            frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag();
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-
-            Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], 
-                      frag[kPart1Index + idx] + frag[kPart2Index + idx]);
-
-            offset_ref.at({accum_m, accum_n}) = z;
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
deleted file mode 100755
index f553fbde9..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
+++ /dev/null
@@ -1,566 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-////////////////////////////////////////////////////////////////////////////////
-// Shuffle registers for layout conversion
-////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Element type for the operand in registers for the mma.sync
-  typename ElementMma_, 
-  /// Element type for the operand in shared memory for ldmatrix
-  typename ElementLoad_,
-  /// Number of mma.sync operations performed along rows or columns         
-  int NumMmaInstructions,
-  /// Number of elements in warp fragment
-  int NumElementsInWarpFragment,
-  /// Number of elements in mma fragment
-  int NumElementsInMmaFragment,
-  /// Identifies A or B multiplicand
-  Operand Operand_,
-  ///
-  typename Enable = void >
-struct FragmentShuffler {
-  public:
-  using ElementMma = ElementMma_;
-  using ElementLoad = ElementLoad_;
-
-  static int const kNumMmaInstructions = NumMmaInstructions;
-  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
-  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
-  static Operand const kOperand = Operand_;
-
-  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
-  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
-
-  CUTLASS_DEVICE
-  WarpFragment operator()(WarpFragment const &src) {
-    return src;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
-/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
-/// for operand A multiplicand going through upcasting. 
-template <
-  /// Element type for the operand in registers for the mma.sync
-  typename ElementMma_, 
-  /// Element type for the operand in shared memory for ldmatrix
-  typename ElementLoad_,
-  /// Number of mma.sync operations performed along rows or columns         
-  int NumMmaInstructions,
-  /// Number of elements in warp fragment
-  int NumElementsInWarpFragment,
-  /// Number of elements in mma fragment
-  int NumElementsInMmaFragment
-> 
-struct FragmentShuffler <ElementMma_, ElementLoad_,
-                         NumMmaInstructions, 
-                         NumElementsInWarpFragment, 
-                         NumElementsInMmaFragment,
-                         Operand::kA,
-                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
-                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
-public:
-  using ElementMma = ElementMma_;
-  using ElementLoad = ElementLoad_;
-
-  static int const kNumMmaInstructions = NumMmaInstructions;
-  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
-  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
-  static Operand const kOperand = Operand::kA;
-
-  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
-  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
-
-  static uint32_t const kSelectBytesEvenThread = 0x5410;
-  static uint32_t const kSelectBytesOddThread = 0x7632;
-
-private:
-  int delta_up_;
-  int delta_down_;
-  int odd_even_lane_id_;
-  uint32_t byte_selector_;
-
-public:
-  CUTLASS_DEVICE
-  FragmentShuffler() {
-    int lane_id = cutlass::arch::LaneId();
-    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
-    delta_down_ = 2 - delta_up_;
-    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
-    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
-                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
-  }
-
-  CUTLASS_DEVICE
-  WarpFragment operator()(WarpFragment const &src) {
-
-    WarpFragment result;
-    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const*>(&src);
-    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment*>(&result);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kNumMmaInstructions; n++) {
-
-        uint32_t const* src_ptr = reinterpret_cast<uint32_t const *>(&mma_frag_src_ptr[n]);
-        uint32_t *dst_ptr = reinterpret_cast<uint32_t *>(&mma_frag_dst_ptr[n]);
-
-        // Shuffle data within the warp, pull from other threads within the warp
-        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
-        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
-        uint32_t tmp2 = __shfl_up_sync(0xFFFFFFFF, src_ptr[1], delta_up_);
-        uint32_t tmp3 = __shfl_down_sync(0xFFFFFFFF, src_ptr[1], delta_down_);
-
-        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
-        dst_ptr[0] = __byte_perm(tmp0, tmp2, byte_selector_);
-        dst_ptr[1] = __byte_perm(tmp1, tmp3, byte_selector_);
-    }
-
-    return result;
-  }
-
-};
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
-/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
-/// for operand B multiplicand going through upcasting. 
-template <
-  /// Element type for the operand in registers for the mma.sync
-  typename ElementMma_, 
-  /// Element type for the operand in shared memory for ldmatrix
-  typename ElementLoad_,
-  /// Number of mma.sync operations performed along rows or columns         
-  int NumMmaInstructions,
-  /// Number of elements in warp fragment
-  int NumElementsInWarpFragment,
-  /// Number of elements in mma fragment
-  int NumElementsInMmaFragment
-> 
-struct FragmentShuffler <ElementMma_, ElementLoad_,
-                         NumMmaInstructions, 
-                         NumElementsInWarpFragment, 
-                         NumElementsInMmaFragment,
-                         Operand::kB,
-                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
-                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
-public:
-  using ElementMma = ElementMma_;
-  using ElementLoad = ElementLoad_;
-
-  static int const kNumMmaInstructions = NumMmaInstructions;
-  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
-  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
-  static Operand const kOperand = Operand::kB;
-
-  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
-  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
-
-  static uint32_t const kSelectBytesEvenThread = 0x5410;
-  static uint32_t const kSelectBytesOddThread = 0x7632;
-
-private:
-  int delta_up_;
-  int delta_down_;
-  int odd_even_lane_id_;
-  uint32_t byte_selector_;
-
-public:
-  CUTLASS_DEVICE
-  FragmentShuffler() {
-    int lane_id = cutlass::arch::LaneId();
-    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
-    delta_down_ = 2 - delta_up_;
-    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
-    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
-                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
-  }
-
-  CUTLASS_DEVICE
-  WarpFragment operator()(WarpFragment const &src) {
-
-    WarpFragment result;
-
-    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const *>(&src);
-    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment *>(&result);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kNumMmaInstructions; n++) {
-
-        uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&mma_frag_src_ptr[n]);
-        uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&mma_frag_dst_ptr[n]);
-
-        // Shuffle data within the warp, pull from other threads within the warp
-        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
-        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
-
-        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
-        dst_ptr[0] = __byte_perm(tmp0, tmp1, byte_selector_);
-    }
-
-    return result;
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Data type conversion
-////////////////////////////////////////////////////////////////////////////////
-template <
-  /// Destination type
-  typename ElementDst_, 
-  /// Source type
-  typename ElementSrc_,
-  /// Number of elements
-  int N,
-  ///
-  typename Enable = void> 
-struct FragmentConverter {
-
-  using ElementDst = ElementDst_;
-  using ElementSrc = ElementSrc_;
-
-  // Operand fragment registers in destination and source types
-  using DestinationFragment = Array<ElementDst, N>;
-  using SourceFragment = Array<ElementSrc, N>;
-
-  FastNumericArrayConverter<ElementDst, ElementSrc, N> convert;
-
-  CUTLASS_DEVICE
-  DestinationFragment operator()(SourceFragment const &src) const {
-    return convert(src);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-
-// Partial specialization for when Destination type is the *same* as 
-// Source type
-template <
-  /// Data type
-  typename Element,
-  /// Number of elements
-  int N,
-  /// 
-  typename Enable>
-struct FragmentConverter<Element, Element, N, Enable> {
-
-  using DestinationFragment = Array<Element, N>;
-  using SourceFragment = Array<Element, N>;
-
-  CUTLASS_DEVICE
-  DestinationFragment operator()(SourceFragment const &src) const {
-    return src;
-  }
-};
-
-} // namespace detail
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaMixedInputTensorOp {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Underlying arch::Mma instruction datatype for A operand
-  using ElementAMma = typename ArchMmaOperator::ElementA;
-
-  /// Underlying arch::Mma instruction datatype for B operand
-  using ElementBMma = typename ArchMmaOperator::ElementB;
-
-  /// Underlying arch::Mma instruction datatype for C operand
-  using MmaElementC = typename ArchMmaOperator::ElementC;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// 
-  // static int const kLoadShapeK = InstructionShape::kK * 
-  //  (sizeof_bits<ElementAMma>::value / sizeof_bits<ElementB>::value);
-
-public:
-
-  /// Iterates over the A operand in Shared Memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
-     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for A tile in registers (loaded from Shared Memory)
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile in registers (for use in Mma instruction)
-  using TransformedFragmentA =
-      Array<ElementAMma, FragmentA::kElements>;
-
-  /// Underlying arch::Mma instruction operand fragement for matrix A
-  using MmaOperandA = typename ArchMmaOperator::FragmentA;
-
-  /// Iterates over the B operand in Shared Memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
-      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for B tile in registers (loaded from Shared Memory)
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile in registers (for use in Mma instruction)
-  using TransformedFragmentB =
-      Array<ElementBMma, FragmentB::kElements>;
-
-  /// Underlying arch::Mma instruction operand fragement for matrix B
-  using MmaOperandB = typename ArchMmaOperator::FragmentB;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-  /// Underlying arch::Mma instruction operand fragement for matrix C
-  using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
-    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
-  >;
-
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaMixedInputTensorOp() {}
-
-    /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    D = C;
-
-    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
-
-        if (AccumulatorsInRowMajor) {  // matrix B is reordered
-          mma(
-            ptr_D[n_serpentine + m * MmaIterations::kColumn],
-            ptr_A[m],
-            ptr_B[n_serpentine],
-            ptr_D[n_serpentine + m * MmaIterations::kColumn]);
-        } else {
-          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[m + n_serpentine * MmaIterations::kRow]);
-        }
-      }
-    }
-  }
-
-  /// Transform the operand warp fragment register to the required data types and layout 
-  /// for the `cultass::arch::Mma`
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    // Shuffle data within warp to obtain the mma.sync operand layout
-    detail::FragmentShuffler<ElementBMma, ElementB, MmaIterations::kColumn, 
-             FragmentB::kElements, MmaOperandB::kElements, Operand::kB> shuffler_B;
-    FragmentB tmp_B; 
-    tmp_B = shuffler_B(B);
-
-    // Convert the B operand to the Mma Instruction operand type
-    detail::FragmentConverter<ElementBMma, ElementB, FragmentB::kElements> convert_B;
-    dst_B = convert_B(tmp_B);
-
-    FragmentA tmp_A;
-
-    Array<ElementA, FragmentA::kElements / 2> *
-        ptr_tmp_A = reinterpret_cast<Array<ElementA,
-                                             FragmentA::kElements / 2> *>(&tmp_A);
-    Array<ElementAMma, FragmentA::kElements / 2> *
-        ptr_dst_A = reinterpret_cast<Array<ElementAMma,
-                                             FragmentA::kElements / 2> *>(&dst_A);
-
-    // Shuffle data within warp to obtain the mma.sync operand layout
-    detail::FragmentShuffler<ElementAMma, ElementA, MmaIterations::kRow,
-             FragmentA::kElements, MmaOperandA::kElements, Operand::kA> shuffler_A;
-
-    // Convert the A operand to the Mma Instruction operand type
-    detail::FragmentConverter<ElementAMma, ElementA, FragmentA::kElements / 2> convert_A;
-
-    tmp_A = shuffler_A(A);
-    ptr_dst_A[0] = convert_A(ptr_tmp_A[0]);
-
-    ptr_dst_A[1] = convert_A(ptr_tmp_A[1]);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
deleted file mode 100755
index c5dcfb7c0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/array_planar_complex.h"
-#include "cutlass/gemm/warp/tile_iterator_planar_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Underlying real-valued warp-level matrix multiply
-  typename Operator_,
-  /// Transformation applied to A operand (typically folded into math instruction)
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Transformation applied to B operand (typically folded into math instruction)
-  ComplexTransform TransformB = ComplexTransform::kNone
->
-class MmaPlanarComplex {
-public:
-
-  /// Underlying real-valued warp-level matrix multiply
-  using Operator = Operator_;
-
-  /// Shape of warp-level matrix multipy
-  using Shape = typename Operator::Shape;
-
-  /// Transformation applied to A operand (typically folded into math instruction)
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Transformation applied to B operand (typically folded into math instruction)
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Fragment of elements
-  using FragmentA = ArrayPlanarComplex<typename Operator::ElementA, Operator::FragmentA::kElements>;
-
-  /// Iterator into planar complex
-  using IteratorA = TileIteratorPlanarComplex<typename Operator::IteratorA>;
-
-  /// Layout in memory of the A operand
-  using LayoutA = typename Operator::LayoutA;
-
-  using FragmentB = ArrayPlanarComplex<typename Operator::ElementB, Operator::FragmentB::kElements>;
-
-  /// Iterator into planar complex
-  using IteratorB = TileIteratorPlanarComplex<typename Operator::IteratorB>;
-
-  /// Layout in memory of the B operand
-  using LayoutB = typename Operator::LayoutB;
-
-  /// Tile iterator for accumulator
-  using IteratorC = TileIteratorPlanarComplex<typename Operator::IteratorC>;
-
-  /// Accumulator fragment
-  using FragmentC = ArrayPlanarComplex<typename Operator::ElementC, Operator::FragmentC::kElements>;
-
-  /// Layout of accumulator fragment in memory
-  using LayoutC = typename Operator::LayoutC;
-
-private:
-
-    /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    Operator::Shape::kM / Operator::Policy::Operator::Shape::kM,
-    Operator::Shape::kN / Operator::Policy::Operator::Shape::kN
-  >;
-
-public:
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaPlanarComplex() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A_in, 
-    FragmentB const &B_in, 
-    FragmentC const &C) const {
-
-    D.real = C.real;
-    D.imag = C.imag;
-
-    //
-    // Transform fragments based on conjugate operations.
-    //
-
-    negate<typename FragmentA::ArrayReal> neg_A;
-
-    FragmentA frag_A;
-    frag_A.real = A_in.real;
-
-    if (kTransformA == ComplexTransform::kConjugate) {
-      frag_A.imag = neg_A(frag_A.imag);
-    }
-    else {
-      frag_A.imag = frag_A.imag;
-    }
-
-    FragmentB frag_B;
-    frag_B.real = B_in.real;
-
-    if (kTransformB == ComplexTransform::kConjugate) {
-      negate<typename FragmentB::ArrayReal> neg;
-      frag_B.imag = neg(frag_B.imag);
-    }
-    else {
-      frag_B.imag = frag_B.imag;
-    }
-
-    //
-    // Accumulated real-valued matrix multiplies
-    //
-
-    Operator real_mma;
-
-    // D.i += A.i * B.r
-    real_mma(D.imag, frag_A.imag, frag_B.real, D.imag);
-
-    // D.r += A.r * B.r
-    real_mma(D.real, frag_A.real, frag_B.real, D.real);
-
-    // D.i += A.r * B.i
-    real_mma(D.imag, frag_A.real, frag_B.imag, D.imag);
-
-    // D.r += -A.i * B.i
-    frag_A.imag = neg_A(frag_A.imag);
-    real_mma(D.real, frag_A.imag, frag_B.imag, D.real);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
deleted file mode 100755
index f5f2f063f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h
+++ /dev/null
@@ -1,263 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/thread/mma.h"
-
-#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK = 1,
-  /// Complex transformation on operand A
-  ComplexTransform TransformA = ComplexTransform::kNone,
-  /// Complex transformation on operand B
-  ComplexTransform TransformB = ComplexTransform::kNone,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaSimt {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassSimt;
-
-  /// Hard-coded for now
-  using ArchTag = arch::Sm50;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = TransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = TransformB;
-
-  /// Layout of threads
-  using ThreadLayoutA = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA >::value,
-                  layout::ColumnMajor,
-                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value,
-                      layout::RowMajor,
-                      LayoutA>::type
-                 >::type;
-  
-  using ThreadLayoutB = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutB >::value,
-                  layout::ColumnMajor,
-                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutB >::value,
-                      layout::RowMajor,
-                      LayoutB>::type
-                 >::type;
-
-  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
-                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
-                                    platform::is_same< ElementA, int8_t >::value && 
-                                    platform::is_same< ElementB, int8_t >::value;
-
-  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
-
-  /// Thread-level matrix multiply accumulate operator
-  using ThreadMma = thread::Mma<
-    GemmShape<
-      Shape::kM / Policy::WarpShape::kRow,
-      Shape::kN / Policy::WarpShape::kColumn,
-      Policy::LaneMmaShape::kK>,
-    ElementA,
-    ThreadLayoutA,
-    ElementB,
-    ThreadLayoutB,
-    ElementC,
-    LayoutC,
-    arch::OpMultiplyAdd,
-    dp4a_type
-  >;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  /// Shape of the underlying instruction
-  using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaSimtTileIterator<
-    MatrixShape<Shape::kM, Policy::LaneMmaShape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    Policy,
-    PartitionsK,
-    Shape::kK
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA = FragmentA;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaSimtTileIterator<
-    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    Policy,
-    PartitionsK,
-    Shape::kK
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentB = FragmentB;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaSimtTileIterator<
-    MatrixShape<Shape::kM, Shape::kN>,
-    Operand::kC,
-    ElementC,
-    LayoutC,
-    Policy
-  >;
-
-  /// Storage for C tile
-  using FragmentC = typename ThreadMma::FragmentC;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaSimt() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &d, 
-    FragmentA a, 
-    FragmentB b, 
-    FragmentC const &c, int group_idx = 0) const {
-
-    ThreadMma mma;
-
-    if (kTransformA == ComplexTransform::kConjugate) {
-      conjugate<FragmentA> conj_a;
-      a = conj_a(a);
-    }
-
-    if (kTransformB == ComplexTransform::kConjugate) {
-      conjugate<FragmentB> conj_b;
-      b = conj_b(b);
-    }
-
-    mma(d, a, b, c);
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-    dst_A = A;
-    dst_B = B;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
deleted file mode 100755
index 8da3b9f86..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
-      instructions
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Describes the arrangement and configuration of per-lane operations in warp-level matrix multiply 
-template <
-  typename WarpShape_,              ///< shape of the warp in lanes (concept: MatrixShape)
-  typename LaneLayout_,             ///< layout function of lanes
-  typename LaneMmaShape_            ///< size of each lane's thread-level matrix product (concept: GemmShape)
->
-struct MmaSimtPolicy {
-  using WarpShape = WarpShape_;
-  using LaneLayout = LaneLayout_;
-  using LaneMmaShape = LaneMmaShape_;
-  using MmaShape = LaneMmaShape;
-
-  /// Returns a layout functor mapping lane position in the warp to thread ID
-  CUTLASS_HOST_DEVICE
-  static LaneLayout get_lane_layout() {
-    return LaneLayout::packed({WarpShape::kRow, WarpShape::kColumn});
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
deleted file mode 100755
index 6b0647ffd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ /dev/null
@@ -1,1890 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
-      instructions
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-
-#include "cutlass/layout/matrix.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma_simt_policy.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
-///
-/// concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Operand identity
-  Operand Operand,
-  /// Data type of A elements
-  typename Element_,
-  /// Layout of operand
-  typename Layout_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension - used in sliced-K
-  int PartitionsK = 1,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize = 1
->
-class MmaSimtTileIterator;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for A operands of column-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension - used in sliced-K
-  int PartitionsK,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::ColumnMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
-    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow / Policy::WarpShape::kRow,
-    Shape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kM,
-    ThreadShape::kColumn
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kM>, layout::ColumnMajor> ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, 0);
-
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(ref.data()),
-      ref.stride(0) / Policy::LaneMmaShape::kM);
-  }
-  
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow / Policy::LaneMmaShape::kM, 
-      coord.column() * Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({0, Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({0, -Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-    Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-
-        // This logic has been replaced with calls to inline PTX to guarantee vectorization.
-        #if 0
-        dst_ptr[m + k * Iterations::kRow] = 
-          *(ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM);
-        #endif
-
-        auto ptr = ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM;
-        arch::shared_load(dst_ptr[m + k * Iterations::kRow], ptr);
-      }
-    }
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    
-    Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kN; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kM; ++m) {
-        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
-          src_ptr[m + k * Iterations::kM];
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for A operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension - used in sliced-K
-  int PartitionsK,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
-    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow / Policy::WarpShape::kRow,
-    Shape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads (scalar loads)
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kM,
-    ThreadShape::kColumn
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  /// Internal reference
-  cutlass::TensorRef<Element, layout::RowMajor> ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to conditionally enable extents checking
-  bool divisible_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() : divisible_(true) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) : extent_(Shape::kRow, Shape::kColumn), divisible_ (true) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, 0);
-
-    origin_ = lane_offset;
-
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(ref.data(), ref.stride(0));
-
-  }
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref,
-    TensorCoord extent, 
-    int lane_id
-  ) : extent_(extent), divisible_ (false) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, 0);
-
-    origin_ = lane_offset;
-    
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(ref.data(), ref.stride(0));
-
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    TensorCoord coord_offset(
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn);
-    
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({0, Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({0, -Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
-          
-          MatrixCoord offset(m * Policy::WarpShape::kRow * Policy::LaneMmaShape::kM + i, k);
-            
-          MatrixCoord access_coord = origin_ + offset;
-
-          int frag_idx = m * Policy::LaneMmaShape::kM + i + k * Iterations::kRow;
-
-          if (divisible_ || 
-              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
-          
-            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
-          }
-          else {
-            frag[frag_idx] = Element();
-          }
-        }
-      }
-    }
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator. 
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
-
-          *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM * Policy::LaneMmaShape::kM + i, k) + pointer_offset) = 
-            frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kM];
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for B operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
-    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
-  
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
-  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow,
-    Shape::kColumn / Policy::WarpShape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-protected:
-
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
-      ref.stride(0) / Policy::LaneMmaShape::kN);
-  }
-  
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kN});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({-Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-
-        #if 0
-        dst_ptr[n + k * Iterations::kColumn] = 
-          *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN);
-        #endif
-
-        void const *ptr = ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN;
-        arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kM; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kN; ++n) {
-        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
-          src_ptr[n + k * Iterations::kN];
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, Index pointer_offset) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for B operands of column-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK,
-  /// Group Size along kPartition - used in sliced-K
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::ColumnMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
-    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
-  
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
-  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow,
-    Shape::kColumn / Policy::WarpShape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  /// Internal reference
-  cutlass::TensorRef<Element, layout::ColumnMajor> ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to conditionally enable extents checking
-  bool divisible_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(): divisible_(true) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ): extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    origin_ = lane_offset;
-
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(ref.data(), ref.stride(0));
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref,
-    TensorCoord extent, 
-    int lane_id
-  ): extent_(extent), divisible_(false) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    origin_ = lane_offset;
-
-    ref.add_coord_offset(lane_offset);
-
-    ref_.reset(ref.data(), ref.stride(0));
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    TensorCoord coord_offset(
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn);
-
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({-Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < Policy::LaneMmaShape::kN; ++i) {
-
-          MatrixCoord offset(k, n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + i);
-            
-          MatrixCoord access_coord = origin_ + offset;
-
-          int frag_idx = n * Policy::LaneMmaShape::kN + i + k * Iterations::kColumn;
-
-          if (divisible_ || 
-              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
-
-            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
-          }
-          else {
-            frag[frag_idx] = Element();
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kM; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kN; ++n) {
-        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
-          src_ptr[n + k * Iterations::kN];
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, Index pointer_offset) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for C operands of column-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_
->
-class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of accumulators in memory
-  using Layout = layout::ColumnMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(
-    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
-    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
-
-  /// Thraed-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow / Policy::WarpShape::kRow,
-    Shape::kColumn / Policy::WarpShape::kColumn
-  >;
-
-  static_assert(
-    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
-    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
-  
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kM,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  using Delta = MatrixShape<
-    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
-    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-  
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({-Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to be loaded from memory
-    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Iterations::kN; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
-
-        Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
-          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(
-            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kN + n}));
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_m = 0; mma_m < Iterations::kM; ++mma_m) {
-
-          Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
-            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag) + 
-            mma_m + Iterations::kM * (n + mma_n * Policy::LaneMmaShape::kN);
-
-          *dst_ptr = src_ptr[mma_m * Policy::WarpShape::kM];
-        }
-      }
-    }
-  }
-    
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
-
-        Array<Element, Policy::LaneMmaShape::kM> *dst_ptr= 
-          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(
-            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kColumn + n}));
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
-
-          Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
-            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(&frag) + 
-            mma_m + Iterations::kRow * (n + mma_n * Policy::LaneMmaShape::kN);
-
-          dst_ptr[mma_m * Policy::WarpShape::kRow] = *src_ptr;
-        }
-      }
-    }
-  }
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for C operands of row-major layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_
->
-class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::RowMajor, Policy_> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of accumulators in memory
-  using Layout = layout::RowMajor;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(
-    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
-    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
-
-  /// Thraed-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow / Policy::WarpShape::kRow,
-    Shape::kColumn / Policy::WarpShape::kColumn
-  >;
-
-  static_assert(
-    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
-    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
-  
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kM,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  using Delta = MatrixShape<
-    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
-    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
-    
-    ref_.add_coord_offset(lane_offset);
-  }
-  
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    ref_.add_coord_offset({Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({-Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to be loaded from memory
-    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
-
-        Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
-          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(
-            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
-
-          Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
-            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag) + 
-            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
-
-          *dst_ptr = src_ptr[mma_n * Policy::WarpShape::kColumn];
-        }
-      }
-    }
-  }
-    
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
-
-        Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
-          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(
-            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
-
-          Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
-            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(&frag) + 
-            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
-
-          dst_ptr[mma_n * Policy::WarpShape::kColumn] = *src_ptr;
-        }
-      }
-    }
-  }
-  
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for A operands of column-major-K interleaved layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK,
-  /// Number of KGroups per kPartition
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::ColumnMajorInterleaved<4> ;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Iterleave factor
-  static const int kInterleave = 4;
-  
-  /// Number of partitions along K dimension
-  static const int kPartitionsK = PartitionsK;
-
-  /// Number of KGroups per kPartition
-  static const int kGroupPerTile = PartitionGroupSize / Shape::kColumn;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
-    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
-  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow / Policy::WarpShape::kRow,
-    Shape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM) && !(ThreadShape::kColumn % Policy::LaneMmaShape::kK), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kM,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kK
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-private:
-
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kMK>, layout::ColumnMajorInterleaved<4>> ref_;
-
-  /// group index within tile
-  int k_group_idx_;
-
-public:
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(Policy::LaneMmaShape::kM, 0);
-
-    ref.add_coord_offset(lane_offset);
-
-    k_group_idx_ = 0;
-    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(ref.data()), ref.stride(0)/Policy::LaneMmaShape::kMK);
-  }
-  
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow / Policy::LaneMmaShape::kMK, 
-      coord.column() * Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == kGroupPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset({0, kGroupPerTile * (kPartitionsK-1)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({0, -Shape::kColumn});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kMK > *dst_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-
-        dst_ptr[m + k * Iterations::kRow] = 
-          *((ref_.data() + ref_.offset({m * Policy::WarpShape::kRow / kInterleave, 
-                  k*Policy::LaneMmaShape::kK}) + pointer_offset / Policy::LaneMmaShape::kM));
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    
-    Array<Element, Policy::LaneMmaShape::kMK> const *src_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK > *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kN; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kM; ++m) {
-        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
-          src_ptr[m + k * Iterations::kM];
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization for B operands of row-major k-interleaved layouts
-///
-/// Concept: MutableRandomAccessContiguousTileIteratorConcept
-///
-template <
-  /// Size of the matrix to load (concept: MatrixShape)
-  typename Shape_,
-  /// Data type of A elements
-  typename Element_,
-  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK,
-  /// Number of KGroups per kPartition
-  int PartitionGroupSize
->
-class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
-public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of policy
-  using Layout = layout::RowMajorInterleaved<4>;
-
-  /// Decomposition of elements among threads
-  using Policy = Policy_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Interleave factor
-  static const int kInterleave = 4;
-
-  /// Number of partitions along K dimension
-  static const int kPartitionsK = PartitionsK;
-
-  /// Number of KGroups per kPartition
-  static const int kGroupPerTile = PartitionGroupSize / Shape::kRow;
-
-  //
-  // Derived quantities
-  //
-
-  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
-    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
-
-  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
-  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
-  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
-  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
-
-  /// Thread-level shape of a fragment
-  using ThreadShape = MatrixShape<
-    Shape::kRow,
-    Shape::kColumn / Policy::WarpShape::kColumn
-  >;
-
-  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN) && !(ThreadShape::kRow % Policy::LaneMmaShape::kK), 
-    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
-
-  /// Number of individual loads
-  using Iterations = MatrixShape<
-    ThreadShape::kRow / Policy::LaneMmaShape::kK,
-    ThreadShape::kColumn / Policy::LaneMmaShape::kN
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, ThreadShape::kCount>;
-
-
-private:
-
-  /// Internal reference
-  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kKN>, layout::RowMajorInterleaved<4>> ref_;
-
-  /// group index within tile
-  int k_group_idx_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator(
-    TensorRef ref, 
-    int lane_id
-  ) {
-
-    // compute offset based on thread ID and lane layout
-    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
-
-    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
-      MatrixCoord(0, Policy::LaneMmaShape::kN);
-
-    ref.add_coord_offset(lane_offset);
-
-    k_group_idx_ = 0;
-
-    ref_.reset(
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(ref.data()),
-      ref.stride(0) / Policy::LaneMmaShape::kKN);
-  }
-  
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
-
-    ref_.add_coord_offset({
-      coord.row() * Shape::kRow, 
-      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kKN});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator++() {
-
-    add_tile_offset({1, 0});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == kGroupPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset({kGroupPerTile * (kPartitionsK-1), 0});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaSimtTileIterator & operator--() {
-
-    ref_.add_coord_offset({-Shape::kRow, 0});
-
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kKN> *dst_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-        dst_ptr[n + k * Iterations::kColumn] = 
-          *(ref_.data() + ref_.offset({k * Policy::LaneMmaShape::kK, 
-                n * Policy::WarpShape::kColumn / kInterleave}) + pointer_offset / Policy::LaneMmaShape::kN);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-  
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-
-    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
-      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kM; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kN; ++n) {
-        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
-          src_ptr[n + k * Iterations::kN];
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, Index pointer_offset) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
deleted file mode 100755
index 1ce1051c4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ /dev/null
@@ -1,382 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate
-   operations targeting sparse Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class SparseMmaTensorOp {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Equivalant base dense mma
-  using Base = MmaTensorOp<Shape, ElementA, LayoutA, ElementB, LayoutB,
-                           ElementC, LayoutC, Policy, PartitionsK_,
-                           AccumulatorsInRowMajor, Enable>;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Base::ArchMmaOperator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename Base::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = typename Base::OperatorClass;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename Base::InstructionShape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Base::kTransformA;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = Base::kTransformB;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// Sparsity in Operand A
-  static int const kSparse = Policy::Operator::kSparse;
-
-  /// Meta data size in bits 
-  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
-
-  /// Max ID2
-  static int const kMaxID2 = Policy::Operator::kMaxID2;
-
-    static int const kVerticalVisit = false;
-  /// Data type of meta E that is moved at the same time
-  using ElementE =
-      typename cutlass::platform::conditional<kMaxID2 == 1, uint32_t,
-                                              uint16_t>::type;
-
-  /// Number of ElementA that is associated with one ElementE
-  static int const kElementsPerElementE =
-      128 / cutlass::sizeof_bits<ElementA>::value;
-
-  /// Meta data is essentially interleaved but mapped to ColumnMajor internally
-  static int const kInterleaved = 2;
-
-  /// Layout of meta E 
-  using LayoutE = cutlass::layout::ColumnMajor;
-
- public:
-
-  /// Iterates over the A operand in memory
- using IteratorA = MmaTensorOpMultiplicandTileIterator<
-     MatrixShape<Shape::kM, Shape::kK / kSparse>, Operand::kA, ElementA,
-     LayoutA,
-     MatrixShape<Policy::Operator::Shape::kM,
-                 Policy::Operator::Shape::kK / kSparse>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
- /// Storage for A tile
- using FragmentA = typename IteratorA::Fragment;
-
- /// Storage for transformed A tile
- using TransformedFragmentA =
-     Array<typename Policy::Operator::ElementA, FragmentA::kElements>;
-
- /// Iterates over the B operand in memory
- using IteratorB = typename Base::IteratorB;
-
- /// Storage for B tile
- using FragmentB = typename Base::FragmentB;
-
- /// Storage for transformed B tile
- using TransformedFragmentB = typename Base::TransformedFragmentB;
-
- /// Iterates over the C operand in memory
- using IteratorC = typename Base::IteratorC;
-
- /// Storage for C tile
- using FragmentC = typename Base::FragmentC;
-
- /// Iterates over the E operand in memory
- using IteratorE = SparseMmaTensorOpMetaTileIterator<
-     MatrixShape<Shape::kM * kInterleaved,
-                 Shape::kK / kSparse / kElementsPerElementE / kInterleaved>,
-     ElementE, LayoutE,
-     MatrixShape<Policy::Operator::Shape::kM,
-                 Policy::Operator::Shape::kK / kSparse / kElementsPerElementE /
-                     kInterleaved>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
- /// Storage for E tile
- using FragmentE = typename IteratorE::Fragment;
-
- /// Number of mma operations performed
- using MmaIterations = typename Base::MmaIterations;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  SparseMmaTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C,
-    FragmentE const &E
-  ) const {
-
-    using MmaOperandA = typename Policy::Operator::FragmentA;
-    using MmaOperandB = typename Policy::Operator::FragmentB;
-    using MmaOperandC = typename Policy::Operator::FragmentC;
-    using MmaOperandE = typename Policy::Operator::FragmentE;
-
-    D = C;
-
-    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-    MmaOperandE const *ptr_E = reinterpret_cast<MmaOperandE const *>(&E);
-
-    if (kVerticalVisit) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
-          int id2 = m_serpentine % kMaxID2;
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n + m_serpentine * MmaIterations::kColumn],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[n + m_serpentine * MmaIterations::kColumn],
-              ptr_E[(m_serpentine / kMaxID2)],
-              id2);
-          } else {
-            mma(
-              ptr_D[m_serpentine + n * MmaIterations::kRow],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[m_serpentine + n * MmaIterations::kRow],
-              ptr_E[(m_serpentine / kMaxID2)],
-              id2);
-          }
-        }
-      }
-    } else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-        int id2 = m % kMaxID2;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n_serpentine + m * MmaIterations::kColumn],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[n_serpentine + m * MmaIterations::kColumn],
-              ptr_E[(m / kMaxID2)],
-              id2);
-          } else {
-            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
-                ptr_A[m],
-                ptr_B[n_serpentine],
-                ptr_D[m + n_serpentine * MmaIterations::kRow],
-                ptr_E[(m / kMaxID2)],
-                id2);
-          }
-        }
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    //
-    // Define conversions from source type to instruction type
-    //
-    FloatRoundStyle const kRoundA =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
-                              ElementA>::kRound;
-    FloatRoundStyle const kRoundB =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
-                              ElementB>::kRound;
-
-    if (kVerticalVisit) {
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                            FragmentA::kElements, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements / 2, kRoundB>
-          convert_B;
-      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
-          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
-          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
-                                             FragmentB::kElements / 2> *>(&dst_B);
-  
-      dst_A = convert_A(A);
-  
-      ptr_dst_B[0] = convert_B(ptr_B[0]);
-      ptr_dst_B[1] = convert_B(ptr_B[1]);
-    } else {
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                             FragmentA::kElements / 2, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements, kRoundB>
-          convert_B;
-      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
-          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
-          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
-                                             FragmentA::kElements / 2> *>(&dst_A);
-  
-      dst_B = convert_B(B);
-  
-      ptr_dst_A[0] = convert_A(ptr_A[0]);
-      ptr_dst_A[1] = convert_A(ptr_A[1]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
deleted file mode 100755
index d4aaf5be1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
+++ /dev/null
@@ -1,415 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h" 
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename T, typename S, int N, FloatRoundStyle Round>
-struct ConvertAndPack {
-
-  using Converter = NumericArrayConverter<T, S, N, Round>;
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<S, N> const &source) {
-    Converter converter;
-
-    return converter(source);
-  }
-};
-
-template <typename T, int N, FloatRoundStyle Round>
-struct ConvertAndPack<T, T, N, Round> {
-
-  CUTLASS_HOST_DEVICE
-  Array<T, N> operator()(Array<T, N> const &source) {
-		return source;
-  }
-};
-
-template <int N, FloatRoundStyle Round>
-struct ConvertAndPack<bfloat16_t, float, N, Round> {
-
-  using Converter = NumericArrayConverter<bfloat16_t, float, N, Round>;
-
-  CUTLASS_HOST_DEVICE
-  Array<bfloat16_t, N> operator()(Array<float, N> const &source) {
-    Converter converter;
-
-    Array<float, N> tmp;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
-      tmp[i] = source[idx];
-    }
-
-    return converter(tmp);
-  }
-};
-
-template <int N, FloatRoundStyle Round>
-struct ConvertAndPack<half_t, float, N, Round> {
-
-  using Converter = NumericArrayConverter<half_t, float, N, Round>;
-
-  CUTLASS_HOST_DEVICE
-  Array<half_t, N> operator()(Array<float, N> const &source) {
-    Converter converter;
-
-    Array<float, N> tmp;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
-      tmp[i] = source[idx];
-    }
-
-    return converter(tmp);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting Tensor Cores.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaTensorOp {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  #if defined(__CUDA_ARCH__) && ((__CUDA_ARCH__ < 800) || (__CUDA_ARCH__ == 890)) 
-    static int const kVerticalVisit = true;
-  #else
-    static int const kVerticalVisit = false;
-  #endif
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
-     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA =
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
-      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB =
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
-    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
-  >;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    D = C;
-
-    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-
-      
-    if (kVerticalVisit) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n + m_serpentine * MmaIterations::kColumn],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
-          } else {
-            mma(
-              ptr_D[m_serpentine + n * MmaIterations::kRow],
-              ptr_A[m_serpentine],
-              ptr_B[n],
-              ptr_D[m_serpentine + n * MmaIterations::kRow]);
-          }
-        }
-      }
-    } else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n_serpentine + m * MmaIterations::kColumn],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
-          } else {
-            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
-                ptr_A[m],
-                ptr_B[n_serpentine],
-                ptr_D[m + n_serpentine * MmaIterations::kRow]);
-          }
-        }
-      }
-    }
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    //
-    // Define conversions from source type to instruction type
-    //
-    FloatRoundStyle const kRoundA =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
-                              ElementA>::kRound;
-    FloatRoundStyle const kRoundB =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
-                              ElementB>::kRound;
-    if (kVerticalVisit) {    
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                            FragmentA::kElements, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements / 2, kRoundB>
-          convert_B;
-      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
-          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
-          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
-                                             FragmentB::kElements / 2> *>(&dst_B);
-  
-      dst_A = convert_A(A);
-  
-      ptr_dst_B[0] = convert_B(ptr_B[0]);
-      ptr_dst_B[1] = convert_B(ptr_B[1]);
-    } else {
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                            FragmentA::kElements / 2, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements, kRoundB>
-          convert_B;
-      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
-          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
-          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
-                                             FragmentA::kElements / 2> *>(&dst_A);
-  
-      dst_B = convert_B(B);
-  
-      ptr_dst_A[0] = convert_A(ptr_A[0]);
-      ptr_dst_A[1] = convert_A(ptr_A[1]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
deleted file mode 100755
index 148e71226..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
+++ /dev/null
@@ -1,471 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-enum class TensorFloat32Op {
-  k3xTF32, 
-  k4xTF32 
-}; 
-
-template <
-  /// Floating-point rounding style
-  FloatRoundStyle RoundBigA_,
-  /// Floating-point rounding style
-  FloatRoundStyle RoundSmallA_,
-  /// Floating-point rounding style
-  FloatRoundStyle RoundBigB_ = RoundBigA_,
-  /// Floating-point rounding style
-  FloatRoundStyle RoundSmallB_ = RoundSmallA_,
-  /// Precision for TensorFloat32Op 
-  // (k3xTF32: BigxBig, BigxSmall, SmallxBig)
-  // (k4xTF32: BigxBig, BigxSmall, SmallxBig, SmallxSmall)
-  TensorFloat32Op Precision_ = TensorFloat32Op::k3xTF32
-  >
-struct FastF32 {
-
-  static FloatRoundStyle const kRoundBigA = RoundBigA_;
-  static FloatRoundStyle const kRoundSmallA = RoundSmallA_;
-  static FloatRoundStyle const kRoundBigB = RoundBigB_;
-  static FloatRoundStyle const kRoundSmallB = RoundSmallB_;
-  static TensorFloat32Op const kPrecision = Precision_;
-};
-
-
-namespace detail {
-
-  template<
-    int N,
-    FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
-    FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
-  >
-  struct ConvertAndPackAccurateF32 {
-  
-    /// Rounding styles for big and small part
-    static FloatRoundStyle const kRoundBig = RoundBig;
-    static FloatRoundStyle const kRoundSmall = RoundSmall;
-
-    /// Converter type
-    using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
-
-    /// Source fragement
-    using SourceFragment = Array<float, N>;
-
-    /// Destination fragment
-    using DestinationFragment = Array<tfloat32_t, N>;
-
-    /// Converter Fragment holding two tfloat32_t elements for every float
-    using ConverterFragment = Array<tfloat32_t, 2>;
-
-    /// Index in fargments for the big and small part
-    static int const kBigIndex = 0;
-    static int const kSmallIndex = 1;
-
-    CUTLASS_HOST_DEVICE
-    void operator()(SourceFragment const &source,
-                    DestinationFragment &dst_big,
-                    DestinationFragment &dst_small) {
-      
-      Converter convert_;
-      ConverterFragment result_;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < N; ++i) {
-        // convert source to result fragment
-        result_ = convert_(source[i]);
-
-        // store converted result fragments to destination fragment
-        dst_big[i] = result_[kBigIndex];
-        dst_small[i] = result_[kSmallIndex];
-      }
-    }
-  };
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaTensorOpFastF32;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float*float+float => float using TF32 TensorOps
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Number of partitions along K dimension
-  int PartitionsK_,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor,
-  /// Used for partial specialization
-  typename Enable
->
-class MmaTensorOpFastF32<
-  Shape_,
-  float, LayoutA_,
-  float, LayoutB_,
-  float, LayoutC_,
-  Policy_, PartitionsK_,
-  AccumulatorsInRowMajor, Enable> {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = float;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = float;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = float;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Indicates math operator 
-  using MathOperator = arch::OpMultiplyAddFastF32;
-
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// Tune F32 to TF32 big small conversion for float operation
-  /// Different combination of big small conversin can cause different tradeoff
-  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
-  /// improve the performance but hur the accuracy.
-  using MmaFastF32 = FastF32 <
-    FloatRoundStyle::round_toward_zero,        // kRoundBigA
-    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
-    FloatRoundStyle::round_toward_zero,        // kRoundBigB
-    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
-    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
-  >;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-      MatrixShape<Shape::kM, Shape::kK>, 
-      Operand::kA, 
-      ElementA, 
-      LayoutA,
-      MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-      Policy::OpDelta::kRow, 
-      kThreadCount, 
-      kPartitionsK
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA =
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
-
-  /// Fragment bisecting big and small sections
-  using AccessTypeFragmentA = 
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, 
-      Operand::kB, 
-      ElementB, 
-      LayoutB,
-      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-      Policy::OpDelta::kRow, 
-      kThreadCount, 
-      kPartitionsK
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB =
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
-
-  /// Fragment bisecting big and small sections
-  using AccessTypeFragmentB = 
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
-
-  /// Index in fargments for the big and small part
-  static int const kBigIndex = 0;
-  static int const kSmallIndex = 1;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
-    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
-  >;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaTensorOpFastF32() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    TransformedFragmentA const &A, 
-    TransformedFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    AccessTypeFragmentA const *ptr_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
-    AccessTypeFragmentB const *ptr_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
-
-    //
-    // Accumulate in place
-    //
-    D = C;
-    
-    mma_operator(D, ptr_A[kSmallIndex], ptr_B[kBigIndex], D);
-
-    mma_operator(D, ptr_A[kBigIndex], ptr_B[kSmallIndex], D);
-
-    mma_operator(D, ptr_A[kBigIndex], ptr_B[kBigIndex], D);
-
-    if (MmaFastF32::kPrecision == TensorFloat32Op::k4xTF32)
-      mma_operator(D, ptr_A[kSmallIndex], ptr_B[kSmallIndex], D);
-  }
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void mma_operator(
-    FragmentC &D, 
-    AccessTypeFragmentA const &A, 
-    AccessTypeFragmentB const &B, 
-    FragmentC const &C
-  ) const {
-
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-      using MmaOperandA = typename ArchMmaOperator::FragmentA;
-      using MmaOperandB = typename ArchMmaOperator::FragmentB;
-      using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-      MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-      MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-      MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-
-      // Serpentine visitation order maximizing reuse of Ra
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-          // This allows to reuse of Rb when at serpentine turns
-          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
-
-          if (AccumulatorsInRowMajor) {  // matrix B is reordered
-            mma(
-              ptr_D[n_serpentine + m * MmaIterations::kColumn],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
-          } else {
-            mma(
-              ptr_D[m + n_serpentine * MmaIterations::kRow],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[m + n_serpentine * MmaIterations::kRow]);
-          }
-        } // end n loop
-      } // end m loop
-    #else
-      assert(0);
-    #endif
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    //
-    // Define conversions from source type to instruction type
-    //
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-      
-      detail::ConvertAndPackAccurateF32<
-        FragmentA::kElements / 2,
-        MmaFastF32::kRoundBigA,
-        MmaFastF32::kRoundSmallA> convert_A;
-      
-      detail::ConvertAndPackAccurateF32<
-        FragmentB::kElements,
-        MmaFastF32::kRoundBigB,
-        MmaFastF32::kRoundSmallB> convert_B;
-      
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *ptr_dst_B = 
-        reinterpret_cast<Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *>(&dst_B);
-      
-      convert_B(B, ptr_dst_B[0], ptr_dst_B[1]);
-
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *ptr_dst_A =
-        reinterpret_cast<Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *>(&dst_A);
-      
-      Array<ElementA, FragmentA::kElements / 2> const *ptr_A = 
-        reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-      
-      convert_A(ptr_A[0], ptr_dst_A[0], ptr_dst_A[2]);
-      
-      convert_A(ptr_A[1], ptr_dst_A[1], ptr_dst_A[3]);
-    #else
-      assert(0);
-    #endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
deleted file mode 100755
index 32460b629..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
+++ /dev/null
@@ -1,559 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of a warp tile
-      that participate in one warp-level mma operation.
-
-      Typically, this is used to access the accumulator tile/fragement of a warp-level mma operation.
-      The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into 
-      next warp-level mma operation. 
-
-      This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is 
-      reused as multiplicand tile for the next mma.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Size of the accumulation tile shape (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,
-    /// Accumulator Element type
-    typename ElementAccumulator_,    
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Output operation on the fragment
-    typename OutputOp_>
-class MmaTensorOpFragmentIterator;
-
-
-// Partial specialization for col-major accumulator tile
-
-template <
-    /// Shape of warp tile to load (concept: MatrixShape)
-    typename Shape_,
-    /// Shape of the warp accumulation tile (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,    
-    /// Accumulator Element type
-    typename ElementAccumulator_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Output operation on fragment
-    typename OutputOp_>
-class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
-                                         cutlass::layout::ColumnMajor,
-                                         InstructionShape_, OutputOp_> {
- public:
-
-  /// Shape of warp tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-    
-  /// Shape of the warp accumulation tile (concept: MatrixShape)
-  using AccumulatorShape = AccumulatorShape_;
-
-  /// KBlocks columns to compute residual
-  static int const kKBlockColumn = KBlocksColumn_;
-
-  /// Accumulator Element type
-  using ElementAccumulator = ElementAccumulator_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Output operation on fragment
-  using OutputOp = OutputOp_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-    static_assert(
-        AccumulatorShape::kRow == Shape::kRow, 
-        "Rows of Warp Accumulator must be the same as rows of warp");
-    static_assert(
-        !(AccumulatorShape::kColumn % Shape::kColumn),
-        "Shape of Warp Accumulator must be divisible by warp shape.");
-    static_assert(
-        !(kKBlockColumn % Shape::kColumn),
-        "KBlock size must be divisible by warp shape.");
-
-    /// Number of times this iterator can be incremented
-    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
-  };
-
-private:
-
-  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
-
-  /// Number of mma operations performed by a warp
-  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                    Shape::kColumn / InstructionShape::kN>;
-  /// Number of mma operations performed by the entire accumulator
-  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
-                                              AccumulatorShape::kColumn / InstructionShape::kN>;
-
-  /// Number of K iterations    
-  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
-  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
-  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-  static int const kResidualIndex = kResidualColumn / Shape::kColumn
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-
-  /// Accumulator Fragment object
-  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
-
-  /// Scale Bias Element Type
-  using ElementScaleBias = typename OutputOp::ElementCompute;
-
-  /// Scale Bias Fragment object
-  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
-
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentAccessType = Array<Element, kElementsPerAccess>;
-
-  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-  /// Used to access residual tile first
-  bool is_residual_tile_;
-
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
-      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
-        index_(0), is_residual_tile_(true) {}
-
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    index_ += index_offset; 
-    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
-      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
-      is_residual_tile_ = false;
-    }
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator &operator--() {
-    add_offset(-1);
-    return *this;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, OutputOp output_op) const {
-
-    if (output_op.is_source_needed()) //beta must be zero
-      assert(0);
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-
-    int index = index_ * MmaIterations::kCount;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < MmaIterations::kColumn; n++) {
-      for (int m = 0; m < MmaIterations::kRow; m++) {
-        int accumulator_access_offset = 
-            n * AccumulatorIterations::kRow + m + index;
-            
-        frag_ptr[m * MmaIterations::kColumn + n].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-            frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset]);
-      }
-    }
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  /// Then apply per-channel scale and bias
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, ScaleBiasFragment &scale, 
-        ScaleBiasFragment &bias, OutputOp output_op) const {
-
-    if (output_op.is_source_needed()) //beta must be zero
-      assert(0);
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
-    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
-
-    int index = index_ * MmaIterations::kCount;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < MmaIterations::kColumn; n++) {
-      for (int m = 0; m < MmaIterations::kRow; m++) {
-        int accumulator_access_offset = 
-            n * AccumulatorIterations::kRow + m + index;
-            
-        frag_ptr[m * MmaIterations::kColumn + n].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-            frag_ptr[m * MmaIterations::kColumn + n] = 
-                output_op(accumulators_[accumulator_access_offset], 
-                    scale_ptr[n] /*scale*/, bias_ptr[n] /*bias*/);
-      }
-    }
-  }
-
-
-
-};
-
-// Partial specialization for row-major accumulator tile
-
-template <
-    /// Shape of warp tile to load (concept: MatrixShape)
-    typename Shape_,
-    /// Shape of the warp accumulation tile (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,    
-    /// Accumulator Element type
-    typename ElementAccumulator_,    
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Output operation on fragment
-    typename OutputOp_>
-class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
-                                         cutlass::layout::RowMajor,
-                                         InstructionShape_, OutputOp_> {
- public:
-
-  /// Shape of warp tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-    
-  /// Shape of the warp accumulation tile (concept: MatrixShape)
-  using AccumulatorShape = AccumulatorShape_;
-
-  /// KBlocks columns to compute residual
-  static int const kKBlockColumn = KBlocksColumn_;
-
-  /// Accumulator Element type
-  using ElementAccumulator = ElementAccumulator_;
-
-  /// Element type
-  using Element = Element_;
-  
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Output operation on fragment
-  using OutputOp = OutputOp_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-    static_assert(
-        AccumulatorShape::kRow == Shape::kRow, 
-        "Rows of Warp Accumulator must be the same as rows of warp");
-    static_assert(
-        !(AccumulatorShape::kColumn % Shape::kColumn),
-        "Shape of Warp Accumulator must be divisible by warp shape.");
-    static_assert(
-        !(kKBlockColumn % Shape::kColumn),
-        "KBlock size must be divisible by warp shape.");
-
-    /// Number of times this iterator can be incremented
-    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
-  };
-
-private:
-
-  static int const kRowsPerIteration = 8;
-  static int const kColumnsPerIteration = 16;
-  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kN / kThreads;
-  static int const kElementsPerAccess = kRowsPerIteration * kColumnsPerIteration / kThreads;
-  static int const kIterationsPerAccess = kElementsPerAccess / kElementsPerIteration;
-  
-  // Number of iterations per actual instruction
-  static int const kIterationsPerInstruction = InstructionShape::kM / kRowsPerIteration;
-
-  static int const kAccessStride = kIterationsPerInstruction;
-
-  /// Number of mma operations performed by a warp
-  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                    Shape::kColumn / InstructionShape::kN>;
-  /// Number of mma operations performed by the entire accumulator
-  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
-                                              AccumulatorShape::kColumn / InstructionShape::kN>;
-
-  /// Number of Accesses in a warp
-  using AccessIterations = MatrixShape<MmaIterations::kRow * kIterationsPerInstruction, 
-                                        MmaIterations::kColumn / kIterationsPerAccess>;
-
-  /// Number of K iterations    
-  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
-  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
-  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn;
-  static int const kResidualIndex = kResidualColumn / Shape::kColumn;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-
-  /// Accumulator Fragment object
-  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
-
-  /// Scale Bias Element Type
-  using ElementScaleBias = typename OutputOp::ElementCompute;
-
-  /// Scale Bias Fragment object
-  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
-
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<ElementAccumulator, kElementsPerIteration>;
-  using FragmentAccessType = Array<Element, kElementsPerIteration>;
-  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerIteration>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Accumulator tile
-  AccessType const *accumulators_;
-
-  /// Internal index
-  int index_;
-
-  /// Used to access residual tile first
-  bool is_residual_tile_;
-
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
-      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
-        index_(0), is_residual_tile_(true) {}
-
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    index_ += index_offset; 
-    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
-      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
-      is_residual_tile_ = false;
-    }
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpFragmentIterator &operator--() {
-    add_offset(-1);
-    return *this;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_index(int idx) {
-    index_ = idx;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, OutputOp output_op) const {
-
-    if (output_op.is_source_needed()) //beta must be zero
-      assert(0);
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-
-    int index = index_ * AccessIterations::kCount;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < AccessIterations::kCount; i++) {
-
-      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
-                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
-                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
-                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
-                                    (kIterationsPerInstruction * kIterationsPerAccess) +
-                                    (index % kIterationsPerInstruction);
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kIterationsPerAccess; j++) {
-  
-        frag_ptr[i*kIterationsPerAccess + j].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-              frag_ptr[i*kIterationsPerAccess + j] = output_op(accumulators_[accumulator_access_offset + j * kAccessStride]);
-      }
-      index++;
-    }
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  /// Then apply per-channel scale and bias
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, ScaleBiasFragment &scale, 
-        ScaleBiasFragment & bias, OutputOp output_op) const {
-
-    if (output_op.is_source_needed()) //beta must be zero
-      assert(0);
-
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
-    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
-
-    int index = index_ * AccessIterations::kCount;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < AccessIterations::kCount; i++) {
-
-      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
-                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
-                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
-                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
-                                    (kIterationsPerInstruction * kIterationsPerAccess) +
-                                    (index % kIterationsPerInstruction);
-
-      int scale_bias_offset = (index 
-                    % (kIterationsPerInstruction * AccessIterations::kColumn))
-                    * kIterationsPerAccess;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kIterationsPerAccess; j++) {
-
-  
-        frag_ptr[i*kIterationsPerAccess + j].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-              frag_ptr[i*kIterationsPerAccess + j] = output_op(
-                    accumulators_[accumulator_access_offset + j * kAccessStride], 
-                    scale_ptr[scale_bias_offset + j], bias_ptr[scale_bias_offset + j]);
-      }
-      index++;
-    }
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
deleted file mode 100755
index 0a768caef..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Policy describing implementation details of warp-level GEMM targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Policy 
-template <
-  typename Operator_,        ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
-  typename OpDelta_          ///< distance between operations (concept: MatrixShape)
->
-struct MmaTensorOpPolicy {
-
-  using Operator = Operator_;    ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
-  using OpDelta = OpDelta_;      ///< distance between operations (concept: MatrixShape)
-  using MmaShape = typename Operator::Shape;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
deleted file mode 100755
index c40790fa8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-
-    This is a work in progress.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/mma.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaVoltaTensorOp {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Architecture tag
-  using ArchTag = arch::Sm70;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  /// Underlying instruction shape
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// interleaved 32x32 tiles
-  using InterleavedTileShape = GemmShape<32, 32, 4>;
-
-  static_assert(!(Shape::kM % InterleavedTileShape::kM) &&
-                !(Shape::kN % InterleavedTileShape::kN),
-                "Shape must be a multiple of InterleavedTileShape.");
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaVoltaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kM, Shape::kK>,
-    Operand::kA,
-    ElementA,
-    LayoutA,
-    MatrixShape<
-      ArchMmaOperator::Shape::kM,
-      ArchMmaOperator::Shape::kK
-    >,
-    Policy::OpDelta::kRow,
-    kThreadCount
-  >;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaVoltaTensorOpMultiplicandTileIterator<
-    MatrixShape<Shape::kK, Shape::kN>,
-    Operand::kB,
-    ElementB,
-    LayoutB,
-    MatrixShape<
-      ArchMmaOperator::Shape::kK,
-      ArchMmaOperator::Shape::kN
-    >,
-    Policy::OpDelta::kRow,
-    kThreadCount
-  >;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaVoltaTensorOpAccumulatorTileIterator<
-    MatrixShape<Shape::kM, Shape::kN>,
-    ElementC,
-    LayoutC,
-    typename ArchMmaOperator::Shape,
-    typename Policy::OpDelta
-  >;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-private:
-
-  static_assert(
-    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
-    !(Shape::kN % ArchMmaOperator::Shape::kN),
-    "Shape of warp-level Mma must be divisible by operator shape.");
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    InterleavedTileShape::kM / ArchMmaOperator::Shape::kM,
-    InterleavedTileShape::kN / ArchMmaOperator::Shape::kN
-  >;
-  using TileIterations = MatrixShape<
-    Shape::kM / InterleavedTileShape::kM,
-    Shape::kN / InterleavedTileShape::kN
-  >;
-
-  // Whether matrix B is reordered
-  bool reorder_B_;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-  
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C)  {
-
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    D = C;
-
-    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int outer_col = 0; outer_col < TileIterations::kColumn; ++outer_col) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int inner_col = 0; inner_col < MmaIterations::kColumn; ++inner_col) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int outer_row = 0; outer_row < TileIterations::kRow; ++outer_row) {
-          CUTLASS_PRAGMA_UNROLL
-
-          for (int inner_row = 0; inner_row < MmaIterations::kRow; ++inner_row) {
-      
-            int op_col = inner_col + MmaIterations::kColumn * outer_col;
-
-            // Column-major serpentine sequence to maximize reuse of A operand.
-            int inner_row_serp = inner_row;
-            int outer_row_serp = outer_row;
-            if (op_col & 1) {
-              inner_row_serp = MmaIterations::kRow - inner_row - 1;
-              outer_row_serp = TileIterations::kRow - outer_row - 1;
-            }
-            int op_row = inner_row_serp + MmaIterations::kRow * outer_row_serp;
-            int op_idx = inner_row_serp + MmaIterations::kRow * 
-                         (inner_col + MmaIterations::kColumn * 
-                          (outer_row_serp + TileIterations::kRow * outer_col));
-            mma(
-              ptr_D[op_idx],
-              ptr_A[op_row],
-              ptr_B[op_col],
-              ptr_D[op_idx]);
-
-          }
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
deleted file mode 100755
index 4588efb98..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-
-/// Tile access iterator
-/// Each iteration acess in the tile is
-/// used as multiplicand for one
-/// warp-level matrix multiplication
-template <
-    /// Size of the tile (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads = 32,
-    /// Enable Residual Support
-    bool EnableResidual = false,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1
->
-class MmaTensorOpMultiplicandTileAccessIterator {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  /// Basic check
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Number of elements accessed per Shared Memory load
-  static int const kElementsPerAccess = 
-    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
-
-  using InstructionCount = MatrixShape<
-    Shape::kRow / InstructionShape::kRow,
-    Shape::kColumn / InstructionShape::kColumn
-  >;
-
-  static int const kIterations = (kOperand == Operand::kA) ? 
-    InstructionCount::kColumn : InstructionCount::kRow;
-
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    (kOperand == Operand::kA) ? 
-      (Shape::kRow * InstructionShape::kColumn / kThreads) : 
-      (Shape::kColumn * InstructionShape::kRow / kThreads)
-  >;
-
-  /// Memory access type
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-private:
-
-  /// Underlying tensor reference
-  TensorRef ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to load residual tile
-  bool is_residual_;
-  
-  /// residual offset of each thread
-  TensorCoord residual_offset_;
-
-  /// Iterations in a tile
-  int iterations_;
-
-public:
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileAccessIterator(
-    TensorRef const &ref, 
-    TensorCoord extent,
-    int lane_id
-  ): ref_(ref), extent_(extent), is_residual_(false), iterations_(0) {
-  
-    if (kOperand == Operand::kA) {
-      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
-    }
-    else {
-      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
-    }
-
-    ref_.add_coord_offset(origin_);
-
-    if(EnableResidual) {
-      // compute residual offset
-      if (kOperand == Operand::kA) {
-        typename TensorCoord::Index residual_size = 
-          extent_.column() % Shape::kColumn;
-        if(residual_size) {
-          is_residual_ = true;
-          residual_offset_ = make_Coord(0, residual_size);
-        }
-      }
-      else {
-        typename TensorCoord::Index residual_size = 
-          extent_.row() % Shape::kRow;
-        if(residual_size) {
-          is_residual_ = true;
-          residual_offset_ = make_Coord(residual_size, 0);
-        }
-      }
-    }
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileAccessIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): MmaTensorOpMultiplicandTileAccessIterator(ref,
-    {Shape::kRow, Shape::kColumn}, lane_id) {
-  }
- 
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileAccessIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  void advance() {
-
-    if(EnableResidual && is_residual_) {
-      is_residual_ = false;
-
-      origin_ += residual_offset_;
-      ref_.add_coord_offset(residual_offset_);
-
-    }
-
-    else {
-      if (kOperand == Operand::kA) {
-        add_tile_offset({0, 1});
-      }
-      else {
-        add_tile_offset({1, 0});
-      }
-    }
-
-    iterations_ = 0;
-  }
-
-  /// increase iterations in a tile
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileAccessIterator & operator++() {
-
-    iterations_++;
-
-    if(iterations_ >= kIterations)
-      advance();
-    
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    int const kWarpShapeDivisibleInner =
-      (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
-
-    // Take advantage of Tensor Op's 8 x 4T access pattern
-    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
-
-    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    if (kOperand == Operand::kA) {
-      int const kTilesPerInstruction = InstructionShape::kRow / 8;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
-            int access_idx = 
-              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
-            
-            MatrixCoord offset(
-              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
-              inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kColumn);
-
-            MatrixCoord access_coord = origin_ + offset;
-
-//            if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
-
-              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
-                ref_.data() + ref_.offset(offset));
-//            }
-//            else {
-//              AccessType zero;
-//              zero.clear();
-//              access_ptr[access_idx] = zero;
-//            }
-          }
-        }
-      }
-    }
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
-          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
-
-          MatrixCoord offset(
-            inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kRow,
-            inst_n_idx * 8);
-
-          MatrixCoord access_coord = origin_ + offset;
-
-//          if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
-              
-            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
-              ref_.data() + ref_.offset(offset));
-//          }
-//          else {
-//              AccessType zero;
-//              zero.clear();
-//              access_ptr[access_idx] = zero;
-//          }
-        }
-      } 
-    }
-  }
-
-};
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
deleted file mode 100755
index e6e6d70f3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ /dev/null
@@ -1,4803 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class MmaTensorOpMultiplicandTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                                   64>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, 64>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Determine number of elements along outer dimension per individual LDSM op
-    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
-    static int const kLdsmOpInner = 8;
-
-    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    static_assert(!(Shape::kStrided % kLdsmOpInner), 
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    /// Shape of one individual LDSM instruction
-    static int const LdsmShapeStrided =
-        InstructionShape::kStrided / kLdsmOpInner;
-    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
-    using LdsmShape =
-        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsmIterations = layout::PitchLinearShape<
-        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
-        1>;
-
-    /// Number of groups for each tile
-    static int const kGroupsPerTile =
-        Shape::kStrided / InstructionShape::kStrided;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount =
-      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_[kPointerCount];
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / Layout::kElementsPerAccess),
-    byte_offset_(0),
-    k_group_idx_(0) {
-      
-    int quad_pair = (lane_id >> 3);
-    int quad_quad = (lane_id >> 4);
-    int lane_in_quad = (lane_id & 3);
-    int lane_in_quad_pair = (lane_id & 7);
-    int lane_in_quad_quad = (lane_id & 15);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPointerCount; ++i) {
-      int partition_contiguous_idx = -1;
-      int access_contiguous_idx = -1;
-      int access_strided_idx = -1;
-
-      if (Policy::LdsmShape::kContiguous == 4) {
-        // Matrix multiply 1688 A/B
-        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
-        // Four blocks are next to each other in the contiguous dimension.
-        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
-        access_contiguous_idx = (quad_pair ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_pair;
-      } else if (Policy::LdsmShape::kContiguous == 2 &&
-                 kOperand == Operand::kA) {
-        // Matrix multiply 16816 A
-        // Q0 Q1
-        // Q2 Q3
-        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
-        access_contiguous_idx =
-            (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
-      } else if (Policy::LdsmShape::kContiguous == 2 &&
-                 kOperand == Operand::kB) {
-        // Matrix multiply 16816 B
-        // Q0 Q2
-        // Q1 Q3
-        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
-        access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_quad;
-      } else if (Policy::LdsmShape::kContiguous == 1) {
-        // Matrix multiply 16832.SP B
-        // Q0
-        // Q1
-        // Q2
-        // Q3
-        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 2));
-        access_contiguous_idx = ((i & 3) ^ lane_in_quad);
-        access_strided_idx = lane_id;
-      }
-
-      int access_contiguous =
-          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
-          access_contiguous_idx;
-
-      int access_strided = access_strided_idx;
-
-      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
-                    access_contiguous + access_strided * stride_;
-    }
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    if (Shape::kContiguous ==
-        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
-      if (tile_offset.contiguous() % 2) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kPointerCount / 2; ++i) {
-          AccessType const *tmp_pointer = pointer_[i];
-          pointer_[i] = pointer_[i + kPointerCount / 2];
-          pointer_[i + kPointerCount / 2] = tmp_pointer;
-        }
-      }
-      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
-    }
-
-    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
-                     stride_ * Layout::kElementsPerAccess +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == Policy::kGroupsPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset(
-            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
-      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_[c % kPointerCount] +
-            Layout::TileShape::kContiguous * (c / kPointerCount) +
-            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
-          fetch_ptr[access_idx],
-          source_byte_ptr
-        );
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = 
-      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no op
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
-/// uses LDS.32 to load from shared memory and therefore must be initialized
-/// with a TensorRef to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
-    OpDelta_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Determine number of elements along outer dimension per individual 32bit
-    // shared memory load op.  Every one warp of 32bit shared memory load loads
-    // 8x4 elements
-    static int const kLdsOpInner = Layout::TileShape::kStrided;
-    static int const kLdsOpOuter = kThreads / kLdsOpInner;
-
-    static_assert(!(Shape::kContiguous % kLdsOpOuter),
-                  "Shape of warp-level mma must be divisible by 32bit "
-                  "fundamental tile size.");
-
-    static_assert(!(Shape::kStrided % kLdsOpInner),
-                  "Shape of warp-level mma must be divisible by 32bit "
-                  "fundamental tile size.");
-
-    /// Number of 32 bit shared memory load instructions needed by one MMA instruction
-    /// 1688  A 2x2
-    /// 1688  B 1x2
-    /// 16816 B 1x4
-    static int const LdsShapeContiguous =
-        InstructionShape::kContiguous / kLdsOpOuter;
-    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
-    using LdsShape =
-        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
-
-    /// Number and arrangement of LDS instructions
-    using LdsIterations = layout::PitchLinearShape<
-        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
-
-    /// Number of groups for each tile
-    static int const kGroupsPerTile =
-        Shape::kStrided / InstructionShape::kStrided;
-  };
-
- private:
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-                "Alternative arrangements not supported at present.");
-
-  /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount = Layout::TileShape::kContiguous *
-                                   Layout::kElementsPerAccess /
-                                   Policy::kLdsOpOuter;
-
-  /// Vectorized access is not used
-  static int const kElementsPerAccess = 1;
-
-  /// Pointer type used for accesses
-  using AccessType = Element;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
- private:
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_[kPointerCount];
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPointerCount; ++i) {
-      int access_strided = lane_id % Policy::kLdsOpInner;
-      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
-                              (access_strided ^ i) * Policy::kLdsOpOuter;
-
-      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
-                    access_contiguous + access_strided * stride_;
-    }
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    int contiguous_offset = tile_offset.contiguous();
-    if (Shape::kContiguous ==
-        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
-      if (tile_offset.contiguous() % 2) {
-        // Matrix multiply 1688 pointer_[0] <=> pointer_[4] pointer_[1] <=> pointer_[5]
-        //           pointer_[2] <=> pointer_[6] pointer_[3] <=> pointer_[7]
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kPointerCount / 2; ++i) {
-          AccessType const *tmp_pointer = pointer_[i];
-          pointer_[i] = pointer_[i + kPointerCount / 2];
-          pointer_[i + kPointerCount / 2] = tmp_pointer;
-        }
-      }
-      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
-    }
-
-    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator++() {
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == Policy::kGroupsPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset(
-            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator--() {
-    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
-            int access_idx =
-                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
-                               Policy::LdsShape::kStrided) *
-                         Policy::LdsShape::kContiguous;
-            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
-            int access_idx_strided =
-                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
-
-            AccessType const *source_ptr =
-                pointer_[access_idx_contiguous % kPointerCount] +
-                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
-                    (access_idx_contiguous / kPointerCount) +
-                access_idx_strided * stride_;
-
-            char const *source_byte_ptr =
-                reinterpret_cast<char const *>(source_ptr) + byte_offset +
-                byte_offset_;
-
-            fetch_ptr[access_idx] =
-                *reinterpret_cast<Element const *>(source_byte_ptr);
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-        tile_offset.contiguous() * Shape::kContiguous /
-            Layout::kElementsPerAccess +
-        tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no op
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps with 64B warp tile
-/// the contiguous dimension. This assumes Threadblock contiguous dimension has
-/// the same size as the warp tile.  It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory.
-///
-/// This specialization can be merged into the general one.  Most code is the same.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous<16, 32>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Element number when the layout crosses
-  static int const kCrosswise = 32;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Determine number of elements along outer dimension per individual LDSM op
-    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
-    static int const kLdsmOpInner = 8;
-
-    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    static_assert(!(Shape::kStrided % kLdsmOpInner),
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    /// Shape of one individual LDSM instruction
-    static int const LdsmShapeStrided =
-        InstructionShape::kStrided / kLdsmOpInner;
-    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
-    using LdsmShape =
-        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsmIterations = layout::PitchLinearShape<
-        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
-        1>;
-
-    /// Number of groups for each tile
-    static int const kGroupsPerTile =
-        Shape::kStrided / InstructionShape::kStrided;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount =
-      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_[kPointerCount];
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
-    byte_offset_(0),
-    k_group_idx_(0) {
-      
-    int quad_pair = (lane_id >> 3);
-    int quad_quad = (lane_id >> 4);
-    //int lane_in_quad = (lane_id & 3);
-    int lane_in_quad_pair = (lane_id & 7);
-    int lane_in_quad_quad = (lane_id & 15);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPointerCount; ++i) {
-      int partition_contiguous_idx = -1;
-      int access_contiguous_idx = -1;
-      int access_strided_idx = -1;
-
-      if (Policy::LdsmShape::kContiguous == 4) {
-        // Matrix multiply 1688 A/B
-        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
-        // Four blocks are next to each other in the contiguous dimension.
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx = quad_pair ^ (lane_in_quad_pair / Layout::kFactor);
-        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
-      } else if (Policy::LdsmShape::kContiguous == 2 &&
-          kOperand == Operand::kA) {
-        // Matrix multiply 16816 A
-        // Q0 Q1
-        // Q2 Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx =
-            (((quad_pair & 1) + i * 2) ^ (lane_in_quad_pair / Layout::kFactor));
-        access_strided_idx = (lane_in_quad_pair + (lane_id >> 4 << 3)) / 2;
-      } else if (Policy::LdsmShape::kContiguous == 2 &&
-                 kOperand == Operand::kB) {
-        // Matrix multiply 16816 B
-        // Q0 Q2
-        // Q1 Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx = (quad_quad + i * 2) ^ (lane_in_quad_pair / Layout::kFactor);
-        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
-      } else if (Policy::LdsmShape::kContiguous == 1) {
-        // Matrix multiply 16832.SP B
-        // Q0
-        // Q1
-        // Q2
-        // Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor) ^ i;
-        access_strided_idx = lane_id / Layout::kFactor;
-      }
-
-      int access_contiguous =
-          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
-          access_contiguous_idx;
-
-      int access_strided = access_strided_idx;
-
-      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
-                    access_contiguous + access_strided * stride_;
-    }
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    if (Shape::kContiguous ==
-        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
-      if (tile_offset.contiguous() % 2) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kPointerCount / 2; ++i) {
-          AccessType const *tmp_pointer = pointer_[i];
-          pointer_[i] = pointer_[i + kPointerCount / 2];
-          pointer_[i + kPointerCount / 2] = tmp_pointer;
-        }
-      }
-      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
-    }
-
-    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
-                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == Policy::kGroupsPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset(
-            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
-      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_[c % kPointerCount] +
-            Layout::TileShape::kContiguous * (c / kPointerCount) +
-            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
-          fetch_ptr[access_idx],
-          source_byte_ptr
-        );
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = 
-      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no op
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps with 32B warp tile
-/// the contiguous dimension. This assumes Threadblock contiguous dimension has
-/// the same size as the warp tile.  It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory.
-///
-/// This specialization can be merged into the general one.  Most code is the same.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous<16, 16>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Element number when the layout crosses
-  static int const kCrosswise = 16;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Determine number of elements along outer dimension per individual LDSM op
-    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
-    static int const kLdsmOpInner = 8;
-
-    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    static_assert(!(Shape::kStrided % kLdsmOpInner),
-      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
-
-    /// Shape of one individual LDSM instruction
-    static int const LdsmShapeStrided =
-        InstructionShape::kStrided / kLdsmOpInner;
-    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
-    using LdsmShape =
-        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsmIterations = layout::PitchLinearShape<
-        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
-        1>;
-
-    /// Number of groups for each tile
-    static int const kGroupsPerTile =
-        Shape::kStrided / InstructionShape::kStrided;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount =
-      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_[kPointerCount];
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ):
-    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
-    byte_offset_(0),
-    k_group_idx_(0) {
-
-    //int quad_pair = (lane_id >> 3);
-    int quad_quad = (lane_id >> 4);
-    int lane_in_pair = (lane_id & 1);
-    int lane_in_quad = (lane_id & 3);
-    int lane_in_quad_pair = (lane_id & 7);
-    int lane_in_quad_quad = (lane_id & 15);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPointerCount; ++i) {
-      int partition_contiguous_idx = -1;
-      int access_contiguous_idx = -1;
-      int access_strided_idx = -1;
-
-      if (Policy::LdsmShape::kContiguous == 2 &&
-          kOperand == Operand::kA) {
-        // Matrix multiply 16816 A
-        // Q0 Q1
-        // Q2 Q3
-        partition_contiguous_idx = lane_in_quad / 2;
-        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
-        access_contiguous_idx =
-            ((lane_in_pair * 2 + ((lane_id & 8) >> 3)) ^
-             access_strided_idx);
-      } else if (Policy::LdsmShape::kContiguous == 2 &&
-                 kOperand == Operand::kB) {
-        // Matrix multiply 16816 B
-        // Q0 Q2
-        // Q1 Q3
-        partition_contiguous_idx = lane_in_quad / 2;
-        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
-        access_contiguous_idx =
-            ((lane_in_pair * 2 + quad_quad) ^
-             access_strided_idx);
-      } else if (Policy::LdsmShape::kContiguous == 1) {
-        // Matrix multiply 16832.SP B
-        // Q0
-        // Q1
-        // Q2
-        // Q3
-        int factor_in_partition =
-            (Layout::PartitionShape::kContiguous * Layout::kFactor /
-             Layout::TileShape::kContiguous);
-
-        partition_contiguous_idx = lane_in_quad / factor_in_partition;
-        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
-                                 (lane_in_quad_quad / Layout::kFactor) ^ i);
-        access_strided_idx = lane_id / Layout::kFactor;
-      } 
-
-      int access_contiguous =
-          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
-          access_contiguous_idx;
-
-      int access_strided = access_strided_idx;
-
-      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
-                    access_contiguous + access_strided * stride_;
-    }
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    if (Shape::kContiguous ==
-        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
-      if (tile_offset.contiguous() % 2) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kPointerCount / 2; ++i) {
-          AccessType const *tmp_pointer = pointer_[i];
-          pointer_[i] = pointer_[i + kPointerCount / 2];
-          pointer_[i + kPointerCount / 2] = tmp_pointer;
-        }
-      }
-      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
-    }
-
-    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
-                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == Policy::kGroupsPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset(
-            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
-      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_[c % kPointerCount] +
-            Layout::TileShape::kContiguous * (c / kPointerCount) +
-            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
-          fetch_ptr[access_idx],
-          source_byte_ptr
-        );
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
-      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no op
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Element number when the layout crosses (in units of elements)
-    int Crosswise,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, Crosswise>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA,
-                "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "
-                "only be instantiated for A operand to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// MBlock or NBlock size
-  static int const kCrosswise = Crosswise;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            kCrosswise>,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Element number when the layout crosses (in units of elements)
-    int Crosswise,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, Crosswise>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "
-                "only be instantiated for B operand to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Element number when the layout crosses
-  static int const kCrosswise = Crosswise;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            kCrosswise>,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Element number when the layout crosses (in units of elements)
-    int Crosswise,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                                   Crosswise>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Element number when the layout crosses
-  static int const kCrosswise = Crosswise;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Determine number of elements along outer dimension per individual LDSM op
-    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
-    static int const kLdsmOpInner = 8;
-
-    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
-                  "Shape of warp-level mma must be divisible by LDSM's "
-                  "fundamental tile size.");
-
-    static_assert(!(Shape::kStrided % kLdsmOpInner),
-                  "Shape of warp-level mma must be divisible by LDSM's "
-                  "fundamental tile size.");
-
-    /// Shape of one individual LDSM instruction
-    static int const LdsmShapeContiguous =
-        InstructionShape::kContiguous / kLdsmOpOuter;
-    static int const LdsmShapeStrided =
-        ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
-            ? (Shape::kStrided / kLdsmOpInner)
-            : (4 / LdsmShapeContiguous);
-    using LdsmShape =
-        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsmIterations =
-        layout::PitchLinearShape<1, Shape::kStrided / kLdsmOpInner /
-                                        LdsmShape::kStrided>;
-
-    ///
-    static int const kGroupsPerTile = Layout::TileShape::kContiguous /
-                                      Layout::kFactor / LdsmShape::kContiguous;
-  };
-
- private:
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-                "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, Shape::kStrided *
-                                      InstructionShape::kContiguous / kThreads>;
-
- private:
-
-  /// Total number of sections.  The memory is divided into stages.  One stage
-  /// can store one tile.  Stage is divided into sections.  Interleaved layout
-  /// can have multiple sections in a stage.  The rest layout only has one section
-  /// in a stage.
-  int sections_;
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Internal counter used to determine when to increment byte offset and when
-  /// to XOR it
-  int k_group_idx_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator()
-      : pointer_(nullptr),
-        sections_(0),
-        stride_(0),
-        byte_offset_(0),
-        k_group_idx_(0) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
-        sections_(ref.stride(0) / kCrosswise),
-        // stride_ = kCrosswise x sections_ x kFactor
-        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
-        byte_offset_(0),
-        k_group_idx_(0) {
-    // Warp level iterator at most use double buffer to hide latency.  If there
-    // are more than 2 sections, every stage should have more than 1 section.
-
-    // Turing silicon requires all 32 threads in a warp provide valid addresses
-    // even for LDSM.1 and LDSM.2
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))
-    lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
-#endif
-
-    int quad_quad = (lane_id >> 4);
-    int quad_pair = (lane_id >> 3);
-    int lane_in_pair = (lane_id & 1);
-    int lane_in_quad = (lane_id & 3);
-    int lane_in_quad_pair = (lane_id & 7);
-    int lane_in_quad_quad = (lane_id & 15);
-
-    int partition_contiguous_idx = -1;
-    int access_contiguous_idx = -1;
-    int access_strided_idx = -1;
-
-    if (Layout::kFactor == 8) {
-      int factor_in_partition =
-          (Layout::PartitionShape::kContiguous * Layout::kFactor /
-           Layout::TileShape::kContiguous);
-
-      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
-        partition_contiguous_idx = lane_in_quad_pair / factor_in_partition;
-        access_contiguous_idx = ((lane_in_quad) ^ (lane_id / Layout::kFactor));
-        access_strided_idx = lane_id / Layout::kFactor;
-      }
-    } else if (Layout::kFactor == 4) {
-      // Super Integer matrix multiply Interleaved-32
-
-      int factor_in_partition =
-          (Layout::PartitionShape::kContiguous * Layout::kFactor /
-           Layout::TileShape::kContiguous);
-
-      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
-        // Integer matrix multiply 8816  A/B
-        partition_contiguous_idx = lane_in_quad / factor_in_partition;
-        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
-                                 (lane_in_quad_quad / Layout::kFactor));
-        access_strided_idx = lane_id / Layout::kFactor;
-      }
-      else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kA) {
-        // Integer matrix multiply 16832 A
-        partition_contiguous_idx = lane_in_quad / factor_in_partition;
-        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
-        access_contiguous_idx =
-            ((lane_in_pair * factor_in_partition + quad_quad) ^
-             access_strided_idx);
-      }
-      else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kB) {
-        // Integer matrix multiply 16832 B
-        partition_contiguous_idx = lane_in_quad / factor_in_partition;
-        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
-        access_contiguous_idx =
-            ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^
-             access_strided_idx);
-      }
-    } else if (Layout::kFactor == 2) {
-      // Super Matrix multiply kBlock = 32
-      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
-        // Matrix multiply 1688 A/B
-        // (Q stands for 1 8x128bit block).
-        // Q0
-        // Q1
-        // Q2
-        // Q3
-        // Four blocks are next to each other in the strided dimension.
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
-        access_strided_idx = lane_id / Layout::kFactor;
-      } else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kA) {
-        // Matrix multiply 16816|1688.TF32 A
-        // Q0 Q2
-        // Q1 Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx =
-            (quad_quad ^ (lane_in_quad_pair / Layout::kFactor));
-        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
-      } else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kB) {
-        // Matrix multiply 16816|1688.TF32 B
-        // Q0 Q1
-        // Q2 Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx =
-            ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor));
-        access_strided_idx =
-            (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor;
-      } 
-      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
-        // Matrix multiply 16832.SP B
-        // Q0 Q1 Q2 Q3
-        partition_contiguous_idx = (lane_id % Layout::kFactor);
-        access_contiguous_idx =
-            (quad_pair ^ (lane_in_quad_pair / Layout::kFactor));
-        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
-      }
-    } else if (Layout::kFactor == 1) {
-      // Super Matrix multiply kBlock = 64
-      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
-        // Q0
-        // Q1
-        // Q2
-        // Q3
-        partition_contiguous_idx = (lane_in_quad_pair >> 2);
-        access_contiguous_idx = lane_in_quad;
-        access_strided_idx = lane_id;
-      }
-      else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kA) {
-        // Matrix multiply 16816|1688.TF32 A
-        // Q0 Q2
-        // Q1 Q3
-        partition_contiguous_idx = (lane_in_quad_pair >> 2);
-        access_contiguous_idx = (quad_quad ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_quad;
-      } else if (Policy::LdsmShape::kStrided ==
-                     (Policy::LdsmShape::kCount / 2) &&
-                 kOperand == Operand::kB) {
-        // Matrix multiply 16816|1688.TF32 B
-        // Q0 Q1
-        // Q2 Q3
-        partition_contiguous_idx = (lane_in_quad_pair >> 2);
-        access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
-      } 
-      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
-        // Matrix multiply 16832.SP B
-        // Q0 Q1 Q2 Q3
-        partition_contiguous_idx = (lane_in_quad_pair >> 2);
-        access_contiguous_idx = (quad_pair ^ lane_in_quad);
-        access_strided_idx = lane_in_quad_pair;
-      }
-    }
-
-    int access_contiguous =
-        partition_contiguous_idx * Layout::PartitionShape::kContiguous +
-        access_contiguous_idx;
-
-    int access_strided = access_strided_idx;
-
-    byte_offset_ = (access_contiguous + access_strided * stride_) *
-                   sizeof_bits<Element>::value * Layout::kElementsPerAccess / 8;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
-    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
-
-    byte_offset_ ^= k_groups_delta * sizeof_bits<Element>::value *
-                    Layout::kElementsPerAccess *
-                    Policy::LdsmShape::kContiguous / 8;
-    pointer_ +=
-        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
-        whole_tiles * stride_ / sections_;
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
-      TensorCoord const &tile_offset) {
-
-    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
-    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
-    if (k_groups_delta < 0) {
-        whole_tiles -= 1;
-        k_groups_delta += Policy::kGroupsPerTile;
-    }
-
-    if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) {
-      byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-    }
-    if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) {
-      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * 
-                        Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-    }
-    if ((Policy::kGroupsPerTile / kPartitionsK) == 8) {
-      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * 
-                        Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-    }
-
-    k_group_idx_ += k_groups_delta;
-    whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK);
-    k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK);
-
-    pointer_ +=
-        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
-        whole_tiles * stride_ / sections_;
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator++() {
-
-    // Integer matrix multiply 16832 Interleaved-32
-    //   NONE
-    // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32
-
-    // Integer matrix multiply 8816  Interleaved-32
-    //   ^1 ^1
-    // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64
-    // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
-    //   ^1 ^3 ^1 ^3
-    // Matrix multiply 1688 kblock=64
-    //   ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
-
-    // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64
-    //   ^2 ^2
-    // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128
-    //   ^2 ^6 ^2 ^6
-
-    if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
-      int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
-                     ? 3
-                     : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
-
-      if (((k_group_idx_ & mask) % 2) == 0)
-        byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-      else if ((k_group_idx_ & mask) == 1)
-        byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-      else if ((k_group_idx_ & mask) == 3)
-        byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
-                        sizeof_bits<Element>::value *
-                        Layout::kElementsPerAccess / 8;
-    }
-
-    k_group_idx_++;
-
-    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
-      k_group_idx_ = 0;
-      add_tile_offset({Policy::kGroupsPerTile, 0});
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
-        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_ + Policy::LdsmShape::kContiguous * c +
-            Policy::kLdsmOpInner / Layout::kFactor *
-                Policy::LdsmShape::kStrided * s * stride_;
-
-        char const *source_byte_ptr =
-            reinterpret_cast<char const *>(source_ptr) + byte_offset +
-            byte_offset_;
-
-        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
-            fetch_ptr[access_idx], source_byte_ptr);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = tile_offset.contiguous() *
-                               InstructionShape::kContiguous /
-                               Layout::kElementsPerAccess +
-                           tile_offset.strided() * Shape::kStrided * stride_;
-
-    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Element number when the layout crosses (in units of elements)
-    int Crosswise,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, Crosswise>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "
-                "only be instantiated for B operand to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// KBlock size
-  static int const kCrosswise = Crosswise;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            kCrosswise>,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
- private:
-  /// Underlying tile iterator
-  Base iterator_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : iterator_({ref.data(), ref.stride()}, lane_id) {}
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator++() {
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator--() {
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { iterator_.load(frag); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Element number when the layout crosses (in units of elements)
-    int Crosswise,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, Crosswise>,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA,
-                "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "
-                "only be instantiated for A operand to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Element number when the layout crosses
-  static int const kCrosswise = Crosswise;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kCrosswise>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            kCrosswise>,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
- private:
-  /// Underlying tile iterator
-  Base iterator_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : iterator_({ref.data(), ref.stride()}, lane_id) {}
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator++() {
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator--() {
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { iterator_.load(frag); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpAccumulatorTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
-/// accumulator layout.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpAccumulatorTileIterator<
-    Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static bool const kDivisible =
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN);
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<
-      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
-      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
-    >;
-  };
-
-private:
-
-  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
-  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
-  // of that row. The accumulators within one row are assumed to be consecutive.
- static int const kElementsPerAccess = InstructionShape::kN / 4;
- static int const kRowsPerTile = 8;
- static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-
-            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-
-            offset_ref.at({accum_m, accum_n}) = frag[idx];
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout.
-///
-/// This iterator is not tested.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpAccumulatorTileIterator<
-    Shape_, Element_, cutlass::layout::AffineRankN<2>, InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static bool const kDivisible =
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN);
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<
-      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
-      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
-    >;
-  };
-
-private:
-
-  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
-  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
-  // of that row. The accumulators within one row are assumed to be consecutive.
- static int const kElementsPerAccess = InstructionShape::kN / 4;
- static int const kRowsPerTile = 8;
- static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-
-            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-
-            offset_ref.at({accum_m, accum_n}) = frag[idx];
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
-/// accumulator layout.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaTensorOpAccumulatorTileIterator<Shape_, Element_,
-                                         cutlass::layout::ColumnMajor,
-                                         InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static bool const kDivisible = 
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN);
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<
-      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
-      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
-    >;
-  };
-
-private:
-
-  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
-  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
-  // of that row. The accumulators within one row are assumed to be consecutive.
- static int const kElementsPerAccess = InstructionShape::kN / 4;
- static int const kRowsPerTile = 8;
- static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, 
-    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-
-            frag[idx] = offset_ref.at({accum_m, accum_n});
-          }
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        
-        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
-          (mma_n * Policy::MmaIterations::kRow + mma_m);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < kAccumulatorRows; ++row) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                          row * kRowsPerTile;
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
-            
-            offset_ref.at({accum_m, accum_n}) = frag[idx];
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
-/// accumulator layout.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element typ
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_,
-    /// Interleaved N
-    int InterleavedN>
-class MmaTensorOpAccumulatorTileIterator<
-    Shape_, Element_, cutlass::layout::ColumnMajorInterleaved<InterleavedN>,
-    InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorInterleaved<InterleavedN>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                      Shape::kColumn / InstructionShape::kN>;
-  };
-
-private:
-
-  static int const kElementsPerAccess = 2;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  using AccessType = Array<Element, kElementsPerAccess>;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        int accum_m = mma_m * InstructionShape::kM;
-        int accum_n = mma_n * InstructionShape::kN;
-
-        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
-
-        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-          offset_ref.offset(TensorCoord(accum_m, accum_n)));
-
-        frag_ptr[idx] = access_ptr[0];
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        int accum_m = mma_m * InstructionShape::kM;
-        int accum_n = mma_n * InstructionShape::kN;
-
-        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
-
-        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-                                 offset_ref.offset(TensorCoord(accum_m, accum_n)));
-
-        access_ptr[0] = frag_ptr[idx];               
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
-/// accumulator layout.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element typ
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_,
-    /// Interleaved N
-    int InterleavedN>
-class MmaTensorOpAccumulatorTileIterator<
-    Shape_, Element_, cutlass::layout::TensorNCxHWx<InterleavedN>,
-    InstructionShape_, OpDelta_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = int8_t;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorNCxHWx<InterleavedN>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    /// Number of elements in strided dimension that each STG writes
-    static int const kStridedPerSTG = 8;
-
-    /// Factor to calculate reorder index to pack accumulator.
-    static int const kPackedFactor = Shape::kColumn / 32;
-
-    /// Number of mma operations performed
-    using MmaIterations = MatrixShape<Shape::kRow / kStridedPerSTG,
-                                      Shape::kColumn / InterleavedN>;
-  };
-
-private:
-
-  static int const kElementsPerAccess = InterleavedN / 4;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  struct alignas((kElementsPerAccess * sizeof_bits<Element>::value / 8)) AccessType {
-      Array<Element, kElementsPerAccess> storage;
-  };
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<int32_t, Shape::kCount / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-  /// Row offset index globally
-  LongIndex global_offset_row_;
-
-  /// Column offset index globally
-  LongIndex global_offset_col_;
-
-  /// Output tensor size
-  TensorCoord extent_;
-
-  /// Alpha 
-  float alpha_;
-
-  /// Beta
-  float beta_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref,
-    int const lane_id,
-    TensorCoord extent,
-    float alpha = 1.0f,
-    float beta = 0.0f
-  ):
-    ref_(ref),
-    extent_(extent),
-    alpha_(alpha),
-    beta_(beta) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-
-    global_offset_row_ = quad;
-
-    global_offset_col_ = lane_in_quad * kElementsPerAccess;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator &add_tile_offset(MatrixCoord const &tile_offset) {
-
-    global_offset_row_ += tile_offset.row() * Shape::kRow;
-
-    global_offset_col_ += tile_offset.column() * Shape::kColumn;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int mma_n = 0; mma_n < Policy::MmaIterations::kN; ++mma_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kM; ++mma_m) {
-        int accum_m = mma_m * InstructionShape::kM;
-        int accum_n = mma_n * InstructionShape::kN;
-
-        int idx = mma_m + mma_n * Policy::MmaIterations::kM;
-
-        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-                                 accum_m * offset_ref.stride(0) + accum_n);
-
-        frag_ptr[idx] = access_ptr[0];
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-  
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    Array<float, Shape::kCount / kThreads> output_frag_f;
-    Array<Element, Shape::kCount / kThreads> output_frag;
-
-    LongIndex pq = extent_.h() * extent_.w();
-
-    LongIndex extent_row = extent_.n() * pq;
-    LongIndex extent_col = extent_.c();
-
-    LongIndex k_major = (global_offset_col_ / InterleavedN) * pq;
-    Index k_minor = global_offset_col_ % InterleavedN;
-    LongIndex k_offset = k_major * InterleavedN + k_minor;
-    LongIndex k_offset_delta = pq * InterleavedN;
-
-    LongIndex stride_n = pq * extent_.c();
-
-    Index n;
-    LongIndex pq_rem;
-
-    unsigned int pq_mul, pq_shr;
-    find_divisor(pq_mul, pq_shr, pq);
-
-    if(beta_ == 0.0f) {
-      CUTLASS_PRAGMA_UNROLL
-      for(int i = 0; i < int(frag.size()); ++i) {
-        output_frag_f[i] = frag[i];
-      }
-
-      if(InstructionShape::kM == Policy::kStridedPerSTG) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < int(frag.size()); ++i) {
-          output_frag[i] = (Element)(output_frag_f[i] * alpha_);
-        }
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < int(frag.size()); ++i) {
-          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
-                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
-                    + (i % (8 * Policy::kPackedFactor)) % 2
-                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
-          output_frag[i] = (Element)(output_frag_f[map_i] * alpha_);
-        }
-      }
-
-      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        int accum_m = mma_m * Policy::kStridedPerSTG;
-
-        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
-        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-       
-          int accum_n = mma_n * InterleavedN;
-
-          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
-         
-          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
-            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-                                                                    offset_m + mma_n * k_offset_delta);
-
-            access_ptr[0] = frag_ptr[idx];
-          }
-        }
-      }
-    } else {
-      if(InstructionShape::kM == Policy::kStridedPerSTG) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < int(frag.size()); ++i) {
-          output_frag_f[i] = frag[i];
-        }
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < int(frag.size()); ++i) {
-          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
-                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
-                    + (i % (8 * Policy::kPackedFactor)) % 2
-                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
-          output_frag_f[i] = frag[map_i];
-        }
-      }
-
-      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
-
-      Array<Element, kElementsPerAccess> ref_frag;
-      AccessType *ref_frag_ptr = reinterpret_cast<AccessType *>(&ref_frag);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-        int accum_m = mma_m * Policy::kStridedPerSTG;
-
-        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
-        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-       
-          int accum_n = mma_n * InterleavedN;
-
-          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
-         
-          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
-            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
-                                                                    offset_m + mma_n * k_offset_delta);
-
-            ref_frag_ptr[0] = access_ptr[0];
-
-            CUTLASS_PRAGMA_UNROLL
-            for(int i = 0; i < kElementsPerAccess; ++i) {
-              output_frag[idx * kElementsPerAccess + i] = Element(alpha_ * output_frag_f[idx * kElementsPerAccess + i]
-                                                                + beta_ * ref_frag[i]);
-            }
-
-            access_ptr[0] = frag_ptr[idx];
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
deleted file mode 100755
index bcac335f2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ /dev/null
@@ -1,3098 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
-
-#include "cutlass/platform/platform.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads>
-class MmaVoltaTensorOpMultiplicandTileIterator;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kA, Element_,
-    cutlass::layout::VoltaTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Shape of one individual LDS.128
-    // TODO: 32 and 4 are hardcoded, 32-by-4 is logical shape
-    using LdsShape = layout::PitchLinearShape<
-      32,
-      4
-    >;
-
-    // LdsShapes are arranged in the strided direction in SMEM
-    using LdsIterations = layout::PitchLinearShape<
-      InstructionShape::kStrided / LdsShape::kStrided,
-      Shape::kContiguous / LdsShape::kContiguous
-    >;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Number of internal pointers needed to reference shared memory
-  static int const kPointerCount = 2;
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment = Array<Element, Shape::kContiguous *
-                                     InstructionShape::kStrided / kThreads * 2>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_[kPointerCount];
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ):
-    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
-    // swizzle patterns for operandA LDS are
-    // 1. (tid[4] << 3) | (tid[2:0] ^ tid[4])
-    // 2. (tid[4] << 3) | (tid[2:0] ^ tid[4] ^ 0b10010)
-
-    int vec_row = (lane_id >> 4); // tid[4]
-    int vec_col = ((lane_id & 4) >> 2); // tid[2]
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPointerCount; ++i) {
-
-      if(i == 1) {
-        vec_row |= 2;
-      }
-      int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
-      int access_contiguous = access_contiguous_idx;
-
-      int access_strided = vec_row;
-      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
-        access_contiguous + access_strided * stride_;
-    }
-
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    int strided_offset = tile_offset.strided();
-
-    // To support 32x32 tile size
-    if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
-      if (contiguous_offset % 2) {
-        AccessType const *tmp_pointer = pointer_[0];
-        pointer_[0] = pointer_[1];
-        pointer_[1] = tmp_pointer;
-      }
-      contiguous_offset = contiguous_offset / 2 * 2;
-    }
-
-    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
-                     Layout::kElementsPerAccess +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
-    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
-    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsIterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_[s & 1] +
-          Policy::LdsShape::kContiguous * c +
-          Policy::LdsShape::kStrided * (s / 2) * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-        tile_offset.contiguous() * Shape::kContiguous /
-            Layout::kElementsPerAccess +
-        tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kB, Element_,
-    cutlass::layout::VoltaTensorOpMultiplicandBCongruous<
-        sizeof_bits<Element_>::value>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-    /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kContiguous % InstructionShape::kContiguous),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-
-    // Shape of one individual LDS
-    // TODO: remove hardcoded 32 and 4
-    using LdsShape = layout::PitchLinearShape<
-      32,
-      4
-    >;
-
-    using LdsIterations = layout::PitchLinearShape<
-      Shape::kContiguous / LdsShape::kContiguous,
-      InstructionShape::kStrided / LdsShape::kStrided
-    >;
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile, needs on more time number of registers
- using Fragment = Array<Element, Shape::kContiguous *
-                                     InstructionShape::kStrided / kThreads * 2>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ):
-    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
-
-    // swizzle pattern is (tid & (3 << 3) | (tid[1:0] ^ tid[4:3]))
-    int access_strided = (lane_id >> 3) & 0x3;
-    int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
-
-    pointer_ = reinterpret_cast<AccessType const *>(ref.data()) +
-                access_contiguous + access_strided * stride_;
-
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    int strided_offset = tile_offset.strided();
-
-    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
-                     Layout::kElementsPerAccess +
-                 contiguous_offset * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
-    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
-    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
-                    Layout::kElementsPerAccess;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsIterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-          Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
-          Policy::LdsShape::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset =
-        tile_offset.contiguous() * Shape::kContiguous /
-            Layout::kElementsPerAccess +
-        tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kA, Element_,
-    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kB, Element_,
-    cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
-        sizeof_bits<Element_>::value>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
-/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
-/// accumulator layout.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions, concept: MatrixShape)
-    typename OpDelta_>
-class MmaVoltaTensorOpAccumulatorTileIterator {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kC;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-
-    /// Volta Tensor Op uses 32x32 interleaved tile
-    using InterleavedTile = MatrixShape<32, 32>;
-
-    static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
-      "Shape of warp-level Mma must be divisible by operator shape.");
-
-    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
-      "Layouts must be defined for logical MatrixCoord coordinate space.");
-
-    /// Number of mma operations performed
-    using TileIterations = MatrixShape<
-      Shape::kRow / InterleavedTile::kRow,
-      Shape::kColumn / InterleavedTile::kColumn
-    >;
-
-    using MmaIterations =
-        MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
-                    InterleavedTile::kColumn / InstructionShape::kN>;
-  };
-
-private:
-
-  // Assume accumulator tile is multipile interleaved 32x32 tile.
-  static int const kElementsPerPartial = 4;
-  using EleShapePerPatial = typename platform::conditional<
-                              platform::is_same<Element, float>::value,
-                              MatrixShape<2, 2>,
-                              MatrixShape<1, 4> >::type;
-  static int const kElementsPerMma = 8;
-  static int const kAccumulatorPatials = 2;
-  using QuadShapePerPatialMma = MatrixShape<4, 4>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-
-private:
-
-  /// Reference to output tensor
-  TensorRef ref_;
-
-public:
-
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator(
-    TensorRef const &ref,
-    int lane_id
-  ):
-    ref_(ref) {
-
-    int quad = (lane_id >> 2);
-    int lane_in_quad = (lane_id & 3);
-    int accum_m, accum_n;
-
-    if (platform::is_same<Element, float>::value) {
-      // (quad[2],quad[0])+lane_in_quad[0]
-      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
-      // (quad[1])+lane_in_quad[1]
-      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
-                  (lane_in_quad & 2);
-    } else {
-      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; // (quad[2],quad[0])
-      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
-    }
-    MatrixCoord lane_offset(accum_m, accum_n);
-
-    ref_.add_coord_offset(lane_offset);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator & operator++() {
-    // deliberate no-op
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator & operator--() {
-    // deliberate no-op
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index pointer_offset) const {               ///< loads a tile with a linear offset
-
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-
-            int mma_accum_start =
-                (((tile_n * Policy::TileIterations::kRow + tile_m) *
-                    Policy::MmaIterations::kColumn + mma_n) *
-                     Policy::MmaIterations::kRow + mma_m) * 
-                    kElementsPerMma;
-
-           CUTLASS_PRAGMA_UNROLL
-            for (int p = 0; p < kAccumulatorPatials; ++p) {
-              CUTLASS_PRAGMA_UNROLL
-              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
-                CUTLASS_PRAGMA_UNROLL
-                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
-                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
-                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
-                                mma_n * QuadShapePerPatialMma::kColumn +
-                                p * Policy::InterleavedTile::kColumn/2 + n;
-                  int idx = mma_accum_start + p * kElementsPerPartial + 
-                            m * EleShapePerPatial::kColumn + n;
-                frag[idx] = offset_ref.at({accum_m, accum_n});
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    Index byte_offset) const {                  ///< loads a tile with a linear offset
-
-    load_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_HOST_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
-
-    load(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_HOST_DEVICE
-  void load(
-    Fragment &frag,                             ///< fragment to load from the tensor
-    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
-    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
-
-    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index pointer_offset) const {               ///< store a tile with a linear offset
-
-    TensorRef offset_ref(ref_);
-    offset_ref.add_pointer_offset(pointer_offset);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
-
-            int mma_accum_start =
-                (((tile_n * Policy::TileIterations::kRow + tile_m) *
-                    Policy::MmaIterations::kColumn + mma_n) *
-                     Policy::MmaIterations::kRow + mma_m) * 
-                    kElementsPerMma;
-
-            CUTLASS_PRAGMA_UNROLL
-            for (int p = 0; p < kAccumulatorPatials; ++p) {
-              CUTLASS_PRAGMA_UNROLL
-              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
-                CUTLASS_PRAGMA_UNROLL
-                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
-                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
-                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
-                                mma_n * QuadShapePerPatialMma::kColumn +
-                                p * Policy::InterleavedTile::kColumn/2 + n;
-                  int idx = mma_accum_start + p * kElementsPerPartial + 
-                            m * EleShapePerPatial::kColumn + n;
-                  offset_ref.at({accum_m, accum_n}) = frag[idx];
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Stores a fragment to memory with additional pointer offset
-  CUTLASS_HOST_DEVICE
-  void store_with_byte_offset(
-    Fragment const &frag,                       ///< fragment to store from the tensor
-    Index byte_offset) const {                  ///< store a tile with a linear offset
-
-    store_with_pointer_offset(byte_offset / sizeof(Element));
-  }
-
-  /// Stores a fragment to memory with logical offset in units of whole tiles.
-  CUTLASS_HOST_DEVICE
-  void store(
-    Fragment &frag,                             ///< fragment to store to the tensor
-    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
-
-    store(frag, tile_offset, 0);
-  }
-
-  /// Stores a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_HOST_DEVICE
-  void store(
-      /// fragment to store to the tensor
-      Fragment const &frag,
-      /// stores a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// stores a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
-  }
-};
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// KBlock size (in units of elements)
-    int KBlock>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, KBlock>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// KBlock size
-  static int const kKBlock = KBlock;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kKBlock>;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-
-    /// Shape of one individual LDS instruction
-    using LdsShape = layout::PitchLinearShape<1, 32>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsIterations = layout::PitchLinearShape<1, Shape::kStrided / 32>;
-
-    /// Using LDS.128
-    static int const kElementsPerAccess = 8;
-
-    /// Contiguous elements per line
-    static int const kContiguousElementsPerLine = 4;
-  };
-
- private:
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-                "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment =
-      Array<Element,
-            Shape::kStrided * InstructionShape::kContiguous / kThreads * 2>;
-
- private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Crosswised elements are arranged in a SMEM line
-  /// in units of AccessType
-  Index line_size;
-
-  /// Internal counter used to determine load addr offset 
-  /// and when to swap higher 64bit with lower 64bit
-  int k_group_idx_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator()
-      : pointer_(nullptr),
-        stride_(0),
-        line_size(0),
-        byte_offset_(0),
-        k_group_idx_(0) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
-        stride_(ref.stride(0) * Policy::kElementsPerAccess),
-        line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
-                  Policy::kElementsPerAccess),
-        k_group_idx_(0),
-        byte_offset_(0) {
-
-    int quad = (lane_id / 4);
-    int lane_in_quad = (lane_id % 4);
-    int access_contiguous;
-
-    if(kOperand == Operand::kA) {
-
-      // swizzle id: tid[4]|tid[1:0]|(tid[2]^tid[4])
-      access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
-                            ((quad & 0x1) ^ ((quad & 0x4) >> 2));
-    } else {
-
-      // swizzle id: tid[4]|tid[1:0]|tid[3]
-      access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
-                            ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
-    }
-
-    byte_offset_ = access_contiguous *
-                   sizeof(Element) * Policy::kElementsPerAccess;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-
-    int contiguous_offset = tile_offset.contiguous();
-    int strided_offset = tile_offset.strided();
-    k_group_idx_ = 0;
-
-    pointer_ += contiguous_offset *
-                    (InstructionShape::kContiguous /
-                     Policy::kContiguousElementsPerLine) *
-                    line_size +
-                strided_offset * Shape::kStrided / 2;
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
-    k_group_idx_ = (k_group_idx_ + 1) % 8;
-
-    if (k_group_idx_ == 4 || k_group_idx_ == 0) {
-      byte_offset_ ^= 1 * sizeof(Element) * Policy::kElementsPerAccess;
-    }
-
-    pointer_ += line_size;
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsIterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-          Policy::LdsShape::kContiguous * c * line_size +
-          Policy::LdsShape::kStrided * s / 2;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
-
-        // swap higher 64bit and lower 64bit
-        if (k_group_idx_ &  0x2) {
-            uint64_t *low = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2;
-            uint64_t *high = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2 + 1;
-            uint64_t tmp = *low;
-            *low = *high;
-            *high = tmp;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = tile_offset.contiguous() *
-                               InstructionShape::kContiguous /
-                               Policy::kElementsPerAccess +
-                           tile_offset.strided() * Shape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    k_group_idx_ = k_group;
-  }
-};
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// KBlock size (in units of elements)
-    int KBlock>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, KBlock>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// KBlock size
-  static int const kKBlock = KBlock;
-
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kKBlock>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                                 kKBlock>,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
- private:
-  /// Underlying tile iterator
-  Base iterator_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : iterator_({ref.data(), ref.stride()}, lane_id) {}
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { iterator_.load(frag); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// KBlock size (in units of elements)
-    int KBlock>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, KBlock>,
-    InstructionShape_, OpDelta_, 32> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
-                "MmaTensorOpMultiplicandIterator may only be instantiated for "
-                "A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// KBlock size
-  static int const kKBlock = KBlock;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, kKBlock>;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                                 kKBlock>,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
- private:
-  /// Underlying tile iterator
-  Base iterator_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
-      : iterator_({ref.data(), ref.stride()}, lane_id) {}
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { iterator_.load(frag); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
-  }
-  
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for 'TN' arrangement
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of matrix operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads = 32,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  /// Basic check
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Number of elements accessed per Shared Memory load
-  static int const kElementsPerAccess = 4;
-
-private:
-
-  static int const kInterleavedTileRows = 32;
-  static int const kInterleavedTileColumns = 32;
-  static int const kInstructionsPerTile = 2;
-  
-  /// Rounded up instruction counts
-  using TileCount = MatrixShape<
-    Shape::kRow / kInterleavedTileRows,
-    Shape::kColumn / kInterleavedTileColumns
-  >;
-
-  using FragmentCount = MatrixShape<
-    TileCount::kRow * kInstructionsPerTile,
-    TileCount::kColumn * kInstructionsPerTile
-  >;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
-  >;
-
-  /// Memory access type
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-private:
-
-  /// Underlying tensor reference
-  TensorRef ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to conditionally enable extents checking
-  bool divisible_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(): divisible_(true) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
-    TensorRef const &ref, 
-    int lane_id
-  ): 
-    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
-
-    int quad_id = lane_id / 4;
-    int lane_in_quad = (lane_id % 4);
-  
-    if (kOperand == Operand::kA) {
-      
-      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad;
-      int col_idx = 0;
-
-      origin_ = MatrixCoord(row_idx, col_idx);
-    }
-    else {
-
-      int row_idx = 0;
-      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
-
-      origin_ = MatrixCoord(row_idx, col_idx); 
-    }
-
-    ref_.add_coord_offset(origin_);
-  }
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
-    TensorRef const &ref, 
-    TensorCoord extent,
-    int lane_id
-  ): ref_(ref), extent_(extent), divisible_(false) {
-  
-    int quad_id = lane_id / 4;
-    int lane_in_quad = (lane_id % 4);
-  
-    if (kOperand == Operand::kA) {
-      
-      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile  + lane_in_quad;
-      int col_idx = 0;
-
-      origin_ = MatrixCoord(row_idx, col_idx);
-    }
-    else {
-
-      int row_idx = 0;
-      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
-
-      origin_ = MatrixCoord(row_idx, col_idx); 
-    }
-
-    #if defined(__CUDA_ARCH__)
-    __syncthreads();
-    #endif
-
-    ref_.add_coord_offset(origin_);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_pointer_offset(LongIndex offset) {
-
-    ref_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_tile_offset(TensorCoord const &tile_offset) {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator++() {
-
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, 1});
-    }
-    else {
-      add_tile_offset({1, 0});
-    }    
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator--() {
-    
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, -1});
-    }
-    else {
-      add_tile_offset({-1, 0});
-    }    
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
-    int ldm = ref_.stride()[0];
-
-    if (kOperand == Operand::kA) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
-        
-        int tile_idx = idx / 2;
-        int quad_idx = idx % 2;
-
-        int row_offset = tile_idx * kInterleavedTileRows + quad_idx * 4;
-        frag_ptr[idx] = access_ptr[row_offset * ldm / kElementsPerAccess];
-      } 
-    }
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
-
-        int tile_idx = idx / 2;
-        int quad_idx = idx % 2;
-
-        int col_offset = tile_idx * kInterleavedTileColumns + quad_idx * 4;
-        frag_ptr[idx] = access_ptr[col_offset * ldm / kElementsPerAccess];
-      } 
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-
-    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation
-  }
-};
-
-
-/// Tile iterator specialized for 'NT' arrangement
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of matrix operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads = 32,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  /// Basic check
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Number of elements accessed per Shared Memory load
-  static int const kElementsPerAccess = 4;
-
-private:
-
-  static int const kInterleavedTileRows = 32;
-  static int const kInterleavedTileColumns = 32;
-  static int const kInstructionsPerTile = 2;
-  
-  /// Rounded up instruction counts
-  using TileCount = MatrixShape<
-    Shape::kRow / kInterleavedTileRows,
-    Shape::kColumn / kInterleavedTileColumns
-  >;
-
-  using FragmentCount = MatrixShape<
-    TileCount::kRow * kInstructionsPerTile,
-    TileCount::kColumn * kInstructionsPerTile
-  >;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
-  >;
-
-  /// Memory access type
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-private:
-
-  /// Underlying tensor reference
-  TensorRef ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to conditionally enable extents checking
-  bool divisible_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(): divisible_(true) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
-    TensorRef const &ref, 
-    int lane_id
-  ): 
-    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
-
-    int quad_id = lane_id / 4;
-    int lane_in_quad = (lane_id % 4);
-  
-    if (kOperand == Operand::kA) {
-      
-      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
-      int col_idx = lane_in_quad;
-
-      origin_ = MatrixCoord(row_idx, col_idx);
-    }
-    else {
-
-      int row_idx = lane_in_quad;
-      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
-
-      origin_ = MatrixCoord(row_idx, col_idx); 
-    }
-
-    ref_.add_coord_offset(origin_);
-  }
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
-    TensorRef const &ref, 
-    TensorCoord extent,
-    int lane_id
-  ): ref_(ref), extent_(extent), divisible_(false) {
-  
-    int quad_id = lane_id / 4;
-    int lane_in_quad = (lane_id % 4);
-  
-    if (kOperand == Operand::kA) {
-      
-      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
-      int col_idx = lane_in_quad;
-
-      origin_ = MatrixCoord(row_idx, col_idx);
-    }
-    else {
-
-      int row_idx = lane_in_quad;
-      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
-
-      origin_ = MatrixCoord(row_idx, col_idx); 
-    }
-
-    #if defined(__CUDA_ARCH__)
-    __syncthreads();
-    #endif
-
-    ref_.add_coord_offset(origin_);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_pointer_offset(LongIndex offset) {
-
-    ref_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_tile_offset(TensorCoord const &tile_offset) {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator++() {
-
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, 1});
-    }
-    else {
-      add_tile_offset({1, 0});
-    }    
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator--() {
-    
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, -1});
-    }
-    else {
-      add_tile_offset({-1, 0});
-    }    
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
-    int ldm = ref_.stride()[0];
-
-    if (kOperand == Operand::kA) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
-        
-        int tile_idx = idx / 2;
-        int quad_idx = idx % 2;
-
-        int row_offset = tile_idx * kInterleavedTileRows;
-        frag_ptr[idx] = access_ptr[row_offset / kElementsPerAccess + quad_idx];
-      }
-    }
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
-
-        int tile_idx = idx / 2;
-        int quad_idx = idx % 2;
-
-        int col_offset = tile_idx * kInterleavedTileColumns;
-        frag_ptr[idx] = access_ptr[col_offset / kElementsPerAccess + quad_idx];
-      } 
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-
-    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-  Shape_, 
-  Operand::kA, 
-  Element_,
-  cutlass::layout::RowMajor,
-  InstructionShape_, 
-  OpDelta_,
-  32
-> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
-  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
-
-public:
-  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
-  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> ;
-
-  using TensorRef = typename Base::TensorRef;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): Base(ref, lane_id) { }
-
-};
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-  Shape_, 
-  Operand::kA, 
-  Element_,
-  cutlass::layout::ColumnMajor,
-  InstructionShape_, 
-  OpDelta_,
-  32
-> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
-  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
-
-public:
-  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
-  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> ;
-
-  using TensorRef = typename Base::TensorRef;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): Base(ref, lane_id) { }
-
-};
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kB, Element_,
-    cutlass::layout::ColumnMajor,
-    InstructionShape_, OpDelta_, 32
-> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
-  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
-
-public:
-  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
-  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_>;
-
-  using TensorRef = typename Base::TensorRef;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): Base(ref, lane_id) { }
-};
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_>
-class MmaVoltaTensorOpMultiplicandTileIterator<
-    Shape_, Operand::kB, Element_,
-    cutlass::layout::RowMajor,
-    InstructionShape_, OpDelta_, 32
-> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
-  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
-
-public:
-  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
-  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_>;
-
-  using TensorRef = typename Base::TensorRef;
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaVoltaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): Base(ref, lane_id) { }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
deleted file mode 100755
index 4ccf0b580..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ /dev/null
@@ -1,2441 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 64b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicandCongruous64b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility.");
-
-  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 2;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<8, 4>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
-      InstructionShape::kStrided / Delta::kStrided
-    >;
-
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-  /// Internal counter used to jump to next K partition
-  int k_group_idx_;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
-    k_group_idx_(0) {
-
-    int access_strided = lane_id / Policy::Delta::kContiguous;
-    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
-      access_contiguous + access_strided * stride_;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    byte_offset_ += offset * sizeof(Element);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    int offset = 
-      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
-      tile_offset.contiguous() * Shape::kContiguous;
-
-    add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    add_tile_offset({0, 1});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-    
-    add_tile_offset({0, -1});
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::Iterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c +
-            Policy::Delta::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    Index pointer_offset = 
-      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous64b,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicandCongruous64b,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for loading 128b vectors of 64b elements.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::TensorOpMultiplicand64bCrosswise,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
-
-  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Long Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Load two elements per access
-  static int const kElementsPerAccess = 2;
-
-  /// Policy defining internal details of tile iterator
-  struct Policy {
-
-    /// Shape of one access
-    using Delta = layout::PitchLinearShape<4, 16>;
-
-    /// Number of iterations to load
-    using Iterations = layout::PitchLinearShape<
-      InstructionShape::kContiguous / Delta::kContiguous,
-      Shape::kStrided / Delta::kStrided
-    >;
-
-  };
-
-private:
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
- using Fragment =
-     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
-
-private:
-
-  /// Layout object storing stride values
-  StrideIndex stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Internal counter for tracking K-group
-  Index k_group_idx_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ):
-    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
-    k_group_idx_(0) {
-
-    int access_strided = lane_id / 8;
-    int access_contiguous = (lane_id  % 8);
-
-    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
-
-    pointer_= reinterpret_cast<AccessType const *>(ref.data());
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    pointer_ += offset / kElementsPerAccess;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
-                     stride_ * kElementsPerAccess +
-                 tile_offset.strided() * Shape::kStrided;
-
-    add_pointer_offset(offset);
-    
-    int old_k_group_idx = k_group_idx_;
-
-    k_group_idx_ += tile_offset.contiguous();
-
-    if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) {
-      byte_offset_ ^= 0x40;
-    }
-
-    return *this;
-  }
-
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
-
-    // TODO: fix this if it becomes an issue during warp it reset
-    add_tile_offset(tile_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    pointer_ += stride_ * InstructionShape::kContiguous;
-
-    if (k_group_idx_ & 0x1) {
-      // xor ptr
-      byte_offset_ ^= 0x40;
-    }
-
-    ++k_group_idx_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
-
-        int access_idx = c + s * Policy::Iterations::kContiguous;
-
-        AccessType const *source_ptr = pointer_ +
-            Policy::Delta::kContiguous * c * stride_ +
-            Policy::Delta::kStrided * s / kElementsPerAccess;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
-
-        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
-
-        fetch_ptr[access_idx] = *source;
-      }
-    }
-
-    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
-
-    if (k_group_idx_ & 1) {
-      // exchange on 64b granularity
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Fragment::kElements; i += 2) {
-        Element tmp = exchange_ptr[i];
-        exchange_ptr[i] = exchange_ptr[i + 1];
-        exchange_ptr[i + 1] = tmp;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = tile_offset.contiguous() *
-                               InstructionShape::kContiguous /
-                               Layout::kElementsPerAccess +
-                           tile_offset.strided() * Shape::kStrided * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    k_group_idx_ = k_group;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
-      layout::TensorOpMultiplicand64bCrosswise,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.strided(), tile_offset.contiguous()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
-      layout::TensorOpMultiplicand64bCrosswise,
-      layout::PitchLinearShape<InstructionShape::kRow,
-                               InstructionShape::kColumn>,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// Tile iterator specialized for canonical matrix layouts
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity
-    Operand Operand_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads = 32,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class MmaTensorOpMultiplicandTileIteratorCanonical {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  /// Basic check
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Number of elements accessed per Shared Memory load
-  static int const kElementsPerAccess = 
-    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
-
-private:
-
-  static int const kWarpShapeOuter = 
-    (kOperand == Operand::kA ? Shape::kRow : Shape::kColumn);
-
-  static int const kWarpShapeInner =
-    (kOperand == Operand::kA ? Shape::kColumn : Shape::kRow);
-
-  
-  /// Rounded up instruction counts
-  using InstructionCount = MatrixShape<
-    Shape::kRow / InstructionShape::kRow,
-    Shape::kColumn / InstructionShape::kColumn
-  >;
-
-  /// Rounded up tile dimensions
-  using WarpShapeDivisible = MatrixShape<
-    InstructionCount::kRow * InstructionShape::kRow,
-    InstructionCount::kColumn * InstructionShape::kColumn
-  >;
-
-public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-    Element, 
-    WarpShapeDivisible::kRow * WarpShapeDivisible::kColumn / kThreads
-  >;
-
-  /// Memory access type
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-private:
-
-  /// Underlying tensor reference
-  TensorRef ref_;
-
-  /// Extent of tensor
-  MatrixCoord extent_;
-
-  /// Origin
-  MatrixCoord origin_;
-
-  /// Used to conditionally enable extents checking
-  bool divisible_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical(): divisible_(true) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical(
-    TensorRef const &ref, 
-    int lane_id
-  ): ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
-  
-    if (kOperand == Operand::kA) {
-      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
-    }
-    else {
-      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
-    }
-
-    ref_.add_coord_offset(origin_);
-  }
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical(
-    TensorRef const &ref, 
-    TensorCoord extent,
-    int lane_id
-  ): ref_(ref), extent_(extent), divisible_(false) {
-  
-    if (kOperand == Operand::kA) {
-      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
-    }
-    else {
-      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
-    }
-
-    ref_.add_coord_offset(origin_);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical &add_pointer_offset(LongIndex offset) {
-
-    ref_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical &add_tile_offset(TensorCoord const &tile_offset) {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-    origin_ += coord_offset;
-
-    ref_.add_coord_offset(coord_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical & operator++() {
-
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, 1});
-    }
-    else {
-      add_tile_offset({1, 0});
-    }    
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical & operator--() {
-    
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, -1});
-    }
-    else {
-      add_tile_offset({-1, 0});
-    }    
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIteratorCanonical & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    int const kWarpShapeDivisibleInner =
-      (kOperand == Operand::kA ? WarpShapeDivisible::kColumn : WarpShapeDivisible::kRow);
-
-    // Take advantage of Tensor Op's 8 x 4T access pattern
-    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
-
-    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    if (kOperand == Operand::kA) {
-      int const kTilesPerInstruction = InstructionShape::kRow / 8;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
-            int access_idx = 
-              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
-            
-            MatrixCoord offset(
-              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
-              inner_idx * 4 * kElementsPerAccess);
-
-            MatrixCoord access_coord = origin_ + offset;
-
-            if (divisible_ || 
-              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
-
-              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
-                ref_.data() + ref_.offset(offset));
-            }
-            else {
-              AccessType zero;
-              zero.clear();
-              access_ptr[access_idx] = zero;
-            }
-          }
-        }
-      }
-    }
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
-          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
-
-          MatrixCoord offset(
-            inner_idx * 4 * kElementsPerAccess,
-            inst_n_idx * 8);
-
-          MatrixCoord access_coord = origin_ + offset;
-
-          if (divisible_ ||
-            (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
-              
-            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
-              ref_.data() + ref_.offset(offset));
-          }
-          else {
-              AccessType zero;
-              zero.clear();
-              access_ptr[access_idx] = zero;
-          }
-        }
-      } 
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-
-    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-  
-    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation
-  }
-};
-
-/// Wrapper for ColumnMajor
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::ColumnMajor,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
-      Shape, kOperand, Element,
-      layout::ColumnMajor,
-      InstructionShape,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-  
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    TensorCoord const & extent,
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-
-/// Wrapper for RowMajor
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Identifies A or B multiplicand
-    Operand Operand_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Interval between adjacent *MMA instructions (in units of MMA
-    /// instructions)
-    int OpDelta_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class MmaTensorOpMultiplicandTileIterator<
-    Shape_, Operand_, Element_,
-    cutlass::layout::RowMajor,
-    InstructionShape_, OpDelta_, 32, PartitionsK_> {
- public:
-
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-
-  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
-    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Underlying tile iterator implementation
-  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
-      Shape, kOperand, Element,
-      layout::RowMajor,
-      InstructionShape,
-      kOpDelta, kThreads, PartitionsK_>;
-
- public:
-
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
-private:
-
-  /// Underlying tile iterator
-  Base iterator_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, lane_id) {
-  }
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator(
-    TensorRef const &ref, 
-    TensorCoord const &extent,
-    int lane_id
-  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator++() {
-
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator--() {
-
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    iterator_.load(frag);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-      frag,
-      {tile_offset.contiguous(), tile_offset.strided()},
-      byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
deleted file mode 100755
index c4ed8bc98..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators to load sparse meta data used by warp-level matrix multiply operations
-   targeting Sparse Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Delta between *MMA operations (in units of *MMA operations, concept:
-    /// MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class SparseMmaTensorOpMetaTileIterator {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  static int const kSparse = 2;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kColumn % InstructionShape::kColumn),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-    
-    static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-
-    // Determine number of elements along outer dimension per individual LDSM op
-    static int const kLdsmOpOuter = InstructionShape::kColumn;
-    static int const kLdsmOpInner = 8 * kElementsPerAccess / kLdsmOpOuter;
-
-    static_assert(!(Shape::kColumn % kLdsmOpOuter),
-                  "Shape of warp-level mma must be divisible by LDSM's "
-                  "fundamental tile size.");
-
-    static_assert(!(Shape::kRow % kLdsmOpInner),
-                  "Shape of warp-level mma must be divisible by LDSM's "
-                  "fundamental tile size.");
-
-    /// Shape of one individual LDSM instruction
-    static int const LdsmShapeColumn =
-        InstructionShape::kColumn / kLdsmOpOuter;
-    static int const LdsmShapeRow =
-        ((4 / LdsmShapeColumn * kLdsmOpInner) > Shape::kRow)
-            ? (Shape::kRow / kLdsmOpInner)
-            : (4 / LdsmShapeColumn);
-    using LdsmShape =
-        layout::PitchLinearShape<LdsmShapeRow, LdsmShapeColumn>;
-
-    /// Number and arrangement of LDSM instructions
-    using LdsmIterations = layout::PitchLinearShape<
-        Shape::kRow / kLdsmOpInner / LdsmShapeRow,
-        1>;
-
-    /// Number of groups for each tile
-    static int const kGroupsPerTile =
-        Shape::kColumn / InstructionShape::kColumn;
-  };
-
- private:
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-                "Alternative arrangements not supported at present.");
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, Policy::kElementsPerAccess>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment =
-      Array<Element, Shape::kRow * InstructionShape::kColumn / kThreads>;
-
- private:
-
-  /// Layout object storing stride values
-  Index stride_;
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Internal counter used to determine when to increment byte offset and when
-  /// to XOR it
-  int k_group_idx_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  SparseMmaTensorOpMetaTileIterator()
-      : pointer_(nullptr),
-        stride_(0),
-        byte_offset_(0),
-        k_group_idx_(0) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  SparseMmaTensorOpMetaTileIterator(TensorRef const &ref, int lane_id)
-      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
-        stride_(ref.stride(0) / Policy::kElementsPerAccess),
-        byte_offset_(0),
-        k_group_idx_(0) {
-
-    int access_contiguous = (lane_id % (Shape::kRow / Policy::kElementsPerAccess));
-    int access_strided = (lane_id / (Shape::kRow / Policy::kElementsPerAccess));
-
-    byte_offset_ = (access_contiguous + access_strided * stride_) *
-                   sizeof_bits<Element>::value * Policy::kElementsPerAccess / 8;
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  SparseMmaTensorOpMetaTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  SparseMmaTensorOpMetaTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    int offset = tile_offset.row() * Shape::kRow +
-                 tile_offset.column() * InstructionShape::kColumn * stride_ *
-                     Policy::kElementsPerAccess;
-
-    add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  SparseMmaTensorOpMetaTileIterator &operator++() {
-    add_tile_offset({0, 1});
-
-    if (kPartitionsK > 1) {
-      ++k_group_idx_;
-      // Jump to next stage
-      if (k_group_idx_ == Policy::kGroupsPerTile) {
-        k_group_idx_ = 0;
-        add_tile_offset(
-            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
-      }
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  SparseMmaTensorOpMetaTileIterator &operator--(){
-    byte_offset_ -= stride_ * InstructionShape::kColumn *
-                    sizeof_bits<Element>::value * Policy::kElementsPerAccess /
-                    8;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE SparseMmaTensorOpMetaTileIterator &
-  operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  SparseMmaTensorOpMetaTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
-        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_ +
-            Policy::LdsmShape::kContiguous * Policy::kLdsmOpInner * c +
-            Policy::LdsmShape::kStrided * s * stride_;
-
-        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) +
-                                      byte_offset + byte_offset_;
-
-        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
-            fetch_ptr[access_idx], source_byte_ptr);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = 
-      tile_offset.contiguous() * Shape::kRow / Layout::kElementsPerAccess + 
-      tile_offset.strided() * InstructionShape::kColumn * stride_;
-
-    byte_offset += sizeof(AccessType) * pointer_offset;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no op
-  }
-};
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
deleted file mode 100755
index 0da043e67..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ /dev/null
@@ -1,805 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include "cutlass/wmma_array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-template <
-    ///< Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Operand identity (A or B)
-    Operand Operand,
-    /// Data type of operand
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Delta between *MMA operations (in units of *WMMA operations, concept:MatrixShape)
-    int OpDelta_,
-    /// Number of threads participating in one matrix operation
-    int Threads,
-    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-    typename Policy_>
-class MmaTensorOpWmmaMultiplicandTileIterator;
-
-
-////////////////////////////////////////////////////////////////////////////////
-/// This tile iterator is specialized for 32-thread WMMA operation. 
-/// It uses nvcuda::wmma::load_matrix_sync to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-////////////////////////////////////////////////////////////////////////////////
-template <
-    ///< Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
-    int OpDelta_,    
-    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-    typename Policy_>
-class MmaTensorOpWmmaMultiplicandTileIterator<
-    Shape_, Operand::kA, Element_, Layout_,
-    OpDelta_, 32, Policy_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kA;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Delta between *WMMA operations
-  static int const kOpDelta = OpDelta_;
-
-  /// Wmma Operator information and operation delta
-  using Policy = Policy_;
-
-
-  //
-  // Derived quantities
-  //
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Stride Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Native Wmma shape for operand A (concept MatrixShape)
-  using WmmaShape = MatrixShape<
-    Policy::Operator::Shape::kM, 
-    Policy::Operator::Shape::kK
-  >;
-
-  /// Map cutlass dataype to nvcuda::wmma datatype
-  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
-
-  /// Shape of individual WMMA load / stores for operand A
-  using Iterations = MatrixShape<
-    Shape::kRow / WmmaShape::kRow,
-    1 
-  >;
-
-  /// Fragment object holding a warps part 
-  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentA, Iterations::kCount>;
-
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// statically assert this specialization
-  /////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// This iterator is specalized for Operand A
-  static_assert(kOperand == Operand::kA,
-    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for A operands to warp-level Mma.");
-
-  /// Supported memory layouts
-  static_assert(
-    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
-    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
-    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /////////////////////////////////////////////////////////////////////////////////////////////////////
-
-private:
-
-  /// Shared memory base pointers - not advanced
-  char const *pointer_;
-  
-  /// Byte offset into shared memory - advanced
-  Index byte_offset_;
-  
-  /// Stride in units of number of elements
-  StrideIndex stride_;
-
-  /// Layout of shared memory
-  Layout layout_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) { 
-  
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-
-    Index elements_offset = layout_({tile_offset.row() * Shape::kRow, tile_offset.column() * WmmaShape::kColumn});
-    
-    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
-    
-    Index elements_offset = layout_({0, WmmaShape::kColumn});
-
-    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
-    
-    Index elements_offset = layout_({0, WmmaShape::kColumn});
-
-    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-
-        Index load_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
-
-        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset); 
-
-        nvcuda::wmma::load_matrix_sync(frag[m], ptr, stride_); 
-      
-      }
-    }
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_byte_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kColumn; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < Iterations::kRow; ++m) {
-
-        Index store_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
-
-        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
-
-        nvcuda::wmma::store_matrix_sync(ptr, frag[m], stride_); 
-      
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_byte_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-/// This tile iterator is specialized for 32-thread WMMA operation. 
-/// It uses nvcuda::wmma::load_matrix_sync to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    ///< Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
-    int OpDelta_,    
-    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-    typename Policy_>
-class MmaTensorOpWmmaMultiplicandTileIterator<
-    Shape_, Operand::kB, Element_, Layout_,
-    OpDelta_, 32, Policy_> {
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Operand tag
-  static Operand const kOperand = Operand::kB;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Delta between *WMMA operations
-  static int const kOpDelta = OpDelta_;
-
-  /// Wmma Operator information and operation delta
-  using Policy = Policy_;
-
-
-  //
-  // Derived quantities
-  //
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Stride Index type
-  using StrideIndex = typename TensorRef::Layout::Stride::Index;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Native Wmma shape (concept MatrixShape)
-  using WmmaShape = MatrixShape<
-    Policy::Operator::Shape::kK, 
-    Policy::Operator::Shape::kN
-  >;
-
-  /// Map cutlass dataype to nvcuda::wmma datatype
-  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
-
-  /// Shape of individual WMMA load / stores for operand B
-  using Iterations = MatrixShape<
-    1,
-    Shape::kColumn / WmmaShape::kColumn
-  >;
-
-  /// Fragment object holding a warps part
-  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentB, Iterations::kCount>;
-
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// statically asserts this specialization
-  /////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// This iterator is specalized for Operand B
-  static_assert(kOperand == Operand::kB,
-    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for B operands to warp-level Mma.");
-
-  /// Supported memory layouts
-  static_assert(
-    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
-    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
-    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
-
-  /// Not working on this feature at the moment.
-  static_assert(kOpDelta == 1,
-    "Alternative arrangements not supported at present.");
-
-  /////////////////////////////////////////////////////////////////////////////////////////////////////
-
-private:
-
-  /// Shared memory base pointers - not advanced
-  char const *pointer_;
-  
-  /// Byte offset into shared memory - advanced
-  Index byte_offset_;
-  
-  /// Stride in units of number of elements
-  StrideIndex stride_;
-
-  /// Layout of shared memory
-  Layout layout_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) {
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
-    
-    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-    
-    Index elements_offset = layout_({tile_offset.row() * WmmaShape::kRow, tile_offset.column() * Shape::kColumn});
-    
-    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
-    
-    Index elements_offset = layout_({WmmaShape::kRow, 0});
-
-    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
-    
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
-
-    Index elements_offset = layout_({WmmaShape::kRow, 0});
-
-    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-        
-        Index load_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
-
-        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset);
-
-        nvcuda::wmma::load_matrix_sync(frag[n], ptr, stride_);        
-      }
-    }
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_byte_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0; k < Iterations::kRow; ++k) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-
-        Index store_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
-
-        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
-        
-        nvcuda::wmma::store_matrix_sync(ptr, frag[n], stride_);        
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_byte_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-template <
-    ///< Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions, concept: MatrixShape)
-    typename OpDelta_,
-    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-    typename Policy_>
-class MmaTensorOpWmmaAccumulatorTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-/// This tile iterator is specialized for 32-thread WMMA operation. 
-/// It uses nvcuda::wmma::store_matrix_sync to load from shared
-/// memory and therefore must be initialized with a TensorRef to shared memory. 
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept |
-///   WriteableRandomAccessContiguousTileIteratorConcept
-///
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    ///< Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
-    typename OpDelta_,    
-    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-    typename Policy_>
-class MmaTensorOpWmmaAccumulatorTileIterator
-{
- public:
-
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = Layout_;
-
-  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
-  using OpDelta = OpDelta_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Wmma Operator information and operation delta
-  using Policy = Policy_;
-
-
-  //
-  // Derived quantities
-  //
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Native Wmma shape (concept MatrixShape)
-  using WmmaShape = MatrixShape<
-    Policy::Operator::Shape::kM, 
-    Policy::Operator::Shape::kN
-  >;
-  
-  /// Map cutlass dataype to nvcuda::wmma datatype
-  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
-
-  /// Map cutlass::layout to nvuda::wmma::layout_t enum
-  static nvcuda::wmma::layout_t const WmmaLayout = cutlass::arch::CutlassToWmmaLayout<Layout>::value;
-
-  /// Shape of individual WMMA load / stores for accumulator
-  using Iterations = MatrixShape<
-    Shape::kRow / WmmaShape::kRow,
-    Shape::kColumn / WmmaShape::kColumn
-  >;
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentC, Iterations::kCount>;
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// statically asserts this specialization
-  /////////////////////////////////////////////////////////////////////////////////////////////////////
-  /// Supported layouts
-  static_assert(
-    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
-    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
-    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
-
-private:
-  
-  /// Internal reference
-  cutlass::TensorRef<Element, Layout> ref_;
-
-public:
-  
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator() { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator(
-    TensorRef const &ref, 
-    int lane_id
-  ): ref_(ref) { }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
-    ref_.add_pointer_offset(offset);
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
-    ref_.add_coord_offset({tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn});
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator & operator++() {
-    ref_.add_coord_offset({Shape::kRow, 0});
-    return *this;
-  }
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator & operator--() {
-    ref_.add_coord_offset({-Shape::kRow, 0});
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  MmaTensorOpWmmaAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < Iterations::kRow; ++m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-
-        const WmmaDataType * ptr = reinterpret_cast<const WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
-        
-        nvcuda::wmma::load_matrix_sync(frag[m * Iterations::kColumn + n], ptr, ref_.stride()[0], WmmaLayout); 
-
-      }
-    }
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    load_with_pointer_offset(frag, 0);
-  }
-    
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < Iterations::kRow; ++m) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Iterations::kColumn; ++n) {
-
-        WmmaDataType * ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
-
-        nvcuda::wmma::store_matrix_sync(ptr, frag[m * Iterations::kColumn + n], ref_.stride()[0], WmmaLayout); 
-      }
-    }
-  }
-
-  /// Stores a fragment to memory at the location pointed to by the iterator
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) const {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    // no operation here
-  }
-};
-
-
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
-
-#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
deleted file mode 100755
index 971ad3b81..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include "cutlass/wmma_array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-///< Structure to compute the matrix product targeting CUDA cores via WMMA.
-template < 
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  ///< Data type of A elements
-  typename ElementA_,
-  ///< Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  ///< Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  ///< Element type of C matrix
-  typename ElementC_,
-  ///< Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  ///< Policy describing warp-level Wmma operation (concept: MmaTensorOpPolicy)
-  typename Policy_,
-  ///< Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  ///< Used for partial specialization
-  typename Enable = bool
->
-class MmaTensorOpWmma {
-public:
-  ///< Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  ///< Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  ///< Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  ///< Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  ///< Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  ///< Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  ///< Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
-  using Policy = Policy_;
-
-  /// Underlying instruction shape
-  using InstructionShape = typename Policy::Operator::Shape;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Indicates math operator 
-  using MathOperator = typename ArchMmaOperator::Operator;
-  
-  /// Underlying architecture tag
-  using ArchTag = typename Policy::Operator::ArchTag;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassWmmaTensorOp;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpWmmaMultiplicandTileIterator<
-     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
-     Policy::OpDelta::kRow, kThreadCount, Policy>;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpWmmaMultiplicandTileIterator<
-     MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
-     Policy::OpDelta::kRow, kThreadCount, Policy>;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpWmmaAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-    typename Policy::OpDelta, Policy>;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-private:
-
-  static_assert(
-    !(Shape::kM % Policy::Operator::Shape::kM) && 
-    !(Shape::kN % Policy::Operator::Shape::kN),
-    "Shape of warp-level Wmma must be divisible by operator shape (wmma native size)");
-
-  /// Number of wmma operations performed
-  using WmmaIterations = MatrixShape<
-    Shape::kM / Policy::Operator::Shape::kM,
-    Shape::kN / Policy::Operator::Shape::kN 
-  >;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: cutlass::arch::Wmma)
-  typename Policy::Operator wmma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaTensorOpWmma() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D, 
-    FragmentA const &A, 
-    FragmentB const &B, 
-    FragmentC const &C) const {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < WmmaIterations::kColumn; ++n) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < WmmaIterations::kRow; ++m) {
-
-        // accumulate wmma mma
-        wmma(D[m * WmmaIterations::kColumn + n], A[m], B[n], C[m * WmmaIterations::kColumn + n]);
-      }
-    }  
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
deleted file mode 100755
index 67231d35a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
+++ /dev/null
@@ -1,449 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
-      Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-#include "cutlass/gemm/warp/mma_tensor_op.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
-template <
-  /// Size of the Gemm problem - concept: gemm::GemmShape<>
-  typename Shape_,
-  /// Data type of A elements
-  typename ElementA_,
-  /// Layout of A matrix (concept: MatrixLayout)
-  typename LayoutA_,
-  /// Data type of B elements
-  typename ElementB_,
-  /// Layout of B matrix (concept: MatrixLayout)
-  typename LayoutB_,
-  /// Element type of C matrix
-  typename ElementC_,
-  /// Layout of C matrix (concept: MatrixLayout)
-  typename LayoutC_,
-  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
-  typename Policy_,
-  /// Reduce operand A or B along K dimension
-  bool ReduceKForA_,
-  /// Number of partitions along K dimension
-  int PartitionsK_ = 1,
-  /// Store the accumulators in row major or column major.  Row major is used
-  /// when output layout is interleaved.
-  bool AccumulatorsInRowMajor = false,
-  /// Used for partial specialization
-  typename Enable = bool
->
-class MmaWithReductionTensorOp {
-public:
-  /// Shape of warp-level matrix operation (concept: GemmShape)
-  using Shape = Shape_;
-
-  /// Data type of multiplicand A
-  using ElementA = ElementA_;
-
-  /// Layout of multiplicand A
-  using LayoutA = LayoutA_;
-
-  /// Data type of multiplicand B
-  using ElementB = ElementB_;
-
-  /// Layout of multiplicand B
-  using LayoutB = LayoutB_;
-
-  /// Data type of accumulator matrix C
-  using ElementC = ElementC_;
-
-  /// Layout of accumulator matrix C
-  using LayoutC = LayoutC_;
-
-  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
-  using Policy = Policy_;
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  using ArchMmaOperator = typename Policy::Operator;
-
-  /// Indicates math operator
-  using MathOperator = typename ArchMmaOperator::Operator;
-
-  /// Architecture tag from underlying instruction
-  using ArchTag = typename ArchMmaOperator::ArchTag;
-
-  /// Indicates class of matrix operator
-  using OperatorClass = arch::OpClassTensorOp;
-
-  /// Shape of underlying instruction
-  using InstructionShape = typename ArchMmaOperator::Shape;
-
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-
-  /// Number of threads participating in warp-level matrix product
-  static int const kThreadCount = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  static bool const kReduceKForA = ReduceKForA_;
-
-  static_assert(platform::is_same<ElementA, cutlass::half_t>::value ||
-                platform::is_same<ElementA, cutlass::bfloat16_t>::value,
-                "ElementA needs to be fp16 or bf16.");
-
-  static_assert(platform::is_same<ElementB, cutlass::half_t>::value ||
-                platform::is_same<ElementB, cutlass::bfloat16_t>::value,
-                "ElementB needs to be fp16 or bf16.");
-
-  static_assert(platform::is_same<InstructionShape,
-                                  cutlass::gemm::GemmShape<16, 8, 16>>::value,
-                "Only supports 16x8x16 tensor core instruction.");
-
-  static_assert(!AccumulatorsInRowMajor,
-                "Only calls tensor core instructions in column major.");
-
-public:
-
-  /// Iterates over the A operand in memory
-  using IteratorA = MmaTensorOpMultiplicandTileIterator<
-     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
-     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
-     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for A tile
-  using FragmentA = typename IteratorA::Fragment;
-
-  /// Storage for transformed A tile
-  using TransformedFragmentA =
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
-
-  /// Iterates over the B operand in memory
-  using IteratorB = MmaTensorOpMultiplicandTileIterator<
-      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
-      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
-      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
-
-  /// Storage for B tile
-  using FragmentB = typename IteratorB::Fragment;
-
-  /// Storage for transformed B tile
-  using TransformedFragmentB =
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
-
-  /// Iterates over the C operand in memory
-  using IteratorC = MmaTensorOpAccumulatorTileIterator<
-     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
-     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
-
-  /// Storage for C tile
-  using FragmentC = typename IteratorC::Fragment;
-
-  /// Number of mma operations performed
-  using MmaIterations = MatrixShape<
-    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
-    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
-  >;
-
-  using FragmentReduction = Array<ElementC, kReduceKForA ? (Shape::kM / 8) : (Shape::kN / 8)>;
-
-public:
-
-  /// Underlying matrix multiply operator (concept: arch::Mma)
-  ArchMmaOperator mma;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_DEVICE
-  MmaWithReductionTensorOp() {}
-
-  /// Performs a warp-level matrix multiply-accumulate operation
-  CUTLASS_DEVICE
-  void operator()(
-    FragmentC &D,
-    TransformedFragmentA const &A,
-    TransformedFragmentB const &B,
-    FragmentC const &C,
-    FragmentReduction &gemm_k_reduction
-  ) const {
-
-    using MmaOperandA = typename ArchMmaOperator::FragmentA;
-    using MmaOperandB = typename ArchMmaOperator::FragmentB;
-    using MmaOperandC = typename ArchMmaOperator::FragmentC;
-
-    D = C;
-
-    [[maybe_unused]] MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
-    [[maybe_unused]] MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
-    [[maybe_unused]] MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
-
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-      assert(0);
-    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-      // Serpentine visitation order maximizing reuse of Ra
-      CUTLASS_PRAGMA_UNROLL
-      for (int m = 0; m < MmaIterations::kRow; ++m) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < MmaIterations::kColumn; ++n) {
-
-          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
-
-          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
-              ptr_A[m],
-              ptr_B[n_serpentine],
-              ptr_D[m + n_serpentine * MmaIterations::kRow]);
-
-          if (!kReduceKForA && m == 0) {
-            #if 0
-            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]);
-            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]);
-            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]);
-            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 3]);
-            #else
-            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&B);
-
-            if (platform::is_same<ElementB, cutlass::half_t>::value) {
-              asm volatile(
-                "{\n\t"
-                " .reg .f16 low, high;\n\t"
-                " .reg .f32 tmp;\n\t"
-                " mov.b32 {low, high}, %1;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " mov.b32 {low, high}, %2;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                "}\n\t"
-                : "+f"(gemm_k_reduction[n_serpentine])
-                : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
-            } else if (platform::is_same<ElementB, cutlass::bfloat16_t>::value) {
-              asm volatile(
-                "{\n\t"
-                " .reg .f32 tmp;\n\t"
-                " shl.b32 tmp, %1, 16;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " and.b32 tmp, %1, 0xffff0000;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " shl.b32 tmp, %2, 16;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " and.b32 tmp, %2, 0xffff0000;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                "}\n\t"
-                : "+f"(gemm_k_reduction[n_serpentine])
-              : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
-            } else {
-                assert(0);
-            }
-            #endif
-          }
-
-          if (kReduceKForA && (n == 0)) {
-            #if 0
-            gemm_k_reduction[m * 2] += float(A[m * 8]);
-            gemm_k_reduction[m * 2] += float(A[m * 8 + 1]);
-            gemm_k_reduction[m * 2] += float(A[m * 8 + 4]);
-            gemm_k_reduction[m * 2] += float(A[m * 8 + 5]);
-
-            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]);
-            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]);
-            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]);
-            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 7]);
-            #else
-            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&A);
-
-            if (platform::is_same<ElementA, cutlass::half_t>::value) {
-              asm volatile(
-                "{\n\t"
-                " .reg .f16 low, high;\n\t"
-                " .reg .f32 tmp;\n\t"
-                " mov.b32 {low, high}, %2;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " mov.b32 {low, high}, %3;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " mov.b32 {low, high}, %4;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " mov.b32 {low, high}, %5;\n\t"
-                " cvt.f32.f16 tmp, low;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " cvt.f32.f16 tmp, high;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                "}\n\t"
-                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
-                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
-
-            } else if (platform::is_same<ElementA, cutlass::bfloat16_t>::value) {
-
-              asm volatile(
-                "{\n\t"
-                " .reg .f32 tmp;\n\t"
-                " shl.b32 tmp, %2, 16;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " and.b32 tmp, %2, 0xffff0000;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " shl.b32 tmp, %3, 16;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " and.b32 tmp, %3, 0xffff0000;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " shl.b32 tmp, %4, 16;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " and.b32 tmp, %4, 0xffff0000;\n\t"
-                " add.f32 %0, tmp, %0;\n\t"
-                " shl.b32 tmp, %5, 16;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                " and.b32 tmp, %5, 0xffff0000;\n\t"
-                " add.f32 %1, tmp, %1;\n\t"
-                "}\n\t"
-                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
-                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
-
-            } else {
-              assert(0);
-            }
-            #endif
-          }
-        }
-      }
-    #else
-      assert(0);
-    #endif
-  }
-
-  /// Transform the mma operands to the required types
-  CUTLASS_DEVICE
-  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
-                 FragmentA const &A, FragmentB const &B) const {
-
-    //
-    // Define conversions from source type to instruction type
-    //
-    FloatRoundStyle const kRoundA =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
-                              ElementA>::kRound;
-    FloatRoundStyle const kRoundB =
-        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
-                              ElementB>::kRound;
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                            FragmentA::kElements, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements / 2, kRoundB>
-          convert_B;
-      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
-          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
-      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
-          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
-                                             FragmentB::kElements / 2> *>(&dst_B);
-
-      dst_A = convert_A(A);
-
-      ptr_dst_B[0] = convert_B(ptr_B[0]);
-      ptr_dst_B[1] = convert_B(ptr_B[1]);
-
-    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
-                            FragmentA::kElements / 2, kRoundA>
-          convert_A;
-      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
-                            FragmentB::kElements, kRoundB>
-          convert_B;
-      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
-          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
-      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
-          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
-                                             FragmentA::kElements / 2> *>(&dst_A);
-
-      dst_B = convert_B(B);
-
-      ptr_dst_A[0] = convert_A(ptr_A[0]);
-      ptr_dst_A[1] = convert_A(ptr_A[1]);
-    #else
-      assert(0);
-    #endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
deleted file mode 100755
index 7d74ac8cf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Defines iterators used by warp-level loading scale and bias vectors.
-   Every scale/bias data only needs to be loaded once for every channel.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/fast_math.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of A elements
-    typename Element_,
-    /// Layout of operand
-    typename Layout_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Policy of the details of LDSM shape and iterations
-    typename Policy_,
-    /// Number of threads participating in one matrix operation
-    int Threads,
-    /// Number of partitions along K dimension
-    int PartitionsK_ = 1>
-class ScaleBiasTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: PitchLinearShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: PitchLinearShape)
-    typename InstructionShape_,
-    /// Policy of the details of LDSM shape and iterations
-    typename Policy_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::PitchLinear,
-                             InstructionShape_, Policy_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::PitchLinear;
-
-  /// Shape of one matrix product operation (concept: GemmShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// Number of partitions along K dimension
-  static int const kPartitionsK = PartitionsK_;
-
-  /// Number of partitions along K dimension
-  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  using Policy = Policy_;
-
- private:
-
-  /// Pointer type used for accesses
-  using AccessType = Array<Element, kElementsPerAccess>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<Element, 2 * Policy::kLdsmOpInner *
-                                      InstructionShape::kContiguous / kThreads>;
-
- private:
-
-  /// Shared memory base pointers - not advanced
-  AccessType const *pointer_;
-
-  /// Byte offset incremented as iterator advances
-  Index byte_offset_;
-
-  /// Internal counter used to determine when to increment byte offset and when
-  /// to XOR it
-  int k_group_idx_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator()
-      : pointer_(nullptr),
-        byte_offset_(0),
-        k_group_idx_(0) {}
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator(TensorRef const &ref_scale_bias,
-                         int lane_id)
-      : byte_offset_(0), k_group_idx_(0) {
-    /// 16816 only
-    pointer_ = reinterpret_cast<AccessType const *>(ref_scale_bias.data()) +
-               ((lane_id >> 3) & 1) * Shape::kContiguous / kElementsPerAccess +
-               (lane_id >> 4);
-  }
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
-    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
-    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
-
-    byte_offset_ += k_groups_delta * sizeof_bits<Element>::value *
-                    kElementsPerAccess * Policy::LdsmShape::kContiguous / 8;
-
-    // Multiply by 2 because scale and bias belonging to the same stage are next
-    // to each other in the shared memory.
-    pointer_ += (2 * whole_tiles * Shape::kContiguous / kElementsPerAccess);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &operator++() {
-    byte_offset_ += Policy::LdsmShape::kContiguous *
-                    sizeof_bits<Element>::value * kElementsPerAccess / 8;
-
-    k_group_idx_++;
-
-    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
-      k_group_idx_ = 0;
-      byte_offset_ -= (Policy::kGroupsPerTile / kPartitionsK) *
-                      Policy::LdsmShape::kContiguous *
-                      sizeof_bits<Element>::value * kElementsPerAccess / 8;
-      add_tile_offset({Policy::kGroupsPerTile, 0});
-    }
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator &operator--() { assert(0); }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-    Array<unsigned, 4> *fetch_ptr =
-        reinterpret_cast<Array<unsigned, 4> *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < 1; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
-        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
-
-        AccessType const *source_ptr =
-            pointer_ + Policy::LdsmShape::kContiguous * c;
-
-        char const *source_byte_ptr =
-            reinterpret_cast<char const *>(source_ptr) + byte_offset +
-            byte_offset_;
-
-        cutlass::arch::ldsm<layout::RowMajor, 4>(
-            fetch_ptr[access_idx], source_byte_ptr);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    load_with_byte_offset(frag, tile_offset, 0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    Index pointer_offset = tile_offset.contiguous() *
-                               InstructionShape::kContiguous /
-                               kElementsPerAccess;
-
-    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
-
-    load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
-/// load from shared memory and therefore must be initialized with a TensorRef
-/// to shared memory.
-///
-/// Satisfies:
-///   ReadableRandomAccessContiguousTileIteratorConcept
-///
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Data type of elements
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Policy of the details of LDSM shape and iterations
-    typename Policy_,
-    /// Number of partitions along K dimension
-    int PartitionsK_>
-class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::RowMajor,
-                             InstructionShape_, Policy_, 32, PartitionsK_> {
- public:
-  /// Shape of tile to load (concept: PitchLinearShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Internal structure of iterator - made public to enable introspection
-  using Policy = Policy_;
-
-  /// Underlying tile iterator implementation
-  using Base = ScaleBiasTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear,
-      layout::PitchLinearShape<InstructionShape::kColumn,
-                               InstructionShape::kRow>,
-      Policy, kThreads, PartitionsK_>;
-
- public:
-  //
-  // Derived quantities
-  //
-
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = typename Base::Fragment;
-
- private:
-  /// Underlying tile iterator
-  Base iterator_;
-
- public:
-  /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator() {}
-
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator(TensorRef const &ref_scale_bias, int lane_id)
-      : iterator_({ref_scale_bias.data(), ref_scale_bias.stride()}, lane_id) {}
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
-    iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator &add_tile_offset(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &add_tile_offset_negative(
-      TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator &operator++() {
-    ++iterator_;
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_HOST_DEVICE
-  ScaleBiasTileIterator &operator--() {
-    --iterator_;
-
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &operator+=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of
-  ///< the tensor
-  CUTLASS_DEVICE
-  ScaleBiasTileIterator &operator-=(
-      TensorCoord const &tile_offset) {
-    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const { iterator_.load(frag); }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-    assert(0);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-    iterator_.load_with_byte_offset(
-        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    iterator_.set_kgroup_index(k_group); 
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm 
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
deleted file mode 100755
index d8d99d675..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level per-channel softmax before
-   matrix multiply-accumulate operations targeting Tensor Cores.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/platform/platform.h"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/arch/memory_sm75.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/mma_sm80.h"
-
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/warp/mma.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
-
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
-#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename FragmentActivations, typename FragmentNormSum>
-struct SoftmaxScaleBiasTransform {
-
-  using T = typename FragmentActivations::Element;
-
-  static int const NumActivations = FragmentActivations::kElements;
-  static int const NumNormSum = FragmentNormSum::kElements;
-  static int const MmaElements = 2;
-  // One element has one scale and one bias
-  static int const MmaScaleBiasPair = 2;
-  // 16816 has 2 columns and 2 rows
-  static int const MmaCols = 2;
-  static int const MmaRows = 2;
-
-  using MmaOperand = Array<T, MmaElements>;
-  using NormSumOperand = Array<__half2, MmaScaleBiasPair>;
-
-  CUTLASS_DEVICE
-  void transform(MmaOperand &activations,
-                 NormSumOperand const &norm_sum) {
-
-    __half2* packed_activations = reinterpret_cast<__half2*>(&activations);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < MmaElements / 2; ++i) {
-      __half2 out = ::h2exp(__hsub2(packed_activations[i], norm_sum[2*i]));
-      packed_activations[i] = __hmul2(out, norm_sum[2*i + 1]);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void operator()(FragmentActivations &activations,
-                  FragmentNormSum const &norm_sum) {
-    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
-    NormSumOperand const *ptr_norm_sum =
-        reinterpret_cast<NormSumOperand const *>(&norm_sum);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
-      transform(ptr_activations[i],
-                ptr_norm_sum[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
deleted file mode 100755
index 42c6728bc..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing warp-level matrix multiply-accumulate operations.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/array_planar_complex.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace warp {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename TileIterator_>
-class TileIteratorPlanarComplex {
-public:
-
-  /// Underlying iterator over real-valued tiles
-  using TileIterator = TileIterator_;
-
-  /// Underlying element type
-  using Element = typename TileIterator::Element;
-
-  /// Underlying layout type
-  using Layout = typename TileIterator::Layout;
-
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = typename TileIterator::TensorRef;
-
-  /// Index type
-  using Index = typename TensorRef::Index;
-
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-
-  /// Planar complex fragment
-  using Fragment = ArrayPlanarComplex<Element, TileIterator::Fragment::kElements>;
-
-public:
-
-  /// Underlying tile iterator
-  TileIterator tile_iterator_;
-
-  /// Offset (in units of bytes) to the imaginary part of the planar complex matrix
-  LongIndex imaginary_offset_;
-
-public:
-    /// Default ctor constructs null iterator
-  CUTLASS_HOST_DEVICE
-  TileIteratorPlanarComplex(): imaginary_offset_(0) { }
-
-  /// Constructor from TensorRef
-  CUTLASS_DEVICE
-  TileIteratorPlanarComplex(
-    TensorRef const &ref, 
-    int lane_id,
-    LongIndex imaginary_offset
-  ):
-    tile_iterator_(ref, lane_id),
-    imaginary_offset_((imaginary_offset * sizeof_bits<Element>::value) / 8) { }
-
-
-  /// Adds a pointer offset to internal pointer(s) to advance through memory
-  CUTLASS_DEVICE
-  TileIteratorPlanarComplex &add_pointer_offset(LongIndex offset) {
-
-    tile_iterator_.add_pointer_offset(offset);
-
-    return *this;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_HOST_DEVICE
-  TileIteratorPlanarComplex &add_tile_offset(TensorCoord const &tile_offset) {
-
-    tile_iterator_.add_tile_offset(tile_offset);
-
-    return *this;
-  }
-
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  TileIteratorPlanarComplex & operator++() {
-    ++tile_iterator_;
-    return *this;
-  }
-
-  //
-  // WIP
-  //
-
-  /// Advances the iterator along the opposite of the advance dimension
-  CUTLASS_HOST_DEVICE
-  TileIteratorPlanarComplex & operator--() {
-    --tile_iterator_;
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  TileIteratorPlanarComplex & operator+=(TensorCoord const &tile_offset) {
-    tile_iterator_.add_tile_offset(tile_offset);
-    return *this;
-  }
-
-  ///< advances in units of whole tiles along the logical coordinate space of the tensor
-  CUTLASS_DEVICE
-  TileIteratorPlanarComplex & operator-=(TensorCoord const &tile_offset) {
-    tile_iterator_.add_tile_offset(-tile_offset);
-    return *this;
-  }
-
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    tile_iterator_.load_with_byte_offset(frag.real, 0);
-    tile_iterator_.load_with_byte_offset(frag.imag, imaginary_offset_);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset in units of bytes
-      Index byte_offset) const {
-
-    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
-    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
-  }
-
-  /// Loads a fragment from memory with additional logical offset
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a linear offset
-      Index pointer_offset) const {
-
-    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
-
-    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
-    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset) const {
-
-    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, 0);
-    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, imaginary_offset_);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index pointer_offset) const {
-
-    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
-
-    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
-    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset + imaginary_offset_);
-  }
-
-  /// Loads a fragment from memory with logical offset in units of whole tiles.
-  CUTLASS_DEVICE
-  void load_with_byte_offset(
-      /// fragment to load from the tensor
-      Fragment &frag,
-      /// loads a tile with a logical offset in units of whole tiles
-      TensorCoord const &tile_offset,
-      /// loads a tile with a logical offset AND a pointer offset
-      Index byte_offset) const {
-
-    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
-    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, byte_offset + imaginary_offset_);
-  }
-
-  /// Notify the iterator which k-group it is currently pointing to.
-  ///
-  /// This does not advance the iterator. Rather, it overrides its internal
-  /// tracking with constant-valued k-group index to enable the compiler to
-  /// fold constants and achieve more efficient code.
-  ///
-  /// This is used by some nontrivial permuted layouts.
-  CUTLASS_DEVICE
-  void set_kgroup_index(int k_group) {
-    tile_iterator_.set_kgroup_index(k_group);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
deleted file mode 100755
index 61b97a1e1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.h
+++ /dev/null
@@ -1,394 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/coord.h"
-
-namespace cutlass {
-namespace gemm {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Shape of a matrix multiply-add operation
-template <
-  /// Rows of matrix product
-  int M = 1,
-  /// Columns of matrix product
-  int N = 1,
-  /// Inner dimension of matrix product
-  int K = 1
->
-struct GemmShape {
-  static int const kM = M;
-  static int const kN = N;
-  static int const kK = K;
-
-  static int const kMN = M * N;
-  static int const kMK = M * K;
-  static int const kKN = N * K;
-  static int const kMNK = M * N * K;
-
-  static int const kCount = kMNK;
-
-  //
-  // Static member functions
-  //
-
-  /// Returns a Coord object
-  CUTLASS_HOST_DEVICE
-  static Coord<3> toCoord() {
-    return make_Coord(kM, kN, kK);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Type alias of the transpose of a GemmShape
-template <
-  /// concept: GemmShape
-  typename Shape
->
-using GemmShapeTranspose = GemmShape<Shape::kN, Shape::kM, Shape::kK>;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GemmCoord is a structure derived from Coord<3> that specifies a location within the
-/// coordinate space of a GEMM problem.
-struct GemmCoord : public Coord<3, int> {
-
-  /// Integer-valued index
-  typedef int Index;
-
-  /// Base type is a Coord of rank=3
-  typedef Coord<3, Index> Base;
-
-  /// GEMM M dimension - rows of the output C matrix
-  static int const kM = 0;
-
-  /// GEMM N dimension - columns of the output C matrix
-  static int const kN = 1;
-
-  /// GEMM K dimension - inner dimension of the GEMM problem
-  static int const kK = 2;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  GemmCoord() { }
-
-  /// Constructs from Coord<3> and a batch
-  CUTLASS_HOST_DEVICE
-  GemmCoord(Coord<3, Index> const& coord): Base(make_Coord(coord[0], coord[1], coord[2])) { }
-
-  /// Helper to construct from a K, N, M, batch variables
-  CUTLASS_HOST_DEVICE
-  GemmCoord(Index m, Index n, Index k): Base(make_Coord(m, n, k)) { }
-
-  /// Returns the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  m() const { return this->at(kM); }
-
-  /// Returns reference to the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index & m() { return this->at(kM); }
-
-  /// Returns the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  n() const { return this->at(kN); }
-
-  /// Returns reference to the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  k() const { return this->at(kK); }
-
-  /// Returns reference to the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index & k() { return this->at(kK); }
-
-  /// Obtains a Coord<3> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<3> mnk() const {
-    return make_Coord(m(), n(), k());
-  }
-
-  /// Obtains a Coord<3> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<3> knm() const {
-    return make_Coord(k(), n(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> nm() const {
-    return make_Coord(n(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> mn() const {
-    return make_Coord(m(), n());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> mk() const {
-    return make_Coord(m(), k());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> km() const {
-    return make_Coord(k(), m());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> nk() const {
-    return make_Coord(n(), k());
-  }
-
-  /// Obtains a Coord<2> from GemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<2> kn() const {
-    return make_Coord(k(), n());
-  }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator+(Base const& b) const {
-    return GemmCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator-(Base const& b) const {
-    return GemmCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator*(Base const& b) const {
-    return GemmCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  GemmCoord operator/(Base const& b) const {
-    return GemmCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  GemmCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// BatchedGemmCoord is a structure derived from Coord<4> that specifies a location within the
-/// coordinate space of a batched GEMM problem.
-struct BatchedGemmCoord : public Coord<4, int> {
-
-  /// Integer-valued index
-  typedef int Index;
-
-  /// Base type is a Coord of rank=4
-  typedef Coord<4, Index> Base;
-
-  /// GEMM M dimension - rows of the output C matrix
-  static int const kM = 0;
-
-  /// GEMM N dimension - columns of the output C matrix
-  static int const kN = 1;
-
-  /// GEMM K dimension - inner dimension of the GEMM problem
-  static int const kK = 2;
-
-  /// GEMM Batch dimension - inner dimension of the GEMM problem
-  static int const kBatch = 3;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord() { }
-
-  /// Constructs from Coord<4>
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord(Base const& coord): Base(coord) { }
-
-  /// Helper to construct from a K, N, M, and batch variables
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord(Index m, Index n, Index k, Index b): Base(make_Coord(m, n, k, b)) { }
-
-  /// Returns the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  m() const { return this->at(kM); }
-
-  /// Returns reference to the GEMM M coordinate
-  CUTLASS_HOST_DEVICE
-  Index & m() { return this->at(kM); }
-
-  /// Returns the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  n() const { return this->at(kN); }
-
-  /// Returns reference to the GEMM N coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  k() const { return this->at(kK); }
-
-  /// Returns reference to the GEMM K coordinate
-  CUTLASS_HOST_DEVICE
-  Index & k() { return this->at(kK); }
-
-  /// Returns the GEMM batch coordinate
-  CUTLASS_HOST_DEVICE
-  Index const&  batch() const { return this->at(kBatch); }
-
-  /// Returns reference to the GEMM batch coordinate
-  CUTLASS_HOST_DEVICE
-  Index & batch() { return this->at(kBatch); }
-
-  /// Obtains a GemmCoord from BatchedGemmCoord
-  CUTLASS_HOST_DEVICE
-  GemmCoord mnk() const {
-    return GemmCoord(m(), n(), k());
-  }
-
-  /// Obtains a Coord<4> from BatchedGemmCoord
-  CUTLASS_HOST_DEVICE
-  Coord<4> mnkb() const {
-    return make_Coord(m(), n(), k(), batch());
-  }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator+(Base const& b) const {
-    return BatchedGemmCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator-(Base const& b) const {
-    return BatchedGemmCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator*(Base const& b) const {
-    return BatchedGemmCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord operator/(Base const& b) const {
-    return BatchedGemmCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  BatchedGemmCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp b/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
deleted file mode 100755
index a979241ef..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Utilities to convert a CuTe tuple to a GemmCoord or BatchedGemmCoord
-*/
-
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cutlass/gemm_coord.h"
-
-namespace cutlass {
-namespace gemm {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Tuple>
-CUTLASS_HOST_DEVICE
-auto
-to_gemm_coord(Tuple tuple) {
-  static_assert(cute::rank(tuple) <= 4, "Can only convert tuples of rank <= 4.");
-
-  if constexpr (cute::rank(tuple) <= 3) {
-    auto tuple_mnk = cute::append<3>(tuple, cute::Int<0>{});
-    return GemmCoord(cute::size<0>(tuple_mnk), cute::size<1>(tuple_mnk), cute::size<2>(tuple_mnk));
-  }
-  else {
-    return BatchedGemmCoord(cute::size<0>(tuple), cute::size<1>(tuple), cute::size<2>(tuple), cute::size<3>(tuple));
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace gemm
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/half.h b/lightllm-kernel/cutlass/include/cutlass/half.h
deleted file mode 100755
index a0f398284..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/half.h
+++ /dev/null
@@ -1,930 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines a class for using IEEE half-precision floating-point types in host or
-      device code.
-*/
-
-#pragma once
-
-#ifndef CUTLASS_ENABLE_F16C
-#define CUTLASS_ENABLE_F16C 0
-#endif
-
-#if defined(__CUDACC_RTC__)
-
-#include "cutlass/floating_point_nvrtc.h"
-
-// F16C extensions are not meaningful when compiling for NVRTC which only accommodates device code.
-#undef CUTLASS_ENABLE_F16C
-#define CUTLASS_ENABLE_F16C 0
-
-#else
-//
-// Standard Library headers belong here to avoid conflicts with NVRTC.
-//
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#include <cstring>
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cuda_fp16.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/float8.h"
-#include "cutlass/platform/platform.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Optionally target F16C extentions to accelerate half-precision conversion.
-#if !defined(__CUDA_ARCH__) && (CUTLASS_ENABLE_F16C)
-#if defined(_MSC_VER)
-
-#include <immintrin.h>
-
-#if defined(__i386__) || defined(__x86_64__)
-#include <intrin.h>
-#endif
-
-#define F16C_ROUND_NEAREST 0
-
-#if !defined(__CUDA_ARCH__)
-extern __inline float _cvtsh_ss (unsigned short __S) {
-  __m128i packed;
-  std::memcpy(&packed, &__S, sizeof(__S));
-
-  __m128 result = _mm_cvtph_ps(packed);
-
-  float flt;
-  std::memcpy(&flt, &result, sizeof(flt));
-
-  return flt;
-}
-
-__inline unsigned short _cvtss_sh (float __F, const int) {
-  __m128 packed;
-  std::memcpy(&packed, &__F, sizeof(__F));
-
-  __m128i result = _mm_cvtps_ph(packed, F16C_ROUND_NEAREST);
-
-  unsigned short u;
-  std::memcpy(&u, &result, sizeof(u));
-
-  return u;
-}
-#endif
-
-#else
-
-// Linux
-#include <x86intrin.h>
-
-#if defined(__i386__) || defined(__x86_64__)
-#include <cpuid.h>
-#endif
-
-#define F16C_ROUND_NEAREST (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)
-
-#endif // _MSC_VER
-
-class CpuId {
-
-  bool f16c_enabled;
-
-  CpuId() {
-  #if defined(__i386__) || defined(__x86_64__)
-    #if defined(_MSC_VER)
-      int exx[4];
-
-      __cpuid (exx, 1); 
-      f16c_enabled = exx[2] & 0x20000000;
-
-    #else 
-    // GCC / Clang
-       int eax, ebx, ecx, edx;
-
-      __cpuid (1 , eax, ebx, ecx, edx); 
-      f16c_enabled = ecx & 0x20000000;
-    #endif
-  #else 
-  // Arm / PowerPC etc.
-    f16c_enabled = false;
-  #endif
-  }
-
-public:
-
-  bool is_f16c_supported() const {
-    return f16c_enabled;
-  } 
-
-  static const CpuId& instance() {
-      static CpuId cpu;
-      return cpu;
-  }
-};
-#endif // !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// IEEE half-precision floating-point type
-struct alignas(2) half_t {
-
-  //
-  // Data members
-  //
-
-  /// Storage type
-  uint16_t storage;
-
-  //
-  // Static conversion operators
-  //
-
-  /// Constructs from an unsigned short
-  CUTLASS_HOST_DEVICE
-  static half_t bitcast(uint16_t x) {
-    half_t h;
-    h.storage = x;
-    return h;
-  }
-
-  /// FP32 -> FP16 conversion - rounds to nearest even
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
-    // Avoid inlining in device code if no hardware support
-    __device__ __noinline__
-  #else
-    CUTLASS_HOST_DEVICE
-  #endif  
-  static half_t convert(float const& flt) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return half_t(__float2half_rn(flt));
-  #else
-
-    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
-      if( CpuId::instance().is_f16c_supported() ) {
-        unsigned short u = _cvtss_sh(flt, F16C_ROUND_NEAREST);
-        return bitcast(u);
-      }
-    #endif
-
-    // software implementation rounds toward nearest even
-    unsigned s;
-
-    #if defined(__CUDA_ARCH__)
-    s = reinterpret_cast<unsigned const &>(flt);
-    #else
-    std::memcpy(&s, &flt, sizeof(s));
-    #endif
-
-    uint16_t sign = uint16_t((s >> 16) & 0x8000);
-    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
-    int mantissa = s & 0x7fffff;
-    uint16_t u = 0;
-
-    if ((s & 0x7fffffff) == 0) {
-      // sign-preserving zero
-      return bitcast(sign);
-    }
-
-    if (exp > 15) {
-      if (exp == 128 && mantissa) {
-        // not a number
-        u = 0x7fff;
-      } else {
-        // overflow to infinity
-        u = sign | 0x7c00;
-      }
-      return bitcast(u);
-    }
-
-    int sticky_bit = 0;
-
-    if (exp >= -14) {
-      // normal fp32 to normal fp16
-      exp = uint16_t(exp + uint16_t(15));
-      u = uint16_t(((exp & 0x1f) << 10));
-      u = uint16_t(u | (mantissa >> 13));
-    } else {
-      // normal single-precision to subnormal half_t-precision representation
-      int rshift = (-14 - exp);
-      if (rshift < 32) {
-        mantissa |= (1 << 23);
-
-        sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
-
-        mantissa = (mantissa >> rshift);
-        u = (uint16_t(mantissa >> 13) & 0x3ff);
-      } else {
-        mantissa = 0;
-        u = 0;
-      }
-    }
-
-    // round to nearest even
-    int round_bit = ((mantissa >> 12) & 1);
-    sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);
-
-    if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
-      u = uint16_t(u + 1);
-    }
-
-    u |= sign;
-
-    return bitcast(u);
-  #endif
-  }
-
-  /// FP32 -> FP16 conversion - rounds to nearest even
-  CUTLASS_HOST_DEVICE
-  static half_t convert(int const& n) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return half_t(__int2half_rn(n));
-  #else
-    return convert(float(n));
-  #endif
-  }
-
-  /// FP32 -> FP16 conversion - rounds to nearest even
-  CUTLASS_HOST_DEVICE
-  static half_t convert(unsigned const& n) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return half_t(__uint2half_rn(n));
-  #else
-    return convert(float(n));
-  #endif
-  }
-
-  /// Converts a half-precision value stored as a uint16_t to a float
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
-    // Avoid inlining in device code if no hardware support
-    __device__ __noinline__
-  #else
-    CUTLASS_HOST_DEVICE
-  #endif
-  static float convert(half_t const& x) {
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return __half2float(x.to_half());
-  #else
-
-    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
-      if( CpuId::instance().is_f16c_supported() ) {
-        unsigned short u = x.storage;
-        return _cvtsh_ss(u);
-      }
-    #endif
-
-    uint16_t const &h = x.storage;
-    uint32_t sign = ((h >> 15) & 1);
-    uint32_t exp = ((h >> 10) & 0x1f);
-    uint32_t mantissa = (h & 0x3ff);
-    unsigned f = 0;
-
-    if (exp > 0 && exp < 31) {
-      // normal
-      exp += 112;
-      f = (sign << 31) | (exp << 23) | (mantissa << 13);
-    } else if (exp == 0) {
-      if (mantissa) {
-        // subnormal
-        exp += 113;
-        while ((mantissa & (1 << 10)) == 0) {
-          mantissa <<= 1;
-          exp--;
-        }
-        mantissa &= 0x3ff;
-        f = (sign << 31) | (exp << 23) | (mantissa << 13);
-      } else {
-        // sign-preserving zero
-        f = (sign << 31);
-      }
-    } else if (exp == 31) {
-      if (mantissa) {
-        f = 0x7fffffff;  // not a number
-      } else {
-        f = (0xff << 23) | (sign << 31);  //  inf
-      }
-    }
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<float const&>(f);
-    #else
-    float flt;
-    std::memcpy(&flt, &f, sizeof(flt));
-    return flt;
-    #endif
-  #endif
-  }
-
-  //
-  // Methods
-  //
-
-  /// Default constructor
-  half_t() = default;
-
-  /// Reinterpret cast from CUDA's half type
-  CUTLASS_HOST_DEVICE
-  explicit half_t(half const & x) {
-    #if defined(__CUDA_ARCH__)
-    storage = reinterpret_cast<uint16_t const &>(x);
-    #else
-    __half_raw raw(x);
-    std::memcpy(&storage, &raw.x, sizeof(storage));
-    #endif
-  }
-
-  /// Floating point conversion
-  CUTLASS_HOST_DEVICE
-  explicit half_t(float x) {
-    storage = convert(x).storage;
-  }
-
-  /// Floating point conversion
-  CUTLASS_HOST_DEVICE
-  explicit half_t(double x): half_t(float(x)) {
-
-  }
-
-  /// float_e4m3_t conversion
-  CUTLASS_HOST_DEVICE
-  explicit half_t(float_e4m3_t x): half_t(float(x)) {
-
-  }
-
-  /// float_e5m2_t conversion
-  CUTLASS_HOST_DEVICE
-  explicit half_t(float_e5m2_t x): half_t(float(x)) {
-
-  }
-
-  /// Integer conversion - round to nearest even
-  CUTLASS_HOST_DEVICE
-  explicit half_t(int x) {
-    storage = convert(x).storage;
-  }
-
-  /// Integer conversion - round toward zero
-  CUTLASS_HOST_DEVICE
-  explicit half_t(unsigned x) {
-    storage = convert(x).storage;
-  }
-
-  /// Assignment
-  CUTLASS_HOST_DEVICE
-  half_t & operator=(half const &x) {
-    #if defined(__CUDA_ARCH__)
-    storage = reinterpret_cast<uint16_t const &>(x);
-    #else
-    __half_raw raw(x);
-    std::memcpy(&storage, &raw.x, sizeof(storage));
-    #endif
-    return *this;
-  }
-
-  /// Converts to float
-  CUTLASS_HOST_DEVICE
-  operator float() const {
-    return convert(*this);
-  }
-
-  /// Converts to float
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(convert(*this));
-  }
-
-  /// Converts to float
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(convert(*this));
-  }
-
-  /// Casts to bool
-  CUTLASS_HOST_DEVICE
-  explicit operator bool() const {
-    return (convert(*this) != 0.0f);
-  }
-
-  /// Bitcasts to CUDA's half type
-  CUTLASS_HOST_DEVICE
-  half to_half() const {
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<half const &>(storage);
-    #else
-    __half_raw raw;
-    std::memcpy(&raw.x, &storage, sizeof(raw.x));
-    return half(raw);
-    #endif
-  }
-
-  /// Accesses raw internal state
-  CUTLASS_HOST_DEVICE
-  uint16_t& raw() {
-    return storage;
-  }
-
-  /// Accesses raw internal state
-  CUTLASS_HOST_DEVICE
-  uint16_t raw() const {
-    return storage;
-  }
-
-  /// Returns the sign bit
-  CUTLASS_HOST_DEVICE
-  bool signbit() const {
-    return ((storage & 0x8000) != 0);
-  }
-
-  /// Returns the biased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent_biased() const {
-    return int((storage >> 10) & 0x1f);
-  }
-
-  /// Returns the unbiased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent() const {
-    return exponent_biased() - 15;
-  }
-
-  /// Returns the mantissa
-  CUTLASS_HOST_DEVICE
-  int mantissa() const {
-    return int(storage & 0x3ff);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool signbit(cutlass::half_t const& h) {
-  return ((h.raw() & 0x8000) != 0);
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::half_t abs(cutlass::half_t const& h) {
-  return cutlass::half_t::bitcast(h.raw() & 0x7fff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isnan(cutlass::half_t const& h) {
-  return (h.exponent_biased() == 0x1f) && h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isfinite(cutlass::half_t const& h) {
-  return (h.exponent_biased() != 0x1f);
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::half_t nanh(const char*) {
-  // NVIDIA canonical NaN
-  return cutlass::half_t::bitcast(0x7fff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isinf(cutlass::half_t const& h) {
-  return (h.exponent_biased() == 0x1f) && !h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isnormal(cutlass::half_t const& h) {
-  return h.exponent_biased() && h.exponent_biased() != 0x1f;
-}
-
-CUTLASS_HOST_DEVICE
-int fpclassify(cutlass::half_t const& h) {
-  int exp = h.exponent_biased();
-  int mantissa = h.mantissa();
-  if (exp == 0x1f) {
-    if (mantissa) {
-      return FP_NAN;
-    }
-    else {
-      return FP_INFINITE;
-    }
-  }
-  else if (!exp) {
-    if (mantissa) {
-      return FP_SUBNORMAL;
-    }
-    else {
-      return FP_ZERO;
-    }
-  }
-  return FP_NORMAL;
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::half_t sqrt(cutlass::half_t const& h) {
-#if defined(__CUDACC_RTC__)
-  return cutlass::half_t(sqrtf(float(h)));
-#else
-  return cutlass::half_t(std::sqrt(float(h)));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t copysign(half_t const& a, half_t const& b) {
-
-  uint16_t a_mag = (a.raw() & 0x7fff);  
-  uint16_t b_sign = (b.raw() & 0x8000);
-  uint16_t result = (a_mag | b_sign);
-
-  return half_t::bitcast(result);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Standard Library operations and definitions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if !defined(__CUDACC_RTC__)
-namespace std {
-
-/// Numeric limits
-template <>
-struct numeric_limits<cutlass::half_t> {
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_infinity = true;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-  static bool const has_denorm_loss = true;
-  static std::float_round_style const round_style = std::round_to_nearest;
-  static bool const is_iec559 = true;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = 10;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
-
-  /// Minimum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
-
-  /// Returns maximum rounding error
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
-
-  /// Returns positive infinity value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
-
-  /// Returns quiet NaN value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
-
-  /// Returns signaling NaN value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
-
-  /// Returns smallest positive subnormal value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
-};
-}  // namespace std
-#endif
-
-namespace cutlass {
-namespace platform {
-
-/// Forward Declaration
-template <class T>
-struct numeric_limits;
-
-/// Numeric limits
-template <>
-struct numeric_limits<cutlass::half_t> {
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_infinity = true;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-#if !defined(__CUDACC_RTC__)
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-#endif
-  static bool const has_denorm_loss = true;
-#if !defined(__CUDACC_RTC__)
-  static std::float_round_style const round_style = std::round_to_nearest;
-#endif
-  static bool const is_iec559 = true;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = 10;
-
-  /// Least positive value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
-
-  /// Minimum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
-
-  /// Maximum finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
-
-  /// Returns smallest finite value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
-
-  /// Returns maximum rounding error
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
-
-  /// Returns positive infinity value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
-
-  /// Returns quiet NaN value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
-
-  /// Returns signaling NaN value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
-
-  /// Returns smallest positive subnormal value
-  CUTLASS_HOST_DEVICE
-  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
-};
-}  // namespace platform 
-}  // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Arithmetic operators
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool operator==(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __heq(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) == float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator!=(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __hne(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) != float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __hlt(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) < float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<=(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __hle(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) <= float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __hgt(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) > float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>=(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return __hge(lhs.to_half(), rhs.to_half());
-#else
-  return float(lhs) >= float(rhs);
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator+(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return half_t(__hadd(lhs.to_half(), rhs.to_half()));
-#else
-  return half_t(float(lhs) + float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator-(half_t const& lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return half_t(__hneg(lhs.to_half()));
-#else
-  return half_t(-float(lhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator-(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return half_t(__hsub(lhs.to_half(), rhs.to_half()));
-#else
-  return half_t(float(lhs) - float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator*(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return half_t(__hmul(lhs.to_half(), rhs.to_half()));
-#else
-  return half_t(float(lhs) * float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator/(half_t const& lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  return half_t(__hdiv(lhs.to_half(), rhs.to_half()));
-#else
-  return half_t(float(lhs) / float(rhs));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator+=(half_t & lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hadd(lhs.to_half(), rhs.to_half()));
-#else
-  lhs = half_t(float(lhs) + float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator-=(half_t & lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hsub(lhs.to_half(), rhs.to_half()));
-#else
-  lhs = half_t(float(lhs) - float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator*=(half_t & lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hmul(lhs.to_half(), rhs.to_half()));
-#else
-  lhs = half_t(float(lhs) * float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator/=(half_t & lhs, half_t const& rhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hdiv(lhs.to_half(), rhs.to_half()));
-#else
-  lhs = half_t(float(lhs) / float(rhs));
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator++(half_t & lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
-#else
-  float tmp(lhs);
-  ++tmp;
-  lhs = half_t(tmp);
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t& operator--(half_t & lhs) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
-#else
-  float tmp(lhs);
-  --tmp;
-  lhs = half_t(tmp);
-#endif
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator++(half_t & lhs, int) {
-  half_t ret(lhs);
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
-#else
-  float tmp(lhs);
-  tmp++;
-  lhs = half_t(tmp);
-#endif
-  return ret;
-}
-
-CUTLASS_HOST_DEVICE
-half_t operator--(half_t & lhs, int) {
-  half_t ret(lhs);
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
-#else
-  float tmp(lhs);
-  tmp--;
-  lhs = half_t(tmp);
-#endif
-  return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// User-defined literals
-//
-
-CUTLASS_HOST_DEVICE
-cutlass::half_t operator "" _hf(long double x) {
-  return cutlass::half_t(float(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::half_t operator "" _hf(unsigned long long int x) {
-  return cutlass::half_t(int(x));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h b/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
deleted file mode 100755
index b84d322db..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines a class for using integer types smaller than one byte in host or
-      device code.
-*/
-
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_size.h"
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-
-template <int Bits, bool Signed = true>
-struct integer_subbyte {
-  using Storage = uint8_t;
-
-  static_assert(Bits <= 8*sizeof(Storage), "Require a subbyte of bits in integer_subbyte");
-
-  // "External type"; the integer type for which
-  // integer_subbyte has a conversion-to operator
-  using xint_t = typename cutlass::platform::conditional<Signed, int, unsigned>::type;
-
-  // Bitmask for truncation from larger integers
-  static constexpr Storage bits_mask_ = Storage(Storage(-1) >> (8 - Bits));
-  // Bitmask for the sign bit
-  static constexpr Storage sign_mask_ = Storage((Signed ? 1 : 0) << (Bits - 1));
-
-  // Where the bits are stored
-  Storage storage;
-
-  // Default construction does NOT zero-initialize
-  integer_subbyte() = default;
-
-  // Implicit conversion is DEPRECATED.
-  // Please use one of the two explicit constructors below.
-  template<class T,
-    class Enable = cutlass::platform::enable_if_t<cutlass::platform::is_convertible_v<T, int>>
-  >
-  [[deprecated("Implicit conversion is deprecated; please use explicit construction instead")]]
-  CUTLASS_HOST_DEVICE
-  integer_subbyte(T value)
-      : integer_subbyte(static_cast<xint_t>(value)) {}
-
-  // CUTLASS code commonly converts both signed and unsigned integers
-  // into integer_subbyte, so the class provides both explicit
-  // conversions.
-
-  // Precondition: If the external type is unsigned int, then value
-  // fits in unsigned int (is nonnegative).
-  CUTLASS_HOST_DEVICE explicit
-  integer_subbyte(int value)
-      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
-  {
-    if constexpr (Signed) {
-      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
-      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
-      assert(value >= lower_bound);
-      assert(value < upper_bound);
-    }
-    else {
-      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
-      assert(value >= 0);
-      assert(value < static_cast<int>(upper_bound));
-    }
-  }
-
-  // Precondition: If the external type is (signed) int, then value
-  // fits in int.
-  CUTLASS_HOST_DEVICE explicit
-  integer_subbyte(unsigned value)
-      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
-  {
-    if constexpr (Signed) {
-      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
-      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
-      assert(value >= lower_bound);
-      assert(value < upper_bound);
-    }
-    else {
-      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
-      assert(value < upper_bound);
-    }
-  }
-
-  // Convert to the "external" integer type (int or unsigned)
-  CUTLASS_HOST_DEVICE
-  operator xint_t() const {
-    if (sign_mask_ & storage) {  // Sign extend
-      return xint_t(storage) | ~xint_t(bits_mask_);
-    } else {
-      return xint_t(storage);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator==(integer_subbyte const& rhs) const {
-    return storage == rhs.storage;
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator!=(integer_subbyte const& rhs) const {
-    return storage != rhs.storage;
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator<(integer_subbyte const& rhs) const {
-    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
-      // If both *this and rhs have the same sign, compare storage directly.
-      return storage < rhs.storage;
-    }
-    else {
-      // If *this and rhs don't have the same sign,
-      // then return whether *this is negative.
-      return sign_mask_ & storage;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator<=(integer_subbyte const& rhs) const {
-    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
-      // If both *this and rhs have the same sign, compare storage directly.
-      return storage <= rhs.storage;
-    }
-    else {
-      // If *this and rhs don't have the same sign,
-      // then return whether *this is negative.
-      return sign_mask_ & storage;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator>=(integer_subbyte const& rhs) const {
-    return !(*this < rhs);
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator>(integer_subbyte const& rhs) const {
-    return !(*this <= rhs);
-  }
-
-  CUTLASS_HOST_DEVICE friend integer_subbyte
-  conj(integer_subbyte const& x) {
-    return x;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 1-bit Unsigned integer type
-using uint1b_t = integer_subbyte<1, false>;
-
-/// 2-bit Integer type
-using int2b_t = integer_subbyte<2, true>;
-
-/// 2-bit Unsigned integer type
-using uint2b_t = integer_subbyte<2, false>;
-
-/// 4-bit Integer type
-using int4b_t = integer_subbyte<4, true>;
-
-/// 4-bit Unsigned integer type
-using uint4b_t = integer_subbyte<4, false>;
-
-/// 1-bit binary type
-using bin1_t = bool;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Bits, bool Signed>
-struct sizeof_bits<integer_subbyte<Bits,Signed>> {
-  static constexpr int value = Bits;
-};
-
-/// Defines the size of an element in bits - specialized for bin1_t
-template <>
-struct sizeof_bits<bin1_t> {
-  static constexpr int value = 1;
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace platform {
-
-/// Forward Declaration
-template <class T>
-struct numeric_limits;
-
-// Specialization for signed integer_subbyte
-template<int NumBits>
-struct numeric_limits<cutlass::integer_subbyte<NumBits, true>> {
-private:
-  using value_type = cutlass::integer_subbyte<NumBits, true>;
-
-public:
-  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
-    return value_type{
-      -(1 << (NumBits - 1))
-    };
-  }
-
-  CUTLASS_HOST_DEVICE static value_type max() noexcept {
-    return value_type{
-      (1 << (NumBits - 1)) - 1
-    };
-  }
-
-  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
-    return lowest();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_signed = true;
-  static constexpr bool has_infinity = false;
-};
-
-// Specialization for unsigned integer_subbyte
-template<int NumBits>
-struct numeric_limits<cutlass::integer_subbyte<NumBits, false>> {
-private:
-  using value_type = cutlass::integer_subbyte<NumBits, false>;
-
-public:
-  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
-    return value_type{0u};
-  }
-
-  CUTLASS_HOST_DEVICE static value_type max() noexcept {
-    return value_type{
-      (1u << NumBits) - 1u
-    };
-  }
-
-  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
-    return lowest();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_signed = false;
-};
-
-} // namespace platform
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
deleted file mode 100755
index 62dcb8b45..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include "cuda_runtime.h"
-
-#include "cutlass/trace.h"
-#endif
-
-namespace cutlass {
-
-struct KernelHardwareInfo {
-  //
-  // Data members
-  //
-  int device_id = 0;
-  int sm_count  = 0;
-
-  //
-  // Methods
-  //
-
-#if !defined(__CUDACC_RTC__)
-  static inline int
-  query_device_multiprocessor_count(int device_id = 0) {
-    cudaError_t result = cudaGetDevice(&device_id);
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaGetDevice() returned error "
-        << cudaGetErrorString(result));
-      return 0;
-    }
-    int multiprocessor_count;
-    result = cudaDeviceGetAttribute(&multiprocessor_count,
-      cudaDevAttrMultiProcessorCount, device_id);
-    if (result != cudaSuccess) {
-      CUTLASS_TRACE_HOST(
-        "  cudaDeviceGetAttribute() returned error "
-        << cudaGetErrorString(result));
-      return 0;
-    }
-    return multiprocessor_count;
-  }
-#endif
-};
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp b/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
deleted file mode 100755
index 876aacc6b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-// Simply import .h version of header so as to avoid breaking any existing CUTLASS builds
-// after .hpp was changed to .h
-#include "cutlass/kernel_hardware_info.h"
diff --git a/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h b/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
deleted file mode 100755
index ca3380a2a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/kernel_launch.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Defines structures and helpers to launch CUDA kernels within CUTLASS.
-*/
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/trace.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Structure containing the basic launch configuration of a CUDA kernel.
-struct KernelLaunchConfiguration {
-
-  /// CUDA grid dimensions
-  dim3 grid;
-
-  /// CUDA threablock dimensions
-  dim3 block;
-
-  /// Bytes of dynamically allocated SMEM in addition to static SMEM
-  size_t dynamic_smem;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a KernellaunchConfiguration object
-  CUTLASS_HOST_DEVICE
-  KernelLaunchConfiguration(
-    dim3 _grid = dim3(1,1,1),
-    dim3 _block = dim3(1,1,1),
-    size_t _dynamic_smem = 0
-  ):
-    grid(_grid),
-    block(_block),
-    dynamic_smem(_dynamic_smem) { }
-};
-
-
-template <typename GemmKernel, typename Params>
-Status kernel_launch(
-    dim3 const grid_dims,
-    dim3 const block_dims,
-    size_t const smem_size,
-    cudaStream_t cuda_stream,
-    const Params &kernel_params,
-    bool launch_with_pdl) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-  CUTLASS_TRACE_HOST("cutlass::kernel_launch");
-#endif
-
-  if (not launch_with_pdl) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::kernel_launch: No PDL");
-#endif
-    device_kernel<GemmKernel><<<grid_dims, block_dims, smem_size, cuda_stream>>>(kernel_params);
-  }
-  else {
-#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
-    if constexpr (GemmKernel::ArchTag::kMinComputeCapability < 90) {
-      CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported for SM90.");
-      return Status::kInvalid;
-    }
-
-    cudaLaunchConfig_t config;
-    cudaLaunchAttribute attrs[1];
-
-    config.gridDim = grid_dims;
-    config.blockDim = block_dims;
-    config.dynamicSmemBytes = smem_size;
-    config.stream = cuda_stream;
-
-    config.attrs = attrs;
-    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = 1;
-    config.numAttrs = 1;
-
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::kernel_launch: Calling cudaLaunchKernelEx");
-#endif
-    cudaError_t launch_result = cudaLaunchKernelEx(&config, &device_kernel<GemmKernel>, kernel_params);
-    if (cudaSuccess != launch_result) {
-      CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaLaunchKernelEx failed with error: " << cudaGetErrorString(launch_result));
-      return Status::kErrorInternal;
-    }
-#else
-    CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported starting CUDA 11.8.");
-    return Status::kInvalid;
-#endif
-  }
-
-  cudaError_t result = cudaGetLastError();
-  if (cudaSuccess == result) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaGetLastError reports success");
-#endif
-    return Status::kSuccess;
-  }
-  else {
-    CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
-    return Status::kErrorInternal;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/layout.h b/lightllm-kernel/cutlass/include/cutlass/layout/layout.h
deleted file mode 100755
index 1089add39..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/layout.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by TensorRef and derived classes. 
-
-    Layout functions map logical coordinates to linear memory. They often require additional
-    data to describe strides between elements.
-
-    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
-    defined in cutlass/tensor_ref.h.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/layout/vector.h"
-
-#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace layout {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h b/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
deleted file mode 100755
index 32aa17a5d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/matrix.h
+++ /dev/null
@@ -1,1349 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by TensorRef and derived classes. 
-
-    Layout functions map logical coordinates to linear memory. They often require additional
-    data to describe strides between elements.
-
-    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
-    defined in cutlass/tensor_ref.h.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/pitch_linear_coord.h"
-
-namespace cutlass {
-namespace layout {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Defines data layouts of various matrix formats usable by TensorRef and other classes.
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for row-major matrices.
-class RowMajor {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  RowMajor(LongIndex ldm = 0): stride_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajor(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajor packed(MatrixCoord const &extent) {
-    return RowMajor(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return LongIndex(coord.row()) * LongIndex(stride_[0]) + coord.column();
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    return MatrixCoord(Index(offset / stride_[0]), Index(offset % stride_[0]));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return LongIndex(extent.row()) * LongIndex(stride_[0]);
-  }
-};
-
-/// Mapping function for column-major matrices.
-class ColumnMajor {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajor(LongIndex ldm = 0): stride_(ldm) { }
-  
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajor(Stride stride): stride_(stride) { }
-
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajor packed(MatrixCoord const &extent) {
-    return ColumnMajor(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return LongIndex(coord.column()) * LongIndex(stride_[0]) + coord.row();
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    return MatrixCoord(Index(offset % stride_[0]), Index(offset / stride_[0]));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return LongIndex(extent.column()) * LongIndex(stride_[0]);
-  }
-};
-
-/// Mapping function for interleaved matrices. Matrix is structured
-/// as row-major arrangement of fixed-size columns.
-template <int Interleave>
-struct RowMajorInterleaved {
-  
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-  /// Size of interleaved columns
-  static int const kInterleave = Interleave;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
-  
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorInterleaved(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorInterleaved packed(MatrixCoord const &extent) {
-    return RowMajorInterleaved(extent.column() * kInterleave);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    Index row_major = coord.row() / kInterleave;
-    Index row_minor = coord.row() % kInterleave;
-    return LongIndex(row_major) * LongIndex(stride_[0]) + LongIndex(coord.column()) * kInterleave + row_minor;
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-
-    Index row_major = Index(offset / stride_[0]);
-    Index residual = Index(offset % stride_[0]);
-
-    Index column = residual / kInterleave;
-    Index row_minor =  residual % kInterleave;
-
-    return MatrixCoord(row_major * kInterleave + row_minor, column);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return (extent.row() + kInterleave - 1) / kInterleave * stride_[0];
-  }
-};
-
-/// Mapping function for interleaved matrices. Matrix is structured
-/// as column-major arrangement of fixed-size rows.
-template <int Interleave>
-struct ColumnMajorInterleaved {
-  
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-  /// Size of interleaved columns
-  static int const kInterleave = Interleave;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
-  
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorInterleaved(Stride stride): stride_(stride) { }
-
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorInterleaved packed(MatrixCoord const &extent) {
-    return ColumnMajorInterleaved(extent.row() * kInterleave);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    Index column_major = coord.column() / kInterleave;
-    Index column_minor = coord.column() % kInterleave;
-    return LongIndex(column_major) * LongIndex(stride_[0]) + LongIndex(coord.row()) * kInterleave + column_minor;
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-
-    Index column_major = Index(offset / stride_[0]);
-    Index residual = Index(offset % stride_[0]);
-
-    Index row = residual / kInterleave;
-    Index column_minor =  residual % kInterleave;
-
-    return MatrixCoord(row, column_major * kInterleave + column_minor);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return (extent.column() + kInterleave - 1) / kInterleave * stride_[0];
-  }
-};
-
-/// Enumerated type for canonical pitch-linear matrix layouts
-enum class Matrix {
-  kColumnMajor,       ///< leading dimension refers to stride between columns; stride along rows is 1
-  kRowMajor           ///< leading dimension refers to stride between rows; stride along columns is 1
-};
-
-/// Mapping function for scenario in which layout is row-major or column-major but this information
-/// is only available at runtime.
-struct ContiguousMatrix {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-  /// Enumerated type indicating canonical matrix layout
-  Matrix layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ContiguousMatrix(
-    Index ldm = 0, 
-    Matrix layout = Matrix::kColumnMajor
-  ):
-    stride_(ldm), layout_(layout) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ContiguousMatrix packed(
-    MatrixCoord const &extent, 
-    Matrix layout = Matrix::kColumnMajor) {
-
-    Index ldm = 0;
-    if (layout == Matrix::kColumnMajor) {
-      ldm = extent.row();
-    }
-    else if (layout == Matrix::kRowMajor) {
-      ldm = extent.column();
-    }
-    return ContiguousMatrix(ldm, layout);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    if (layout_ == Matrix::kColumnMajor) {
-      return coord.row() + coord.column() * stride_[0];
-    }
-    else if (layout_ == Matrix::kRowMajor) {
-      return coord.row() * stride_[0] + coord.column();
-    }
-    else {
-      // degenerate case
-      return 0;
-    }
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    CUTLASS_UNUSED(offset);
-    return MatrixCoord(0, 0);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    if (layout_ == Matrix::kColumnMajor) {
-      return stride_[0] * extent.column();
-    }
-    else if (layout_ == Matrix::kRowMajor) {
-      return stride_[0] * extent.row();
-    }
-    else {
-      // degenerate case
-      return 0;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for scenario in which both rows and columns are separated by a stride.
-template <int Rank>
-struct AffineRankN {
-
-  /// Logical rank of tensor
-  static int const kRank = Rank;
-
-  /// Rank of stride vector
-  static int const kStrideRank = kRank;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Coord<kRank, Index>;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRankN(
-    Stride const &stride = Stride()
-  ):
-    stride_(stride) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRankN(
-    Coord<kRank/2, LongIndex> const &stride_m,
-    Coord<kRank/2, LongIndex> const &stride_n
-  ) { 
-
-    // Concatenate the strides
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < kRank/2; ++m) {
-      stride_[m] = stride_m[m];
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kRank/2; ++n) {
-      stride_[n + kRank/2] = stride_n[n];
-    }
-  }
-
-  /// Ctor for N = 2
-  CUTLASS_HOST_DEVICE
-  AffineRankN(
-    LongIndex const &stride_m,
-    LongIndex const &stride_n
-  ) { 
-      stride_[0] = stride_m;
-      stride_[1] = stride_n;
-  }
-
-  /// Ctor for N = 2
-  CUTLASS_HOST_DEVICE
-  AffineRankN(
-    LongIndex const &stride
-  ) { 
-      stride_[0] = stride;
-      stride_[1] = 1;
-  }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static AffineRankN packed(TensorCoord const &extent) {
-    
-    AffineRankN layout;
-    layout.stride_[kRank - 1] = 1;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = kRank - 1; i > 0; --i) {
-      layout.stride_[i - 1] = layout.stride_[i] * extent[i];
-    }
-
-    return layout;
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return dot(coord, stride_);
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    return TensorCoord();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    int idx = stride_.max_dim_index();
-    return extent[idx] * stride_[idx];
-  }
-};
-
-/// Mapping function for scenario in which both rows and columns are separated by a stride.
-/// Row stride is smaller than column stride in AffineRank2ColumnMajor.
-struct AffineRank2ColumnMajor {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 2;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2ColumnMajor(
-    Stride const &stride = Stride()
-  ):
-    stride_(stride) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2ColumnMajor(
-    LongIndex row_stride,           ///< stride between elements in consecutive rows
-    LongIndex column_stride         ///< stride between elements in consecutive columns
-  )
-    { stride_[0] = row_stride; stride_[1] = column_stride;}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2ColumnMajor(
-    LongIndex stride
-  )
-    { stride_[0] = 1; stride_[1] = stride;}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static AffineRank2ColumnMajor packed(MatrixCoord const &extent) {
-    return AffineRank2ColumnMajor(1, extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return dot(coord, stride_);
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    CUTLASS_UNUSED(offset);
-    return MatrixCoord(0, 0);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return extent.column() * stride_[1];
-  }
-};
-
-/// Mapping function for scenario in which both rows and columns are separated by a stride.
-/// Column stride is smaller than row stride in AffineRank2RowMajor.
-struct AffineRank2RowMajor {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 2;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2RowMajor(
-    Stride const &stride = Stride()
-  ):
-    stride_(stride) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2RowMajor(
-    LongIndex row_stride,           ///< stride between elements in consecutive rows
-    LongIndex column_stride         ///< stride between elements in consecutive columns
-  ) { stride_[0] = row_stride; stride_[1] = column_stride;}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  AffineRank2RowMajor(
-    LongIndex stride
-  ) { stride_[0] = stride; stride_[1] = 1;}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static AffineRank2RowMajor packed(MatrixCoord const &extent) {
-    return AffineRank2RowMajor(1, extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return dot(coord, stride_);
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    CUTLASS_UNUSED(offset);
-    return MatrixCoord(0, 0);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return extent.row() * stride_[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Utility functions to convert stride_factor to the strides used by the Affine2 layout.
-//
-// stride_factor is the logical distance between two coorinates.
-//
-// All Coodinates used here are matrix coordinates.  stride[0] and extent[0] are for the
-// rows.  stride[1] and extent[1] are for the columns.
-template <typename Affine2Layout>
-  struct Affine2Layout_Factory {
-  CUTLASS_HOST_DEVICE
-  static Affine2Layout layout_factory(cutlass::Coord<2> const &extent, typename Affine2Layout::Stride stride_factor) {
-    return Affine2Layout::packed(extent);
-  }
-};
-
-template <>
-struct Affine2Layout_Factory<cutlass::layout::AffineRank2ColumnMajor> {
-CUTLASS_HOST_DEVICE
-static cutlass::layout::AffineRank2ColumnMajor layout_factory(
-  cutlass::Coord<2> const &extent,
-  typename cutlass::layout::AffineRank2ColumnMajor::Stride stride_factor) {
-    return cutlass::layout::AffineRank2ColumnMajor({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
-  }
-};
-
-template <>
-struct Affine2Layout_Factory<cutlass::layout::AffineRank2RowMajor> {
-CUTLASS_HOST_DEVICE
-static cutlass::layout::AffineRank2RowMajor layout_factory(
-  cutlass::Coord<2> const &extent,
-  typename cutlass::layout::AffineRank2RowMajor::Stride stride_factor) {
-    return cutlass::layout::AffineRank2RowMajor({ stride_factor[0] * stride_factor[1] * extent[1], stride_factor[1] });
-  }
-};
-
-// The base layout cutlass::layout::AffineRankN<2> is similar to AffineRank2ColumnMajor
-template <>
-struct Affine2Layout_Factory<cutlass::layout::AffineRankN<2>> {
-CUTLASS_HOST_DEVICE
-static cutlass::layout::AffineRankN<2> layout_factory(
-  cutlass::Coord<2> const &extent,
-  typename cutlass::layout::AffineRankN<2>::Stride stride_factor) {
-    return cutlass::layout::AffineRankN<2>({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for block-linear matrices. Matrix is structured
-/// as column-major arrangement of 2D tiles (that are column-major).
-template <int BlockRows, int BlockColumns>
-struct ColumnMajorBlockLinear {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-  /// Size of a block in rows
-  static int const kBlockRows = BlockRows;
-
-  /// Size of a block in columns
-  static int const kBlockColumns = BlockColumns;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorBlockLinear packed(MatrixCoord const &extent) {
-    return ColumnMajorBlockLinear(extent.row() * kBlockRows * kBlockColumns);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return 
-      (coord.row() % kBlockRows) + 
-      (coord.column() % kBlockColumns) * kBlockRows +
-      (coord.row() / kBlockRows) * kBlockRows * kBlockColumns +
-      (coord.column() / kBlockColumns) * stride_[0];
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-
-    return MatrixCoord(0, 0);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return (extent.column() + kBlockColumns - 1) / kBlockColumns * stride_[0];
-  }
-};
-
-/// Mapping function for block-linear matrices. Matrix is structured
-/// as row-major arrangement of 2D tiles (that are row-major)
-template <int BlockRows, int BlockColumns>
-struct RowMajorBlockLinear {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-  /// Size of a block in rows
-  static int const kBlockRows = BlockRows;
-
-  /// Size of a block in columns
-  static int const kBlockColumns = BlockColumns;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorBlockLinear packed(MatrixCoord const &extent) {
-    return RowMajorBlockLinear(extent.column() * kBlockRows * kBlockColumns);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    return 
-      (coord.column() % kBlockColumns) +
-      (coord.row() % kBlockRows) * kBlockColumns +
-      (coord.column() / kBlockColumns) * kBlockRows * kBlockColumns +
-      (coord.row() / kBlockRows) * stride_[0];
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  MatrixCoord inverse(LongIndex offset) const {
-    return MatrixCoord(0, 0);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-  
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    return (extent.row() + kBlockRows - 1) / kBlockRows * stride_[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct GeneralMatrix {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 2;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index>;
-
-private:
-  //
-  // Data members
-  //
-
-  Matrix layout_id_;
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  GeneralMatrix(): layout_id_(Matrix::kColumnMajor), stride_(make_Coord(0, 1)) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  GeneralMatrix(
-    Matrix layout_id, 
-    Index ldm, 
-    Index interleave): layout_id_(layout_id), stride_(make_Coord(ldm, interleave)) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static GeneralMatrix packed(
-    MatrixCoord const &extent, 
-    Matrix layout_id = Matrix::kColumnMajor, 
-    Index interleave = 1) {
-
-    Index c;
-    if (layout_id == Matrix::kRowMajor) {
-      c = extent.column();
-    }
-    else {
-      c = extent.row();
-    }
-
-    Index ldm = c * interleave;
-
-    return GeneralMatrix(layout_id, ldm, interleave);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (row, column)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord const &coord) const {
-    Index c, s;
-    if (layout_id_ == Matrix::kRowMajor) {
-      c = coord.column();
-      s = coord.row();
-    }
-    else {
-      s = coord.column();
-      c = coord.row();
-    }
-
-    Index v = s / stride_[1];
-    Index residual = (s % stride_[1]);
-
-    return LongIndex(c) * LongIndex(stride_[1]) + LongIndex(v) * LongIndex(stride_[0]) + residual;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix layout_id() const {
-    return layout_id_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix & layout_id() {
-    return layout_id_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index stride(int idx) const {
-    return stride_[idx];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  typename Stride::Index & stride(int idx) {
-    return stride_[idx];
-  }
-  
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(MatrixCoord const &extent) const {
-    Index s;
-    if (layout_id_ == Matrix::kRowMajor) {
-      s = extent.row();
-    }
-    else {
-      s = extent.column();
-    }
-
-    Index v = Index((s + stride_[1] - 1) / stride_[1]);
-    return LongIndex(v) * LongIndex(stride_[0]);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines transposes of matrix layouts
-template <typename Layout>
-struct LayoutTranspose;
-
-/// Transpose of row-major is column-major
-template <>
-struct LayoutTranspose<layout::RowMajor> {
-  using type = layout::ColumnMajor;
-};
-
-/// Transpose of column-major is row-major
-template <>
-struct LayoutTranspose<layout::ColumnMajor> {
-  using type = layout::RowMajor;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/permute.h b/lightllm-kernel/cutlass/include/cutlass/layout/permute.h
deleted file mode 100755
index 912eb2c8c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/permute.h
+++ /dev/null
@@ -1,828 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by GEMM+permute path for common tensor or matrix formats.
-
-    Like Layout functions, permute layout functions map logical coordinates to linear memory. They often require additional
-    data to describe strides between elements.
-
-    Permute layout functions must implement all members in the interface of NoPermute<> defined in this file. Address offset
-    computation lies in operator() with private member variables  {col_permute_, row_permute_ and stride_} as new addresses after permute op.
-*/
-#pragma once
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include "assert.h"
-#endif
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/coord.h"
-#include "cutlass/tensor_coord.h"
-
-namespace cutlass {
-namespace layout {
-
-// template<PermuteTag, typename Layout, bool Inverse>
-// struct PermuteSelect {
-//   // Try to give a reasonable error message to the user
-//   static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
-//                 "You've tried to use a layout permutation for which the implementation is not availble. "
-//                 "In order to provide an implementation for a particular combination of matrix layout "
-//                 "and direction (direct/inverse), please specialize PermuteSelect trait.");
-// };
-
-// Base template for defining specializations of permutation inverses
-template<typename Permute>
-struct InversePermute
-{
-  // Try to give a reasonable error message to the user
-  static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
-                "To apply permutation to a GEMM input operand (A or B), an inverse permutation for the desired "
-                "permute class must be defined and enabled by specializing cutlass::layout::InversePermute trait.");
-};
-
-class PermuteBase {
-public:
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-};
-
-class NoPermute : public PermuteBase {
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor from matrix extent
-  CUTLASS_HOST_DEVICE
-  NoPermute(MatrixCoord extent, Index stride) { };
-
-  /// Constructor from pitch-linear extent
-  CUTLASS_HOST_DEVICE
-  NoPermute(PitchLinearCoord extent, Index stride) { };
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const { return 0; } // not correct but should never be called
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { return 0; } // not correct but should never be called
-};
-
-template<>
-struct InversePermute<NoPermute> {
-  using type = NoPermute;
-};
-
-/// Helper trait to detect if permute operation is a noop
-template<typename Permute>
-inline bool constexpr is_trivial_permute = platform::is_same<Permute, cutlass::layout::NoPermute>::value;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Defines permute layouts of various tensor formats.
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//  Tensor4DPermute0213
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
-/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
-template <int D1, int D2>
-class Tensor4DPermute0213RowMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D3_;
-
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermute0213RowMajor(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % D1 == 0);
-    assert(extent.column() % D2 == 0);
-
-    D3_ = extent.column() / D2;
-
-    stride_ = stride * D1 / D2;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermute0213RowMajor(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermute0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // [i,j,k,l] -> [i,k,j,l]
-    Index l = coord.column() % D3_;
-    Index k = coord.column() / D3_;
-    Index j = coord.row() % D1;
-    Index i = coord.row() / D1;
-
-    MatrixCoord permuted{k + i * D2, l + j * D3_};
-
-    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
-  }
-};
-
-// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
-template <int D1, int D2>
-class Tensor4DPermute0213RowMajorInverse : public Tensor4DPermute0213RowMajor<D2, D1> {
-public:
-  using Base = Tensor4DPermute0213RowMajor<D2, D1>;
-  using Base::Base;
-};
-
-template<int D1, int D2>
-struct InversePermute<Tensor4DPermute0213RowMajor<D1, D2>> {
-  using type = Tensor4DPermute0213RowMajorInverse<D1, D2>;
-};
-
-template<int D1, int D2>
-struct InversePermute<Tensor4DPermute0213RowMajorInverse<D1, D2>> {
-  using type = Tensor4DPermute0213RowMajor<D1, D2>;
-};
-
-/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
-/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
-template <int D1, int D2>
-class Tensor4DPermute0213ColumnMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D0_;
-
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermute0213ColumnMajor(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % D1 == 0);
-    assert(extent.column() % D2 == 0);
-
-    D0_ = extent.row() / D1;
-
-    stride_ = stride * D2 / D1;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermute0213ColumnMajor(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermute0213ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // [i,j,k,l] -> [i,k,j,l]
-    Index l = coord.column() / D2;
-    Index k = coord.column() % D2;
-    Index j = coord.row() / D0_;
-    Index i = coord.row() % D0_;
-
-    MatrixCoord permuted{i + k * D0_, j + l * D1};
-
-    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
-  }
-};
-
-// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
-template <int D1, int D2>
-class Tensor4DPermute0213ColumnMajorInverse : public Tensor4DPermute0213ColumnMajor<D2, D1> {
-public:
-  using Base = Tensor4DPermute0213ColumnMajor<D2, D1>;
-  using Base::Base;
-};
-
-template<int D1, int D2>
-struct InversePermute<Tensor4DPermute0213ColumnMajor<D1, D2>> {
-  using type = Tensor4DPermute0213ColumnMajorInverse<D1, D2>;
-};
-
-template<int D1, int D2>
-struct InversePermute<Tensor4DPermute0213ColumnMajorInverse<D1, D2>> {
-  using type = Tensor4DPermute0213ColumnMajor<D1, D2>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//  Tensor4DPermuteBMM0213
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
-/// as [B/D1, D1, M, N]. Then perform permute([0, 2, 1, 3]) on the corresponding whole BMM tensor.
-template <int D1>
-class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D3_;
-
-  Index stride_;
-
-  Index batch_stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0213RowMajor(MatrixCoord extent, Index stride) {
-
-    Index D2 = extent.row();
-    D3_ = extent.column();
-
-    stride_ = stride * D1;
-    batch_stride_ = D2 * stride_;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0213RowMajor(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermuteBMM0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
-    
-    // [i,j,k,l] -> [i,k,j,l]
-    Index l = coord.column();
-    Index k = coord.row();
-    Index j = BMM_batch_idx % D1;
-    Index i = BMM_batch_idx / D1;
-
-    Index pbatch = i;
-    MatrixCoord pcoord{k, l + j * D3_};
-
-    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
-  }
-};
-
-template <int D1>
-class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D3_;
-
-  Index stride_;
-
-  Index batch_stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord extent, Index stride) {
-
-    assert(extent.column() % D1 == 0);
-
-    Index D2 = extent.row();
-    D3_ = extent.column() / D1;
-
-    stride_ = stride / D1;
-
-    batch_stride_ = D2 * stride_;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0213RowMajorInverse(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // The batch index for BMM
-    Index BMM_batch_idx = blockIdx.z;
-    
-    // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
-    Index l = coord.column() % D3_;
-    Index j = coord.column() / D3_;
-    Index k = coord.row();
-    Index i = BMM_batch_idx;
-
-    // compute original [batch, row, col] index
-    Index pbatch = j + i * D1;
-    MatrixCoord pcoord{k, l};
-
-    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
-  }
-};
-
-template<int D1>
-struct InversePermute<Tensor4DPermuteBMM0213RowMajor<D1>> {
-  using type = Tensor4DPermuteBMM0213RowMajorInverse<D1>;
-};
-
-template<int D1>
-struct InversePermute<Tensor4DPermuteBMM0213RowMajorInverse<D1>> {
-  using type = Tensor4DPermuteBMM0213RowMajor<D1>;
-};
-
-/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
-/// as [B/D1, D1, M, N]. Then perform permute([0, 3, 2, 1]) on the corresponding whole BMM tensor.
-template <int D1>
-class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D2_;
-
-  Index stride_;
-
-  Index batch_stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord extent, Index stride) {
-
-    D2_ = extent.row();
-    Index D3 = extent.column();
-
-    stride_ = stride * D1;
-    batch_stride_ = stride_ * D3;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0321ColumnMajor(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    Index BMM_batch_idx = blockIdx.z;
-    
-    // [i,j,k,l] -> [i,k,j,l]
-    Index l = coord.column();
-    Index k = coord.row();
-    Index j = BMM_batch_idx % D1;
-    Index i = BMM_batch_idx / D1;
-
-    Index pbatch = i;
-    MatrixCoord pcoord{k + j * D2_, l};
-
-    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
-  }
-};
-
-template <int D1>
-class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index D2_;
-
-  Index stride_;
-
-  Index batch_stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % D1 == 0);
-
-    D2_ = extent.row() / D1;
-    Index D3 = extent.column();
-
-    stride_ = stride / D1;
-    batch_stride_ = stride_ * D3;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor4DPermuteBMM0321ColumnMajorInverse(PitchLinearCoord extent, Index stride)
-  : Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    Index BMM_batch_idx = blockIdx.z;
-    
-    // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
-    Index l = coord.column();
-    Index k = coord.row() % D2_;
-    Index j = coord.row() / D2_;
-    Index i = BMM_batch_idx;
-
-    Index pbatch = i * D1 + j;
-    MatrixCoord pcoord{k, l};
-
-    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
-  }
-};
-
-template<int D1>
-struct InversePermute<Tensor4DPermuteBMM0321ColumnMajor<D1>> {
-  using type = Tensor4DPermuteBMM0321ColumnMajorInverse<D1>;
-};
-
-template<int D1>
-struct InversePermute<Tensor4DPermuteBMM0321ColumnMajorInverse<D1>> {
-  using type = Tensor4DPermuteBMM0321ColumnMajor<D1>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//  Tensor5DPermute20314
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Permute layout function for 5-D permuted tensors with output matrix (dimension as [M, N]) reshaped
-/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([2, 0, 3, 1, 4]) on the corresponding output tensor.
-template <int T1, int T2, int T3>
-class Tensor5DPermute20314RowMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index T0_;
-
-  Index T4_;
-
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute20314RowMajor(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % T1 == 0);
-    assert(extent.column() % (T2 * T3) == 0);
-
-    T0_ = extent.row() / T1;
-    T4_ = extent.column() / (T2 * T3);
-
-    /// Update stride_permute with stride
-    stride_ = stride / T2 * T1; // stride in Elements
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute20314RowMajor(PitchLinearCoord extent, Index stride)
-  : Tensor5DPermute20314RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
-  
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
-    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T2, T0, T3, T1, T4].
-
-    Index m = coord.column() % T4_;
-    Index l = (coord.column() / T4_) % T3;
-    Index k = (coord.column() / T4_) / T3;
-    Index j = coord.row() % T1;
-    Index i = coord.row() / T1;
-
-    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T1 * T4_};
-
-    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
-  }
-};
-
-/// Inverse for Tensor5DPermute20314 (could also be given a proper name, e.g. Tensor5DPermute13024).
-template <int T1, int T2, int T3>
-class Tensor5DPermute20314RowMajorInverse : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index T0_;
-
-  Index T4_;
-
-  // Permuted stride in units of elements
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute20314RowMajorInverse(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % T2 == 0);
-    assert(extent.column() % (T1 * T3) == 0);
-
-    T0_ = extent.row() / T2;
-    T4_ = extent.column() / (T1 * T3);
-
-    stride_ = stride / T1 * T2;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute20314RowMajorInverse(PitchLinearCoord extent, Index stride)
-  : Tensor5DPermute20314RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
-
-  /// Computes the offset after the inverse of permute operation in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    Index m = coord.column() % T4_;
-    Index j = (coord.column() / T4_) % T1;
-    Index l = (coord.column() / T4_) / T1;
-    Index i = coord.row() % T0_;
-    Index k = coord.row() / T0_;
-
-    MatrixCoord permuted{j + i * T1, m + l * T4_ + k * T3 * T4_};
-
-    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
-  }
-};
-
-template<int T1, int T2, int T3>
-struct InversePermute<Tensor5DPermute20314RowMajor<T1, T2, T3>> {
-  using type = Tensor5DPermute20314RowMajorInverse<T1, T2, T3>;
-};
-
-template<int T1, int T2, int T3>
-struct InversePermute<Tensor5DPermute20314RowMajorInverse<T1, T2, T3>> {
-  using type = Tensor5DPermute20314RowMajor<T1, T2, T3>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Tensor5DPermute02413
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Permute layout function for 5-D permuted tensors with matrix (dimensions [M, N]) reshaped
-/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([0, 2, 4, 1, 3]) on the corresponding tensor.
-template <int T1, int T2, int T3>
-class Tensor5DPermute02413ColumnMajor : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index T0_;
-
-  Index T4_;
-
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute02413ColumnMajor(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % T1 == 0);
-    assert(extent.column() % (T2 * T3) == 0);
-
-    T0_ = extent.row() / T1;
-    T4_ = extent.column() / (T2 * T3);
-
-    /// Update stride_permute with stride
-    stride_ = stride / T1 * T2; // stride in Elements
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute02413ColumnMajor(PitchLinearCoord extent, Index stride)
-  : Tensor5DPermute02413ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
-  
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
-    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T0, T2, T4, T1, T3].
-
-    Index m = (coord.column() / T2) / T3;
-    Index l = (coord.column() / T2) % T3;
-    Index k = coord.column() % T2;
-    Index j = coord.row() / T0_;
-    Index i = coord.row() % T0_;
-
-    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T4_ * T1};
-
-    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
-  }
-};
-
-/// Inverse for Tensor5DPermute02413ColumnMajor
-template <int T1, int T2, int T3>
-class Tensor5DPermute02413ColumnMajorInverse : public PermuteBase {
-private:
-  //
-  // Data members
-  //
-
-  Index T0_;
-
-  Index T4_;
-
-  // Permuted stride in units of elements
-  Index stride_;
-  
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute02413ColumnMajorInverse(MatrixCoord extent, Index stride) {
-
-    assert(extent.row() % T2 == 0);
-    assert(extent.column() % (T1 * T3) == 0);
-
-    T0_ = extent.row() / T2;
-    T4_ = extent.column() / (T1 * T3);
-
-    stride_ = stride / T2 * T1;
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Tensor5DPermute02413ColumnMajorInverse(PitchLinearCoord extent, Index stride)
-  : Tensor5DPermute02413ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
-
-  /// Computes the offset after the inverse of permute operation in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(MatrixCoord coord) const {
-
-    Index m = coord.column() % T4_;
-    Index j = (coord.column() / T4_) % T1;
-    Index l = (coord.column() / T4_) / T1;
-    Index i = coord.row() % T0_;
-    Index k = coord.row() / T0_;
-
-    MatrixCoord permuted{i + j * T0_, k + l * T2 + m * T2 * T3};
-
-    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
-  }
-
-  /// Computes the offset after Permute Op in logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const { 
-    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
-  }
-};
-
-template<int T1, int T2, int T3>
-struct InversePermute<Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
-  using type = Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>;
-};
-
-template<int T1, int T2, int T3>
-struct InversePermute<Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>> {
-  using type = Tensor5DPermute02413ColumnMajor<T1, T2, T3>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
deleted file mode 100755
index 8c9540f40..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/pitch_linear_coord.h"
-
-namespace cutlass {
-namespace layout {
-
-template <int Contiguous, int Strided>
-  using PitchLinearShape = cutlass::PitchLinearShape < Contiguous, Strided >;
-  using PitchLinearCoord = PitchLinearCoord;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for pitch-linear memory
-class PitchLinear {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, LongIndex>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-  
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  PitchLinear(LongIndex ldm = 0): stride_(ldm) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  PitchLinear(Stride _stride): stride_(_stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static PitchLinear packed(TensorCoord const &extent) {
-    return PitchLinear(extent.contiguous());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return LongIndex(coord.contiguous()) + LongIndex(coord.strided()) * LongIndex(stride_[0]);
-  }
-
-  /// Returns the logical coordinate given an offset.
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex index) const {
-    return make_Coord(
-      TensorCoord::Index(index % stride_[0]),
-      TensorCoord::Index(index / stride_[0])
-    );
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  LongIndex stride(int rank) const {
-    return stride_[rank];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  LongIndex & stride(int rank) {
-    return stride_[rank];
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent.strided() * stride_[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
deleted file mode 100755
index 8374fe31d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/tensor.h
+++ /dev/null
@@ -1,648 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D
-      tensor formats.
-
-    Layout functions map logical coordinates to linear memory. They often require additional
-    data to describe strides between elements.
-
-    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
-    defined in cutlass/tensor_ref.h.
-*/
-#pragma once
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include "assert.h"
-#endif
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/coord.h"
-#include "cutlass/tensor_coord.h"
-
-namespace cutlass {
-namespace layout {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Defines data layouts of various tensor formats usable by TensorRef and other classes.
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag used for 3-D NWC tensors for 1-D convolutions; only used in 3.x API
-class TensorNWC {};
-
-/// Tag used for n-D KCSRT tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
-class TensorKCS {};
-class TensorKCSR {};
-class TensorKCSRT {};
-
-/// Tag used for n-D CSRTK tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
-class TensorCSK {};
-class TensorCSRK {};
-class TensorCSRTK {};
-
-/// Mapping function for 4-D NHWC tensors.
-class TensorNHWC {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 4;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 3;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate (n, h, w, c)
-  using TensorCoord = Tensor4DCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member - [stride_w, stride_h, stride_n]
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNHWC(Stride const &stride = Stride(0)): stride_(stride) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNHWC(
-    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
-    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
-    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
-  ): 
-    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
-
-  /// Constructor
-  // Once convolutions implement 64b stride this ctor can be deleted
-  CUTLASS_HOST_DEVICE
-  TensorNHWC(Coord<kStrideRank, LongIndex> const &stride): 
-    stride_(make_Coord(
-      static_cast<typename Stride::Index>(stride[0]), 
-      static_cast<typename Stride::Index>(stride[1]), 
-      static_cast<typename Stride::Index>(stride[2]))
-    ) { }
-
-  /// Helper returns a layout to a tightly packed NHWC tensor.
-  CUTLASS_HOST_DEVICE
-  static TensorNHWC packed(TensorCoord const &extent) {
-    return TensorNHWC(
-      make_Coord(
-        extent.c(), 
-        extent.w() * extent.c(),
-        extent.h() * extent.w() * extent.c()
-      )
-    );
-  }
-  
-  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return coord.c() + 
-      LongIndex(stride_[0] * coord.w()) + 
-      LongIndex(stride_[1] * coord.h()) +
-      LongIndex(stride_[2] * coord.n());
-  }
-  
-  /// Returns the offset of a pitchlinear coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const {
-    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
-  }
-
-  /// Returns the logical coordinate (n, h, w, c) from a given offset in linear memory.
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex index) const {
-
-    int n = 0, h = 0, w = 0, c = 0;
-
-    #if defined(__CUDA_ARCH__)
-    int tmp = 0;
-    c = int(index % static_cast<int>(stride_[0]));
-
-    unsigned int hw_mul, hw_shr, w_mul, w_shr, c_mul, c_shr;
-
-    find_divisor(hw_mul, hw_shr, stride_[2]);
-    find_divisor(w_mul, w_shr, stride_[1]);
-    find_divisor(c_mul, c_shr, stride_[0]);
-
-    fast_divmod(n, tmp, index, int(stride_[2]), hw_mul, hw_shr);
-    fast_divmod(h, w, tmp, int(stride_[1]), w_mul, w_shr);
-    fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);
-    #else
-
-    n = int(index / stride_[2]);
-    LongIndex residual = index % stride_[2];
-
-    h = int(residual / stride_[1]);
-    residual = (residual % stride_[1]);
-
-    w = int(residual / stride_[0]);
-    c = int(residual % stride_[0]);
-
-    #endif
-    return TensorCoord(n, h, w, c);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    // it does not make sense if the extent is larger than stride
-    // and we could not rely on the capacity calculation in such cases
-    // we could move this checkers to debug code only
-    if ((extent.c() > stride_[0])
-        || (extent.w() * stride_[0] > stride_[1]) 
-        || (extent.h() * stride_[1] > stride_[2])) {
-      assert(0);
-    }
-    return extent.n() * stride_[2];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for 4-D NCHW tensors.
-class TensorNCHW {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 4;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 3;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Tensor4DCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member - [w, hw, chw]
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNCHW(Stride const &stride = Stride(0)): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorNCHW packed(TensorCoord const &extent) {
-    return TensorNCHW(
-      make_Coord(
-        extent.w(),
-        extent.w() * extent.h(),
-        extent.h() * extent.w() * extent.c()
-      )
-    );
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return coord.w() + 
-      LongIndex(stride_[0] * coord.h()) + 
-      LongIndex(stride_[1] * coord.c()) + 
-      LongIndex(stride_[2] * coord.n());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent.n() * stride_[2];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for 4-D NC/xHWx tensors.
-template <int Interleave>
-class TensorNCxHWx {
-public:
-
-  /// Interleaving quantity
-  static int const kInterleave = Interleave;
-
-  /// Logical rank of tensor
-  static int const kRank = 4;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 3;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Tensor4DCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member - [Interleave x w, Interleave x wh, hwc]
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNCxHWx(
-    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
-    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
-    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
-  ):
-    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
-
-  /// Constructor
-  // Once convolutions implement 64b stride this ctor can be deleted
-  CUTLASS_HOST_DEVICE
-  TensorNCxHWx(Coord<kStrideRank, LongIndex> const &stride): 
-    stride_(make_Coord(
-      static_cast<typename Stride::Index>(stride[0]), 
-      static_cast<typename Stride::Index>(stride[1]), 
-      static_cast<typename Stride::Index>(stride[2]))
-    ) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorNCxHWx packed(TensorCoord const &extent) {
-    return TensorNCxHWx(
-      make_Coord(
-        kInterleave * extent.w(),
-        kInterleave * extent.w() * extent.h(),
-        extent.h() * extent.w() * extent.c()
-      )
-    );
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    Index c_minor = (coord.c() % kInterleave);
-    Index c_major = (coord.c() / kInterleave);
-
-    return c_minor + 
-      LongIndex(kInterleave * coord.w()) + 
-      LongIndex(stride_[0] * coord.h()) + 
-      LongIndex(stride_[1] * c_major) + 
-      LongIndex(stride_[2] * coord.n());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent.n() * stride_[2];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for 4-D CxRSKx tensors.
-template <int Interleave>
-class TensorCxRSKx {
-public:
-
-  /// Interleaving quantity
-  static int const kInterleave = Interleave;
-
-  /// Logical rank of tensor
-  static int const kRank = 4;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 3;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Tensor4DCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member - [Interleave x n, Interleave x nw, Interleave x nwh]
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorCxRSKx(
-    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
-    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
-    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
-  ):
-    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
-
-  /// Constructor
-  // Once convolutions implement 64b stride this ctor can be deleted
-  CUTLASS_HOST_DEVICE
-  TensorCxRSKx(Coord<kStrideRank, LongIndex> const &stride): 
-    stride_(make_Coord(
-      static_cast<typename Stride::Index>(stride[0]), 
-      static_cast<typename Stride::Index>(stride[1]), 
-      static_cast<typename Stride::Index>(stride[2]))
-    ) { }
-
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorCxRSKx packed(TensorCoord const &extent) {
-    return TensorCxRSKx(
-      make_Coord(
-        kInterleave * extent.n(),
-        kInterleave * extent.n() * extent.w(),
-        kInterleave * extent.n() * extent.w() * extent.h()
-      )
-    );
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    Index c_minor = (coord.c() % kInterleave);
-    Index c_major = (coord.c() / kInterleave);
-
-    return c_minor + 
-      LongIndex(kInterleave * coord.n()) + 
-      LongIndex(stride_[0] * coord.w()) + 
-      LongIndex(stride_[1] * coord.h()) + 
-      LongIndex(stride_[2] * c_major);
-  }
-
-  /// Returns the offset of a pitchlinear coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord const &coord) const {
-    return (coord.contiguous() % kInterleave) +
-      LongIndex((coord.contiguous() / kInterleave) * stride_[2]) +
-      LongIndex(coord.strided() * kInterleave);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return (extent.c() / kInterleave * stride_[2]);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mapping function for 5-D NDHWC tensors.
-class TensorNDHWC {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 5;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 4;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate (n, d, h, w, c)
-  using TensorCoord = Tensor5DCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Stride data member - [c, wc, hwc, dhwc]
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNDHWC(Stride const &stride = Stride(0)): stride_(stride) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  TensorNDHWC(
-    typename Stride::Index c, 
-    typename Stride::Index wc, 
-    typename Stride::Index hwc, 
-    typename Stride::Index dhwc): 
-  stride_(make_Coord(c, wc, hwc, dhwc)) { }
-
-  /// Constructor
-  // Once convolutions implement 64b stride this ctor can be deleted
-  CUTLASS_HOST_DEVICE
-  TensorNDHWC(Coord<kStrideRank, LongIndex> const &stride): 
-    stride_(make_Coord(
-      static_cast<typename Stride::Index>(stride[0]), 
-      static_cast<typename Stride::Index>(stride[1]), 
-      static_cast<typename Stride::Index>(stride[2]),
-      static_cast<typename Stride::Index>(stride[3]))
-    ) { }
-
-  /// Helper returns a layout to a tightly packed NHWC tensor.
-  CUTLASS_HOST_DEVICE
-  static TensorNDHWC packed(TensorCoord const &extent) {
-    return TensorNDHWC(
-      make_Coord(
-        extent.c(), 
-        extent.w() * extent.c(),
-        extent.h() * extent.w() * extent.c(),
-        extent.d() * extent.h() * extent.w() * extent.c()
-      )
-    );
-  }
-  
-  /// Returns the offset of a coordinate (n, d, h, w, c) in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return coord.c() + 
-      LongIndex(stride_[0] * coord.w()) + 
-      LongIndex(stride_[1] * coord.h()) +
-      LongIndex(stride_[2] * coord.d()) +
-      LongIndex(stride_[3] * coord.n());
-  }
-
-  /// Returns the offset of a pitchlinear coordinate in linear memory. 
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(PitchLinearCoord coord) const {
-    return coord.contiguous() + LongIndex(coord.strided() * stride_[3]);
-  }
-  
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    // it does not make sense if the extent is larger than stride
-    // and we could not rely on the capacity calculation in such cases
-    // we could move this checkers to debug code only
-    if ((extent.c() > stride_[0])
-        || (extent.w() * stride_[0] > stride_[1]) 
-        || (extent.h() * stride_[1] > stride_[2])
-        || (extent.d() * stride_[2] > stride_[3])) {
-      assert(0);
-    }
-    return extent.n() * stride_[3];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
deleted file mode 100755
index 4691b9829..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ /dev/null
@@ -1,1044 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/layout/pitch_linear.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace layout {
-
-// template <
-//   int ElementSize,
-//   gemm::Operand Operand
-// >
-// struct VoltaTensorOpMultiplicandCongruous;
-
-// template <
-//   int ElementSize,
-//   gemm::Operand Operand
-// >
-// struct ColumnMajorVoltaTensorOpMultiplicandCongruous;
-// template <
-//   int ElementSize,
-//   gemm::Operand Operand
-// >
-// struct RowMajorVoltaTensorOpMultiplicandCongruous;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
-template <int ElementSize>
-struct VoltaTensorOpMultiplicandCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  /// Fundamental tile shape in units of vectors
-  using TileShape = PitchLinearShape<8, 4>;
-
-  /// Fundamental partition shape in units of vectors
-  using PartitionShape = PitchLinearShape<8, 2>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-  
-  using PartitionCount = PitchLinearShape<
-    TileShape::kContiguous / PartitionShape::kContiguous,
-    TileShape::kStrided / PartitionShape::kStrided
-  >;
-
-  using AccessCount = PitchLinearShape<
-    PartitionShape::kContiguous,
-    PartitionShape::kStrided
-  >;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandCongruous(Index ldm = 0): stride_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandCongruous(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return VoltaTensorOpMultiplicandCongruous(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    
-    // First, compute c and s of vector within source (in units of vector accesses)
-    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
-    int vec_strided_idx = coord.strided();
-
-    // Compute the fundamental tile being accessed
-    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
-    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
-
-    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
-    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
-
-    // Then swizzle in a tile
-    // Swizzle pattern is (tid[2:0] << 2)|(tid[4:3] ^ tid[2:1])
-    int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
-    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
-                                       ((tile_contiguous_residual & 1) << 2);
-    // Compute final element location
-    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
-        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
-
-    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
-
-    return element_contiguous + element_strided * stride_[0];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
-template <int ElementSize>
-struct ColumnMajorVoltaTensorOpMultiplicandCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return ColumnMajorVoltaTensorOpMultiplicandCongruous(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
-template <int ElementSize>
-struct RowMajorVoltaTensorOpMultiplicandCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return RowMajorVoltaTensorOpMultiplicandCongruous(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
-// template <int ElementSize, Operand Operand>
-template <int ElementSize>
-struct VoltaTensorOpMultiplicandBCongruous {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  /// Fundamental tile shape in units of vectors
-  using TileShape = PitchLinearShape<8, 4>;
-
-  /// Fundamental partition shape in units of vectors
-  using PartitionShape = PitchLinearShape<4, 4>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-  
-  using PartitionCount = PitchLinearShape<
-    TileShape::kContiguous / PartitionShape::kContiguous,
-    TileShape::kStrided / PartitionShape::kStrided
-  >;
-
-  using AccessCount = PitchLinearShape<
-    PartitionShape::kContiguous,
-    PartitionShape::kStrided
-  >;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandBCongruous(Index ldm = 0): stride_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandBCongruous(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
-    return VoltaTensorOpMultiplicandBCongruous(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    
-    // First, compute c and s of vector within source (in units of vector accesses)
-    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
-    int vec_strided_idx = coord.strided();
-
-    // Compute the fundamental tile being accessed
-    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
-    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
-
-    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
-    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
-
-    // Then swizzle in a tile
-    // Swizzle pattern is (tid[1:0] << 3)|(tid & 0x4)|(tid[1:0])
-    int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
-    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
-                                       (tile_contiguous_residual & 0x4);
-  
-    // Compute final element location
-    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
-        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
-
-    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
-
-    return element_contiguous + element_strided * stride_[0];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE 
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
-template <int ElementSize>
-struct ColumnMajorVoltaTensorOpMultiplicandBCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
-    return ColumnMajorVoltaTensorOpMultiplicandBCongruous(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
-template <int ElementSize>
-struct RowMajorVoltaTensorOpMultiplicandBCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
-    return RowMajorVoltaTensorOpMultiplicandBCongruous(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and KBlock size (in elements).
-template <int ElementSize, int KBlock>
-struct VoltaTensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 64b accesses
-  static int const kAccessSize = 64;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-  static int const kKBlock = KBlock;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride data member. For GEMM, it equals to KBlock x stage.
-  Stride stride_;
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  VoltaTensorOpMultiplicandCrosswise(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
-    return VoltaTensorOpMultiplicandCrosswise(extent[1]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    //
-    // First, compute c and s of vector within source (in units of vector
-    // accesses)
-    //
-    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
-    int vec_strided_idx = coord.strided();
-
-    //
-    // Then swizzle
-    // The mapping is like this:
-    // id[1:0]|(id[3]^id[4])|id[2]
-
-    int vec_strided_within_tile = vec_contiguous_idx & 0x7;
-    int permuted_vec_contiguous =
-        (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
-        (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);
-
-    permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);
-
-    int permuted_vec_strided = vec_contiguous_idx;
-
-    //
-    // Compute final element location
-    //
-
-    int element_contiguous = permuted_vec_contiguous *  kElementsPerAccess + 
-                             (coord.contiguous() % kElementsPerAccess);
-    
-    return element_contiguous + permuted_vec_strided * (stride_[0] * kElementsPerAccess);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[0] * stride_[0];
-  }
-};
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// VoltaTensorOpMultiplicandCrosswise
-template <int ElementSize, int KBlock>
-struct ColumnMajorVoltaTensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
-
-  /// This layout is optimized for 64b accesses
-  static int const kAccessSize = Base::kAccessSize;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(
-      TensorCoord const &extent) {
-    return ColumnMajorVoltaTensorOpMultiplicandCrosswise(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicandCrosswise
-template <int ElementSize, int KBlock>
-struct RowMajorVoltaTensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
-
-  /// This layout is optimized for 64b accesses
-  static int const kAccessSize = Base::kAccessSize;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorVoltaTensorOpMultiplicandCrosswise packed(
-      TensorCoord const &extent) {
-    return RowMajorVoltaTensorOpMultiplicandCrosswise(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-} // namespace layout
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
deleted file mode 100755
index 1cda44286..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ /dev/null
@@ -1,1169 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/layout/pitch_linear.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace layout {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-/// This one is the base class of all Ampere/Turing fp16/bf16/int8/int4/int1
-/// tensor core kernels.  tf32 TN uses this too.
-template <int ElementSize, int Crosswise>
-struct TensorOpMultiplicand {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Static constants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-  static int const kCrosswise = Crosswise;
-
-  /// Contiguous dimension of the tile shape matches one shared memory cache
-  /// line - 128B.  For 128bit access size, it equals to 8 accesses.
-  static int const kTileShapeContiguous = 128 / (kAccessSize / 8);
-
-  /// Number of kblocks to store PartitionShape::kContiguous Elements
-  static int const kFactor =
-      kTileShapeContiguous * kElementsPerAccess / kCrosswise;
-
-  static_assert(
-      (kFactor > 0),
-      "kCrosswise should be no large than one shared memory cache line.");
-
-  /// The strided dimension needs to be at least (WarpSize(32) /
-  /// kTileShapeContiguous) for a warp to access.  To ensure conflict free
-  /// access, it also needs to be at least (kTileShapeContiguous / kFactor).
-  /// See comments below
-  static int const kTileShapeStride =
-      ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
-          ? (kTileShapeContiguous / kFactor)
-          : (32 / kTileShapeContiguous);
-
-  /// Fundamental tile shape in units of vectors to guarantee bank conflict free
-  /// shared memory load/store.
-  /// For kFactor = 1, TileShape = <8, 8> 
-  /// For kFactor > 1, TileShape = <8, 4>
-  using TileShape = PitchLinearShape<kTileShapeContiguous, kTileShapeStride>;
-
-  /// Fundamental partition shape in units of vectors
-  using PartitionShape = PitchLinearShape<4, 4>;
-
-  using PartitionCount =
-      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
-                       TileShape::kStrided / PartitionShape::kStrided>;
-
-  using AccessCount =
-      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride data member. For GEMM, it equals to kCrosswise x stage.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicand(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicand(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicand packed(TensorCoord const &extent) {
-    return TensorOpMultiplicand(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    //
-    // First, compute c and s of vector within source (in units of vector
-    // accesses)
-    //
-
-    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
-    int vec_strided_idx = coord.strided() / kFactor;
-
-    // Compute the fundamental tile being accessed
-    int tile_contiguous_idx =
-        vec_contiguous_idx / (TileShape::kContiguous / kFactor);
-
-    int tile_contiguous_residual =
-        vec_contiguous_idx % (TileShape::kContiguous / kFactor) +
-        ((coord.strided() % kFactor) * (TileShape::kContiguous / kFactor));
-    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
-
-    // Compute the 'partition' within the fundamental tile
-    int partition_contiguous_idx =
-        tile_contiguous_residual / PartitionShape::kContiguous;
-    int partition_strided_idx =
-        tile_strided_residual / PartitionShape::kStrided;
-
-    int partition_contiguous_residual =
-        tile_contiguous_residual % PartitionShape::kContiguous;
-    int partition_strided_residual =
-        tile_strided_residual % PartitionShape::kStrided;
-
-    //
-    // Then swizzle
-    //
-
-    int permuted_vec_contiguous_within_partition =
-        partition_contiguous_residual ^ (partition_strided_residual % 4);
-
-    int permuted_partition_contiguous_within_tile =
-        partition_contiguous_idx ^ (partition_strided_idx % 2);
-
-    //
-    // Compute final element location
-    //
-
-    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
-                              permuted_partition_contiguous_within_tile *
-                                  PartitionShape::kContiguous +
-                              permuted_vec_contiguous_within_partition) *
-                                 kElementsPerAccess +
-                             (coord.contiguous() % kElementsPerAccess);
-
-    int element_strided = vec_strided_idx;
-
-    return element_contiguous + element_strided * stride_[0] * kFactor;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-template <int ElementSize, int Crosswise>
-struct TensorOpMultiplicandCongruous {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  static int const kCrosswise = Base::kCrosswise;
-  static int const kFactor = Base::kFactor;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCongruous(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(coord);
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return coord;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(extent);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-/// This one is just for TF32 NT kernel.
-template <int Crosswise>
-struct TensorOpMultiplicandCongruous<32, Crosswise> {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  /// Fundamental tile shape in units of vectors
-  using TileShape = PitchLinearShape<8, 4>;
-
-  /// Partitionshape is the same as TileShape for this layout
-  using PartitionShape = PitchLinearShape<8, 4>;
-
-  using PartitionCount =
-      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
-                       TileShape::kStrided / PartitionShape::kStrided>;
-
-  using AccessCount =
-      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
-
-  //
-  // Static constants
-  //
-  static int const kElementSize = 32;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-  static int const kCrosswise = Crosswise;
-  static int const kFactor = 1;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride data member.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCongruous(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    int tc = coord.contiguous() / 32;
-    int ts = coord.strided() / 4;
-
-    int c = (coord.contiguous() % 32) / kElementsPerAccess;
-    int s = coord.strided() % 4;
-
-    LongIndex offset = (c ^ (2 * s)) * kElementsPerAccess + s * stride_[0] +
-                       tc * 32 + ts * stride_[0] * 4 + coord.contiguous() % 4;
-
-    return offset;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-template <int ElementSize, int Crosswise>
-struct ColumnMajorTensorOpMultiplicandCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  static int const kCrosswise = Base::kCrosswise;
-  static int const kFactor = Base::kFactor;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicandCongruous(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());    
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-template <int ElementSize, int Crosswise>
-struct RowMajorTensorOpMultiplicandCongruous {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  static int const kCrosswise = Base::kCrosswise;
-  static int const kFactor = Base::kFactor;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicandCongruous(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-template <int ElementSize, int Crosswise>
-struct TensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  static int const kCrosswise = Base::kCrosswise;
-  static int const kFactor = Base::kFactor;
-  using PartitionCount =  typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCrosswise(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(coord);
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return coord;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(extent);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// TensorOpMultiplicandCrosswise
-template <int ElementSize, int Crosswise>
-struct ColumnMajorTensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount = typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicandCrosswise packed(
-      TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicandCrosswise(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicandCrosswise
-template <int ElementSize, int Crosswise>
-struct RowMajorTensorOpMultiplicandCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = Base::kAccessSize;
-  using TileShape = typename Base::TileShape;
-  using PartitionShape = typename Base::PartitionShape;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = Base::kElementSize;
-  static int const kElementsPerAccess = Base::kElementsPerAccess;
-  using PartitionCount = typename Base::PartitionCount;
-  using AccessCount = typename Base::AccessCount;
-
- private:
-  //
-  // Data members
-  //
-
-  Base layout_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicandCrosswise packed(
-      TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicandCrosswise(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return layout_.stride(); }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
-template <int ElementSize, int InterleavedK>
-struct TensorOpMultiplicandColumnMajorInterleaved {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-
-  //static int const kThreadBlockStrided = ThreadBlockStrided;
-  static int const kInterleavedK = InterleavedK;
-  
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandColumnMajorInterleaved(Index ldm = 0): stride_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandColumnMajorInterleaved(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandColumnMajorInterleaved(extent[0] * kInterleavedK);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    int const rows_per_smem_cache_line = 128 / kInterleavedK;
-
-    int row_id = coord.strided() / rows_per_smem_cache_line;
-    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
-
-    int access_block_id = col_id >> 4;
-    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
-
-    int swizzle_col_id = swizzle_access_block_id << 4;
-
-    return row_id * 128 + swizzle_col_id;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return (extent[1] / kInterleavedK) * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
-template <int ElementSize, int InterleavedK>
-struct TensorOpMultiplicandRowMajorInterleaved {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  /// This layout is optimized for 128b accesses
-  static int const kAccessSize = 128;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = ElementSize;
-  static int const kElementsPerAccess = kAccessSize / kElementSize;
-
-  //static int const kThreadBlockStrided = ThreadBlockStrided;
-  static int const kInterleavedK = InterleavedK;
-  
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandRowMajorInterleaved(Index ldm = 0): stride_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandRowMajorInterleaved(Stride stride): stride_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandRowMajorInterleaved(extent[1] * kInterleavedK);
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    int const rows_per_smem_cache_line = 128 / kInterleavedK;
-
-    int row_id = coord.strided() / rows_per_smem_cache_line;
-    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
-
-    int access_block_id = col_id >> 4;
-    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
-
-    int swizzle_col_id = swizzle_access_block_id << 4;
-
-    return row_id * 128 + swizzle_col_id;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return (extent[0] / kInterleavedK) * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
deleted file mode 100755
index 15d528399..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
+++ /dev/null
@@ -1,1139 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief layouts needed by Ampere fp64 tensor core kernels.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace layout {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct TensorOpMultiplicandCongruous64b {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = 64;
-  static int const kElementsPerAccess = 1;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCongruous64b(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    int tc = coord.contiguous() / 16;
-    int ts = coord.strided() / 4;
-
-    int c = coord.contiguous() % 16;
-    int s = coord.strided() % 4;
-
-
-    int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8);
-    int row = (c & 6) / 2;
-
-    bank ^= ((s & 2) * 2);
-
-    LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0];
-
-    return offset;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    return TensorCoord();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct ColumnMajorTensorOpMultiplicandCongruous64b {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous64b;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());    
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct RowMajorTensorOpMultiplicandCongruous64b {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous64b;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicandCongruous64b(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct TensorOpMultiplicand64bCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = 64;
-  static int const kElementsPerAccess = 1;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
-    return TensorOpMultiplicand64bCrosswise(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    int tc = coord.contiguous() / 16;
-    int ts = coord.strided() / 16;
-
-    int c = coord.contiguous() % 16;
-    int s = coord.strided() % 16;
-
-    int k_group = c / 4;
-    int access_s = s / 2;
-
-    int row = access_s % 4;
-    int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1);
-
-    int smem_row = (k_group * 4 + row) + tc * 16;
-    int smem_col = ts * 16 + bank;
-
-    LongIndex offset = smem_row * stride_[0] + smem_col;
-
-    return offset;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct ColumnMajorTensorOpMultiplicand64bCrosswise {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicand64bCrosswise;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct RowMajorTensorOpMultiplicand64bCrosswise {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicand64bCrosswise;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicand64bCrosswise(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct TensorOpMultiplicandCongruous128b {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = 128;
-  static int const kElementsPerAccess = 1;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCongruous128b(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    Index tc = coord.contiguous() / 8;
-    Index ts = coord.strided() / 4;
-
-    Index c = coord.contiguous() % 8;
-    Index s = coord.strided() % 4;
-
-    Index k_index = (c / 2);
-
-    Index bank = (((c & 1) * 4) | (s ^ k_index));
-
-    LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0];
-
-    return offset;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    return TensorCoord();   
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct ColumnMajorTensorOpMultiplicandCongruous128b {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous128b;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.contiguous(), coord.strided());    
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct RowMajorTensorOpMultiplicandCongruous128b {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCongruous128b;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicandCongruous128b(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Inverse of layout function, mapping linear offset to logical coordinate
-  CUTLASS_HOST_DEVICE
-  TensorCoord inverse(LongIndex offset) const {
-    PitchLinearCoord coord = layout_.inverse(offset);
-    return MatrixCoord(coord.strided(), coord.contiguous());
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template based on element size (in bits) - defined in terms of pitch-linear
-/// memory and Crosswise size (in elements).
-struct TensorOpMultiplicandCrosswise128x4 {
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = PitchLinearCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Static constants
-  //
-
-  static int const kElementSize = 128;
-  static int const kElementsPerAccess = 1;
-
- private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member.
-  Stride stride_;
-
- public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {}
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {}
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
-    return TensorOpMultiplicandCrosswise128x4(extent[0]);
-  }
-
-  /// Returns the offset of a coordinate in linear memory.
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-
-    Index tc = coord.contiguous() / 8;
-    Index ts = coord.strided() / 8;
-
-    Index c = coord.contiguous() % 8;
-    Index s = coord.strided() % 8;
-
-    Index liq = c % 4;
-
-    Index bank = liq + ((s & 1) * 4) ^ (c & 4);
-
-    Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2);
-
-    LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank;
-
-    return offset;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const { return stride_; }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride &stride() { return stride_; }
-
-  /// Compute the number of contiguous elements needed to store a tensor with
-  /// the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return extent[1] * stride_[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a column-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct ColumnMajorTensorOpMultiplicandCrosswise128x4 {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCrosswise128x4;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
-    return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.row(), coord.column()));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Template mapping a row-major view of pitch-linear memory to
-/// TensorOpMultiplicand
-struct RowMajorTensorOpMultiplicandCrosswise128x4 {
-
-  /// Logical rank of tensor
-  static int const kRank = 2;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = MatrixCoord;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index, LongIndex>;
-
-  //
-  // Invariants
-  //
-
-  using Base = TensorOpMultiplicandCrosswise128x4;
-
-private:
-
-  //
-  // Data members
-  //
-
-  Base layout_;
-
-public:
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
-    return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row());
-  }
-
-  /// Returns the offset of a coordinate in linear memory. 
-  /// Assumes coordinate has convention (contiguous, strided)
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return layout_(PitchLinearCoord(coord.column(), coord.row()));
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &extent) const {
-    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace layout
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/layout/vector.h b/lightllm-kernel/cutlass/include/cutlass/layout/vector.h
deleted file mode 100755
index 56506feab..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/layout/vector.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used for rank=1 vectors.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-namespace cutlass {
-namespace layout {
-
-/// Tensor layout for densely packed vectors.
-class PackedVectorLayout {
-public:
-  /// Logical rank of tensor
-  static int const kRank = 1;
-
-  /// Rank of stride vector
-  static int const kStrideRank = 1;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Coord<kRank, Index>;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index>;
-
-private:
-
-  //
-  // No actual stride vector stored
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  PackedVectorLayout() { }
-
-  /// Helper returns a layout to a tightly packed tensor
-  CUTLASS_HOST_DEVICE
-  static PackedVectorLayout packed(TensorCoord const &size) {
-    CUTLASS_UNUSED(size);
-    return PackedVectorLayout();
-  }
-
-  /// Returns the offset of a coordinate in linear memory
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(TensorCoord const &coord) const {
-    return coord[0];
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return make_Coord(1);
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &size) const {
-    return size[0];
-  }
-};
-
-} // namespace layout
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix.h b/lightllm-kernel/cutlass/include/cutlass/matrix.h
deleted file mode 100755
index 5d8ccb3c1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/matrix.h
+++ /dev/null
@@ -1,14129 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*  
-  \file
-  \brief Matrix classes with value semantics.
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <iosfwd>
-#include <cmath>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/matrix.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Primary template with partial specializations to follow
-template <typename Element, int Rows, int Columns> struct Matrix;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 1-by-2 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 1, 2> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 1;
-
-  /// Number of columns in matrix
-  static int const kColumns = 2;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 2;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 1-by-2 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> transpose() const {
-    Matrix<Element, 2, 1> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> row(int i) const {
-    return slice_1x2(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
-    return set_slice_1x2(v, i, 0);
-  }
-    
-  /// Forms a 1-by-2 matrix by horizontally concatenating an Element with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Element lhs, Element rhs) {
-    return Matrix(
-      lhs, rhs);
-  }
-  
-  /// Concatenates this matrix with a an Element to form a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> hcat(Element rhs) const {
-    return Matrix<Element, 1, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 1-by-2 matrix to form a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> hcat(Matrix<Element, 1, 2> const & rhs) const {
-    return Matrix<Element, 1, 4>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 1-by-2 matrix to form a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
-    return Matrix<Element, 2, 2>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-2 matrix to form a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
-    return Matrix<Element, 3, 2>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-2 matrix to form a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> vcat(Matrix<Element, 3, 2> const & rhs) const {
-    return Matrix<Element, 4, 2>::vcat(*this, rhs);
-  }
-    
-  /// Elementwise add operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (1-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-
-    return m;
-  }
-  
-  /// Matrix product of size 1-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Element product(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
-    
-    // k=0
-    accum += data[0] * rhs.data[0];
-
-    // k=1
-    accum += data[1] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Element operator*(Matrix<Element, 2, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> product(
-    Matrix<Element, 2, 2> const &rhs,
-    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 1-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> product(
-    Matrix<Element, 2, 3> const &rhs,
-    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> product(
-    Matrix<Element, 2, 4> const &rhs,
-    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Dot product of vectors with extent 2
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 2
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 1-by-2 matrix
-template <typename Element>
-using Matrix1x2 = Matrix<Element, 1, 2>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix1x2<Element> make_Matrix1x2(
-    Element _0_0, Element _0_1
-) {
-  return Matrix1x2<Element>(
-  _0_0, _0_1 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 1-by-3 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 1, 3> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 1;
-
-  /// Number of columns in matrix
-  static int const kColumns = 3;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 3;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 1-by-3 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> transpose() const {
-    Matrix<Element, 3, 1> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-    mt.data[2] = data[2];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> row(int i) const {
-    return slice_1x3(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
-    return set_slice_1x3(v, i, 0);
-  }
-    
-  /// Forms a 1-by-3 matrix by horizontally concatenating an Element with a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Element lhs, Matrix<Element, 1, 2> const & rhs) {
-    return Matrix(
-      lhs, rhs.at(0, 0), rhs.at(0, 1));
-  }
-  
-  /// Forms a 1-by-3 matrix by horizontally concatenating a 1-by-2 matrix with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Element rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs);
-  }
-  
-  /// Concatenates this matrix with a an Element to form a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> hcat(Element rhs) const {
-    return Matrix<Element, 1, 4>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 1-by-3 matrix to form a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
-    return Matrix<Element, 2, 3>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-3 matrix to form a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
-    return Matrix<Element, 3, 3>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-3 matrix to form a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> vcat(Matrix<Element, 3, 3> const & rhs) const {
-    return Matrix<Element, 4, 3>::vcat(*this, rhs);
-  }
-    
-  /// Elementwise add operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (1-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-
-    return m;
-  }
-  
-  /// Matrix product of size 1-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Element product(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
-    
-    // k=0
-    accum += data[0] * rhs.data[0];
-
-    // k=1
-    accum += data[1] * rhs.data[1];
-
-    // k=2
-    accum += data[2] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Element operator*(Matrix<Element, 3, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> product(
-    Matrix<Element, 3, 2> const &rhs,
-    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> product(
-    Matrix<Element, 3, 3> const &rhs,
-    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 1-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> product(
-    Matrix<Element, 3, 4> const &rhs,
-    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Dot product of vectors with extent 3
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 3
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-  /// Cross product
-  CUTLASS_HOST_DEVICE
-  Matrix cross(Matrix const &rhs) const {
-    return Matrix(
-      data[1] * rhs.data[2] - data[2] * rhs.data[1],
-      data[0] * rhs.data[2] - data[2] * rhs.data[1],
-      data[0] * rhs.data[1] - data[1] * rhs.data[0]
-    );
-  }
-  
-};
-
-/// Template alias for 1-by-3 matrix
-template <typename Element>
-using Matrix1x3 = Matrix<Element, 1, 3>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix1x3<Element> make_Matrix1x3(
-    Element _0_0, Element _0_1, Element _0_2
-) {
-  return Matrix1x3<Element>(
-  _0_0, _0_1, _0_2 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 1-by-4 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 1, 4> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 1;
-
-  /// Number of columns in matrix
-  static int const kColumns = 4;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 4;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 1-by-4 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> transpose() const {
-    Matrix<Element, 4, 1> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-    mt.data[2] = data[2];
-    mt.data[3] = data[3];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 1 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> row(int i) const {
-    return slice_1x4(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
-    return set_slice_1x4(v, i, 0);
-  }
-    
-  /// Forms a 1-by-4 matrix by horizontally concatenating an Element with a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Element lhs, Matrix<Element, 1, 3> const & rhs) {
-    return Matrix(
-      lhs, rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2));
-  }
-  
-  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-2 matrix with a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Matrix<Element, 1, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1));
-  }
-  
-  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-3 matrix with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 1, 3> const & lhs, Element rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs);
-  }
-  
-  /// Concatenates this matrix with a a 1-by-4 matrix to form a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
-    return Matrix<Element, 2, 4>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-4 matrix to form a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
-    return Matrix<Element, 3, 4>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-4 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> vcat(Matrix<Element, 3, 4> const & rhs) const {
-    return Matrix<Element, 4, 4>::vcat(*this, rhs);
-  }
-    
-  /// Elementwise add operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-    data[3] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-    data[3] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (1-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-
-    return m;
-  }
-  
-  /// Matrix product of size 1-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Element product(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
-    
-    // k=0
-    accum += data[0] * rhs.data[0];
-
-    // k=1
-    accum += data[1] * rhs.data[1];
-
-    // k=2
-    accum += data[2] * rhs.data[2];
-
-    // k=3
-    accum += data[3] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Element operator*(Matrix<Element, 4, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> product(
-    Matrix<Element, 4, 2> const &rhs,
-    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[6];
-    accum.data[1] += data[3] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> product(
-    Matrix<Element, 4, 3> const &rhs,
-    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[9];
-    accum.data[1] += data[3] * rhs.data[10];
-    accum.data[2] += data[3] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> product(
-    Matrix<Element, 4, 4> const &rhs,
-    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[12];
-    accum.data[1] += data[3] * rhs.data[13];
-    accum.data[2] += data[3] * rhs.data[14];
-    accum.data[3] += data[3] * rhs.data[15];
-
-    return accum;
-  }
-
-  /// Matrix product of size 1-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 1-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Dot product of vectors with extent 4
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    accum += data[3] * rhs.data[3];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 4
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    accum += data[3] * rhs.data[3];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 1-by-4 matrix
-template <typename Element>
-using Matrix1x4 = Matrix<Element, 1, 4>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix1x4<Element> make_Matrix1x4(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3
-) {
-  return Matrix1x4<Element>(
-  _0_0, _0_1, _0_2, _0_3 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 2-by-1 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 2, 1> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 2;
-
-  /// Number of columns in matrix
-  static int const kColumns = 1;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 2;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 2-by-1 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, 
-    Element _1_0
-  ) {
-
-    data[0] = _0_0;
-    data[1] = _1_0;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> transpose() const {
-    Matrix<Element, 1, 2> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> column(int j) const {
-    return slice_2x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
-    return set_slice_2x1(v, 0, j);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> hcat(Matrix<Element, 2, 1> const & rhs) const {
-    return Matrix<Element, 2, 2>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 2> const & rhs) const {
-    return Matrix<Element, 2, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-3 matrix to form a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 3> const & rhs) const {
-    return Matrix<Element, 2, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-1 matrix by vertically concatenating an Element with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Element upper, Element lower) {
-    return Matrix(
-      upper
-      , lower);
-  }
-  
-  /// Concatenates this matrix with a an Element to form a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> vcat(Element rhs) const {
-    return Matrix<Element, 3, 1>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-1 matrix to form a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> vcat(Matrix<Element, 2, 1> const & rhs) const {
-    return Matrix<Element, 4, 1>::vcat(*this, rhs);
-  }
-    
-  /// Elementwise add operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-
-    result.data[1] = data[1] + rhs.data[1];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-
-    data[1] += rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-
-    result.data[1] = data[1] - rhs.data[1];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-
-    data[1] -= rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-
-    result.data[1] = data[1] * rhs.data[1];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-
-    result.data[1] = data[1] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-
-    data[1] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-
-    result.data[1] = data[1] / rhs.data[1];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-
-    result.data[1] = data[1] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-
-    data[1] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (2-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-
-    data[1] /= rhs.data[1];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-
-    return m;
-  }
-  
-  /// Matrix product of size 2-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> product(
-    Matrix<Element, 1, 1> const &rhs,
-    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[1] * rhs.data[0];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 2-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> product(
-    Matrix<Element, 1, 2> const &rhs,
-    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[1] * rhs.data[0];
-    accum.data[3] += data[1] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> product(
-    Matrix<Element, 1, 3> const &rhs,
-    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[1] * rhs.data[0];
-    accum.data[4] += data[1] * rhs.data[1];
-    accum.data[5] += data[1] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> product(
-    Matrix<Element, 1, 4> const &rhs,
-    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[1] * rhs.data[0];
-    accum.data[5] += data[1] * rhs.data[1];
-    accum.data[6] += data[1] * rhs.data[2];
-    accum.data[7] += data[1] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Dot product of vectors with extent 2
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 2
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 2-by-1 matrix
-template <typename Element>
-using Matrix2x1 = Matrix<Element, 2, 1>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix2x1<Element> make_Matrix2x1(
-    Element _0_0, 
-    Element _1_0
-) {
-  return Matrix2x1<Element>(
-  _0_0, 
-  _1_0 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 2-by-2 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 2, 2> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 2;
-
-  /// Number of columns in matrix
-  static int const kColumns = 2;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 4;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 2-by-2 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;
-    data[2] = _1_0;  data[3] = _1_1;
-  }
-    
-  /// Constucts a 2-by-2 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 2> const &row_0,
-    Matrix<Element, 1, 2> const &row_1
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_1.data[0];
-    data[3] = row_1.data[1];
-  }
-    
-  /// Static method to construct a 2-by-2 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 2, 1> const &column_0,
-    Matrix<Element, 2, 1> const &column_1
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_0.data[1];
-    result.data[3] = column_1.data[1];
-    return result;
-  }
-    
-  /// Constructs an identity matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix identity() {
-    Matrix m;
-    
-    m.data[0] = Element(1);
-    m.data[3] = Element(1);
-
-    return m;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> diagonal() const {
-    Matrix<Element, 2, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[3];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> transpose() const {
-    Matrix<Element, 2, 2> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[2] = data[1];
-    mt.data[1] = data[2];
-    mt.data[3] = data[3];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> row(int i) const {
-    return slice_1x2(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
-    return set_slice_1x2(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> column(int j) const {
-    return slice_2x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
-    return set_slice_2x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Forms a 2-by-2 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0)
-      , lhs.at(1, 0), rhs.at(1, 0));
-  }
-  
-  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 1> const & rhs) const {
-    return Matrix<Element, 2, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 2> const & rhs) const {
-    return Matrix<Element, 2, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , lower.at(0, 0), lower.at(0, 1));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-2 matrix to form a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
-    return Matrix<Element, 3, 2>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-2 matrix to form a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
-    return Matrix<Element, 4, 2>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Element                         B,
-    Element                         C, Element                         D) {
-    return Matrix(
-      A, B
-      , C, D
-    );
-  }
-  
-  /// Elementwise add operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-
-    data[2] *= s;
-    data[3] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-
-    data[2] /= s;
-    data[3] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (2-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-
-    return m;
-  }
-  
-  /// Matrix product of size 2-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> product(
-    Matrix<Element, 2, 1> const &rhs,
-    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[2] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[3] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> product(
-    Matrix<Element, 2, 2> const &rhs,
-    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[2] * rhs.data[0];
-    accum.data[3] += data[2] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[3] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 2-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> product(
-    Matrix<Element, 2, 3> const &rhs,
-    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[2] * rhs.data[0];
-    accum.data[4] += data[2] * rhs.data[1];
-    accum.data[5] += data[2] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[3] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> product(
-    Matrix<Element, 2, 4> const &rhs,
-    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[2] * rhs.data[0];
-    accum.data[5] += data[2] * rhs.data[1];
-    accum.data[6] += data[2] * rhs.data[2];
-    accum.data[7] += data[2] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-    accum.data[6] += data[3] * rhs.data[6];
-    accum.data[7] += data[3] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[3];
-
-    return accum;
-  }
-    
-  /// Returns 2-by-2 rotation matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation(Element theta) {
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    return Matrix(
-      c, -s,
-      s,  c
-    );
-  }
-    
-  /// Computes the determinant of a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Element determinant(Element accum = Element()) const {
-        accum += data[0] * data[3] - data[1] * data[2];
-
-    return accum;
-  }
-  
-  /// Computes the inverse of a 2-by-2 matrix given
-  /// the matrix's determinant
-  CUTLASS_HOST_DEVICE
-  Matrix inverse(Element det) const {
-    return Matrix(
-      data[3], -data[1],
-      -data[2], data[0]
-    ) * (Element(1) / det); 
-  }
-
-  /// Computes the inverse of a 2-by-2 matrix.
-  CUTLASS_HOST_DEVICE
-  Matrix inverse() const {
-    return inverse(determinant());
-  }
-    
-};
-
-/// Template alias for 2-by-2 matrix
-template <typename Element>
-using Matrix2x2 = Matrix<Element, 2, 2>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix2x2<Element> make_Matrix2x2(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1
-) {
-  return Matrix2x2<Element>(
-  _0_0, _0_1, 
-  _1_0, _1_1 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 2-by-3 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 2, 3> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 2;
-
-  /// Number of columns in matrix
-  static int const kColumns = 3;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 6;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 2-by-3 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
-    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
-  }
-    
-  /// Constucts a 2-by-3 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 3> const &row_0,
-    Matrix<Element, 1, 3> const &row_1
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_1.data[0];
-    data[4] = row_1.data[1];
-    data[5] = row_1.data[2];
-  }
-    
-  /// Static method to construct a 2-by-3 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 3, 1> const &column_0,
-    Matrix<Element, 3, 1> const &column_1,
-    Matrix<Element, 3, 1> const &column_2
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_0.data[1];
-    result.data[4] = column_1.data[1];
-    result.data[5] = column_2.data[1];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> diagonal() const {
-    Matrix<Element, 2, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[3];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> transpose() const {
-    Matrix<Element, 3, 2> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[2] = data[1];
-    mt.data[4] = data[2];
-    mt.data[1] = data[3];
-    mt.data[3] = data[4];
-    mt.data[5] = data[5];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> row(int i) const {
-    return slice_1x3(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
-    return set_slice_1x3(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> column(int j) const {
-    return slice_2x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
-    return set_slice_2x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1));
-  }
-  
-  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0));
-  }
-  
-  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 1> const & rhs) const {
-    return Matrix<Element, 2, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-3 matrix to form a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
-    return Matrix<Element, 3, 3>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-3 matrix to form a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
-    return Matrix<Element, 4, 3>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 2> const & B,
-    Element                         C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1)
-      , C, D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 2-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Element                         B,
-    Matrix<Element, 1, 2> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B
-      , C.at(0, 0), C.at(0, 1), D
-    );
-  }
-  
-  /// Elementwise add operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-
-    result.data[3] = data[3] + rhs.data[3];
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-
-    data[3] += rhs.data[3];
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-
-    result.data[3] = data[3] - rhs.data[3];
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-
-    data[3] -= rhs.data[3];
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-
-    result.data[3] = data[3] * rhs.data[3];
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-
-    result.data[3] = data[3] * s;
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-
-    data[3] *= s;
-    data[4] *= s;
-    data[5] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-
-    result.data[3] = data[3] / rhs.data[3];
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-
-    result.data[3] = data[3] / s;
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-
-    data[3] /= s;
-    data[4] /= s;
-    data[5] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (2-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-
-    data[3] /= rhs.data[3];
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-
-    return m;
-  }
-  
-  /// Matrix product of size 2-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> product(
-    Matrix<Element, 3, 1> const &rhs,
-    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[3] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[4] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[5] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> product(
-    Matrix<Element, 3, 2> const &rhs,
-    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[3] * rhs.data[0];
-    accum.data[3] += data[3] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[4] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[5] * rhs.data[4];
-    accum.data[3] += data[5] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> product(
-    Matrix<Element, 3, 3> const &rhs,
-    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[0];
-    accum.data[4] += data[3] * rhs.data[1];
-    accum.data[5] += data[3] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[4] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[5] * rhs.data[6];
-    accum.data[4] += data[5] * rhs.data[7];
-    accum.data[5] += data[5] * rhs.data[8];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 2-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> product(
-    Matrix<Element, 3, 4> const &rhs,
-    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[0];
-    accum.data[5] += data[3] * rhs.data[1];
-    accum.data[6] += data[3] * rhs.data[2];
-    accum.data[7] += data[3] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-    accum.data[6] += data[4] * rhs.data[6];
-    accum.data[7] += data[4] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[5] * rhs.data[8];
-    accum.data[5] += data[5] * rhs.data[9];
-    accum.data[6] += data[5] * rhs.data[10];
-    accum.data[7] += data[5] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[4];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 2-by-3 matrix
-template <typename Element>
-using Matrix2x3 = Matrix<Element, 2, 3>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix2x3<Element> make_Matrix2x3(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2
-) {
-  return Matrix2x3<Element>(
-  _0_0, _0_1, _0_2, 
-  _1_0, _1_1, _1_2 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 2-by-4 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 2, 4> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 2;
-
-  /// Number of columns in matrix
-  static int const kColumns = 4;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 8;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 2-by-4 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
-    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
-  }
-    
-  /// Constucts a 2-by-4 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 4> const &row_0,
-    Matrix<Element, 1, 4> const &row_1
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_0.data[3];
-    data[4] = row_1.data[0];
-    data[5] = row_1.data[1];
-    data[6] = row_1.data[2];
-    data[7] = row_1.data[3];
-  }
-    
-  /// Static method to construct a 2-by-4 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 4, 1> const &column_0,
-    Matrix<Element, 4, 1> const &column_1,
-    Matrix<Element, 4, 1> const &column_2,
-    Matrix<Element, 4, 1> const &column_3
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_3.data[0];
-    result.data[4] = column_0.data[1];
-    result.data[5] = column_1.data[1];
-    result.data[6] = column_2.data[1];
-    result.data[7] = column_3.data[1];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[3] = diag.data[1];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> diagonal() const {
-    Matrix<Element, 2, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[3];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> transpose() const {
-    Matrix<Element, 4, 2> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[2] = data[1];
-    mt.data[4] = data[2];
-    mt.data[6] = data[3];
-    mt.data[1] = data[4];
-    mt.data[3] = data[5];
-    mt.data[5] = data[6];
-    mt.data[7] = data[7];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 2 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> row(int i) const {
-    return slice_1x4(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
-    return set_slice_1x4(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> column(int j) const {
-    return slice_2x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
-    return set_slice_2x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 3> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2));
-  }
-  
-  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1));
-  }
-  
-  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-3 matrix with a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 2, 3> const & lhs, Matrix<Element, 2, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0));
-  }
-  
-  /// Forms a 2-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-4 matrix to form a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
-    return Matrix<Element, 3, 4>::vcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 2-by-4 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
-    return Matrix<Element, 4, 4>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 2-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 3> const & B,
-    Element                         C, Matrix<Element, 1, 3> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
-    );
-  }
-  
-  /// Forms a 2-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
-    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 2-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 3> const & A, Element                         B,
-    Matrix<Element, 1, 3> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
-    );
-  }
-  
-  /// Elementwise add operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-    data[3] *= s;
-
-    data[4] *= s;
-    data[5] *= s;
-    data[6] *= s;
-    data[7] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-    data[3] /= s;
-
-    data[4] /= s;
-    data[5] /= s;
-    data[6] /= s;
-    data[7] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (2-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-
-    return m;
-  }
-  
-  /// Matrix product of size 2-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> product(
-    Matrix<Element, 4, 1> const &rhs,
-    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[4] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[5] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[6] * rhs.data[2];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[3];
-    accum.data[1] += data[7] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> product(
-    Matrix<Element, 4, 2> const &rhs,
-    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[4] * rhs.data[0];
-    accum.data[3] += data[4] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[5] * rhs.data[2];
-    accum.data[3] += data[5] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[6] * rhs.data[4];
-    accum.data[3] += data[6] * rhs.data[5];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[6];
-    accum.data[1] += data[3] * rhs.data[7];
-    accum.data[2] += data[7] * rhs.data[6];
-    accum.data[3] += data[7] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> product(
-    Matrix<Element, 4, 3> const &rhs,
-    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[0];
-    accum.data[4] += data[4] * rhs.data[1];
-    accum.data[5] += data[4] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[5] * rhs.data[3];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[6] * rhs.data[6];
-    accum.data[4] += data[6] * rhs.data[7];
-    accum.data[5] += data[6] * rhs.data[8];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[9];
-    accum.data[1] += data[3] * rhs.data[10];
-    accum.data[2] += data[3] * rhs.data[11];
-    accum.data[3] += data[7] * rhs.data[9];
-    accum.data[4] += data[7] * rhs.data[10];
-    accum.data[5] += data[7] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> product(
-    Matrix<Element, 4, 4> const &rhs,
-    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[0];
-    accum.data[5] += data[4] * rhs.data[1];
-    accum.data[6] += data[4] * rhs.data[2];
-    accum.data[7] += data[4] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-    accum.data[6] += data[5] * rhs.data[6];
-    accum.data[7] += data[5] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[6] * rhs.data[8];
-    accum.data[5] += data[6] * rhs.data[9];
-    accum.data[6] += data[6] * rhs.data[10];
-    accum.data[7] += data[6] * rhs.data[11];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[12];
-    accum.data[1] += data[3] * rhs.data[13];
-    accum.data[2] += data[3] * rhs.data[14];
-    accum.data[3] += data[3] * rhs.data[15];
-    accum.data[4] += data[7] * rhs.data[12];
-    accum.data[5] += data[7] * rhs.data[13];
-    accum.data[6] += data[7] * rhs.data[14];
-    accum.data[7] += data[7] * rhs.data[15];
-
-    return accum;
-  }
-
-  /// Matrix product of size 2-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 2-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[5];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 2-by-4 matrix
-template <typename Element>
-using Matrix2x4 = Matrix<Element, 2, 4>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix2x4<Element> make_Matrix2x4(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3
-) {
-  return Matrix2x4<Element>(
-  _0_0, _0_1, _0_2, _0_3, 
-  _1_0, _1_1, _1_2, _1_3 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 3-by-1 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 3, 1> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 3;
-
-  /// Number of columns in matrix
-  static int const kColumns = 1;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 3;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 3-by-1 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, 
-    Element _1_0, 
-    Element _2_0
-  ) {
-
-    data[0] = _0_0;
-    data[1] = _1_0;
-    data[2] = _2_0;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> transpose() const {
-    Matrix<Element, 1, 3> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-    mt.data[2] = data[2];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-    m.data[2] = data[i * 1 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-    data[i * 1 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> column(int j) const {
-    return slice_3x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
-    return set_slice_3x1(v, 0, j);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> hcat(Matrix<Element, 3, 1> const & rhs) const {
-    return Matrix<Element, 3, 2>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 2> const & rhs) const {
-    return Matrix<Element, 3, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-3 matrix to form a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 3> const & rhs) const {
-    return Matrix<Element, 3, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-1 matrix by vertically concatenating an Element with a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Element upper, Matrix<Element, 2, 1> const & lower) {
-    return Matrix(
-      upper
-      , lower.at(0, 0)
-      , lower.at(1, 0));
-  }
-  
-  /// Forms a 3-by-1 matrix by vertically concatenating a 2-by-1 matrix with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Element lower) {
-    return Matrix(
-      upper.at(0, 0)
-      , upper.at(1, 0)
-      , lower);
-  }
-  
-  /// Concatenates this matrix with a an Element to form a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> vcat(Element rhs) const {
-    return Matrix<Element, 4, 1>::vcat(*this, rhs);
-  }
-    
-  /// Elementwise add operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-
-    result.data[1] = data[1] + rhs.data[1];
-
-    result.data[2] = data[2] + rhs.data[2];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-
-    data[1] += rhs.data[1];
-
-    data[2] += rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-
-    result.data[1] = data[1] - rhs.data[1];
-
-    result.data[2] = data[2] - rhs.data[2];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-
-    data[1] -= rhs.data[1];
-
-    data[2] -= rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-
-    result.data[1] = data[1] * rhs.data[1];
-
-    result.data[2] = data[2] * rhs.data[2];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-
-    result.data[1] = data[1] * s;
-
-    result.data[2] = data[2] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-
-    data[1] *= s;
-
-    data[2] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-
-    result.data[1] = data[1] / rhs.data[1];
-
-    result.data[2] = data[2] / rhs.data[2];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-
-    result.data[1] = data[1] / s;
-
-    result.data[2] = data[2] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-
-    data[1] /= s;
-
-    data[2] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (3-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-
-    data[1] /= rhs.data[1];
-
-    data[2] /= rhs.data[2];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-
-    return m;
-  }
-  
-  /// Matrix product of size 3-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> product(
-    Matrix<Element, 1, 1> const &rhs,
-    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[1] * rhs.data[0];
-    accum.data[2] += data[2] * rhs.data[0];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 3-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> product(
-    Matrix<Element, 1, 2> const &rhs,
-    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[1] * rhs.data[0];
-    accum.data[3] += data[1] * rhs.data[1];
-    accum.data[4] += data[2] * rhs.data[0];
-    accum.data[5] += data[2] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> product(
-    Matrix<Element, 1, 3> const &rhs,
-    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[1] * rhs.data[0];
-    accum.data[4] += data[1] * rhs.data[1];
-    accum.data[5] += data[1] * rhs.data[2];
-    accum.data[6] += data[2] * rhs.data[0];
-    accum.data[7] += data[2] * rhs.data[1];
-    accum.data[8] += data[2] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> product(
-    Matrix<Element, 1, 4> const &rhs,
-    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[1] * rhs.data[0];
-    accum.data[5] += data[1] * rhs.data[1];
-    accum.data[6] += data[1] * rhs.data[2];
-    accum.data[7] += data[1] * rhs.data[3];
-    accum.data[8] += data[2] * rhs.data[0];
-    accum.data[9] += data[2] * rhs.data[1];
-    accum.data[10] += data[2] * rhs.data[2];
-    accum.data[11] += data[2] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Dot product of vectors with extent 3
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 3
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-  /// Cross product
-  CUTLASS_HOST_DEVICE
-  Matrix cross(Matrix const &rhs) const {
-    return Matrix(
-      data[1] * rhs.data[2] - data[2] * rhs.data[1],
-      data[0] * rhs.data[2] - data[2] * rhs.data[1],
-      data[0] * rhs.data[1] - data[1] * rhs.data[0]
-    );
-  }
-  
-};
-
-/// Template alias for 3-by-1 matrix
-template <typename Element>
-using Matrix3x1 = Matrix<Element, 3, 1>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix3x1<Element> make_Matrix3x1(
-    Element _0_0, 
-    Element _1_0, 
-    Element _2_0
-) {
-  return Matrix3x1<Element>(
-  _0_0, 
-  _1_0, 
-  _2_0 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 3-by-2 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 3, 2> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 3;
-
-  /// Number of columns in matrix
-  static int const kColumns = 2;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 6;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 3-by-2 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1, 
-    Element _2_0, Element _2_1
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;
-    data[2] = _1_0;  data[3] = _1_1;
-    data[4] = _2_0;  data[5] = _2_1;
-  }
-    
-  /// Constucts a 3-by-2 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 2> const &row_0,
-    Matrix<Element, 1, 2> const &row_1,
-    Matrix<Element, 1, 2> const &row_2
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_1.data[0];
-    data[3] = row_1.data[1];
-    data[4] = row_2.data[0];
-    data[5] = row_2.data[1];
-  }
-    
-  /// Static method to construct a 3-by-2 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 2, 1> const &column_0,
-    Matrix<Element, 2, 1> const &column_1
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_0.data[1];
-    result.data[3] = column_1.data[1];
-    result.data[4] = column_0.data[2];
-    result.data[5] = column_1.data[2];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> diagonal() const {
-    Matrix<Element, 2, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[4];
-    diag.data[2] = data[8];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> transpose() const {
-    Matrix<Element, 2, 3> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[3] = data[1];
-    mt.data[1] = data[2];
-    mt.data[4] = data[3];
-    mt.data[2] = data[4];
-    mt.data[5] = data[5];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> row(int i) const {
-    return slice_1x2(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
-    return set_slice_1x2(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-    m.data[2] = data[i * 2 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-    data[i * 2 + j + 4] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> column(int j) const {
-    return slice_3x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
-    return set_slice_3x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-    m.data[4] = data[i * 2 + j + 4];
-    m.data[5] = data[i * 2 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-    data[i * 2 + j + 4] = m.data[4];
-    data[i * 2 + j + 5] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Forms a 3-by-2 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0)
-      , lhs.at(1, 0), rhs.at(1, 0)
-      , lhs.at(2, 0), rhs.at(2, 0));
-  }
-  
-  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 1> const & rhs) const {
-    return Matrix<Element, 3, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 2> const & rhs) const {
-    return Matrix<Element, 3, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , lower.at(0, 0), lower.at(0, 1)
-      , lower.at(1, 0), lower.at(1, 1));
-  }
-  
-  /// Forms a 3-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , upper.at(1, 0), upper.at(1, 1)
-      , lower.at(0, 0), lower.at(0, 1));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-2 matrix to form a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
-    return Matrix<Element, 4, 2>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Element                         B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A, B
-      , C.at(0, 0), D.at(0, 0)
-      , C.at(1, 0), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 3-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
-    Element                         C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0)
-      , A.at(1, 0), B.at(1, 0)
-      , C, D
-    );
-  }
-  
-  /// Elementwise add operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-
-    data[2] *= s;
-    data[3] *= s;
-
-    data[4] *= s;
-    data[5] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-
-    data[2] /= s;
-    data[3] /= s;
-
-    data[4] /= s;
-    data[5] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (3-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-
-    return m;
-  }
-  
-  /// Matrix product of size 3-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> product(
-    Matrix<Element, 2, 1> const &rhs,
-    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[2] * rhs.data[0];
-    accum.data[2] += data[4] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[3] * rhs.data[1];
-    accum.data[2] += data[5] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> product(
-    Matrix<Element, 2, 2> const &rhs,
-    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[2] * rhs.data[0];
-    accum.data[3] += data[2] * rhs.data[1];
-    accum.data[4] += data[4] * rhs.data[0];
-    accum.data[5] += data[4] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[3] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[3];
-    accum.data[4] += data[5] * rhs.data[2];
-    accum.data[5] += data[5] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 3-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> product(
-    Matrix<Element, 2, 3> const &rhs,
-    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[2] * rhs.data[0];
-    accum.data[4] += data[2] * rhs.data[1];
-    accum.data[5] += data[2] * rhs.data[2];
-    accum.data[6] += data[4] * rhs.data[0];
-    accum.data[7] += data[4] * rhs.data[1];
-    accum.data[8] += data[4] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[3] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-    accum.data[6] += data[5] * rhs.data[3];
-    accum.data[7] += data[5] * rhs.data[4];
-    accum.data[8] += data[5] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> product(
-    Matrix<Element, 2, 4> const &rhs,
-    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[2] * rhs.data[0];
-    accum.data[5] += data[2] * rhs.data[1];
-    accum.data[6] += data[2] * rhs.data[2];
-    accum.data[7] += data[2] * rhs.data[3];
-    accum.data[8] += data[4] * rhs.data[0];
-    accum.data[9] += data[4] * rhs.data[1];
-    accum.data[10] += data[4] * rhs.data[2];
-    accum.data[11] += data[4] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-    accum.data[6] += data[3] * rhs.data[6];
-    accum.data[7] += data[3] * rhs.data[7];
-    accum.data[8] += data[5] * rhs.data[4];
-    accum.data[9] += data[5] * rhs.data[5];
-    accum.data[10] += data[5] * rhs.data[6];
-    accum.data[11] += data[5] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[3];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 3-by-2 matrix
-template <typename Element>
-using Matrix3x2 = Matrix<Element, 3, 2>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix3x2<Element> make_Matrix3x2(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1, 
-    Element _2_0, Element _2_1
-) {
-  return Matrix3x2<Element>(
-  _0_0, _0_1, 
-  _1_0, _1_1, 
-  _2_0, _2_1 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 3-by-3 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 3, 3> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 3;
-
-  /// Number of columns in matrix
-  static int const kColumns = 3;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 9;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 3-by-3 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2, 
-    Element _2_0, Element _2_1, Element _2_2
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
-    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
-    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
-  }
-    
-  /// Constucts a 3-by-3 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 3> const &row_0,
-    Matrix<Element, 1, 3> const &row_1,
-    Matrix<Element, 1, 3> const &row_2
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_1.data[0];
-    data[4] = row_1.data[1];
-    data[5] = row_1.data[2];
-    data[6] = row_2.data[0];
-    data[7] = row_2.data[1];
-    data[8] = row_2.data[2];
-  }
-    
-  /// Static method to construct a 3-by-3 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 3, 1> const &column_0,
-    Matrix<Element, 3, 1> const &column_1,
-    Matrix<Element, 3, 1> const &column_2
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_0.data[1];
-    result.data[4] = column_1.data[1];
-    result.data[5] = column_2.data[1];
-    result.data[6] = column_0.data[2];
-    result.data[7] = column_1.data[2];
-    result.data[8] = column_2.data[2];
-    return result;
-  }
-    
-  /// Constructs an identity matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix identity() {
-    Matrix m;
-    
-    m.data[0] = Element(1);
-    m.data[4] = Element(1);
-    m.data[8] = Element(1);
-
-    return m;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-    m.data[8] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> diagonal() const {
-    Matrix<Element, 3, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[4];
-    diag.data[2] = data[8];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> transpose() const {
-    Matrix<Element, 3, 3> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[3] = data[1];
-    mt.data[6] = data[2];
-    mt.data[1] = data[3];
-    mt.data[4] = data[4];
-    mt.data[7] = data[5];
-    mt.data[2] = data[6];
-    mt.data[5] = data[7];
-    mt.data[8] = data[8];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> row(int i) const {
-    return slice_1x3(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
-    return set_slice_1x3(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-    m.data[2] = data[i * 3 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-    data[i * 3 + j + 6] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> column(int j) const {
-    return slice_3x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
-    return set_slice_3x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-    m.data[4] = data[i * 3 + j + 6];
-    m.data[5] = data[i * 3 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-    data[i * 3 + j + 6] = m.data[4];
-    data[i * 3 + j + 7] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-    m.data[6] = data[i * 3 + j + 6];
-    m.data[7] = data[i * 3 + j + 7];
-    m.data[8] = data[i * 3 + j + 8];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-    data[i * 3 + j + 6] = m.data[6];
-    data[i * 3 + j + 7] = m.data[7];
-    data[i * 3 + j + 8] = m.data[8];
-
-    return *this;
-  }
-    
-  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
-      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1));
-  }
-  
-  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
-      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0));
-  }
-  
-  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 1> const & rhs) const {
-    return Matrix<Element, 3, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
-  }
-  
-  /// Forms a 3-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-3 matrix to form a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
-    return Matrix<Element, 4, 3>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 2> const & B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
-    );
-  }
-  
-  /// Forms a 3-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Element                         B,
-    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 3-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
-    Element                         C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
-      , C, D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 3-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
-    Matrix<Element, 1, 2> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
-      , C.at(0, 0), C.at(0, 1), D
-    );
-  }
-  
-  /// Elementwise add operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-
-    result.data[3] = data[3] + rhs.data[3];
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-    result.data[8] = data[8] + rhs.data[8];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-
-    data[3] += rhs.data[3];
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-    data[8] += rhs.data[8];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-
-    result.data[3] = data[3] - rhs.data[3];
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-    result.data[8] = data[8] - rhs.data[8];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-
-    data[3] -= rhs.data[3];
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-    data[8] -= rhs.data[8];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-
-    result.data[3] = data[3] * rhs.data[3];
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-    result.data[8] = data[8] * rhs.data[8];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-
-    result.data[3] = data[3] * s;
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-    result.data[8] = data[8] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-
-    data[3] *= s;
-    data[4] *= s;
-    data[5] *= s;
-
-    data[6] *= s;
-    data[7] *= s;
-    data[8] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-
-    result.data[3] = data[3] / rhs.data[3];
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-    result.data[8] = data[8] / rhs.data[8];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-
-    result.data[3] = data[3] / s;
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-    result.data[8] = data[8] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-
-    data[3] /= s;
-    data[4] /= s;
-    data[5] /= s;
-
-    data[6] /= s;
-    data[7] /= s;
-    data[8] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (3-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-
-    data[3] /= rhs.data[3];
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-    data[8] /= rhs.data[8];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-    m.data[8] = -m.data[8];
-
-    return m;
-  }
-  
-  /// Matrix product of size 3-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> product(
-    Matrix<Element, 3, 1> const &rhs,
-    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[3] * rhs.data[0];
-    accum.data[2] += data[6] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[4] * rhs.data[1];
-    accum.data[2] += data[7] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[5] * rhs.data[2];
-    accum.data[2] += data[8] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> product(
-    Matrix<Element, 3, 2> const &rhs,
-    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[3] * rhs.data[0];
-    accum.data[3] += data[3] * rhs.data[1];
-    accum.data[4] += data[6] * rhs.data[0];
-    accum.data[5] += data[6] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[4] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[3];
-    accum.data[4] += data[7] * rhs.data[2];
-    accum.data[5] += data[7] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[5] * rhs.data[4];
-    accum.data[3] += data[5] * rhs.data[5];
-    accum.data[4] += data[8] * rhs.data[4];
-    accum.data[5] += data[8] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> product(
-    Matrix<Element, 3, 3> const &rhs,
-    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[0];
-    accum.data[4] += data[3] * rhs.data[1];
-    accum.data[5] += data[3] * rhs.data[2];
-    accum.data[6] += data[6] * rhs.data[0];
-    accum.data[7] += data[6] * rhs.data[1];
-    accum.data[8] += data[6] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[4] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-    accum.data[6] += data[7] * rhs.data[3];
-    accum.data[7] += data[7] * rhs.data[4];
-    accum.data[8] += data[7] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[5] * rhs.data[6];
-    accum.data[4] += data[5] * rhs.data[7];
-    accum.data[5] += data[5] * rhs.data[8];
-    accum.data[6] += data[8] * rhs.data[6];
-    accum.data[7] += data[8] * rhs.data[7];
-    accum.data[8] += data[8] * rhs.data[8];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 3-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> product(
-    Matrix<Element, 3, 4> const &rhs,
-    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[0];
-    accum.data[5] += data[3] * rhs.data[1];
-    accum.data[6] += data[3] * rhs.data[2];
-    accum.data[7] += data[3] * rhs.data[3];
-    accum.data[8] += data[6] * rhs.data[0];
-    accum.data[9] += data[6] * rhs.data[1];
-    accum.data[10] += data[6] * rhs.data[2];
-    accum.data[11] += data[6] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-    accum.data[6] += data[4] * rhs.data[6];
-    accum.data[7] += data[4] * rhs.data[7];
-    accum.data[8] += data[7] * rhs.data[4];
-    accum.data[9] += data[7] * rhs.data[5];
-    accum.data[10] += data[7] * rhs.data[6];
-    accum.data[11] += data[7] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[5] * rhs.data[8];
-    accum.data[5] += data[5] * rhs.data[9];
-    accum.data[6] += data[5] * rhs.data[10];
-    accum.data[7] += data[5] * rhs.data[11];
-    accum.data[8] += data[8] * rhs.data[8];
-    accum.data[9] += data[8] * rhs.data[9];
-    accum.data[10] += data[8] * rhs.data[10];
-    accum.data[11] += data[8] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-    accum += data[8];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-    accum += data[8] * data[8];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[4];
-    accum += data[8];
-
-    return accum;
-  }
-    
-  /// Returns 3-by-3 rotation matrix around the X axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_X(Element theta) {
-    Matrix m = identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(1, 1) = c;
-    m.at(1, 2) = -s;
-    m.at(2, 1) = s;
-    m.at(2, 2) = c;
-
-    return m;
-  }
-
-  /// Returns 3-by-3 rotation matrix around the Y axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_Y(Element theta) {
-    Matrix m = identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(0, 0) = c;
-    m.at(2, 0) = -s;
-    m.at(0, 2) = s;
-    m.at(2, 2) = c;
-
-    return m;
-  }
-
-  /// Returns 3-by-3 rotation matrix around the Z axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_Z(Element theta) {
-    Matrix m = Matrix::identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(0, 0) = c;
-    m.at(0, 1) = -s;
-    m.at(1, 0) = s;
-    m.at(1, 1) = c;
-
-    return m;
-  }
-
-  /// Returns a 3-by-3 rotation matrix around a unit-length axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
-    Element x = u.data[0];
-    Element y = u.data[1];
-    Element z = u.data[2];
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    Element one_minus_cos = Element(1) - fast_cos(theta);
-
-    Matrix m;
-
-    m.set_slice_3x3({
-      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
-      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
-      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
-    });
-
-    return m;
-  }
-
-  /// Returns a 3-by-3 reflection about the plane specified by the 
-  /// unit-length normal vector n_unit
-  CUTLASS_HOST_DEVICE
-  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
-
-    Element a = n_unit.data[0];
-    Element b = n_unit.data[1];
-    Element c = n_unit.data[2];
-
-    Matrix m = Matrix::identity();
-
-    m.set_slice_3x3({
-      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
-      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
-      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
-    });
-
-    return m;
-  }
-
-  /// Computes the determinant of a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Element determinant(Element accum = Element()) const {
-    
-    accum += at(0, 0) * Matrix<Element, 2, 2>({ at(1, 1), at(1, 2), at(2, 1), at(2, 2) }).determinant();
-    accum -= at(0, 1) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 2), at(2, 0), at(2, 2) }).determinant();
-    accum += at(0, 2) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 1), at(2, 0), at(2, 1) }).determinant();
-
-    return accum;
-  }
-  
-  /// Computes the inverse of a 3-by-3 matrix given
-  /// the matrix's determinant
-  CUTLASS_HOST_DEVICE
-  Matrix inverse(Element det) const {
-    return Matrix(
-      at(1, 1) * at(2, 2) - at(1, 2) * at(2, 1),
-      at(0, 2) * at(2, 1) - at(0, 1) * at(2, 2),
-      at(0, 1) * at(1, 2) - at(0, 2) * at(1, 1),
-
-      at(1, 2) * at(2, 0) - at(1, 0) * at(2, 2),
-      at(0, 0) * at(2, 2) - at(0, 2) * at(2, 0),
-      at(0, 2) * at(1, 0) - at(0, 0) * at(1, 2),
-
-      at(1, 0) * at(2, 1) - at(1, 1) * at(2, 0),
-      at(0, 1) * at(2, 0) - at(0, 0) * at(2, 1),
-      at(0, 0) * at(1, 1) - at(0, 1) * at(1, 0)
-    ) * (Element(1) / det);
-  }
-  /// Computes the inverse of a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix inverse() const {
-    return inverse(determinant());
-  }
-    
-};
-
-/// Template alias for 3-by-3 matrix
-template <typename Element>
-using Matrix3x3 = Matrix<Element, 3, 3>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix3x3<Element> make_Matrix3x3(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2, 
-    Element _2_0, Element _2_1, Element _2_2
-) {
-  return Matrix3x3<Element>(
-  _0_0, _0_1, _0_2, 
-  _1_0, _1_1, _1_2, 
-  _2_0, _2_1, _2_2 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 3-by-4 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 3, 4> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 3;
-
-  /// Number of columns in matrix
-  static int const kColumns = 4;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 12;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 3-by-4 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
-    Element _2_0, Element _2_1, Element _2_2, Element _2_3
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
-    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
-    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
-  }
-    
-  /// Constucts a 3-by-4 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 4> const &row_0,
-    Matrix<Element, 1, 4> const &row_1,
-    Matrix<Element, 1, 4> const &row_2
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_0.data[3];
-    data[4] = row_1.data[0];
-    data[5] = row_1.data[1];
-    data[6] = row_1.data[2];
-    data[7] = row_1.data[3];
-    data[8] = row_2.data[0];
-    data[9] = row_2.data[1];
-    data[10] = row_2.data[2];
-    data[11] = row_2.data[3];
-  }
-    
-  /// Static method to construct a 3-by-4 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 4, 1> const &column_0,
-    Matrix<Element, 4, 1> const &column_1,
-    Matrix<Element, 4, 1> const &column_2,
-    Matrix<Element, 4, 1> const &column_3
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_3.data[0];
-    result.data[4] = column_0.data[1];
-    result.data[5] = column_1.data[1];
-    result.data[6] = column_2.data[1];
-    result.data[7] = column_3.data[1];
-    result.data[8] = column_0.data[2];
-    result.data[9] = column_1.data[2];
-    result.data[10] = column_2.data[2];
-    result.data[11] = column_3.data[2];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-    m.data[8] = s;
-    m.data[9] = s;
-    m.data[10] = s;
-    m.data[11] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[4] = diag.data[1];
-    m.data[8] = diag.data[2];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> diagonal() const {
-    Matrix<Element, 3, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[4];
-    diag.data[2] = data[8];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> transpose() const {
-    Matrix<Element, 4, 3> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[3] = data[1];
-    mt.data[6] = data[2];
-    mt.data[9] = data[3];
-    mt.data[1] = data[4];
-    mt.data[4] = data[5];
-    mt.data[7] = data[6];
-    mt.data[10] = data[7];
-    mt.data[2] = data[8];
-    mt.data[5] = data[9];
-    mt.data[8] = data[10];
-    mt.data[11] = data[11];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 3 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> row(int i) const {
-    return slice_1x4(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
-    return set_slice_1x4(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-    m.data[2] = data[i * 4 + j + 8];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-    data[i * 4 + j + 8] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> column(int j) const {
-    return slice_3x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
-    return set_slice_3x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-    m.data[4] = data[i * 4 + j + 8];
-    m.data[5] = data[i * 4 + j + 9];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-    data[i * 4 + j + 8] = m.data[4];
-    data[i * 4 + j + 9] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-    m.data[6] = data[i * 4 + j + 8];
-    m.data[7] = data[i * 4 + j + 9];
-    m.data[8] = data[i * 4 + j + 10];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-    data[i * 4 + j + 8] = m.data[6];
-    data[i * 4 + j + 9] = m.data[7];
-    data[i * 4 + j + 10] = m.data[8];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-    m.data[8] = data[i * 4 + j + 8];
-    m.data[9] = data[i * 4 + j + 9];
-    m.data[10] = data[i * 4 + j + 10];
-    m.data[11] = data[i * 4 + j + 11];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-    data[i * 4 + j + 8] = m.data[8];
-    data[i * 4 + j + 9] = m.data[9];
-    data[i * 4 + j + 10] = m.data[10];
-    data[i * 4 + j + 11] = m.data[11];
-
-    return *this;
-  }
-    
-  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 3> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
-      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2));
-  }
-  
-  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
-      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1));
-  }
-  
-  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-3 matrix with a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 3, 3> const & lhs, Matrix<Element, 3, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
-      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0));
-  }
-  
-  /// Forms a 3-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
-  }
-  
-  /// Forms a 3-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
-  }
-  
-  /// Concatenates this matrix with a a 1-by-4 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
-    return Matrix<Element, 4, 4>::vcat(*this, rhs);
-  }
-    
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 3> const & B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
-    );
-  }
-  
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
-    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
-    );
-  }
-  
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 3> const & A, Element                         B,
-    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
-    Element                         C, Matrix<Element, 1, 3> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
-      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
-    );
-  }
-  
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
-    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 3-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
-    Matrix<Element, 1, 3> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
-    );
-  }
-  
-  /// Elementwise add operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-
-    result.data[8] = data[8] + rhs.data[8];
-    result.data[9] = data[9] + rhs.data[9];
-    result.data[10] = data[10] + rhs.data[10];
-    result.data[11] = data[11] + rhs.data[11];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-
-    data[8] += rhs.data[8];
-    data[9] += rhs.data[9];
-    data[10] += rhs.data[10];
-    data[11] += rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-
-    result.data[8] = data[8] - rhs.data[8];
-    result.data[9] = data[9] - rhs.data[9];
-    result.data[10] = data[10] - rhs.data[10];
-    result.data[11] = data[11] - rhs.data[11];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-
-    data[8] -= rhs.data[8];
-    data[9] -= rhs.data[9];
-    data[10] -= rhs.data[10];
-    data[11] -= rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-
-    result.data[8] = data[8] * rhs.data[8];
-    result.data[9] = data[9] * rhs.data[9];
-    result.data[10] = data[10] * rhs.data[10];
-    result.data[11] = data[11] * rhs.data[11];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-
-    result.data[8] = data[8] * s;
-    result.data[9] = data[9] * s;
-    result.data[10] = data[10] * s;
-    result.data[11] = data[11] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-    data[3] *= s;
-
-    data[4] *= s;
-    data[5] *= s;
-    data[6] *= s;
-    data[7] *= s;
-
-    data[8] *= s;
-    data[9] *= s;
-    data[10] *= s;
-    data[11] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-
-    result.data[8] = data[8] / rhs.data[8];
-    result.data[9] = data[9] / rhs.data[9];
-    result.data[10] = data[10] / rhs.data[10];
-    result.data[11] = data[11] / rhs.data[11];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-
-    result.data[8] = data[8] / s;
-    result.data[9] = data[9] / s;
-    result.data[10] = data[10] / s;
-    result.data[11] = data[11] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-    data[3] /= s;
-
-    data[4] /= s;
-    data[5] /= s;
-    data[6] /= s;
-    data[7] /= s;
-
-    data[8] /= s;
-    data[9] /= s;
-    data[10] /= s;
-    data[11] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (3-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-
-    data[8] /= rhs.data[8];
-    data[9] /= rhs.data[9];
-    data[10] /= rhs.data[10];
-    data[11] /= rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-    m.data[8] = -m.data[8];
-    m.data[9] = -m.data[9];
-    m.data[10] = -m.data[10];
-    m.data[11] = -m.data[11];
-
-    return m;
-  }
-  
-  /// Matrix product of size 3-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> product(
-    Matrix<Element, 4, 1> const &rhs,
-    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[4] * rhs.data[0];
-    accum.data[2] += data[8] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[5] * rhs.data[1];
-    accum.data[2] += data[9] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[6] * rhs.data[2];
-    accum.data[2] += data[10] * rhs.data[2];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[3];
-    accum.data[1] += data[7] * rhs.data[3];
-    accum.data[2] += data[11] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> product(
-    Matrix<Element, 4, 2> const &rhs,
-    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[4] * rhs.data[0];
-    accum.data[3] += data[4] * rhs.data[1];
-    accum.data[4] += data[8] * rhs.data[0];
-    accum.data[5] += data[8] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[5] * rhs.data[2];
-    accum.data[3] += data[5] * rhs.data[3];
-    accum.data[4] += data[9] * rhs.data[2];
-    accum.data[5] += data[9] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[6] * rhs.data[4];
-    accum.data[3] += data[6] * rhs.data[5];
-    accum.data[4] += data[10] * rhs.data[4];
-    accum.data[5] += data[10] * rhs.data[5];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[6];
-    accum.data[1] += data[3] * rhs.data[7];
-    accum.data[2] += data[7] * rhs.data[6];
-    accum.data[3] += data[7] * rhs.data[7];
-    accum.data[4] += data[11] * rhs.data[6];
-    accum.data[5] += data[11] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> product(
-    Matrix<Element, 4, 3> const &rhs,
-    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[0];
-    accum.data[4] += data[4] * rhs.data[1];
-    accum.data[5] += data[4] * rhs.data[2];
-    accum.data[6] += data[8] * rhs.data[0];
-    accum.data[7] += data[8] * rhs.data[1];
-    accum.data[8] += data[8] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[5] * rhs.data[3];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-    accum.data[6] += data[9] * rhs.data[3];
-    accum.data[7] += data[9] * rhs.data[4];
-    accum.data[8] += data[9] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[6] * rhs.data[6];
-    accum.data[4] += data[6] * rhs.data[7];
-    accum.data[5] += data[6] * rhs.data[8];
-    accum.data[6] += data[10] * rhs.data[6];
-    accum.data[7] += data[10] * rhs.data[7];
-    accum.data[8] += data[10] * rhs.data[8];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[9];
-    accum.data[1] += data[3] * rhs.data[10];
-    accum.data[2] += data[3] * rhs.data[11];
-    accum.data[3] += data[7] * rhs.data[9];
-    accum.data[4] += data[7] * rhs.data[10];
-    accum.data[5] += data[7] * rhs.data[11];
-    accum.data[6] += data[11] * rhs.data[9];
-    accum.data[7] += data[11] * rhs.data[10];
-    accum.data[8] += data[11] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> product(
-    Matrix<Element, 4, 4> const &rhs,
-    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[0];
-    accum.data[5] += data[4] * rhs.data[1];
-    accum.data[6] += data[4] * rhs.data[2];
-    accum.data[7] += data[4] * rhs.data[3];
-    accum.data[8] += data[8] * rhs.data[0];
-    accum.data[9] += data[8] * rhs.data[1];
-    accum.data[10] += data[8] * rhs.data[2];
-    accum.data[11] += data[8] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-    accum.data[6] += data[5] * rhs.data[6];
-    accum.data[7] += data[5] * rhs.data[7];
-    accum.data[8] += data[9] * rhs.data[4];
-    accum.data[9] += data[9] * rhs.data[5];
-    accum.data[10] += data[9] * rhs.data[6];
-    accum.data[11] += data[9] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[6] * rhs.data[8];
-    accum.data[5] += data[6] * rhs.data[9];
-    accum.data[6] += data[6] * rhs.data[10];
-    accum.data[7] += data[6] * rhs.data[11];
-    accum.data[8] += data[10] * rhs.data[8];
-    accum.data[9] += data[10] * rhs.data[9];
-    accum.data[10] += data[10] * rhs.data[10];
-    accum.data[11] += data[10] * rhs.data[11];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[12];
-    accum.data[1] += data[3] * rhs.data[13];
-    accum.data[2] += data[3] * rhs.data[14];
-    accum.data[3] += data[3] * rhs.data[15];
-    accum.data[4] += data[7] * rhs.data[12];
-    accum.data[5] += data[7] * rhs.data[13];
-    accum.data[6] += data[7] * rhs.data[14];
-    accum.data[7] += data[7] * rhs.data[15];
-    accum.data[8] += data[11] * rhs.data[12];
-    accum.data[9] += data[11] * rhs.data[13];
-    accum.data[10] += data[11] * rhs.data[14];
-    accum.data[11] += data[11] * rhs.data[15];
-
-    return accum;
-  }
-
-  /// Matrix product of size 3-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 3-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-    accum += data[8];
-    accum += data[9];
-    accum += data[10];
-    accum += data[11];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-    accum += data[8] * data[8];
-    accum += data[9] * data[9];
-    accum += data[10] * data[10];
-    accum += data[11] * data[11];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[5];
-    accum += data[10];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 3-by-4 matrix
-template <typename Element>
-using Matrix3x4 = Matrix<Element, 3, 4>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix3x4<Element> make_Matrix3x4(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
-    Element _2_0, Element _2_1, Element _2_2, Element _2_3
-) {
-  return Matrix3x4<Element>(
-  _0_0, _0_1, _0_2, _0_3, 
-  _1_0, _1_1, _1_2, _1_3, 
-  _2_0, _2_1, _2_2, _2_3 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 4-by-1 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 4, 1> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 4;
-
-  /// Number of columns in matrix
-  static int const kColumns = 1;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 4;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 4-by-1 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, 
-    Element _1_0, 
-    Element _2_0, 
-    Element _3_0
-  ) {
-
-    data[0] = _0_0;
-    data[1] = _1_0;
-    data[2] = _2_0;
-    data[3] = _3_0;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> transpose() const {
-    Matrix<Element, 1, 4> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[1] = data[1];
-    mt.data[2] = data[2];
-    mt.data[3] = data[3];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-    m.data[2] = data[i * 1 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-    data[i * 1 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 1> m;
-    
-    m.data[0] = data[i * 1 + j + 0];
-    m.data[1] = data[i * 1 + j + 1];
-    m.data[2] = data[i * 1 + j + 2];
-    m.data[3] = data[i * 1 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 1 + j + 0] = m.data[0];
-    data[i * 1 + j + 1] = m.data[1];
-    data[i * 1 + j + 2] = m.data[2];
-    data[i * 1 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> column(int j) const {
-    return slice_4x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
-    return set_slice_4x1(v, 0, j);
-  }
-    
-  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> hcat(Matrix<Element, 4, 1> const & rhs) const {
-    return Matrix<Element, 4, 2>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 2> const & rhs) const {
-    return Matrix<Element, 4, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 4-by-3 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 3> const & rhs) const {
-    return Matrix<Element, 4, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 4-by-1 matrix by vertically concatenating an Element with a 3-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Element upper, Matrix<Element, 3, 1> const & lower) {
-    return Matrix(
-      upper
-      , lower.at(0, 0)
-      , lower.at(1, 0)
-      , lower.at(2, 0));
-  }
-  
-  /// Forms a 4-by-1 matrix by vertically concatenating a 2-by-1 matrix with a 2-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Matrix<Element, 2, 1> const & lower) {
-    return Matrix(
-      upper.at(0, 0)
-      , upper.at(1, 0)
-      , lower.at(0, 0)
-      , lower.at(1, 0));
-  }
-  
-  /// Forms a 4-by-1 matrix by vertically concatenating a 3-by-1 matrix with an Element
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 3, 1> const & upper, Element lower) {
-    return Matrix(
-      upper.at(0, 0)
-      , upper.at(1, 0)
-      , upper.at(2, 0)
-      , lower);
-  }
-  
-  /// Elementwise add operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-
-    result.data[1] = data[1] + rhs.data[1];
-
-    result.data[2] = data[2] + rhs.data[2];
-
-    result.data[3] = data[3] + rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-
-    data[1] += rhs.data[1];
-
-    data[2] += rhs.data[2];
-
-    data[3] += rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-
-    result.data[1] = data[1] - rhs.data[1];
-
-    result.data[2] = data[2] - rhs.data[2];
-
-    result.data[3] = data[3] - rhs.data[3];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-
-    data[1] -= rhs.data[1];
-
-    data[2] -= rhs.data[2];
-
-    data[3] -= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-
-    result.data[1] = data[1] * rhs.data[1];
-
-    result.data[2] = data[2] * rhs.data[2];
-
-    result.data[3] = data[3] * rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-
-    result.data[1] = data[1] * s;
-
-    result.data[2] = data[2] * s;
-
-    result.data[3] = data[3] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-
-    data[1] *= s;
-
-    data[2] *= s;
-
-    data[3] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-
-    result.data[1] = data[1] / rhs.data[1];
-
-    result.data[2] = data[2] / rhs.data[2];
-
-    result.data[3] = data[3] / rhs.data[3];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-
-    result.data[1] = data[1] / s;
-
-    result.data[2] = data[2] / s;
-
-    result.data[3] = data[3] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-
-    data[1] /= s;
-
-    data[2] /= s;
-
-    data[3] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (4-by-1)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-
-    data[1] /= rhs.data[1];
-
-    data[2] /= rhs.data[2];
-
-    data[3] /= rhs.data[3];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-
-    return m;
-  }
-  
-  /// Matrix product of size 4-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> product(
-    Matrix<Element, 1, 1> const &rhs,
-    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[1] * rhs.data[0];
-    accum.data[2] += data[2] * rhs.data[0];
-    accum.data[3] += data[3] * rhs.data[0];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-1-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 4-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> product(
-    Matrix<Element, 1, 2> const &rhs,
-    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[1] * rhs.data[0];
-    accum.data[3] += data[1] * rhs.data[1];
-    accum.data[4] += data[2] * rhs.data[0];
-    accum.data[5] += data[2] * rhs.data[1];
-    accum.data[6] += data[3] * rhs.data[0];
-    accum.data[7] += data[3] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-2-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> product(
-    Matrix<Element, 1, 3> const &rhs,
-    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[1] * rhs.data[0];
-    accum.data[4] += data[1] * rhs.data[1];
-    accum.data[5] += data[1] * rhs.data[2];
-    accum.data[6] += data[2] * rhs.data[0];
-    accum.data[7] += data[2] * rhs.data[1];
-    accum.data[8] += data[2] * rhs.data[2];
-    accum.data[9] += data[3] * rhs.data[0];
-    accum.data[10] += data[3] * rhs.data[1];
-    accum.data[11] += data[3] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-3-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> product(
-    Matrix<Element, 1, 4> const &rhs,
-    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[1] * rhs.data[0];
-    accum.data[5] += data[1] * rhs.data[1];
-    accum.data[6] += data[1] * rhs.data[2];
-    accum.data[7] += data[1] * rhs.data[3];
-    accum.data[8] += data[2] * rhs.data[0];
-    accum.data[9] += data[2] * rhs.data[1];
-    accum.data[10] += data[2] * rhs.data[2];
-    accum.data[11] += data[2] * rhs.data[3];
-    accum.data[12] += data[3] * rhs.data[0];
-    accum.data[13] += data[3] * rhs.data[1];
-    accum.data[14] += data[3] * rhs.data[2];
-    accum.data[15] += data[3] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-4-by-1
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Dot product of vectors with extent 4
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    accum += data[3] * rhs.data[3];
-    return accum;
-  }
-
-  /// Dot product of vectors with extent 4
-  CUTLASS_HOST_DEVICE
-  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
-    
-    accum += data[0] * rhs.data[0];
-    accum += data[1] * rhs.data[1];
-    accum += data[2] * rhs.data[2];
-    accum += data[3] * rhs.data[3];
-    return accum;
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 4-by-1 matrix
-template <typename Element>
-using Matrix4x1 = Matrix<Element, 4, 1>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix4x1<Element> make_Matrix4x1(
-    Element _0_0, 
-    Element _1_0, 
-    Element _2_0, 
-    Element _3_0
-) {
-  return Matrix4x1<Element>(
-  _0_0, 
-  _1_0, 
-  _2_0, 
-  _3_0 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 4-by-2 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 4, 2> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 4;
-
-  /// Number of columns in matrix
-  static int const kColumns = 2;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 8;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 4-by-2 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1, 
-    Element _2_0, Element _2_1, 
-    Element _3_0, Element _3_1
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;
-    data[2] = _1_0;  data[3] = _1_1;
-    data[4] = _2_0;  data[5] = _2_1;
-    data[6] = _3_0;  data[7] = _3_1;
-  }
-    
-  /// Constucts a 4-by-2 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 2> const &row_0,
-    Matrix<Element, 1, 2> const &row_1,
-    Matrix<Element, 1, 2> const &row_2,
-    Matrix<Element, 1, 2> const &row_3
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_1.data[0];
-    data[3] = row_1.data[1];
-    data[4] = row_2.data[0];
-    data[5] = row_2.data[1];
-    data[6] = row_3.data[0];
-    data[7] = row_3.data[1];
-  }
-    
-  /// Static method to construct a 4-by-2 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 2, 1> const &column_0,
-    Matrix<Element, 2, 1> const &column_1
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_0.data[1];
-    result.data[3] = column_1.data[1];
-    result.data[4] = column_0.data[2];
-    result.data[5] = column_1.data[2];
-    result.data[6] = column_0.data[3];
-    result.data[7] = column_1.data[3];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> diagonal() const {
-    Matrix<Element, 2, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[5];
-    diag.data[2] = data[10];
-    diag.data[3] = data[15];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> transpose() const {
-    Matrix<Element, 2, 4> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[4] = data[1];
-    mt.data[1] = data[2];
-    mt.data[5] = data[3];
-    mt.data[2] = data[4];
-    mt.data[6] = data[5];
-    mt.data[3] = data[6];
-    mt.data[7] = data[7];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> row(int i) const {
-    return slice_1x2(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
-    return set_slice_1x2(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-    m.data[2] = data[i * 2 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-    data[i * 2 + j + 4] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-    m.data[4] = data[i * 2 + j + 4];
-    m.data[5] = data[i * 2 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-    data[i * 2 + j + 4] = m.data[4];
-    data[i * 2 + j + 5] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 1> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 2];
-    m.data[2] = data[i * 2 + j + 4];
-    m.data[3] = data[i * 2 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 2] = m.data[1];
-    data[i * 2 + j + 4] = m.data[2];
-    data[i * 2 + j + 6] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> column(int j) const {
-    return slice_4x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
-    return set_slice_4x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 2> m;
-    
-    m.data[0] = data[i * 2 + j + 0];
-    m.data[1] = data[i * 2 + j + 1];
-    m.data[2] = data[i * 2 + j + 2];
-    m.data[3] = data[i * 2 + j + 3];
-    m.data[4] = data[i * 2 + j + 4];
-    m.data[5] = data[i * 2 + j + 5];
-    m.data[6] = data[i * 2 + j + 6];
-    m.data[7] = data[i * 2 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 2 + j + 0] = m.data[0];
-    data[i * 2 + j + 1] = m.data[1];
-    data[i * 2 + j + 2] = m.data[2];
-    data[i * 2 + j + 3] = m.data[3];
-    data[i * 2 + j + 4] = m.data[4];
-    data[i * 2 + j + 5] = m.data[5];
-    data[i * 2 + j + 6] = m.data[6];
-    data[i * 2 + j + 7] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Forms a 4-by-2 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0)
-      , lhs.at(1, 0), rhs.at(1, 0)
-      , lhs.at(2, 0), rhs.at(2, 0)
-      , lhs.at(3, 0), rhs.at(3, 0));
-  }
-  
-  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 1> const & rhs) const {
-    return Matrix<Element, 4, 3>::hcat(*this, rhs);
-  }
-    
-  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 2> const & rhs) const {
-    return Matrix<Element, 4, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 4-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 3-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 3, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , lower.at(0, 0), lower.at(0, 1)
-      , lower.at(1, 0), lower.at(1, 1)
-      , lower.at(2, 0), lower.at(2, 1));
-  }
-  
-  /// Forms a 4-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 2-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , upper.at(1, 0), upper.at(1, 1)
-      , lower.at(0, 0), lower.at(0, 1)
-      , lower.at(1, 0), lower.at(1, 1));
-  }
-  
-  /// Forms a 4-by-2 matrix by vertically concatenating a 3-by-2 matrix with a 1-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 3, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1)
-      , upper.at(1, 0), upper.at(1, 1)
-      , upper.at(2, 0), upper.at(2, 1)
-      , lower.at(0, 0), lower.at(0, 1));
-  }
-  
-  /// Forms a 4-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Element                         B,
-    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 1> const & D) {
-    return Matrix(
-      A, B
-      , C.at(0, 0), D.at(0, 0)
-      , C.at(1, 0), D.at(1, 0)
-      , C.at(2, 0), D.at(2, 0)
-    );
-  }
-  
-  /// Forms a 4-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0)
-      , A.at(1, 0), B.at(1, 0)
-      , C.at(0, 0), D.at(0, 0)
-      , C.at(1, 0), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 4-by-2 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 1> const & B,
-    Element                         C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0)
-      , A.at(1, 0), B.at(1, 0)
-      , A.at(2, 0), B.at(2, 0)
-      , C, D
-    );
-  }
-  
-  /// Elementwise add operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-
-    data[2] *= s;
-    data[3] *= s;
-
-    data[4] *= s;
-    data[5] *= s;
-
-    data[6] *= s;
-    data[7] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-
-    data[2] /= s;
-    data[3] /= s;
-
-    data[4] /= s;
-    data[5] /= s;
-
-    data[6] /= s;
-    data[7] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (4-by-2)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-
-    return m;
-  }
-  
-  /// Matrix product of size 4-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> product(
-    Matrix<Element, 2, 1> const &rhs,
-    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[2] * rhs.data[0];
-    accum.data[2] += data[4] * rhs.data[0];
-    accum.data[3] += data[6] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[3] * rhs.data[1];
-    accum.data[2] += data[5] * rhs.data[1];
-    accum.data[3] += data[7] * rhs.data[1];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-1-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> product(
-    Matrix<Element, 2, 2> const &rhs,
-    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[2] * rhs.data[0];
-    accum.data[3] += data[2] * rhs.data[1];
-    accum.data[4] += data[4] * rhs.data[0];
-    accum.data[5] += data[4] * rhs.data[1];
-    accum.data[6] += data[6] * rhs.data[0];
-    accum.data[7] += data[6] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[3] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[3];
-    accum.data[4] += data[5] * rhs.data[2];
-    accum.data[5] += data[5] * rhs.data[3];
-    accum.data[6] += data[7] * rhs.data[2];
-    accum.data[7] += data[7] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-2-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 4-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> product(
-    Matrix<Element, 2, 3> const &rhs,
-    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[2] * rhs.data[0];
-    accum.data[4] += data[2] * rhs.data[1];
-    accum.data[5] += data[2] * rhs.data[2];
-    accum.data[6] += data[4] * rhs.data[0];
-    accum.data[7] += data[4] * rhs.data[1];
-    accum.data[8] += data[4] * rhs.data[2];
-    accum.data[9] += data[6] * rhs.data[0];
-    accum.data[10] += data[6] * rhs.data[1];
-    accum.data[11] += data[6] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[3] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-    accum.data[6] += data[5] * rhs.data[3];
-    accum.data[7] += data[5] * rhs.data[4];
-    accum.data[8] += data[5] * rhs.data[5];
-    accum.data[9] += data[7] * rhs.data[3];
-    accum.data[10] += data[7] * rhs.data[4];
-    accum.data[11] += data[7] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-3-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> product(
-    Matrix<Element, 2, 4> const &rhs,
-    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[2] * rhs.data[0];
-    accum.data[5] += data[2] * rhs.data[1];
-    accum.data[6] += data[2] * rhs.data[2];
-    accum.data[7] += data[2] * rhs.data[3];
-    accum.data[8] += data[4] * rhs.data[0];
-    accum.data[9] += data[4] * rhs.data[1];
-    accum.data[10] += data[4] * rhs.data[2];
-    accum.data[11] += data[4] * rhs.data[3];
-    accum.data[12] += data[6] * rhs.data[0];
-    accum.data[13] += data[6] * rhs.data[1];
-    accum.data[14] += data[6] * rhs.data[2];
-    accum.data[15] += data[6] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[3] * rhs.data[4];
-    accum.data[5] += data[3] * rhs.data[5];
-    accum.data[6] += data[3] * rhs.data[6];
-    accum.data[7] += data[3] * rhs.data[7];
-    accum.data[8] += data[5] * rhs.data[4];
-    accum.data[9] += data[5] * rhs.data[5];
-    accum.data[10] += data[5] * rhs.data[6];
-    accum.data[11] += data[5] * rhs.data[7];
-    accum.data[12] += data[7] * rhs.data[4];
-    accum.data[13] += data[7] * rhs.data[5];
-    accum.data[14] += data[7] * rhs.data[6];
-    accum.data[15] += data[7] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-4-by-2
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[3];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 4-by-2 matrix
-template <typename Element>
-using Matrix4x2 = Matrix<Element, 4, 2>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix4x2<Element> make_Matrix4x2(
-    Element _0_0, Element _0_1, 
-    Element _1_0, Element _1_1, 
-    Element _2_0, Element _2_1, 
-    Element _3_0, Element _3_1
-) {
-  return Matrix4x2<Element>(
-  _0_0, _0_1, 
-  _1_0, _1_1, 
-  _2_0, _2_1, 
-  _3_0, _3_1 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 4-by-3 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 4, 3> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 4;
-
-  /// Number of columns in matrix
-  static int const kColumns = 3;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 12;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 4-by-3 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2, 
-    Element _2_0, Element _2_1, Element _2_2, 
-    Element _3_0, Element _3_1, Element _3_2
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
-    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
-    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
-    data[9] = _3_0;  data[10] = _3_1;  data[11] = _3_2;
-  }
-    
-  /// Constucts a 4-by-3 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 3> const &row_0,
-    Matrix<Element, 1, 3> const &row_1,
-    Matrix<Element, 1, 3> const &row_2,
-    Matrix<Element, 1, 3> const &row_3
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_1.data[0];
-    data[4] = row_1.data[1];
-    data[5] = row_1.data[2];
-    data[6] = row_2.data[0];
-    data[7] = row_2.data[1];
-    data[8] = row_2.data[2];
-    data[9] = row_3.data[0];
-    data[10] = row_3.data[1];
-    data[11] = row_3.data[2];
-  }
-    
-  /// Static method to construct a 4-by-3 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 3, 1> const &column_0,
-    Matrix<Element, 3, 1> const &column_1,
-    Matrix<Element, 3, 1> const &column_2
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_0.data[1];
-    result.data[4] = column_1.data[1];
-    result.data[5] = column_2.data[1];
-    result.data[6] = column_0.data[2];
-    result.data[7] = column_1.data[2];
-    result.data[8] = column_2.data[2];
-    result.data[9] = column_0.data[3];
-    result.data[10] = column_1.data[3];
-    result.data[11] = column_2.data[3];
-    return result;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-    m.data[8] = s;
-    m.data[9] = s;
-    m.data[10] = s;
-    m.data[11] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> diagonal() const {
-    Matrix<Element, 3, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[5];
-    diag.data[2] = data[10];
-    diag.data[3] = data[15];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> transpose() const {
-    Matrix<Element, 3, 4> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[4] = data[1];
-    mt.data[8] = data[2];
-    mt.data[1] = data[3];
-    mt.data[5] = data[4];
-    mt.data[9] = data[5];
-    mt.data[2] = data[6];
-    mt.data[6] = data[7];
-    mt.data[10] = data[8];
-    mt.data[3] = data[9];
-    mt.data[7] = data[10];
-    mt.data[11] = data[11];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> row(int i) const {
-    return slice_1x3(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
-    return set_slice_1x3(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-    m.data[2] = data[i * 3 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-    data[i * 3 + j + 6] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-    m.data[4] = data[i * 3 + j + 6];
-    m.data[5] = data[i * 3 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-    data[i * 3 + j + 6] = m.data[4];
-    data[i * 3 + j + 7] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-    m.data[6] = data[i * 3 + j + 6];
-    m.data[7] = data[i * 3 + j + 7];
-    m.data[8] = data[i * 3 + j + 8];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-    data[i * 3 + j + 6] = m.data[6];
-    data[i * 3 + j + 7] = m.data[7];
-    data[i * 3 + j + 8] = m.data[8];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 1> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 3];
-    m.data[2] = data[i * 3 + j + 6];
-    m.data[3] = data[i * 3 + j + 9];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 3] = m.data[1];
-    data[i * 3 + j + 6] = m.data[2];
-    data[i * 3 + j + 9] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> column(int j) const {
-    return slice_4x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
-    return set_slice_4x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 2> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 3];
-    m.data[3] = data[i * 3 + j + 4];
-    m.data[4] = data[i * 3 + j + 6];
-    m.data[5] = data[i * 3 + j + 7];
-    m.data[6] = data[i * 3 + j + 9];
-    m.data[7] = data[i * 3 + j + 10];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 3] = m.data[2];
-    data[i * 3 + j + 4] = m.data[3];
-    data[i * 3 + j + 6] = m.data[4];
-    data[i * 3 + j + 7] = m.data[5];
-    data[i * 3 + j + 9] = m.data[6];
-    data[i * 3 + j + 10] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 3> m;
-    
-    m.data[0] = data[i * 3 + j + 0];
-    m.data[1] = data[i * 3 + j + 1];
-    m.data[2] = data[i * 3 + j + 2];
-    m.data[3] = data[i * 3 + j + 3];
-    m.data[4] = data[i * 3 + j + 4];
-    m.data[5] = data[i * 3 + j + 5];
-    m.data[6] = data[i * 3 + j + 6];
-    m.data[7] = data[i * 3 + j + 7];
-    m.data[8] = data[i * 3 + j + 8];
-    m.data[9] = data[i * 3 + j + 9];
-    m.data[10] = data[i * 3 + j + 10];
-    m.data[11] = data[i * 3 + j + 11];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 3 + j + 0] = m.data[0];
-    data[i * 3 + j + 1] = m.data[1];
-    data[i * 3 + j + 2] = m.data[2];
-    data[i * 3 + j + 3] = m.data[3];
-    data[i * 3 + j + 4] = m.data[4];
-    data[i * 3 + j + 5] = m.data[5];
-    data[i * 3 + j + 6] = m.data[6];
-    data[i * 3 + j + 7] = m.data[7];
-    data[i * 3 + j + 8] = m.data[8];
-    data[i * 3 + j + 9] = m.data[9];
-    data[i * 3 + j + 10] = m.data[10];
-    data[i * 3 + j + 11] = m.data[11];
-
-    return *this;
-  }
-    
-  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
-      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1)
-      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1));
-  }
-  
-  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
-      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0)
-      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0));
-  }
-  
-  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 1> const & rhs) const {
-    return Matrix<Element, 4, 4>::hcat(*this, rhs);
-  }
-    
-  /// Forms a 4-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 3-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 3, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2)
-      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2));
-  }
-  
-  /// Forms a 4-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 2-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
-  }
-  
-  /// Forms a 4-by-3 matrix by vertically concatenating a 3-by-3 matrix with a 1-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 3, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
-      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 2> const & B,
-    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 2> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
-      , C.at(2, 0), D.at(2, 0), D.at(2, 1)
-    );
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Element                         B,
-    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
-      , C.at(2, 0), C.at(2, 1), D.at(2, 0)
-    );
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
-    );
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
-    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 2> const & B,
-    Element                         C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
-      , A.at(2, 0), B.at(2, 0), B.at(2, 1)
-      , C, D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 4-by-3 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 1> const & B,
-    Matrix<Element, 1, 2> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
-      , A.at(2, 0), A.at(2, 1), B.at(2, 0)
-      , C.at(0, 0), C.at(0, 1), D
-    );
-  }
-  
-  /// Elementwise add operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-
-    result.data[3] = data[3] + rhs.data[3];
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-    result.data[8] = data[8] + rhs.data[8];
-
-    result.data[9] = data[9] + rhs.data[9];
-    result.data[10] = data[10] + rhs.data[10];
-    result.data[11] = data[11] + rhs.data[11];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-
-    data[3] += rhs.data[3];
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-    data[8] += rhs.data[8];
-
-    data[9] += rhs.data[9];
-    data[10] += rhs.data[10];
-    data[11] += rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-
-    result.data[3] = data[3] - rhs.data[3];
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-    result.data[8] = data[8] - rhs.data[8];
-
-    result.data[9] = data[9] - rhs.data[9];
-    result.data[10] = data[10] - rhs.data[10];
-    result.data[11] = data[11] - rhs.data[11];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-
-    data[3] -= rhs.data[3];
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-    data[8] -= rhs.data[8];
-
-    data[9] -= rhs.data[9];
-    data[10] -= rhs.data[10];
-    data[11] -= rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-
-    result.data[3] = data[3] * rhs.data[3];
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-    result.data[8] = data[8] * rhs.data[8];
-
-    result.data[9] = data[9] * rhs.data[9];
-    result.data[10] = data[10] * rhs.data[10];
-    result.data[11] = data[11] * rhs.data[11];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-
-    result.data[3] = data[3] * s;
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-    result.data[8] = data[8] * s;
-
-    result.data[9] = data[9] * s;
-    result.data[10] = data[10] * s;
-    result.data[11] = data[11] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-
-    data[3] *= s;
-    data[4] *= s;
-    data[5] *= s;
-
-    data[6] *= s;
-    data[7] *= s;
-    data[8] *= s;
-
-    data[9] *= s;
-    data[10] *= s;
-    data[11] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-
-    result.data[3] = data[3] / rhs.data[3];
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-    result.data[8] = data[8] / rhs.data[8];
-
-    result.data[9] = data[9] / rhs.data[9];
-    result.data[10] = data[10] / rhs.data[10];
-    result.data[11] = data[11] / rhs.data[11];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-
-    result.data[3] = data[3] / s;
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-    result.data[8] = data[8] / s;
-
-    result.data[9] = data[9] / s;
-    result.data[10] = data[10] / s;
-    result.data[11] = data[11] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-
-    data[3] /= s;
-    data[4] /= s;
-    data[5] /= s;
-
-    data[6] /= s;
-    data[7] /= s;
-    data[8] /= s;
-
-    data[9] /= s;
-    data[10] /= s;
-    data[11] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (4-by-3)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-
-    data[3] /= rhs.data[3];
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-    data[8] /= rhs.data[8];
-
-    data[9] /= rhs.data[9];
-    data[10] /= rhs.data[10];
-    data[11] /= rhs.data[11];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-    m.data[8] = -m.data[8];
-    m.data[9] = -m.data[9];
-    m.data[10] = -m.data[10];
-    m.data[11] = -m.data[11];
-
-    return m;
-  }
-  
-  /// Matrix product of size 4-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> product(
-    Matrix<Element, 3, 1> const &rhs,
-    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[3] * rhs.data[0];
-    accum.data[2] += data[6] * rhs.data[0];
-    accum.data[3] += data[9] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[4] * rhs.data[1];
-    accum.data[2] += data[7] * rhs.data[1];
-    accum.data[3] += data[10] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[5] * rhs.data[2];
-    accum.data[2] += data[8] * rhs.data[2];
-    accum.data[3] += data[11] * rhs.data[2];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-1-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> product(
-    Matrix<Element, 3, 2> const &rhs,
-    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[3] * rhs.data[0];
-    accum.data[3] += data[3] * rhs.data[1];
-    accum.data[4] += data[6] * rhs.data[0];
-    accum.data[5] += data[6] * rhs.data[1];
-    accum.data[6] += data[9] * rhs.data[0];
-    accum.data[7] += data[9] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[4] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[3];
-    accum.data[4] += data[7] * rhs.data[2];
-    accum.data[5] += data[7] * rhs.data[3];
-    accum.data[6] += data[10] * rhs.data[2];
-    accum.data[7] += data[10] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[5] * rhs.data[4];
-    accum.data[3] += data[5] * rhs.data[5];
-    accum.data[4] += data[8] * rhs.data[4];
-    accum.data[5] += data[8] * rhs.data[5];
-    accum.data[6] += data[11] * rhs.data[4];
-    accum.data[7] += data[11] * rhs.data[5];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-2-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> product(
-    Matrix<Element, 3, 3> const &rhs,
-    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[3] * rhs.data[0];
-    accum.data[4] += data[3] * rhs.data[1];
-    accum.data[5] += data[3] * rhs.data[2];
-    accum.data[6] += data[6] * rhs.data[0];
-    accum.data[7] += data[6] * rhs.data[1];
-    accum.data[8] += data[6] * rhs.data[2];
-    accum.data[9] += data[9] * rhs.data[0];
-    accum.data[10] += data[9] * rhs.data[1];
-    accum.data[11] += data[9] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[4] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-    accum.data[6] += data[7] * rhs.data[3];
-    accum.data[7] += data[7] * rhs.data[4];
-    accum.data[8] += data[7] * rhs.data[5];
-    accum.data[9] += data[10] * rhs.data[3];
-    accum.data[10] += data[10] * rhs.data[4];
-    accum.data[11] += data[10] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[5] * rhs.data[6];
-    accum.data[4] += data[5] * rhs.data[7];
-    accum.data[5] += data[5] * rhs.data[8];
-    accum.data[6] += data[8] * rhs.data[6];
-    accum.data[7] += data[8] * rhs.data[7];
-    accum.data[8] += data[8] * rhs.data[8];
-    accum.data[9] += data[11] * rhs.data[6];
-    accum.data[10] += data[11] * rhs.data[7];
-    accum.data[11] += data[11] * rhs.data[8];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-3-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Matrix product of size 4-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> product(
-    Matrix<Element, 3, 4> const &rhs,
-    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[3] * rhs.data[0];
-    accum.data[5] += data[3] * rhs.data[1];
-    accum.data[6] += data[3] * rhs.data[2];
-    accum.data[7] += data[3] * rhs.data[3];
-    accum.data[8] += data[6] * rhs.data[0];
-    accum.data[9] += data[6] * rhs.data[1];
-    accum.data[10] += data[6] * rhs.data[2];
-    accum.data[11] += data[6] * rhs.data[3];
-    accum.data[12] += data[9] * rhs.data[0];
-    accum.data[13] += data[9] * rhs.data[1];
-    accum.data[14] += data[9] * rhs.data[2];
-    accum.data[15] += data[9] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[4] * rhs.data[4];
-    accum.data[5] += data[4] * rhs.data[5];
-    accum.data[6] += data[4] * rhs.data[6];
-    accum.data[7] += data[4] * rhs.data[7];
-    accum.data[8] += data[7] * rhs.data[4];
-    accum.data[9] += data[7] * rhs.data[5];
-    accum.data[10] += data[7] * rhs.data[6];
-    accum.data[11] += data[7] * rhs.data[7];
-    accum.data[12] += data[10] * rhs.data[4];
-    accum.data[13] += data[10] * rhs.data[5];
-    accum.data[14] += data[10] * rhs.data[6];
-    accum.data[15] += data[10] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[5] * rhs.data[8];
-    accum.data[5] += data[5] * rhs.data[9];
-    accum.data[6] += data[5] * rhs.data[10];
-    accum.data[7] += data[5] * rhs.data[11];
-    accum.data[8] += data[8] * rhs.data[8];
-    accum.data[9] += data[8] * rhs.data[9];
-    accum.data[10] += data[8] * rhs.data[10];
-    accum.data[11] += data[8] * rhs.data[11];
-    accum.data[12] += data[11] * rhs.data[8];
-    accum.data[13] += data[11] * rhs.data[9];
-    accum.data[14] += data[11] * rhs.data[10];
-    accum.data[15] += data[11] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-4-by-3
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-    accum += data[8];
-    accum += data[9];
-    accum += data[10];
-    accum += data[11];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-    accum += data[8] * data[8];
-    accum += data[9] * data[9];
-    accum += data[10] * data[10];
-    accum += data[11] * data[11];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[4];
-    accum += data[8];
-
-    return accum;
-  }
-    
-};
-
-/// Template alias for 4-by-3 matrix
-template <typename Element>
-using Matrix4x3 = Matrix<Element, 4, 3>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix4x3<Element> make_Matrix4x3(
-    Element _0_0, Element _0_1, Element _0_2, 
-    Element _1_0, Element _1_1, Element _1_2, 
-    Element _2_0, Element _2_1, Element _2_2, 
-    Element _3_0, Element _3_1, Element _3_2
-) {
-  return Matrix4x3<Element>(
-  _0_0, _0_1, _0_2, 
-  _1_0, _1_1, _1_2, 
-  _2_0, _2_1, _2_2, 
-  _3_0, _3_1, _3_2 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// 4-by-4 matrix template class definition
-template <typename Element_>
-struct Matrix<Element_, 4, 4> {
-
-  //
-  // Type definitions
-  //
-
-  /// Element data type
-  using Element = Element_;
-
-  /// Number of rows in matrix
-  static int const kRows = 4;
-
-  /// Number of columns in matrix
-  static int const kColumns = 4;
-
-  /// Layout of matrix in underlying array
-  using Layout = layout::RowMajor;
-
-  /// Number of elements in matrix
-  static int const kCount = 16;
-
-  //
-  // Data members
-  //
-
-  /// Elements of the matrix in row-major layout
-  Array<Element, kCount> data;
-
-  //
-  // Methods
-  //
-
-  /// Constructs a zero matrix
-  CUTLASS_HOST_DEVICE
-  Matrix() {
-    data.clear();
-  }
-  
-  /// Copy constructor for a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Matrix(Matrix const &rhs) {
-    data = rhs.data;
-  }
-    
-  /// Constucts a 4-by-4 matrix from scalar elements
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
-    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
-    Element _3_0, Element _3_1, Element _3_2, Element _3_3
-  ) {
-
-    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
-    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
-    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
-    data[12] = _3_0;  data[13] = _3_1;  data[14] = _3_2;  data[15] = _3_3;
-  }
-    
-  /// Constucts a 4-by-4 matrix from row vectors
-  CUTLASS_HOST_DEVICE
-  Matrix(
-    Matrix<Element, 1, 4> const &row_0,
-    Matrix<Element, 1, 4> const &row_1,
-    Matrix<Element, 1, 4> const &row_2,
-    Matrix<Element, 1, 4> const &row_3
-  ) { 
-    data[0] = row_0.data[0];
-    data[1] = row_0.data[1];
-    data[2] = row_0.data[2];
-    data[3] = row_0.data[3];
-    data[4] = row_1.data[0];
-    data[5] = row_1.data[1];
-    data[6] = row_1.data[2];
-    data[7] = row_1.data[3];
-    data[8] = row_2.data[0];
-    data[9] = row_2.data[1];
-    data[10] = row_2.data[2];
-    data[11] = row_2.data[3];
-    data[12] = row_3.data[0];
-    data[13] = row_3.data[1];
-    data[14] = row_3.data[2];
-    data[15] = row_3.data[3];
-  }
-    
-  /// Static method to construct a 4-by-4 matrix from column vectors
-  CUTLASS_HOST_DEVICE
-  static Matrix from_columns(
-    Matrix<Element, 4, 1> const &column_0,
-    Matrix<Element, 4, 1> const &column_1,
-    Matrix<Element, 4, 1> const &column_2,
-    Matrix<Element, 4, 1> const &column_3
-  ) { 
-    Matrix result;
-    
-    result.data[0] = column_0.data[0];
-    result.data[1] = column_1.data[0];
-    result.data[2] = column_2.data[0];
-    result.data[3] = column_3.data[0];
-    result.data[4] = column_0.data[1];
-    result.data[5] = column_1.data[1];
-    result.data[6] = column_2.data[1];
-    result.data[7] = column_3.data[1];
-    result.data[8] = column_0.data[2];
-    result.data[9] = column_1.data[2];
-    result.data[10] = column_2.data[2];
-    result.data[11] = column_3.data[2];
-    result.data[12] = column_0.data[3];
-    result.data[13] = column_1.data[3];
-    result.data[14] = column_2.data[3];
-    result.data[15] = column_3.data[3];
-    return result;
-  }
-    
-  /// Constructs an identity matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix identity() {
-    Matrix m;
-    
-    m.data[0] = Element(1);
-    m.data[5] = Element(1);
-    m.data[10] = Element(1);
-    m.data[15] = Element(1);
-
-    return m;
-  }
-    
-  /// Constructs a matrix from a uniform element
-  CUTLASS_HOST_DEVICE
-  static Matrix uniform(Element s) {
-    Matrix m;
-    
-    m.data[0] = s;
-    m.data[1] = s;
-    m.data[2] = s;
-    m.data[3] = s;
-    m.data[4] = s;
-    m.data[5] = s;
-    m.data[6] = s;
-    m.data[7] = s;
-    m.data[8] = s;
-    m.data[9] = s;
-    m.data[10] = s;
-    m.data[11] = s;
-    m.data[12] = s;
-    m.data[13] = s;
-    m.data[14] = s;
-    m.data[15] = s;
-
-    return m;
-  }
-
-  /// Constructs a matrix from a uniform element 1
-  CUTLASS_HOST_DEVICE
-  static Matrix ones() {
-    return uniform(Element(1));
-  }
-
-  /// Constructs a matrix from a uniform element 0
-  CUTLASS_HOST_DEVICE
-  static Matrix zero() {
-    return Matrix();
-  }
-  
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 4, 1> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Constructs a matrix from elements along its diagonal
-  CUTLASS_HOST_DEVICE
-  static Matrix from_diagonal(Matrix<Element, 1, 4> const &diag) {
-    Matrix m;
-    
-    m.data[0] = diag.data[0];
-    m.data[5] = diag.data[1];
-    m.data[10] = diag.data[2];
-    m.data[15] = diag.data[3];
-
-    return m;
-  }
-
-  /// Gets an array of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> diagonal() const {
-    Matrix<Element, 4, 1> diag;
-    
-    diag.data[0] = data[0];
-    diag.data[1] = data[5];
-    diag.data[2] = data[10];
-    diag.data[3] = data[15];
-
-    return diag;
-  }
-    
-  /// Returns a transposed matrix
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> transpose() const {
-    Matrix<Element, 4, 4> mt;
-    
-    mt.data[0] = data[0];
-    mt.data[4] = data[1];
-    mt.data[8] = data[2];
-    mt.data[12] = data[3];
-    mt.data[1] = data[4];
-    mt.data[5] = data[5];
-    mt.data[9] = data[6];
-    mt.data[13] = data[7];
-    mt.data[2] = data[8];
-    mt.data[6] = data[9];
-    mt.data[10] = data[10];
-    mt.data[14] = data[11];
-    mt.data[3] = data[12];
-    mt.data[7] = data[13];
-    mt.data[11] = data[14];
-    mt.data[15] = data[15];
-
-    return mt;
-  }
-    
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(int i, int j) const {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(int i, int j) {
-    return data[i * 4 + j];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element at(Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & at(Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element &at(int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element at(int offset) const {
-    return data[offset];
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element operator[](Coord<2> const &coord) const {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by coordinate
-  CUTLASS_HOST_DEVICE
-  Element & operator[](Coord<2> const &coord) {
-    return at(coord[0], coord[1]);
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element & operator[](int offset) {
-    return data[offset];
-  }
-
-  /// Accesses an element by offset
-  CUTLASS_HOST_DEVICE
-  Element operator[](int offset) const {
-    return data[offset];
-  }
-  
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
-    Matrix<Element, 1, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 1, 4> row(int i) const {
-    return slice_1x4(i, 0);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
-    return set_slice_1x4(v, i, 0);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
-    Matrix<Element, 2, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-    m.data[2] = data[i * 4 + j + 8];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-    data[i * 4 + j + 8] = m.data[2];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-    m.data[4] = data[i * 4 + j + 8];
-    m.data[5] = data[i * 4 + j + 9];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-    data[i * 4 + j + 8] = m.data[4];
-    data[i * 4 + j + 9] = m.data[5];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-    m.data[6] = data[i * 4 + j + 8];
-    m.data[7] = data[i * 4 + j + 9];
-    m.data[8] = data[i * 4 + j + 10];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-    data[i * 4 + j + 8] = m.data[6];
-    data[i * 4 + j + 9] = m.data[7];
-    data[i * 4 + j + 10] = m.data[8];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
-    Matrix<Element, 3, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-    m.data[8] = data[i * 4 + j + 8];
-    m.data[9] = data[i * 4 + j + 9];
-    m.data[10] = data[i * 4 + j + 10];
-    m.data[11] = data[i * 4 + j + 11];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-    data[i * 4 + j + 8] = m.data[8];
-    data[i * 4 + j + 9] = m.data[9];
-    data[i * 4 + j + 10] = m.data[10];
-    data[i * 4 + j + 11] = m.data[11];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 1> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 4];
-    m.data[2] = data[i * 4 + j + 8];
-    m.data[3] = data[i * 4 + j + 12];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 4] = m.data[1];
-    data[i * 4 + j + 8] = m.data[2];
-    data[i * 4 + j + 12] = m.data[3];
-
-    return *this;
-  }
-    
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> column(int j) const {
-    return slice_4x1(0, j);
-  }
-
-  CUTLASS_HOST_DEVICE
-  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
-    return set_slice_4x1(v, 0, j);
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 2> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 4];
-    m.data[3] = data[i * 4 + j + 5];
-    m.data[4] = data[i * 4 + j + 8];
-    m.data[5] = data[i * 4 + j + 9];
-    m.data[6] = data[i * 4 + j + 12];
-    m.data[7] = data[i * 4 + j + 13];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 4] = m.data[2];
-    data[i * 4 + j + 5] = m.data[3];
-    data[i * 4 + j + 8] = m.data[4];
-    data[i * 4 + j + 9] = m.data[5];
-    data[i * 4 + j + 12] = m.data[6];
-    data[i * 4 + j + 13] = m.data[7];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 3> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 4];
-    m.data[4] = data[i * 4 + j + 5];
-    m.data[5] = data[i * 4 + j + 6];
-    m.data[6] = data[i * 4 + j + 8];
-    m.data[7] = data[i * 4 + j + 9];
-    m.data[8] = data[i * 4 + j + 10];
-    m.data[9] = data[i * 4 + j + 12];
-    m.data[10] = data[i * 4 + j + 13];
-    m.data[11] = data[i * 4 + j + 14];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 4] = m.data[3];
-    data[i * 4 + j + 5] = m.data[4];
-    data[i * 4 + j + 6] = m.data[5];
-    data[i * 4 + j + 8] = m.data[6];
-    data[i * 4 + j + 9] = m.data[7];
-    data[i * 4 + j + 10] = m.data[8];
-    data[i * 4 + j + 12] = m.data[9];
-    data[i * 4 + j + 13] = m.data[10];
-    data[i * 4 + j + 14] = m.data[11];
-
-    return *this;
-  }
-    
-  /// Gets a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> slice_4x4(int i = 0, int j = 0) const {
-    Matrix<Element, 4, 4> m;
-    
-    m.data[0] = data[i * 4 + j + 0];
-    m.data[1] = data[i * 4 + j + 1];
-    m.data[2] = data[i * 4 + j + 2];
-    m.data[3] = data[i * 4 + j + 3];
-    m.data[4] = data[i * 4 + j + 4];
-    m.data[5] = data[i * 4 + j + 5];
-    m.data[6] = data[i * 4 + j + 6];
-    m.data[7] = data[i * 4 + j + 7];
-    m.data[8] = data[i * 4 + j + 8];
-    m.data[9] = data[i * 4 + j + 9];
-    m.data[10] = data[i * 4 + j + 10];
-    m.data[11] = data[i * 4 + j + 11];
-    m.data[12] = data[i * 4 + j + 12];
-    m.data[13] = data[i * 4 + j + 13];
-    m.data[14] = data[i * 4 + j + 14];
-    m.data[15] = data[i * 4 + j + 15];
-
-    return m;
-  }
-
-  /// Overwrites a submatrix with optional offset
-  CUTLASS_HOST_DEVICE
-  Matrix & set_slice_4x4(Matrix<Element, 4, 4> const &m, int i = 0, int j = 0) {
-    
-    data[i * 4 + j + 0] = m.data[0];
-    data[i * 4 + j + 1] = m.data[1];
-    data[i * 4 + j + 2] = m.data[2];
-    data[i * 4 + j + 3] = m.data[3];
-    data[i * 4 + j + 4] = m.data[4];
-    data[i * 4 + j + 5] = m.data[5];
-    data[i * 4 + j + 6] = m.data[6];
-    data[i * 4 + j + 7] = m.data[7];
-    data[i * 4 + j + 8] = m.data[8];
-    data[i * 4 + j + 9] = m.data[9];
-    data[i * 4 + j + 10] = m.data[10];
-    data[i * 4 + j + 11] = m.data[11];
-    data[i * 4 + j + 12] = m.data[12];
-    data[i * 4 + j + 13] = m.data[13];
-    data[i * 4 + j + 14] = m.data[14];
-    data[i * 4 + j + 15] = m.data[15];
-
-    return *this;
-  }
-    
-  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-3 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 3> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
-      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
-      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2)
-      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1), rhs.at(3, 2));
-  }
-  
-  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-2 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 2> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
-      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
-      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1)
-      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0), rhs.at(3, 1));
-  }
-  
-  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-3 matrix with a 4-by-1 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix hcat(Matrix<Element, 4, 3> const & lhs, Matrix<Element, 4, 1> const & rhs) {
-    return Matrix(
-      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
-      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
-      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0)
-      , lhs.at(3, 0), lhs.at(3, 1), lhs.at(3, 2), rhs.at(3, 0));
-  }
-  
-  /// Forms a 4-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 3-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 3, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3)
-      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2), lower.at(2, 3));
-  }
-  
-  /// Forms a 4-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 2-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
-      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
-  }
-  
-  /// Forms a 4-by-4 matrix by vertically concatenating a 3-by-4 matrix with a 1-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  static Matrix vcat(Matrix<Element, 3, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
-    return Matrix(
-      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
-      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
-      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2), upper.at(2, 3)
-      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Element                         A, Matrix<Element, 1, 3> const & B,
-    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 3> const & D) {
-    return Matrix(
-      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
-      , C.at(2, 0), D.at(2, 0), D.at(2, 1), D.at(2, 2)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
-    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
-      , C.at(2, 0), C.at(2, 1), D.at(2, 0), D.at(2, 1)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 1, 3> const & A, Element                         B,
-    Matrix<Element, 3, 3> const & C, Matrix<Element, 3, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
-      , C.at(2, 0), C.at(2, 1), C.at(2, 2), D.at(2, 0)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
-    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
-      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
-      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
-    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
-    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
-      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 3> const & B,
-    Element                         C, Matrix<Element, 1, 3> const & D) {
-    return Matrix(
-      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
-      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
-      , A.at(2, 0), B.at(2, 0), B.at(2, 1), B.at(2, 2)
-      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 2> const & B,
-    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
-      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
-      , A.at(2, 0), A.at(2, 1), B.at(2, 0), B.at(2, 1)
-      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
-    );
-  }
-  
-  /// Forms a 4-by-4 matrix by concatenating four components
-  CUTLASS_HOST_DEVICE
-  static Matrix block(
-    Matrix<Element, 3, 3> const & A, Matrix<Element, 3, 1> const & B,
-    Matrix<Element, 1, 3> const & C, Element                         D) {
-    return Matrix(
-      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
-      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
-      , A.at(2, 0), A.at(2, 1), A.at(2, 2), B.at(2, 0)
-      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
-    );
-  }
-  
-  /// Elementwise add operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix add(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] + rhs.data[0];
-    result.data[1] = data[1] + rhs.data[1];
-    result.data[2] = data[2] + rhs.data[2];
-    result.data[3] = data[3] + rhs.data[3];
-
-    result.data[4] = data[4] + rhs.data[4];
-    result.data[5] = data[5] + rhs.data[5];
-    result.data[6] = data[6] + rhs.data[6];
-    result.data[7] = data[7] + rhs.data[7];
-
-    result.data[8] = data[8] + rhs.data[8];
-    result.data[9] = data[9] + rhs.data[9];
-    result.data[10] = data[10] + rhs.data[10];
-    result.data[11] = data[11] + rhs.data[11];
-
-    result.data[12] = data[12] + rhs.data[12];
-    result.data[13] = data[13] + rhs.data[13];
-    result.data[14] = data[14] + rhs.data[14];
-    result.data[15] = data[15] + rhs.data[15];
-
-    return result;
-  }
-      
-  /// Elementwise add operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator +(Matrix const &rhs) const {
-    return add(rhs);
-  }
-
-  /// Elementwise add operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator +=(Matrix const &rhs) {
-    
-    data[0] += rhs.data[0];
-    data[1] += rhs.data[1];
-    data[2] += rhs.data[2];
-    data[3] += rhs.data[3];
-
-    data[4] += rhs.data[4];
-    data[5] += rhs.data[5];
-    data[6] += rhs.data[6];
-    data[7] += rhs.data[7];
-
-    data[8] += rhs.data[8];
-    data[9] += rhs.data[9];
-    data[10] += rhs.data[10];
-    data[11] += rhs.data[11];
-
-    data[12] += rhs.data[12];
-    data[13] += rhs.data[13];
-    data[14] += rhs.data[14];
-    data[15] += rhs.data[15];
-
-    return *this;
-  }
-        
-  /// Elementwise subtract operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix subtract(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] - rhs.data[0];
-    result.data[1] = data[1] - rhs.data[1];
-    result.data[2] = data[2] - rhs.data[2];
-    result.data[3] = data[3] - rhs.data[3];
-
-    result.data[4] = data[4] - rhs.data[4];
-    result.data[5] = data[5] - rhs.data[5];
-    result.data[6] = data[6] - rhs.data[6];
-    result.data[7] = data[7] - rhs.data[7];
-
-    result.data[8] = data[8] - rhs.data[8];
-    result.data[9] = data[9] - rhs.data[9];
-    result.data[10] = data[10] - rhs.data[10];
-    result.data[11] = data[11] - rhs.data[11];
-
-    result.data[12] = data[12] - rhs.data[12];
-    result.data[13] = data[13] - rhs.data[13];
-    result.data[14] = data[14] - rhs.data[14];
-    result.data[15] = data[15] - rhs.data[15];
-
-    return result;
-  }
-      
-  /// Elementwise subtract operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator -(Matrix const &rhs) const {
-    return subtract(rhs);
-  }
-
-  /// Elementwise subtract operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator -=(Matrix const &rhs) {
-    
-    data[0] -= rhs.data[0];
-    data[1] -= rhs.data[1];
-    data[2] -= rhs.data[2];
-    data[3] -= rhs.data[3];
-
-    data[4] -= rhs.data[4];
-    data[5] -= rhs.data[5];
-    data[6] -= rhs.data[6];
-    data[7] -= rhs.data[7];
-
-    data[8] -= rhs.data[8];
-    data[9] -= rhs.data[9];
-    data[10] -= rhs.data[10];
-    data[11] -= rhs.data[11];
-
-    data[12] -= rhs.data[12];
-    data[13] -= rhs.data[13];
-    data[14] -= rhs.data[14];
-    data[15] -= rhs.data[15];
-
-    return *this;
-  }
-        
-  /// Elementwise multiply operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * rhs.data[0];
-    result.data[1] = data[1] * rhs.data[1];
-    result.data[2] = data[2] * rhs.data[2];
-    result.data[3] = data[3] * rhs.data[3];
-
-    result.data[4] = data[4] * rhs.data[4];
-    result.data[5] = data[5] * rhs.data[5];
-    result.data[6] = data[6] * rhs.data[6];
-    result.data[7] = data[7] * rhs.data[7];
-
-    result.data[8] = data[8] * rhs.data[8];
-    result.data[9] = data[9] * rhs.data[9];
-    result.data[10] = data[10] * rhs.data[10];
-    result.data[11] = data[11] * rhs.data[11];
-
-    result.data[12] = data[12] * rhs.data[12];
-    result.data[13] = data[13] * rhs.data[13];
-    result.data[14] = data[14] * rhs.data[14];
-    result.data[15] = data[15] * rhs.data[15];
-
-    return result;
-  }
-      
-  /// Scalar multiply operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix multiply(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] * s;
-    result.data[1] = data[1] * s;
-    result.data[2] = data[2] * s;
-    result.data[3] = data[3] * s;
-
-    result.data[4] = data[4] * s;
-    result.data[5] = data[5] * s;
-    result.data[6] = data[6] * s;
-    result.data[7] = data[7] * s;
-
-    result.data[8] = data[8] * s;
-    result.data[9] = data[9] * s;
-    result.data[10] = data[10] * s;
-    result.data[11] = data[11] * s;
-
-    result.data[12] = data[12] * s;
-    result.data[13] = data[13] * s;
-    result.data[14] = data[14] * s;
-    result.data[15] = data[15] * s;
-
-    return result;
-  }
-
-  /// Scalar multiply operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator *(Element const &s) const {
-    return multiply(s);
-  }
-
-  /// Scalar multiply operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator *=(Element const &s) {
-    
-    data[0] *= s;
-    data[1] *= s;
-    data[2] *= s;
-    data[3] *= s;
-
-    data[4] *= s;
-    data[5] *= s;
-    data[6] *= s;
-    data[7] *= s;
-
-    data[8] *= s;
-    data[9] *= s;
-    data[10] *= s;
-    data[11] *= s;
-
-    data[12] *= s;
-    data[13] *= s;
-    data[14] *= s;
-    data[15] *= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Matrix const &rhs) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / rhs.data[0];
-    result.data[1] = data[1] / rhs.data[1];
-    result.data[2] = data[2] / rhs.data[2];
-    result.data[3] = data[3] / rhs.data[3];
-
-    result.data[4] = data[4] / rhs.data[4];
-    result.data[5] = data[5] / rhs.data[5];
-    result.data[6] = data[6] / rhs.data[6];
-    result.data[7] = data[7] / rhs.data[7];
-
-    result.data[8] = data[8] / rhs.data[8];
-    result.data[9] = data[9] / rhs.data[9];
-    result.data[10] = data[10] / rhs.data[10];
-    result.data[11] = data[11] / rhs.data[11];
-
-    result.data[12] = data[12] / rhs.data[12];
-    result.data[13] = data[13] / rhs.data[13];
-    result.data[14] = data[14] / rhs.data[14];
-    result.data[15] = data[15] / rhs.data[15];
-
-    return result;
-  }
-      
-  /// Scalar divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix divide(Element const &s) const {
-
-    Matrix result;
-    
-    result.data[0] = data[0] / s;
-    result.data[1] = data[1] / s;
-    result.data[2] = data[2] / s;
-    result.data[3] = data[3] / s;
-
-    result.data[4] = data[4] / s;
-    result.data[5] = data[5] / s;
-    result.data[6] = data[6] / s;
-    result.data[7] = data[7] / s;
-
-    result.data[8] = data[8] / s;
-    result.data[9] = data[9] / s;
-    result.data[10] = data[10] / s;
-    result.data[11] = data[11] / s;
-
-    result.data[12] = data[12] / s;
-    result.data[13] = data[13] / s;
-    result.data[14] = data[14] / s;
-    result.data[15] = data[15] / s;
-
-    return result;
-  }
-
-  /// Scalar divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Element const &s) const {
-    return divide(s);
-  }
-
-  /// Scalar divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Element const &s) {
-    
-    data[0] /= s;
-    data[1] /= s;
-    data[2] /= s;
-    data[3] /= s;
-
-    data[4] /= s;
-    data[5] /= s;
-    data[6] /= s;
-    data[7] /= s;
-
-    data[8] /= s;
-    data[9] /= s;
-    data[10] /= s;
-    data[11] /= s;
-
-    data[12] /= s;
-    data[13] /= s;
-    data[14] /= s;
-    data[15] /= s;
-
-    return *this;
-  }
-        
-  /// Elementwise divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix operator /(Matrix const &rhs) const {
-    return divide(rhs);
-  }
-
-  /// Elementwise divide operator (4-by-4)
-  CUTLASS_HOST_DEVICE
-  Matrix & operator /=(Matrix const &rhs) {
-    
-    data[0] /= rhs.data[0];
-    data[1] /= rhs.data[1];
-    data[2] /= rhs.data[2];
-    data[3] /= rhs.data[3];
-
-    data[4] /= rhs.data[4];
-    data[5] /= rhs.data[5];
-    data[6] /= rhs.data[6];
-    data[7] /= rhs.data[7];
-
-    data[8] /= rhs.data[8];
-    data[9] /= rhs.data[9];
-    data[10] /= rhs.data[10];
-    data[11] /= rhs.data[11];
-
-    data[12] /= rhs.data[12];
-    data[13] /= rhs.data[13];
-    data[14] /= rhs.data[14];
-    data[15] /= rhs.data[15];
-
-    return *this;
-  }
-        
-  /// Negates each element of the matrix
-  CUTLASS_HOST_DEVICE
-  Matrix operator-() const {
-    Matrix m;
-    
-    m.data[0] = -m.data[0];
-    m.data[1] = -m.data[1];
-    m.data[2] = -m.data[2];
-    m.data[3] = -m.data[3];
-    m.data[4] = -m.data[4];
-    m.data[5] = -m.data[5];
-    m.data[6] = -m.data[6];
-    m.data[7] = -m.data[7];
-    m.data[8] = -m.data[8];
-    m.data[9] = -m.data[9];
-    m.data[10] = -m.data[10];
-    m.data[11] = -m.data[11];
-    m.data[12] = -m.data[12];
-    m.data[13] = -m.data[13];
-    m.data[14] = -m.data[14];
-    m.data[15] = -m.data[15];
-
-    return m;
-  }
-  
-  /// Matrix product of size 4-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> product(
-    Matrix<Element, 4, 1> const &rhs,
-    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[4] * rhs.data[0];
-    accum.data[2] += data[8] * rhs.data[0];
-    accum.data[3] += data[12] * rhs.data[0];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[1];
-    accum.data[1] += data[5] * rhs.data[1];
-    accum.data[2] += data[9] * rhs.data[1];
-    accum.data[3] += data[13] * rhs.data[1];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[2];
-    accum.data[1] += data[6] * rhs.data[2];
-    accum.data[2] += data[10] * rhs.data[2];
-    accum.data[3] += data[14] * rhs.data[2];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[3];
-    accum.data[1] += data[7] * rhs.data[3];
-    accum.data[2] += data[11] * rhs.data[3];
-    accum.data[3] += data[15] * rhs.data[3];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-1-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> product(
-    Matrix<Element, 4, 2> const &rhs,
-    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[4] * rhs.data[0];
-    accum.data[3] += data[4] * rhs.data[1];
-    accum.data[4] += data[8] * rhs.data[0];
-    accum.data[5] += data[8] * rhs.data[1];
-    accum.data[6] += data[12] * rhs.data[0];
-    accum.data[7] += data[12] * rhs.data[1];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[2];
-    accum.data[1] += data[1] * rhs.data[3];
-    accum.data[2] += data[5] * rhs.data[2];
-    accum.data[3] += data[5] * rhs.data[3];
-    accum.data[4] += data[9] * rhs.data[2];
-    accum.data[5] += data[9] * rhs.data[3];
-    accum.data[6] += data[13] * rhs.data[2];
-    accum.data[7] += data[13] * rhs.data[3];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[4];
-    accum.data[1] += data[2] * rhs.data[5];
-    accum.data[2] += data[6] * rhs.data[4];
-    accum.data[3] += data[6] * rhs.data[5];
-    accum.data[4] += data[10] * rhs.data[4];
-    accum.data[5] += data[10] * rhs.data[5];
-    accum.data[6] += data[14] * rhs.data[4];
-    accum.data[7] += data[14] * rhs.data[5];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[6];
-    accum.data[1] += data[3] * rhs.data[7];
-    accum.data[2] += data[7] * rhs.data[6];
-    accum.data[3] += data[7] * rhs.data[7];
-    accum.data[4] += data[11] * rhs.data[6];
-    accum.data[5] += data[11] * rhs.data[7];
-    accum.data[6] += data[15] * rhs.data[6];
-    accum.data[7] += data[15] * rhs.data[7];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-2-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> product(
-    Matrix<Element, 4, 3> const &rhs,
-    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[4] * rhs.data[0];
-    accum.data[4] += data[4] * rhs.data[1];
-    accum.data[5] += data[4] * rhs.data[2];
-    accum.data[6] += data[8] * rhs.data[0];
-    accum.data[7] += data[8] * rhs.data[1];
-    accum.data[8] += data[8] * rhs.data[2];
-    accum.data[9] += data[12] * rhs.data[0];
-    accum.data[10] += data[12] * rhs.data[1];
-    accum.data[11] += data[12] * rhs.data[2];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[3];
-    accum.data[1] += data[1] * rhs.data[4];
-    accum.data[2] += data[1] * rhs.data[5];
-    accum.data[3] += data[5] * rhs.data[3];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-    accum.data[6] += data[9] * rhs.data[3];
-    accum.data[7] += data[9] * rhs.data[4];
-    accum.data[8] += data[9] * rhs.data[5];
-    accum.data[9] += data[13] * rhs.data[3];
-    accum.data[10] += data[13] * rhs.data[4];
-    accum.data[11] += data[13] * rhs.data[5];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[6];
-    accum.data[1] += data[2] * rhs.data[7];
-    accum.data[2] += data[2] * rhs.data[8];
-    accum.data[3] += data[6] * rhs.data[6];
-    accum.data[4] += data[6] * rhs.data[7];
-    accum.data[5] += data[6] * rhs.data[8];
-    accum.data[6] += data[10] * rhs.data[6];
-    accum.data[7] += data[10] * rhs.data[7];
-    accum.data[8] += data[10] * rhs.data[8];
-    accum.data[9] += data[14] * rhs.data[6];
-    accum.data[10] += data[14] * rhs.data[7];
-    accum.data[11] += data[14] * rhs.data[8];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[9];
-    accum.data[1] += data[3] * rhs.data[10];
-    accum.data[2] += data[3] * rhs.data[11];
-    accum.data[3] += data[7] * rhs.data[9];
-    accum.data[4] += data[7] * rhs.data[10];
-    accum.data[5] += data[7] * rhs.data[11];
-    accum.data[6] += data[11] * rhs.data[9];
-    accum.data[7] += data[11] * rhs.data[10];
-    accum.data[8] += data[11] * rhs.data[11];
-    accum.data[9] += data[15] * rhs.data[9];
-    accum.data[10] += data[15] * rhs.data[10];
-    accum.data[11] += data[15] * rhs.data[11];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-3-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> product(
-    Matrix<Element, 4, 4> const &rhs,
-    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
-  ) const {
-    
-    // k=0
-    accum.data[0] += data[0] * rhs.data[0];
-    accum.data[1] += data[0] * rhs.data[1];
-    accum.data[2] += data[0] * rhs.data[2];
-    accum.data[3] += data[0] * rhs.data[3];
-    accum.data[4] += data[4] * rhs.data[0];
-    accum.data[5] += data[4] * rhs.data[1];
-    accum.data[6] += data[4] * rhs.data[2];
-    accum.data[7] += data[4] * rhs.data[3];
-    accum.data[8] += data[8] * rhs.data[0];
-    accum.data[9] += data[8] * rhs.data[1];
-    accum.data[10] += data[8] * rhs.data[2];
-    accum.data[11] += data[8] * rhs.data[3];
-    accum.data[12] += data[12] * rhs.data[0];
-    accum.data[13] += data[12] * rhs.data[1];
-    accum.data[14] += data[12] * rhs.data[2];
-    accum.data[15] += data[12] * rhs.data[3];
-
-    // k=1
-    accum.data[0] += data[1] * rhs.data[4];
-    accum.data[1] += data[1] * rhs.data[5];
-    accum.data[2] += data[1] * rhs.data[6];
-    accum.data[3] += data[1] * rhs.data[7];
-    accum.data[4] += data[5] * rhs.data[4];
-    accum.data[5] += data[5] * rhs.data[5];
-    accum.data[6] += data[5] * rhs.data[6];
-    accum.data[7] += data[5] * rhs.data[7];
-    accum.data[8] += data[9] * rhs.data[4];
-    accum.data[9] += data[9] * rhs.data[5];
-    accum.data[10] += data[9] * rhs.data[6];
-    accum.data[11] += data[9] * rhs.data[7];
-    accum.data[12] += data[13] * rhs.data[4];
-    accum.data[13] += data[13] * rhs.data[5];
-    accum.data[14] += data[13] * rhs.data[6];
-    accum.data[15] += data[13] * rhs.data[7];
-
-    // k=2
-    accum.data[0] += data[2] * rhs.data[8];
-    accum.data[1] += data[2] * rhs.data[9];
-    accum.data[2] += data[2] * rhs.data[10];
-    accum.data[3] += data[2] * rhs.data[11];
-    accum.data[4] += data[6] * rhs.data[8];
-    accum.data[5] += data[6] * rhs.data[9];
-    accum.data[6] += data[6] * rhs.data[10];
-    accum.data[7] += data[6] * rhs.data[11];
-    accum.data[8] += data[10] * rhs.data[8];
-    accum.data[9] += data[10] * rhs.data[9];
-    accum.data[10] += data[10] * rhs.data[10];
-    accum.data[11] += data[10] * rhs.data[11];
-    accum.data[12] += data[14] * rhs.data[8];
-    accum.data[13] += data[14] * rhs.data[9];
-    accum.data[14] += data[14] * rhs.data[10];
-    accum.data[15] += data[14] * rhs.data[11];
-
-    // k=3
-    accum.data[0] += data[3] * rhs.data[12];
-    accum.data[1] += data[3] * rhs.data[13];
-    accum.data[2] += data[3] * rhs.data[14];
-    accum.data[3] += data[3] * rhs.data[15];
-    accum.data[4] += data[7] * rhs.data[12];
-    accum.data[5] += data[7] * rhs.data[13];
-    accum.data[6] += data[7] * rhs.data[14];
-    accum.data[7] += data[7] * rhs.data[15];
-    accum.data[8] += data[11] * rhs.data[12];
-    accum.data[9] += data[11] * rhs.data[13];
-    accum.data[10] += data[11] * rhs.data[14];
-    accum.data[11] += data[11] * rhs.data[15];
-    accum.data[12] += data[15] * rhs.data[12];
-    accum.data[13] += data[15] * rhs.data[13];
-    accum.data[14] += data[15] * rhs.data[14];
-    accum.data[15] += data[15] * rhs.data[15];
-
-    return accum;
-  }
-
-  /// Matrix product of size 4-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix<Element, 4, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
-    return product(rhs);
-  }
-  
-  /// Matrix product of size 4-by-4-by-4
-  CUTLASS_HOST_DEVICE
-  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
-    *this = product(rhs);
-    return *this;
-  }
-    
-  /// Returns the sum of elements
-  CUTLASS_HOST_DEVICE
-  Element sum(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[1];
-    accum += data[2];
-    accum += data[3];
-    accum += data[4];
-    accum += data[5];
-    accum += data[6];
-    accum += data[7];
-    accum += data[8];
-    accum += data[9];
-    accum += data[10];
-    accum += data[11];
-    accum += data[12];
-    accum += data[13];
-    accum += data[14];
-    accum += data[15];
-
-    return accum;
-  }  
-
-  /// Returns the sum of squared elements
-  CUTLASS_HOST_DEVICE
-  Element norm(Element accum = Element()) const {
-    
-    accum += data[0] * data[0];
-    accum += data[1] * data[1];
-    accum += data[2] * data[2];
-    accum += data[3] * data[3];
-    accum += data[4] * data[4];
-    accum += data[5] * data[5];
-    accum += data[6] * data[6];
-    accum += data[7] * data[7];
-    accum += data[8] * data[8];
-    accum += data[9] * data[9];
-    accum += data[10] * data[10];
-    accum += data[11] * data[11];
-    accum += data[12] * data[12];
-    accum += data[13] * data[13];
-    accum += data[14] * data[14];
-    accum += data[15] * data[15];
-
-    return accum;
-  }
-
-  /// Returns square root of the norm
-  CUTLASS_HOST_DEVICE
-  Element magnitude() const {
-    return fast_sqrt(norm());
-  }
-
-  /// Returns the sum of diagonal elements
-  CUTLASS_HOST_DEVICE
-  Element trace(Element accum = Element()) const {
-    
-    accum += data[0];
-    accum += data[5];
-    accum += data[10];
-    accum += data[15];
-
-    return accum;
-  }
-    
-  /// Returns 4-by-4 rotation matrix around the X axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_X(Element theta) {
-    Matrix m = identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(1, 1) = c;
-    m.at(1, 2) = -s;
-    m.at(2, 1) = s;
-    m.at(2, 2) = c;
-
-    return m;
-  }
-
-  /// Returns 4-by-4 rotation matrix around the Y axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_Y(Element theta) {
-    Matrix m = identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(0, 0) = c;
-    m.at(2, 0) = -s;
-    m.at(0, 2) = s;
-    m.at(2, 2) = c;
-
-    return m;
-  }
-
-  /// Returns 4-by-4 rotation matrix around the Z axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation_Z(Element theta) {
-    Matrix m = Matrix::identity();
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    m.at(0, 0) = c;
-    m.at(0, 1) = -s;
-    m.at(1, 0) = s;
-    m.at(1, 1) = c;
-
-    return m;
-  }
-
-  /// Returns a 4-by-4 rotation matrix around a unit-length axis
-  CUTLASS_HOST_DEVICE
-  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
-    Element x = u.data[0];
-    Element y = u.data[1];
-    Element z = u.data[2];
-
-    Element c = fast_cos(theta);
-    Element s = fast_sin(theta);
-
-    Element one_minus_cos = Element(1) - fast_cos(theta);
-
-    Matrix m;
-
-    m.set_slice_3x3({
-      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
-      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
-      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
-    });
-
-    return m;
-  }
-
-  /// Returns a 4-by-4 reflection about the plane specified by the 
-  /// unit-length normal vector n_unit
-  CUTLASS_HOST_DEVICE
-  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
-
-    Element a = n_unit.data[0];
-    Element b = n_unit.data[1];
-    Element c = n_unit.data[2];
-
-    Matrix m = Matrix::identity();
-
-    m.set_slice_3x3({
-      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
-      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
-      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
-    });
-
-    return m;
-  }
-
-  /// Returns a perspective projection matrix typical of OpenGL applications
-  CUTLASS_HOST_DEVICE
-  static Matrix perspective(Element near_plane, Element far_plane, Element fovH, Element fovV) {
-    Element aspect = fovH / fovV;
-    Element f = Element(cos(fovV)) / Element(fovH);
-    Element Q = near_plane - far_plane;
-
-    return Matrix(
-      f / aspect, 0,                0,                           0,
-      0,          f,                0,                           0,
-      0,          0, (near_plane + far_plane) / Q, Element(2) * far_plane * near_plane / Q,
-      0,          0,                -1,                          0
-    );
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Matrix translation(Matrix<Element, 3, 1> const &v) {
-    return Matrix(
-      1, 0, 0, v.data[0],
-      0, 1, 0, v.data[1],
-      0, 0, 1, v.data[2],
-      0, 0, 0, 1
-    );
-  }
-  
-  /// Computes the determinant of a 4-by-4 matrix
-  CUTLASS_HOST_DEVICE
-  Element determinant(Element accum = Element()) const {
-    
-    accum += at(0, 0) * Matrix<Element, 3, 3>({ at(1, 1), at(1, 2), at(1, 3), at(2, 1), at(2, 2), at(2, 3), at(3, 1), at(3, 2), at(3, 3) }).determinant();
-    accum -= at(0, 1) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 2), at(1, 3), at(2, 0), at(2, 2), at(2, 3), at(3, 0), at(3, 2), at(3, 3) }).determinant();
-    accum += at(0, 2) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 3), at(2, 0), at(2, 1), at(2, 3), at(3, 0), at(3, 1), at(3, 3) }).determinant();
-    accum -= at(0, 3) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 2), at(2, 0), at(2, 1), at(2, 2), at(3, 0), at(3, 1), at(3, 2) }).determinant();
-
-    return accum;
-  }
-  
-  /// Computes the inverse of a 4-by-4 matrix (ignores the optional argument)
-  CUTLASS_HOST_DEVICE
-  Matrix inverse(Element ignore = 1) const {
-    Matrix<Element, 2, 2> B = slice_2x2(0, 2);
-    Matrix<Element, 2, 2> A = slice_2x2(0, 0);
-    Matrix<Element, 2, 2> C = slice_2x2(2, 0);
-    Matrix<Element, 2, 2> D = slice_2x2(2, 2);
-
-    Matrix<Element, 2, 2> D_inv = D.inverse();
-
-    Matrix<Element, 2, 2> E = (A - B * D_inv * C).inverse();
-
-    return Matrix::block(
-      E,              -E * B * D_inv,
-      -D_inv * C * E, D_inv + D_inv * C * E * B * D_inv
-    );
-  }
-    
-};
-
-/// Template alias for 4-by-4 matrix
-template <typename Element>
-using Matrix4x4 = Matrix<Element, 4, 4>;
-
-
-/// Free funciton to infer element type from template arguments
-template <typename Element>
-CUTLASS_HOST_DEVICE Matrix4x4<Element> make_Matrix4x4(
-    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
-    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
-    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
-    Element _3_0, Element _3_1, Element _3_2, Element _3_3
-) {
-  return Matrix4x4<Element>(
-  _0_0, _0_1, _0_2, _0_3, 
-  _1_0, _1_1, _1_2, _1_3, 
-  _2_0, _2_1, _2_2, _2_3, 
-  _3_0, _3_1, _3_2, _3_3 
-  );
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Elementwise scalar multiplication
-template <typename Element, int Rows, int Columns>
-CUTLASS_HOST_DEVICE
-Matrix<Element, Rows, Columns> operator*(Element s, Matrix<Element, Rows, Columns> const &rhs) {
-  return rhs.multiply(s);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h b/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
deleted file mode 100755
index 719575d59..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/matrix_coord.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a canonical coordinate for rank=2 matrices offering named indices.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
-/// expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.
-struct MatrixCoord : public Coord<2, int> {
-
-public:
-
-  /// Integer-valued index
-  using Index = int;
-
-  /// Base type is a Coord of rank=2
-  using Base = Coord<2, Index>;
-
-  /// LongIndex type
-  using LongIndex = typename Base::LongIndex;
-
-private:
-
-  /// Rows dimension
-  static int const kRow = 0;
-
-  /// Columns dimension
-  static int const kColumn = 1;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  MatrixCoord() { }
-
-  /// Constructs from Coord<2>
-  CUTLASS_HOST_DEVICE
-  MatrixCoord(Coord<2, Index> const &coord): Base(coord) { }
-
-  /// Helper to construct from a row and column
-  CUTLASS_HOST_DEVICE
-  MatrixCoord(Index row, Index column): Base(make_Coord(row, column)) { }
-
-  /// Helper to construct from a row and column, which are LongIndex based
-  CUTLASS_HOST_DEVICE
-  MatrixCoord(LongIndex row, LongIndex column): Base(make_Coord(Index(row), Index(column))) { }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & row() const { return this->at(kRow); }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & row() { return this->at(kRow); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & column() const { return this->at(kColumn); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & column() { return this->at(kColumn); }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  MatrixCoord operator+(Base const& b) const {
-    return MatrixCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  MatrixCoord operator-(Base const& b) const {
-    return MatrixCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  MatrixCoord operator*(Base const& b) const {
-    return MatrixCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  MatrixCoord operator/(Base const& b) const {
-    return MatrixCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  MatrixCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  MatrixCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  MatrixCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  MatrixCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h b/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
deleted file mode 100755
index 66623a431..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/matrix_shape.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a Shape template for matrix tiles
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Describes the size of a matrix tile
-template <
-  int Row_,     ///< rows of a matrix
-  int Column_      ///< columns of a matrix
->
-struct MatrixShape {
-  static int const kRow = Row_;           ///< rows of a matrix
-  static int const kColumn = Column_;           ///< columns of a matrix
-  static int const kCount = Row_ * Column_;  ///< total number of elements in a matrix
-
-  //
-  // Static member functions
-  //
-
-  CUTLASS_HOST_DEVICE
-  static Coord<2> toCoord() {
-    return make_Coord(kRow, kColumn);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h b/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
deleted file mode 100755
index 17c1ac14d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h
+++ /dev/null
@@ -1,4547 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Boost-like numeric conversion operator for CUTLASS numeric types
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <cfenv>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/thread/unary_op.h"
-
-#include "cutlass/array.h"
-#include "cutlass/half.h"
-#include "cutlass/bfloat16.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Floating-point rounding style similare to Standard Library's formats but supporting
-/// additional rounding options.
-enum class FloatRoundStyle {
-  round_indeterminate,          ///< rounding mode unknown
-  round_toward_zero,            ///< round toward zero
-  round_to_nearest,             ///< round to nearest even
-  round_to_nearest_satfinite,   ///< round to nearest even, capping value to min and max of destination type
-  round_toward_infinity,        ///< round toward infinity
-  round_toward_neg_infinity,    ///< round toward negative infinity
-  round_half_ulp_truncate,      ///< add 0.5ulp to integer representation then round toward zero
-  round_half_ulp_trunc_dntz     ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename T,
-  typename S,
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-struct NumericConverter {
-
-  using result_type = T;
-  using source_type = S;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    return static_cast<result_type>(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float => int32_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDA_ARCH__)
-template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = int32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-    return __float2int_rn(s);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = int32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-
-    return __float2int_rz(s);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#elif !defined(__CUDACC_RTC__)
-
-template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = int32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TONEAREST);
-    return (result_type)std::nearbyint(s);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = int32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TOWARDZERO);
-    return (result_type)std::nearbyint(s);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float => int8_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDA_ARCH__)
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = int8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-
-    int32_t intermediate;
-    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = int8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-
-    int32_t intermediate;
-    asm volatile("cvt.rzi.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = uint8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-
-    int32_t intermediate;
-    asm volatile("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = uint8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & s) {
-
-    int32_t intermediate;
-    asm volatile("cvt.rzi.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#elif !defined(__CUDACC_RTC__)
-
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = int8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TONEAREST);
-    int32_t intermediate = (int32_t)std::nearbyint(s);
-
-    // Low-end saturation
-    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
-
-    // High-end saturation
-    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = int8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TOWARDZERO);
-    int32_t intermediate = (int32_t)std::nearbyint(s);
-
-    // Low-end saturation
-    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
-
-    // High-end saturation
-    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = uint8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TONEAREST);
-    int32_t intermediate = (int32_t)std::nearbyint(s);
-
-    // Low-end saturation
-    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
-
-    // High-end saturation
-    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = uint8_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
-
-  static result_type convert(source_type const & s) {
-    std::fesetround(FE_TOWARDZERO);
-    int32_t intermediate = (int32_t)std::nearbyint(s);
-
-    // Low-end saturation
-    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
-
-    // High-end saturation
-    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
-
-    return static_cast<result_type>(intermediate);
-  }
-
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float => integer_subbyte
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<int Bits, FloatRoundStyle Round>
-struct NumericConverter<integer_subbyte<Bits, /* Signed = */ true>, float, Round> {
-private:
-  static constexpr bool result_is_signed = true;
-
-public:
-  using result_type = integer_subbyte<Bits, result_is_signed>;
-  using source_type = float;
-  static constexpr FloatRoundStyle round_style = Round;
-
-  CUTLASS_HOST_DEVICE static result_type
-  convert(source_type const& src) {
-    using middle_type = int;
-    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
-      "requires that integer_subbyte have fewer representation bits "
-      "than the number of bits in int.");
-
-    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
-    return NumericConverter<result_type, middle_type, Round>::convert(middle);
-  }
-
-  CUTLASS_HOST_DEVICE result_type
-  operator()(source_type const& s) const {
-    return convert(s);
-  }
-};
-
-template<int Bits, FloatRoundStyle Round>
-struct NumericConverter<integer_subbyte<Bits, /* Signed = */ false>, float, Round> {
-private:
-  static constexpr bool result_is_signed = false;
-
-public:
-  using result_type = integer_subbyte<Bits, result_is_signed>;
-  using source_type = float;
-  static constexpr FloatRoundStyle round_style = Round;
-
-  CUTLASS_HOST_DEVICE static result_type
-  convert(source_type const& src) {
-    using middle_type = unsigned;
-    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
-      "requires that integer_subbyte have fewer representation bits "
-      "than the number of bits in unsigned int.");
-
-    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
-    return NumericConverter<result_type, middle_type, Round>::convert(middle);
-  }
-
-  CUTLASS_HOST_DEVICE result_type  
-  operator()(source_type const& s) const {
-    return convert(s);
-  }
-};
-  
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float <= cutlass::half_t
-template <typename T, FloatRoundStyle Round>
-struct NumericConverter<T, T, Round> {
-
-  using result_type = T;
-  using source_type = T;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    return s;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float <=> cutlass::half_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float <= cutlass::half_t
-template <FloatRoundStyle Round>
-struct NumericConverter<float, cutlass::half_t, Round> {
-
-  using result_type = float;
-  using source_type = cutlass::half_t;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    result_type result = static_cast<float>(s);
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Specialization for round-to-nearest
-template <>
-struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = cutlass::half_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    result_type result = static_cast<cutlass::half_t>(s);
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Specialization for round-toward-zero
-template <>
-struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_toward_zero> {
-
-  using result_type = cutlass::half_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
-
-  /// Round toward zero
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & flt) {
-
-  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-    return cutlass::half_t(__float2half_rz(flt));
-  #else
-    // software implementation rounds toward nearest even
-    unsigned const& s = reinterpret_cast<unsigned const &>(flt);
-    uint16_t sign = uint16_t((s >> 16) & 0x8000);
-    int32_t exp = int32_t((s >> 23) & 0xff) - 127;
-    int mantissa = s & 0x7fffff;
-    uint16_t u = 0;
-
-    if ((s & 0x7fffffff) == 0) {
-      // sign-preserving zero
-      return cutlass::half_t::bitcast(sign);
-    }
-
-    if (exp > 15) {
-      if (exp == 128 && mantissa) {
-        // not a number
-        u = 0x7fff;
-      } else {
-        // overflow to infinity
-        u = sign | 0x7c00;
-      }
-      return cutlass::half_t::bitcast(u);
-    }
-
-    if (exp >= -14) {
-      // normal fp32 to normal fp16
-      u = uint16_t((uint32_t(exp + 15) & 0x1f) << 10);
-      u = uint16_t(u | (mantissa >> 13));
-    } else {
-      // normal single-precision to subnormal cutlass::half_t-precision representation
-      int rshift = (-14 - exp);
-      if (rshift < 32) {
-        mantissa |= (1 << 23);
-        mantissa = (mantissa >> rshift);
-        u = (uint16_t(mantissa >> 13) & 0x3ff);
-      } else {
-        mantissa = 0;
-        u = 0;
-      }
-    }
-
-    u |= sign;
-
-    return cutlass::half_t::bitcast(u);
-
-  #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float <=> cutlass::bfloat16_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float <= cutlass::bfloat16_t
-template <FloatRoundStyle Round>
-struct NumericConverter<float, cutlass::bfloat16_t, Round> {
-
-  using result_type = float;
-  using source_type = cutlass::bfloat16_t;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    return static_cast<float>(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
-  using result_type = cutlass::bfloat16_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-    return static_cast<cutlass::bfloat16_t>(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
-  using result_type = cutlass::bfloat16_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
-
-    #if defined(__CUDA_ARCH__)
-    if (::isfinite(s)) {
-      x32 += 0x8000;
-    }
-    #else
-    if (std::isfinite(s)) {
-      x32 += 0x8000;
-    }
-    #endif
-
-    uint16_t x16 = uint16_t((x32 >> 16) & 0xffff);
-    return cutlass::bfloat16_t::bitcast(x16);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
-  using result_type = cutlass::bfloat16_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
-    uint16_t x16 = uint16_t(x32 >> 16);
-
-    return cutlass::bfloat16_t::bitcast(x16);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for float <=> cutlass::tfloat32_t
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for float <= cutlass::tfloat32_t
-template <FloatRoundStyle Round>
-struct NumericConverter<float, cutlass::tfloat32_t, Round> {
-
-  using result_type = float;
-  using source_type = cutlass::tfloat32_t;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    return static_cast<float>(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
-  using result_type = cutlass::tfloat32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    unsigned storage = reinterpret_cast<unsigned const &>(s);
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-    asm volatile("cvt.rn.tf32.f32 %0, %1;" : "=r"(storage) : "r"(storage));
-#else
-    if ((storage & 0x7f800000) != 0x7f800000) {
-
-      bool mantissa_bit = ((storage & (1 << 13)) != 0);
-      bool round_bit = ((storage & (1 << 12)) != 0);
-      bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0);
-
-      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
-        storage += uint32_t(1 << 13);
-      }
-
-      // Note, the following is intentionally commented out. TF32
-      // does not define the low order bits, so they may be left in
-      // an undefined state.
-      //
-      // By not truncating these bit explicitly, we avoid an extra logical
-      // operation.
-      //
-      // TF32 may be implicitly converted to float by performing this
-      // operation as needed.
-      //
-      // storage = (storage & ~0x1fff);
-    }
-    else if (storage & ~0xff800000) {
-      storage = 0x7fffffff;
-    }
-#endif
-
-    return cutlass::tfloat32_t::bitcast(storage);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
-  using result_type = cutlass::tfloat32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-    return cutlass::tfloat32_t::round_half_ulp_truncate(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero.
-/// It avoids predicated code, though it requires a temporary register.
-template <>
-struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
-  using result_type = cutlass::tfloat32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    unsigned y = reinterpret_cast<unsigned const &>(s);
-    y = y & 0xff800000;
-    float d = reinterpret_cast<float const &>(y);
-    float z = d / float(1 << 11) + s;
-
-    return reinterpret_cast<result_type const &>(z);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <>
-struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
-  using result_type = cutlass::tfloat32_t;
-  using source_type = float;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-    uint32_t x = reinterpret_cast<uint32_t const &>(s);
-    return cutlass::tfloat32_t::bitcast(x & 0xffffe000);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Conversion operator for float to cutlass::tfloat32_t big and small values
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
-  FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
->
-struct NumericConverterFastF32 {
-
-  // result_type holds big cutlass::tfloat32_t at idx(0) and small cutlass::tfloat32_t at idx(1)
-  using result_type = Array<cutlass::tfloat32_t, 2>;
-
-  // source data type
-  using source_type = float;
-
-  // rounding styles for big and small part
-  static FloatRoundStyle const kRoundBig = RoundBig;
-  static FloatRoundStyle const kRoundSmall = RoundSmall;
-
-  CUTLASS_HOST_DEVICE
-    static result_type convert(source_type const & source) {
-
-    result_type result;
-    NumericConverter<cutlass::tfloat32_t, float, kRoundBig> convert_big_;
-    NumericConverter<cutlass::tfloat32_t, float, kRoundSmall> convert_small_;
-
-    // convert and fill cutlass::tfloat32_t big at idx 0
-    result[0] = convert_big_(source);
-
-    // convert and fill cutlass::tfloat32_t small at idx 1
-    result[1] = convert_small_(source - static_cast<float>(result[0]));
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-    result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Conversion and Clamp operator for Integers
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename T,
-  typename S
->
-struct NumericConverterClamp {
-
-  using result_type = T;
-  using source_type = S;
-
-  CUTLASS_HOST_DEVICE
-    static result_type convert(source_type const & s) {
-    NumericConverter<result_type, source_type> convert_op;
-    result_type const kClamp_max = cutlass::platform::numeric_limits<result_type>::max();
-    result_type const kClamp_min = cutlass::platform::numeric_limits<result_type>::lowest();
-    if (s < (source_type)kClamp_min)
-      return kClamp_min;
-    if (s > (source_type)kClamp_max)
-      return kClamp_max;
-    return convert_op(s);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-// This converter is needed to enable cutlass::half_t output types when using int32_t accumulators.
-// Since floating-point types do not require a clamp, this converter simply casts from
-// the source type to cutlass::half_t.
-template <
-  typename S
->
-struct NumericConverterClamp<cutlass::half_t, S> {
-
-  using result_type = cutlass::half_t;
-  using source_type = S;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const &source) {
-    return static_cast<cutlass::half_t>(source);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Conversion operator for Array
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Conversion operator for Array
-template <
-  typename T,
-  typename S,
-  int N,
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
->
-struct NumericArrayConverter {
-
-  using result_type = Array<T, N>;
-  using source_type = Array<S, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
-                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
-                  "Unary Operator not supported.");
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    result_type result;
-    NumericConverter<T, S, Round> convert_;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
-        result[i] = convert_(s[i]);
-      } else { // conjugate
-        result[i] = conj(convert_(s[i]));
-      }
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <
-  typename T,
-  int N,
-  FloatRoundStyle Round,
-  typename Transform
->
-struct NumericArrayConverter<T, T, N, Round, Transform> {
-
-  using result_type = Array<T, N>;
-  using source_type = Array<T, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
-                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
-                  "Unary Operator not supported.");
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const &source) {
-    if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
-      return source;
-    } else {
-      result_type result;
-      for (int i = 0; i < N; ++i) {
-        result[i] = conj(static_cast<typename source_type::Element>(source[i]));
-      }
-      return result;
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<half, 2> <= Array<float, 2>, round to nearest
-template <>
-struct NumericArrayConverter<cutlass::half_t, float, 2, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = Array<cutlass::half_t, 2>;
-  using source_type = Array<float, 2>;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-      Array<cutlass::half_t, 2> result;
-      reinterpret_cast<__half2 &>(result) = __float22half2_rn(reinterpret_cast<float2 const &>(source));
-      return result;
-    #else
-      NumericConverter<cutlass::half_t, float, round_style> convert_;
-      // NOTE: cutlass::Array<half, N> is NOT an aggregate type and
-      //  below `{}` does NOT conduct zero initialization. Below `{}` will 
-      //  conduct default initialization (calling default ctr). We use this syntax
-      //  to resolve compiler warning on uninitialized member variable.
-      Array<cutlass::half_t, 2> result{};
-      result[0] = convert_(source[0]);
-      result[1] = convert_(source[1]);
-      return result;
-    #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, 2> <= Array<cutlass::half_t, 2>, round to nearest
-template <FloatRoundStyle Round>
-struct NumericArrayConverter<float, cutlass::half_t, 2, Round> {
-
-  using result_type = Array<float, 2>;
-  using source_type = Array<cutlass::half_t, 2>;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-      float2 result2 = __half22float2(reinterpret_cast<__half2 const &>(source));
-      return {
-        float{result2.x},
-        float{result2.y}
-      };
-    #else
-      NumericConverter<float, cutlass::half_t, round_style> convert_;
-      return {
-        convert_(source[0]),
-        convert_(source[1])
-      };
-    #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<half> <= Array<float>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::half_t, float, N, Round> {
-
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<float, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<cutlass::half_t, float, 2, Round> convert_vector_;
-    NumericConverter<cutlass::half_t, float, Round> convert_element_;
-
-    result_type result;
-
-    Array<cutlass::half_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::half_t, 2> *>(&result);
-    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    if (N % 2) {
-      result[N - 1] = convert_element_(source[N - 1]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-
-/// Partial specialization for Array<half> <= Array<float>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float, cutlass::half_t, N, Round> {
-
-  using result_type = Array<float, N>;
-  using source_type = Array<cutlass::half_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<float, cutlass::half_t, 2, Round> convert_vector_;
-    NumericConverter<float, cutlass::half_t, Round> convert_element_;
-
-    result_type result;
-
-    Array<float, 2> *result_ptr = reinterpret_cast<Array<float, 2> *>(&result);
-    Array<cutlass::half_t, 2> const *source_ptr = reinterpret_cast<Array<cutlass::half_t, 2> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    if (N % 2) {
-      result[N - 1] = convert_element_(source[N - 1]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<cutlass::bfloat16_t, 2> <= Array<float, 2>, round to nearest
-template <>
-struct NumericArrayConverter<cutlass::bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
-
-  using result_type = Array<cutlass::bfloat16_t, 2>;
-  using source_type = Array<float, 2>;
-  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned d;
-
-    asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) );
-
-    return reinterpret_cast<result_type const &>(d);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<cutlass::bfloat16_t> <= Array<float>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::bfloat16_t, float, N, Round> {
-
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<float, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> convert_vector_;
-    NumericConverter<cutlass::bfloat16_t, float, Round> convert_element_;
-
-    result_type result;
-
-    Array<cutlass::bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::bfloat16_t, 2> *>(&result);
-    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 2; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    if (N % 2) {
-      result[N - 1] = convert_element_(source[N - 1]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conditional guards to enable partial specialization for packed integers
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \
-    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
-     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Partial specialization for Array<int8_t, 1> <= Array<int, 1>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int, 1, Round> {
-
-  using result_type = Array<int8_t, 1>;
-  using source_type = Array<int, 1>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericConverter<int8_t, int, Round> convert_element_;
-
-    result_type result;
-
-    result[0] = convert_element_(source[0]);
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int8_t, 2> <= Array<int, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int, 2, Round> {
-
-  using result_type = Array<int8_t, 2>;
-  using source_type = Array<int, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    uint32_t tmp;
-
-    asm volatile(
-      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, 0;\n"
-      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
-
-    uint16_t out = (tmp & 0xffff);
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int8_t, 4> <= Array<int, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int, 4, Round> {
-
-  using result_type = Array<int8_t, 4>;
-  using source_type = Array<int, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned out;
-
-    asm volatile(
-      "{ .reg .u32 r4;"
-      "cvt.pack.sat.s8.s32.b32   r4, %4, %3, 0;"
-      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, r4;"
-      "}"
-      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
-
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int8_t> <= Array<int>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int, N, Round> {
-  static_assert(!(N % 4), "N must be multiple of 4.");
-
-  using result_type = Array<int8_t, N>;
-  using source_type = Array<int, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<int8_t, int, 4, Round> convert_vector_;
-
-    result_type result;
-
-    Array<int8_t, 4> *result_ptr = reinterpret_cast<Array<int8_t, 4> *>(&result);
-    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 4; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<uint8_t, 1> <= Array<int, 1>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, int, 1, Round> {
-
-  using result_type = Array<uint8_t, 1>;
-  using source_type = Array<int, 1>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericConverter<uint8_t, int, Round> convert_element_;
-
-    result_type result;
-
-    result[0] = convert_element_(source[0]);
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<uint8_t, 2> <= Array<int, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, int, 2, Round> {
-
-  using result_type = Array<uint8_t, 2>;
-  using source_type = Array<int, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    uint32_t tmp;
-
-    asm volatile(
-      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, 0;\n"
-      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
-
-    uint16_t out = (tmp & 0xffff);
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<uint8_t, 4> <= Array<int, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, int, 4, Round> {
-
-  using result_type = Array<uint8_t, 4>;
-  using source_type = Array<int, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned out;
-
-    asm volatile(
-      "{ .reg .u32 r4;"
-      "cvt.pack.sat.u8.s32.b32   r4, %4, %3, 0;"
-      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, r4;"
-      "}"
-      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
-
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int8_t> <= Array<int>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, int, N, Round> {
-  static_assert(!(N % 4), "N must be multiple of 4.");
-
-  using result_type = Array<uint8_t, N>;
-  using source_type = Array<int, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<uint8_t, int, 4, Round> convert_vector_;
-
-    result_type result;
-
-    Array<uint8_t, 4> *result_ptr = reinterpret_cast<Array<uint8_t, 4> *>(&result);
-    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 4; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<float, N> <=> Array<float_e4m3_t, N>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<float, 2> <= Array<float_e4m3_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float, cutlass::float_e4m3_t, 2, Round> {
-  using result_element = float;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out_fp16;
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
-        "}\n" : "=r"(out_fp16): "h"(src_packed));
-
-    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
-
-    result_type out;
-    out[0] = res0.x;
-    out[1] = res0.y;
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 2> <= Array<float, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, float, 2, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = float;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
-        "}" \
-        : "=h"(out) : "f"(source[0]), "f"(source[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, 2> <= Array<float_e5m2_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float, cutlass::float_e5m2_t, 2, Round> {
-  using result_element = float;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out_fp16;
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
-        "}\n" : "=r"(out_fp16): "h"(src_packed));
-
-    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
-
-    result_type out;
-    out[0] = res0.x;
-    out[1] = res0.y;
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 2> <= Array<float, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, float, 2, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = float;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
-        "}" \
-        : "=h"(out) : "f"(source[0]), "f"(source[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<half, N> <=> Array<float_e4m3_t, N>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<half, 2> <= Array<float_e4m3_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::half_t, cutlass::float_e4m3_t, 2, Round> {
-  using result_element = cutlass::half_t;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    result_type out;
-    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
-        "}\n" : "=r"(reg): "h"(src_packed));
-
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 2> <= Array<half, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, cutlass::half_t, 2, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = cutlass::half_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e4m3x2.f16x2   %0, %1;\n" \
-        "}" \
-        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<half, 2> <= Array<float_e5m2_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::half_t, cutlass::float_e5m2_t, 2, Round> {
-  using result_element = cutlass::half_t;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    result_type out;
-    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
-        "}\n" : "=r"(reg): "h"(src_packed));
-
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 2> <= Array<half, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, cutlass::half_t, 2, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = cutlass::half_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e5m2x2.f16x2   %0, %1;\n" \
-        "}" \
-        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<bfloat16_t, N> <=> Array<float_e4m3_t, N>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e4m3_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e4m3_t, 2, Round> {
-  using result_element = cutlass::bfloat16_t;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t res_half;
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
-        "}\n" : "=r"(res_half): "h"(src_packed));
-    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
-    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
-    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 2> <= Array<bfloat16_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, cutlass::bfloat16_t, 2, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = cutlass::bfloat16_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
-    Array<float, 2> res_float = converter(source);
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
-        "}" \
-        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e5m2_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e5m2_t, 2, Round> {
-  using result_element = cutlass::bfloat16_t;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t res_half;
-    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
-        "}\n" : "=r"(res_half): "h"(src_packed));
-    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
-    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
-    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 2> <= Array<bfloat16_t, 2>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, cutlass::bfloat16_t, 2, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = cutlass::bfloat16_t;
-
-  using result_type = Array<result_element, 2>;
-  using source_type = Array<source_element, 2>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
-    Array<float, 2> res_float = converter(source);
-    uint16_t out;
-
-    asm volatile( \
-        "{\n" \
-        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
-        "}" \
-        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-namespace detail {
-
-/// Special converters that can be used with 4 8-bit elements packed in a register.
-/// Common use is for fast FP8 converters.
-template <
-  typename T,
-  typename S,
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
->
-struct NumericArrayConverterPacked4Element {
-  using result_type = Array<T, 4>;
-  using source_type = Array<S, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
-                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
-                  "Unary Operator not supported.");
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & s) {
-
-    result_type result;
-    NumericConverter<T, S, Round> convert_;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
-        result[i] = convert_(s[i]);
-      }
-      else { // conjugate
-        result[i] = conj(convert_(s[i]));
-      }
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, 4> <= Array<float_e4m3_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float, cutlass::float_e4m3_t, Round> {
-  using result_element = float;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out_fp16[2];
-    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo, hi;\n" \
-        "mov.b32 {lo, hi}, %2;\n" \
-        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
-        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
-        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
-
-    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
-    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
-
-    result_type out;
-    out[0] = res0.x;
-    out[1] = res0.y;
-    out[2] = res1.x;
-    out[3] = res1.y;
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e4m3_t, float, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = float;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out;
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo;\n" \
-        ".reg .b16 hi;\n" \
-        "cvt.rn.satfinite.e4m3x2.f32   lo, %2, %1;\n" \
-        "cvt.rn.satfinite.e4m3x2.f32   hi, %4, %3;\n" \
-        "mov.b32 %0, {lo, hi};\n" \
-        "}" \
-        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<float, 4> <=> Array<float_e5m2_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<float, 4> <= Array<float_e5m2_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float, cutlass::float_e5m2_t, Round> {
-  using result_element = float;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out_fp16[2];
-    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo, hi;\n" \
-        "mov.b32 {lo, hi}, %2;\n" \
-        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
-        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
-        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
-
-    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
-    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
-
-    result_type out;
-    out[0] = res0.x;
-    out[1] = res0.y;
-    out[2] = res1.x;
-    out[3] = res1.y;
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e5m2_t, float, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = float;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out;
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo;\n" \
-        ".reg .b16 hi;\n" \
-        "cvt.rn.satfinite.e5m2x2.f32   lo, %2, %1;\n" \
-        "cvt.rn.satfinite.e5m2x2.f32   hi, %4, %3;\n" \
-        "mov.b32 %0, {lo, hi};\n" \
-        "}" \
-        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e4m3_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e4m3_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e4m3_t, Round> {
-  using result_element = cutlass::half_t;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out[2];
-    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo, hi;\n" \
-        "mov.b32 {lo, hi}, %2;\n" \
-        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
-        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
-        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::half_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::half_t, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = cutlass::half_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out;
-    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo;\n" \
-        ".reg .b16 hi;\n" \
-        "cvt.rn.satfinite.e4m3x2.f16x2   lo, %1;\n" \
-        "cvt.rn.satfinite.e4m3x2.f16x2   hi, %2;\n" \
-        "mov.b32 %0, {lo, hi};\n" \
-        "}" \
-        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e5m2_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e5m2_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e5m2_t, Round> {
-  using result_element = cutlass::half_t;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out[2];
-    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo, hi;\n" \
-        "mov.b32 {lo, hi}, %2;\n" \
-        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
-        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
-        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::half_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::half_t, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = cutlass::half_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    uint32_t out;
-    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
-
-    asm volatile( \
-        "{\n" \
-        ".reg .b16 lo;\n" \
-        ".reg .b16 hi;\n" \
-        "cvt.rn.satfinite.e5m2x2.f16x2   lo, %1;\n" \
-        "cvt.rn.satfinite.e5m2x2.f16x2   hi, %2;\n" \
-        "mov.b32 %0, {lo, hi};\n" \
-        "}" \
-        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
-
-    return reinterpret_cast<result_type const &>(out);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e4m3_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e4m3_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e4m3_t, Round> {
-  using result_element = cutlass::bfloat16_t;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    // Convert f8 to float
-    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
-    Array<float, 4> tmp_floats = src2float(source);
-
-    // Convert float to bf16
-    result_type out;
-    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
-    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
-    NumericArrayConverter<result_element, float, 2, Round> float2result;
-    packed_out[0] = float2result(packed_tmp[0]);
-    packed_out[1] = float2result(packed_tmp[1]);
-
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::bfloat16_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::bfloat16_t, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = cutlass::bfloat16_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    // Convert bf16 to float
-    Array<float, 4> tmp;
-    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
-    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
-    NumericArrayConverter<float, source_element, 2, Round> src2float;
-    packed_tmp[0] = src2float(packed_source[0]);
-    packed_tmp[1] = src2float(packed_source[1]);
-
-    // Convert float to f8
-    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
-    return float2result(tmp);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e5m2_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e5m2_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e5m2_t, Round> {
-  using result_element = cutlass::bfloat16_t;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    // Convert f8 to float
-    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
-    Array<float, 4> tmp_floats = src2float(source);
-
-    // Convert float to bf16
-    result_type out;
-    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
-    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
-    NumericArrayConverter<result_element, float, 2, Round> float2result;
-    packed_out[0] = float2result(packed_tmp[0]);
-    packed_out[1] = float2result(packed_tmp[1]);
-
-    return out;
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::bfloat16_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::bfloat16_t, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = cutlass::bfloat16_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
-    // Convert bf16 to float
-    Array<float, 4> tmp;
-    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
-    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
-    NumericArrayConverter<float, source_element, 2, Round> src2float;
-    packed_tmp[0] = src2float(packed_source[0]);
-    packed_tmp[1] = src2float(packed_source[1]);
-
-    // Convert float to f8
-    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
-    return float2result(tmp);
-  #else
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  #endif
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for Array<float_e4m3_t, 4> <=> Array<float_e5m2_t, 4>
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float_e5m2_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::float_e5m2_t, Round> {
-  using result_element = cutlass::float_e4m3_t;
-  using source_element = cutlass::float_e5m2_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float_e4m3_t, 4>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::float_e4m3_t, Round> {
-  using result_element = cutlass::float_e5m2_t;
-  using source_element = cutlass::float_e4m3_t;
-
-  using result_type = Array<result_element, 4>;
-  using source_type = Array<source_element, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-    result_type result;
-    NumericConverter<result_element, source_element, Round> converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      result[i] = converter(source[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations for:
-//       Array<T, N> <=> Array<float_e4m3_t, N>
-//       Array<T, N> <=> Array<float_e5m2_t, N>
-// using packed converter under the hood
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename T,
-  typename S,
-  int N,
-  FloatRoundStyle Round
->
-struct PackedNumericArrayConverter {
-  using result_element = T;
-  using source_element = S;
-
-  using result_type = Array<result_element, N>;
-  using source_type = Array<source_element, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using packed_result_type = Array<result_element, 4>;
-  using packed_source_type = Array<source_element, 4>;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-    result_type result;
-    packed_result_type* packed_result = reinterpret_cast<packed_result_type*>(&result);
-    const packed_source_type* packed_source = reinterpret_cast<const packed_source_type*>(&source);
-
-    detail::NumericArrayConverterPacked4Element<result_element, source_element, Round> packed_converter;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 4; ++i) {
-      packed_result[i] = packed_converter(packed_source[i]);
-    }
-
-    // Handle leftovers
-    NumericConverter<result_element, source_element, Round> converter;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N % 4; ++i) {
-      int idx = ((N / 4) * 4) + i;
-      result[idx] = converter(source[idx]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const{
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<T, N> <= Array<float_e4m3_t, N>
-template <
-  typename T,
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> :
-  public PackedNumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> {};
-
-/// Partial specialization for Array<T, N> <= Array<float_e5m2_t, N>
-template <
-  typename T,
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> :
-  public PackedNumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> {};
-
-/// Partial specialization for Array<float_e4m3_t, N> <= Array<S, N>
-template <
-  typename S,
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, S, N, Round> :
-  public PackedNumericArrayConverter<float_e4m3_t, S, N, Round> {};
-
-/// Partial specialization for Array<float_e5m2_t, N> <= Array<S, N>
-template <
-  typename S,
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, S, N, Round> :
-  public PackedNumericArrayConverter<float_e5m2_t, S, N, Round> {};
-
-/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e5m2_t, N>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> :
-  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> {};
-
-/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e4m3_t, N>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> :
-  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> {};
-
-/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e4m3_t, N>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> :
-  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> {};
-
-/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e5m2_t, N>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> :
-  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> {};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Array<int8_t> <= Array<float>
-/// Conversion is performed with saturation regardless of setting of
-/// the `Round` template parameter.
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, float, 1, Round> {
-
-  using result_type = Array<int8_t, 1>;
-  using source_type = Array<float, 1>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericConverter<int8_t, float, Round> destination_converter;
-    result_type result;
-    result[0] = destination_converter(source[0]);
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, float, 1, Round> {
-
-  using result_type = Array<uint8_t, 1>;
-  using source_type = Array<float, 1>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericConverter<uint8_t, float, Round> destination_converter;
-    result_type result;
-    result[0] = destination_converter(source[0]);
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-// To convert a FP32 to Int that has less than 32 bits, we need to convert it to int32 first.
-template <
-  typename T,
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayFP32ToIntConverter {
-
-  using result_type = Array<T, N>;
-  using source_type = Array<float, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  static_assert(cutlass::platform::numeric_limits<T>::is_integer, "the dest type has to be int.");
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    // Convert float to int
-    Array<int32_t, N> temporary;
-
-    NumericArrayConverter<int32_t, float, N, Round> compute_converter;
-    temporary = compute_converter(source);
-
-    // Convert to int to int8_t
-    NumericArrayConverter<T, int32_t, N, Round> destination_converter;
-    return destination_converter(temporary);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, float, N, Round> {
-
-  using result_type = Array<int8_t, N>;
-  using source_type = Array<float, N>;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericArrayFP32ToIntConverter<int8_t, N, Round> converter;
-    return converter(source);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint8_t, float, N, Round> {
-
-  using result_type = Array<uint8_t, N>;
-  using source_type = Array<float, N>;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericArrayFP32ToIntConverter<uint8_t, N, Round> converter;
-    return converter(source);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int4b_t, float, N, Round> {
-
-  using result_type = Array<int4b_t, N>;
-  using source_type = Array<float, N>;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericArrayFP32ToIntConverter<int4b_t, N, Round> converter;
-    return converter(source);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint4b_t, float, N, Round> {
-
-  using result_type = Array<uint4b_t, N>;
-  using source_type = Array<float, N>;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-    NumericArrayFP32ToIntConverter<uint4b_t, N, Round> converter;
-    return converter(source);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
-    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
-     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-
-/// Partial specialization for Array<int4b_t, 8> <= Array<int, 8>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int4b_t, int, 8, Round> {
-
-  using result_type = Array<int4b_t, 8>;
-  using source_type = Array<int, 8>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned out;
-
-    asm volatile(
-        "{ .reg .u32 r4;"
-        "cvt.pack.sat.s4.s32.b32   r4, %8, %7, 0;"
-        "cvt.pack.sat.s4.s32.b32   r4, %6, %5, r4;"
-        "cvt.pack.sat.s4.s32.b32   r4, %4, %3, r4;"
-        "cvt.pack.sat.s4.s32.b32   %0, %2, %1, r4;"
-        "}"
-        : "=r"(out)
-        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
-          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
-
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int4b_t> <= Array<int>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int4b_t, int, N, Round> {
-  static_assert(!(N % 8), "N must be multiple of 8.");
-
-  using result_type = Array<int4b_t, N>;
-  using source_type = Array<int, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<int4b_t, int, 8, Round> convert_vector_;
-
-    result_type result;
-
-    Array<int4b_t, 8> *result_ptr = reinterpret_cast<Array<int4b_t, 8> *>(&result);
-    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 8; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<uint4b_t, 8> <= Array<int, 8>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint4b_t, int, 8, Round> {
-
-  using result_type = Array<uint4b_t, 8>;
-  using source_type = Array<int, 8>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned out;
-
-    asm volatile(
-        "{ .reg .u32 r4;"
-        "cvt.pack.sat.u4.s32.b32   r4, %8, %7, 0;"
-        "cvt.pack.sat.u4.s32.b32   r4, %6, %5, r4;"
-        "cvt.pack.sat.u4.s32.b32   r4, %4, %3, r4;"
-        "cvt.pack.sat.u4.s32.b32   %0, %2, %1, r4;"
-        "}"
-        : "=r"(out)
-        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
-          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
-
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int4b_t> <= Array<int>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<uint4b_t, int, N, Round> {
-  static_assert(!(N % 8), "N must be multiple of 8.");
-
-  using result_type = Array<uint4b_t, N>;
-  using source_type = Array<int, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_HOST_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<uint4b_t, int, 8, Round> convert_vector_;
-
-    result_type result;
-
-    Array<uint4b_t, 8> *result_ptr = reinterpret_cast<Array<uint4b_t, 8> *>(&result);
-    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 8; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#endif  // Conditional guards to enable partial specialization for packed integers
-
-namespace detail {
-
-  /*
-      A helper class that can vectorize a numeric converter with implementation for several vector widths.
-
-      The vector widths must be giving in decreasing order or width, and must be a power of 2.
-
-      The vector converters must produce identical results to the scalar converters for consistency.
-    */
-  class VectorizedConverter {
-  private:
-    // Base case to handle remainder elements as scalars.
-    template <int Offset, size_t ParentWidth, typename ArrayConverter>
-    CUTLASS_DEVICE
-    static void convert_helper(
-      typename ArrayConverter::result_type& result,
-      typename ArrayConverter::source_type const& source) {
-
-      using ElementRes = typename ArrayConverter::result_type::Element;
-      using ElementSrc = typename ArrayConverter::source_type::Element;
-      // If no more converters, handle the remaining elements as scalars.
-      constexpr int total_elements = ArrayConverter::result_type::kElements;
-      constexpr int remainder = total_elements - Offset;
-      static_assert(remainder == (total_elements % ParentWidth), "Unexpected remainder.");
-
-      typename ArrayConverter::ScalarConverter scalar_converter;
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = Offset; i < ArrayConverter::result_type::kElements; ++i) {
-        result[i] = scalar_converter(ElementSrc(source[i]));
-      }
-    }
-
-    template <int Offset, size_t ParentWidth, typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
-    CUTLASS_DEVICE
-    static void convert_helper(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
-      static_assert(sizeof...(OtherVectorArrays) % 2 == 0, "Vector converters must come in {dst, src} pairs");
-      static_assert(ResultVectorArray::kElements == SourceVectorArray::kElements, "Vector converters must have the same vector width");
-      static_assert(cutlass::platform::is_same<typename ArrayConverter::result_type::Element, typename ResultVectorArray::Element>::value,
-        "ResultVectorArray must have the same type ArrayConverter::result_type");
-      static_assert(cutlass::platform::is_same<typename ArrayConverter::source_type::Element, typename SourceVectorArray::Element>::value,
-        "SourceVectorArray must have the same type ArrayConverter::result_type");
-      static_assert(Offset >= 0 && Offset <= ArrayConverter::result_type::kElements, "Offset must be between 0 and N");
-
-      static_assert(ParentWidth == 0 || ParentWidth > ResultVectorArray::kElements, "Vector arrays must be given in decreasing order of width");
-
-      constexpr int vector_width = ResultVectorArray::kElements;
-      static_assert(ispow2(vector_width), "Vector width must be a power of 2");
-
-      using ElementRes = typename ArrayConverter::result_type::Element;
-      using ElementSrc = typename ArrayConverter::source_type::Element;
-
-      constexpr int vector_bits_res = vector_width * cutlass::sizeof_bits<ElementRes>::value;
-      constexpr int vector_bits_src = vector_width * cutlass::sizeof_bits<ElementSrc>::value;
-
-      static_assert(vector_bits_res % 8 == 0, "Result vector type must be byte addressed.");
-      static_assert(vector_bits_src % 8 == 0, "Source vector type must be byte addressed.");
-
-      constexpr int vector_offset = Offset / vector_width;
-      ResultVectorArray* packed_result_vec = reinterpret_cast<ResultVectorArray*>(&result) + vector_offset;
-      SourceVectorArray const* packed_source_vec = reinterpret_cast<SourceVectorArray const*>(&source) + vector_offset;
-
-      // Convert the remaining elements as vectors.
-      constexpr int total_elements = ArrayConverter::result_type::kElements;
-      constexpr int groups_of_vec = (total_elements - Offset) / vector_width;
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < groups_of_vec; ++i) {
-        packed_result_vec[i] = ArrayConverter::template packed_convert<ResultVectorArray, SourceVectorArray>(packed_source_vec[i]);
-      }
-
-      constexpr int new_offset = Offset + vector_width * groups_of_vec;
-      // Recurse to handle other vector converters, or the scalar base case.
-      convert_helper<new_offset, ResultVectorArray::kElements, ArrayConverter, OtherVectorArrays...>(result, source);
-    }
-
-  public:
-    /*
-        A method to convert vectors of elements using the packed_convert method of the converter.
-
-        Converters using this class must implement packed convert and support 1 or more vector conversions.
-      */
-    template <typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
-    CUTLASS_DEVICE
-    static void convert(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
-      convert_helper<0, 0, ArrayConverter, ResultVectorArray, SourceVectorArray, OtherVectorArrays...>(result, source);
-    }
-  };
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__CUDA_ARCH__)
-/// Partial specialization for Array<int8_t, 8> <= Array<int4b_t, 8>
-template <
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int4b_t, 8, Round> {
-
-  using result_type = Array<int8_t, 8>;
-  using source_type = Array<int4b_t, 8>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-    unsigned const& storage = reinterpret_cast<unsigned const &>(source);
-    unsigned out[2];
-
-    asm volatile(
-        "{\n"
-        "  .reg .u32 tmp0, tmp1, tmp2;\n"
-        "  shl.b32 tmp0, %2, 4;\n"                // tmp0 = x1x2x3x4x5x6x7__
-        "  and.b32 tmp0, tmp0, 0xf0f0f0f0;\n"     // tmp0 = x1__x3__x5__x7__
-        "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s1s3s5s7
-        "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s1__s3__s5__s7__
-        "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x1__x3__x5__x7
-        "  or.b32 tmp2, tmp0, tmp1;\n"            // tmp2 = y1y3y5y7
-        "  and.b32 tmp0, %2, 0xf0f0f0f0;\n"       // tmp0 = x0__x2__x4__x6__
-        "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s0s2s4s6
-        "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s0__s2__s4__s6__
-        "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x0__x2__x4__x6
-        "  or.b32 tmp0, tmp0, tmp1;\n"            // tmp0 = y0y2y4y6
-        "  prmt.b32 %0, tmp2, tmp0, 0x5140;\n"    // %0 = y0y1y2y3
-        "  prmt.b32 %1, tmp2, tmp0, 0x7362;\n"    // %1 = y4y5y6y7
-        "}\n"
-        : "=r"(out[0]), "=r"(out[1])
-        : "r"(storage));
-
-    return reinterpret_cast<result_type const &>(out);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<int8_t> <= Array<int4b_t>
-template <
-  int N,
-  FloatRoundStyle Round
->
-struct NumericArrayConverter<int8_t, int4b_t, N, Round> {
-  static_assert(!(N % 8), "N must be multiple of 8.");
-
-  using result_type = Array<int8_t, N>;
-  using source_type = Array<int4b_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const & source) {
-
-    NumericArrayConverter<int8_t, int4b_t, 8, Round> convert_vector_;
-
-    result_type result;
-
-    Array<int8_t, 8> *result_ptr = reinterpret_cast<Array<int8_t, 8> *>(&result);
-    Array<int4b_t, 8> const *source_ptr = reinterpret_cast<Array<int4b_t, 8> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 8; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-#endif // defined(__CUDA_ARCH__)
-
-/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::int4b_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::int4b_t, N, Round> {
-  using result_type = Array<cutlass::float_e4m3_t, N>;
-  using source_type = Array<cutlass::int4b_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
-  using result_type_packed_4 = Array<cutlass::float_e4m3_t, 4>;
-  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
-  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
-
-  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::int4b_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_8 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  // The core converter uses a lookup table to converts i4 -> e4m3.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_8>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
-
-    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
-    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
-
-    // View the input as reg
-    uint32_t reg = to_reg(source);
-
-    // Determines if to get from the signed or unsigned candidates
-    uint32_t sign = (reg & 0x88888888) >> 1;
-
-    // Ignore sign bit when indexing into LUT
-    uint32_t lut_idx = (reg & 0x77777777);
-
-    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
-    const uint32_t final_prmt_base = 0x32103210;
-
-    // [0, 1, 2, 3] encoded as FP8
-    static constexpr uint32_t POS_E4M3s_REG1 = 0x44403800;
-    // [4, 5, 6, 7] encoded as FP8
-    static constexpr uint32_t POS_E4M3s_REG2 = 0x4E4C4A48;
-    // [-1, -2, -3, -4] encoded as FP8
-    static constexpr uint32_t NEG_E4M3s_REG1 = 0xCACCCED0;
-    // [-5, -6, -7, -7] encoded as FP8
-    static constexpr uint32_t NEG_E4M3s_REG2 = 0xB8C0C4C8;
-
-
-    const int iters = PackedSrcType::kElements / 4;
-    #pragma unroll
-    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
-      uint32_t final_prmt_idx = final_prmt_base | sign;
-
-      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
-      // as the index to prmt.
-      // It first select both the positive and negative candidates, then uses the sign bit to
-      // select the correct candidate.
-      asm volatile(
-          "{\n"
-          "  .reg .b32 pos_f8s, neg_f8s;\n"
-          "  prmt.b32 pos_f8s, %1, %2, %5;\n"
-          "  prmt.b32 neg_f8s, %3, %4, %5;\n"
-          "  prmt.b32 %0, pos_f8s, neg_f8s, %6;\n"
-          "}\n"
-          : "=r"(r[ii])
-          : "n"(POS_E4M3s_REG1), "n"(POS_E4M3s_REG2), "n"(NEG_E4M3s_REG1), "n"(NEG_E4M3s_REG2),
-            "r"(lut_idx), "r"(final_prmt_idx));
-    }
-    return reinterpret_cast<PackedResultType&>(r);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_8, source_type_packed_8,
-                                         result_type_packed_4, source_type_packed_4>(result, source);
-
-    return result;
-  }
-
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, N> <= Array<cutlass::int4b_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<float, cutlass::int4b_t, N, Round> {
-  using result_type = Array<float, N>;
-  using source_type = Array<cutlass::int4b_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_8 = Array<float, 8>;
-  using result_type_packed_4 = Array<float, 4>;
-  using result_type_packed_2 = Array<float, 2>;
-  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
-  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
-  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
-
-  using ScalarConverter = NumericConverter<float, cutlass::int4b_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint8_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_8 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  template <int offset, int elements_to_convert, typename PackedResultType>
-  CUTLASS_DEVICE
-  static void packed_convert_vec(PackedResultType& result, uint32_t src_reg) {
-    static_assert(offset == 0 || offset == 4, "Invalid offset");
-    // Selects one of the bottom int4s and constructs:
-    // 8388608 + (x + 8)
-    // 8388608 + 16 * (x + 8)
-    // 8388608 + 256 * (x + 8)
-    // 8388608 + 4096 * (x + 8)
-    uint32_t const and_masks[4] = {0x0000000F, 0x000000F0, 0x00000F00, 0x0000F000};
-    uint32_t const xor_masks[4] = {0x4B000008, 0x4B000080, 0x4B000800, 0x4B008000};
-
-    float const scales[4] = {1.f, 1.f / 16.f, 1.f / 256.f, 1.f / 4096.f};
-    float const offsets[4] = {-8388616.f, -524296.f, -32776.f, -2056.f};
-
-    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
-
-    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&result);
-
-    // For each operand, computes:
-    // r[i] = (r[i] & and_mask) ^ xor_mask
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < elements_to_convert; ++ii) {
-      asm volatile(
-          "{\n"
-          "  lop3.b32 %0, %1, %2, %3, %4;\n"
-          "}\n"
-          : "=r"(result_as_int[offset + ii])
-          : "r"(src_reg), "r"(and_masks[ii]), "r"(xor_masks[ii]), "n"(immLut));
-
-      result[offset + ii] = __fmaf_rn(result[offset + ii], scales[ii], offsets[ii]);
-    }
-  }
-
-  // The core converter uses bit tricks to construct a known FP16 number, then does a
-  // subtraction in FP16 for the final result.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_8>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 1, 2, 4 or 8 to use private convert dispatch.");
-
-    // Hold output FP16s in reg. We need 1 reg for every 2 elements
-    PackedResultType r;
-
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-    constexpr int total_elements = PackedResultType::kElements == 8 ? 4 : PackedResultType::kElements;
-    packed_convert_vec<0, total_elements>(r, src_reg);
-
-
-    if (PackedResultType::kElements == 8) {
-      uint32_t src_reg_shifted = src_reg >> 16;
-      packed_convert_vec<4, 4>(r, src_reg_shifted);
-    }
-    return r;
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_8, source_type_packed_8,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, N> <= Array<int8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<float, int8_t, N, Round> {
-  using result_type = Array<float, N>;
-  using source_type = Array<int8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<float, 4>;
-  using result_type_packed_2 = Array<float, 2>;
-  using source_type_packed_4 = Array<int8_t, 4>;
-  using source_type_packed_2 = Array<int8_t, 2>;
-
-  using ScalarConverter = NumericConverter<float, int8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  CUTLASS_DEVICE
-  static int32_t to_int32(source_type_packed_2 const& source) {
-    return static_cast<int32_t>(reinterpret_cast<const int16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static int32_t to_int32(source_type_packed_4 const& source) {
-    return reinterpret_cast<const int32_t&>(source);
-  }
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    PackedResultType r;
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ <= 800
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-    static constexpr int fp32_base = 0x4B400000;
-    uint32_t const prmt_indices[4] = {0x8880, 0x9991, 0xAAA2, 0xBBB3};
-
-    int* result_as_int = reinterpret_cast<int*>(&r);
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
-      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(result_as_int[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < PackedResultType::kElements; ++ii)
-    {
-      result_as_int[ii] += fp32_base;
-      r[ii] -= reinterpret_cast<const float&>(fp32_base);
-    }
-  #else
-    int32_t x = to_int32(source);
-    int32_t t[4];
-    constexpr int32_t mask[4] = {0x00000001, 0x00000100, 0x00010000, 0x01000000};
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
-      t[ii] = __dp4a(x, mask[ii], 0);
-      r[ii] = static_cast<float>(t[ii]);
-    }
-  #endif
-
-    return r;
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<float, N> <= Array<uint8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<float, uint8_t, N, Round> {
-  using result_type = Array<float, N>;
-  using source_type = Array<uint8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<float, 4>;
-  using result_type_packed_2 = Array<float, 2>;
-  using source_type_packed_4 = Array<uint8_t, 4>;
-  using source_type_packed_2 = Array<uint8_t, 2>;
-
-  using ScalarConverter = NumericConverter<float, uint8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    PackedResultType r;
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-
-    // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of u8x4 source and stores
-    // the result in r (without introducing extra cvt.u32.u8 instruction)
-    uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
-    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
-    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
-      result_as_int[ii] = __byte_perm(src_reg, 0x4B000000, prmt_indices[ii]);
-      // Subtract the magic number 0x4B000000 from tmp in floating-point arithmetic to obtain final result
-      r[ii] -= 8388608.f;
-    }
-
-    return r;
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int4b_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::half_t, cutlass::int4b_t, N, Round> {
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<cutlass::int4b_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_8 = Array<cutlass::half_t, 8>;
-  using result_type_packed_4 = Array<cutlass::half_t, 4>;
-  using result_type_packed_2 = Array<cutlass::half_t, 2>;
-  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
-  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
-  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int4b_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint8_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_8 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  // The core converter uses bit tricks to construct a known FP16 number, then does a
-  // subtraction in FP16 for the final result.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_8>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
-
-    // Hold output FP16s in reg. We need 1 reg for every 2 elements
-    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
-    RegArray r;
-
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-
-    // Below constructs the following temporary:
-    // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
-    // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
-    // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
-    // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
-    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
-    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
-    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
-    static_assert(RegArray::kElements <= 4, "Too many inputs for F16 -> I4 vector converter");
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile(
-          "{\n"
-          "  prmt.b32 %0, %1, %2, %3;\n"
-          "}\n"
-          : "=r"(r[ii])
-          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
-    }
-
-    // The below XOR does the following:
-    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
-    //    1024 + x + 8 OR 1024 + 16 * (x + 8), then using hfma to subtract 1032 from that
-    // 2) Adds 8 to the int4 value that we will process in the FP16 (for uint4, we can simply avoid this step)
-    // The AND does the following:
-    // 1) Clear the set bits for the int4 we will ignore.
-    // We use lop3 so that we can use 1 instruction for AND and XOR.
-    static constexpr uint32_t xor_mask = 0x64806408;
-    static constexpr uint32_t and_mask = 0xFFF0FF0F;
-    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
-
-    // For each operand, computes:
-    // r[i] = (r[i] & and_mask) ^ xor_mask
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile(
-          "{\n"
-          "  lop3.b32 %0, %0, %1, %2, %3;\n"
-          "}\n"
-          : "+r"(r[ii])
-          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
-    }
-
-    // We will issue 2 hfmas that do the following:
-    // For the high FP16:
-    //  Divide by 16 {packed as a operand} to get:
-    //    64 + (x + 8)
-    //    x + 72
-    //  Subtract 72 {packed as c operand} to get x
-    // For the low FP16:
-    //    1024 + (x + 8)
-    //    x + 1032
-    // So, we subtract 1032 {packed as c operand} to get x
-
-    // {-72, -1032}
-    static constexpr uint32_t hfma_bias_rep = 0xD480E408;
-    // {1 / 16, 1}
-    static constexpr uint32_t hfma_scale_rep = 0x2C003C00;
-
-    const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
-    const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
-    // Scale and subtract the FP16s to get the original int4 number as FP16.
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
-      fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
-    }
-    return reinterpret_cast<PackedResultType&>(r);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_8, source_type_packed_8,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<cutlass::half_t, N> <= Array<int8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::half_t, int8_t, N, Round> {
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<int8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<cutlass::half_t, 4>;
-  using result_type_packed_2 = Array<cutlass::half_t, 2>;
-  using source_type_packed_4 = Array<int8_t, 4>;
-  using source_type_packed_2 = Array<int8_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::half_t, int8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  // The core converter uses bit tricks to construct a known FP16 number, then does a
-  // subtraction in FP16 for the final result.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    // Hold output FP16s in reg. We need 1 reg for every 2 elements
-    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
-    RegArray r;
-
-    #if 0 // Scalar conversion (Please keep this code for reference for vectorized version below)
-    auto result = reinterpret_cast<PackedResultType&>(r);
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < PackedResultType::kElements; ++i) {
-      int16_t tmp = source[i] + 26112 /* 0x6600 */;
-      result[i] = reinterpret_cast<cutlass::half_t const &>(tmp) - 1536.0_hf;
-    }
-    #endif
-
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-    uint32_t const prmt_indices[2] = {0x9180, 0xB3A2};
-
-    // Pack s8x2 (s8[1], s8[0]) -> s16x2 (sext.s8[1], sext.s8[0])
-    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt)
-    // The inline ptx below uses `msb=0` and `msb=1` from the above link to sign-extend the sign bit in 0, 1, 2, 3 bytes of s8x4
-    // into result_ptr[0] and result_ptr[1]'s 08-15 and 24-31 bits, respectively.
-    // Note that `__byte_perm(source_ptr[0], source_ptr[0], 0x9180);` won't achieve the same result and doesn't sign-extend the sign bit.
-    // Thus, we use inline ptx `prmt.b32` instruction for the desired sign extend from s8x2 to s16x2.
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(r[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
-    }
-
-    // In the absense of add.s16x2 instruction, use bit-wise operation to execute signed addition with magic numbers to achieve
-    // the same result as add.s16x2 instruction.
-    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
-    // For a logical operation F(a, b, c) the value of kImmLut can be computed by applying the same operation to
-    // three predefined constant values as follows:
-    //                                        ta = 0xF0;
-    //                                        tb = 0xCC;
-    //                                        tc = 0xAA;
-    //                                   kImmLut = F(ta, tb, tc);
-    // If we want F = ((a & b) ^ c) then set kImmLut = (0xF0 & 0xCC) ^ 0xAA
-    static constexpr uint32_t kImmLut = (0xF0 & 0xCC) ^ 0xAA;
-
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      // The bit-wise operation executed below is `r[ii] = (r[ii] & 0x03FF03FF) ^ 0x66006600;`
-      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" :
-                                "=r"(r[ii]) : "r"(r[ii]), "n"(0x03FF03FF), "n"(0x66006600), "n"(kImmLut));
-    }
-
-    static constexpr uint32_t bias_rep = 0x66006600;
-    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
-      fp16x2_val = __hsub2(fp16x2_val, bias);
-    }
-    return reinterpret_cast<PackedResultType&>(r);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<cutlass::half_t, N> <= Array<uint8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::half_t, uint8_t, N, Round> {
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<uint8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<cutlass::half_t, 4>;
-  using result_type_packed_2 = Array<cutlass::half_t, 2>;
-  using source_type_packed_4 = Array<uint8_t, 4>;
-  using source_type_packed_2 = Array<uint8_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::half_t, uint8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    // Hold output FP16s in reg. We need 1 reg for every 2 elements
-    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
-    RegArray r;
-
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-    uint32_t const prmt_indices[2] = {0x5150, 0x5352};
-    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(r[ii]) : "r"(src_reg), "n"(start_byte_for_fp16), "r"(prmt_indices[ii]));
-    }
-
-    static constexpr uint32_t bias_rep = 0x64006400;
-    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
-      fp16x2_val = __hsub2(fp16x2_val, bias);
-    }
-
-    return reinterpret_cast<PackedResultType&>(r);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int4b_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int4b_t, N, Round> {
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<cutlass::int4b_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
-  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
-  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
-  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
-  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
-  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int4b_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint8_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_8 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  // The core converter uses bit tricks to construct a known FP16 number, then does a
-  // subtraction in FP16 for the final result.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_8>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
-
-    // Hold output FP16s in reg. We need 1 reg for every 2 elements
-    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
-    RegArray r;
-
-    // View the input as reg
-    uint32_t src_reg = to_reg(source);
-    uint32_t src_reg_shifted = src_reg >> 4;
-
-    // Below constructs the following temporary:
-    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
-    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile(
-          "{\n"
-          "  prmt.b32 %0, %1, %2, %3;\n"
-          "}\n"
-          : "=r"(r[ii])
-          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
-    }
-
-    // The below XOR does the following:
-    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
-    //    128 + (x + 8) and subtracting 136 to get x
-    static constexpr uint32_t xor_mask = 0x43084308;
-    static constexpr uint32_t and_mask = 0x000F000F;
-    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
-
-    // For each operand, computes:
-    // r[i] = (r[i] & and_mask) ^ xor_mask
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      asm volatile(
-          "{\n"
-          "  lop3.b32 %0, %0, %1, %2, %3;\n"
-          "}\n"
-          : "+r"(r[ii])
-          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
-    }
-
-    // We will issue 2 bfmas that do the following:
-    // high BF16:
-    // hi_bf16 - 136, lo_bf16 - 136
-
-    // This is the BF16 {136, 136} represented as an integer.
-    static constexpr uint32_t bias_rep = 0x43084308;
-    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int ii = 0; ii < RegArray::kElements; ++ii) {
-      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
-      bf16x2_val = __hsub2(bf16x2_val, bias);
-    }
-
-    return reinterpret_cast<PackedResultType&>(r);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_8, source_type_packed_8,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<int8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::bfloat16_t, int8_t, N, Round> {
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<int8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
-  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
-  using source_type_packed_4 = Array<int8_t, 4>;
-  using source_type_packed_2 = Array<int8_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, int8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    NumericArrayConverter<float, int8_t, PackedResultType::kElements, Round> convert_int8_to_f32;
-    Array<float, PackedResultType::kElements> tmp = convert_int8_to_f32(source);
-    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16;
-    return convert_f32_to_bf16(tmp);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<uint8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::bfloat16_t, uint8_t, N, Round> {
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<uint8_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
-private:
-  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
-  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
-  using source_type_packed_4 = Array<uint8_t, 4>;
-  using source_type_packed_2 = Array<uint8_t, 2>;
-
-  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, uint8_t, Round>;
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_2 const& source) {
-    return static_cast<uint32_t>(
-      reinterpret_cast<const uint16_t&>(source));
-  }
-
-  CUTLASS_DEVICE
-  static uint32_t to_reg(source_type_packed_4 const& source) {
-    return reinterpret_cast<const uint32_t&>(source);
-  }
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE
-  static PackedResultType packed_convert(PackedSrcType const &source) {
-
-    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
-                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
-                   platform::is_same<PackedResultType, result_type_packed_4>::value),
-                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
-
-    NumericArrayConverter<float, uint8_t, PackedResultType::kElements, Round> convert_uint8_to_f32;
-    Array<float, PackedResultType::kElements> tmp = convert_uint8_to_f32(source);
-    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16_;
-    return convert_f32_to_bf16_(tmp);
-  }
-
-  friend class detail::VectorizedConverter;
-
-public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType,
-                                         result_type_packed_4, source_type_packed_4,
-                                         result_type_packed_2, source_type_packed_2>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const {
-    return convert(s);
-  }
-};
-
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// FastNumericArrayConverter only works when the source is within center range.
-/// Conversion operator for Array.  See the comments before
-/// FastLinearCombinationClamp.
-template <typename T, typename S, int N,
-          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-          typename Enable = void>
-struct FastNumericArrayConverter {
-  using result_type = Array<T, N>;
-  using source_type = Array<S, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &s) {
-    NumericArrayConverter<T, S, N, Round> convert_;
-
-    return convert_(s);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const { return convert(s); }
-};
-
-/// Partial specialization for Array<float> <= Array<int>
-template <int N, FloatRoundStyle Round>
-struct FastNumericArrayConverter<float, int, N, Round> {
-  using result_type = Array<float, N>;
-  using source_type = Array<int, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    result_type result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int tmp = source[i] + 1262485504 /*0x4B400000*/;
-      result[i] = reinterpret_cast<float const &>(tmp) - 12582912.0f;
-    }
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const { return convert(s); }
-};
-
-/// Partial specialization for Array<int8_t, 4> <= Array<float, 4>
-template <FloatRoundStyle Round>
-struct FastNumericArrayConverter<int8_t, float, 4, Round> {
-  using result_type = Array<int8_t, 4>;
-  using source_type = Array<float, 4>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    Array<int32_t, 4> result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 4; ++i) {
-      float tmp = source[i] + 12582912.0f;
-      result[i] = reinterpret_cast<int32_t const &>(tmp);
-    }
-
-    result[0] = __byte_perm(result[0], result[1], 0x40);
-    result[2] = __byte_perm(result[2], result[3], 0x40);
-    result[0] = __byte_perm(result[0], result[2], 0x5410);
-
-    return reinterpret_cast<result_type const &>(result[0]);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const { return convert(s); }
-};
-
-/// Partial specialization for Array<int8_t> <= Array<float>
-template <int N, FloatRoundStyle Round>
-struct FastNumericArrayConverter<int8_t, float, N, Round> {
-  static_assert(!(N % 4), "N must be multiple of 4.");
-
-  using result_type = Array<int8_t, N>;
-  using source_type = Array<float, N>;
-  static FloatRoundStyle const round_style = Round;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const &source) {
-    FastNumericArrayConverter<int8_t, float, 4, Round> convert_vector_;
-
-    result_type result;
-
-    Array<int8_t, 4> *result_ptr =
-        reinterpret_cast<Array<int8_t, 4> *>(&result);
-    Array<float, 4> const *source_ptr =
-        reinterpret_cast<Array<float, 4> const *>(&source);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N / 4; ++i) {
-      result_ptr[i] = convert_vector_(source_ptr[i]);
-    }
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const &s) const { return convert(s); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines preferred rounding mode for a pair of types
-template <typename T, typename S>
-struct PreferredRoundingMode {
-  static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest;
-};
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
-/// Defines preferred rounding mode for a pair of types
-template <>
-struct PreferredRoundingMode<cutlass::tfloat32_t, float> {
-  static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate;
-};
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Packs predicates into an array.
-template <int N>
-struct PackPredicates {
-  using result_type = Array<uint1b_t, N>;
-
-  static_assert(!(N % 4), "Must pack predicates in a count that is a multiple of 4");
-
-  CUTLASS_HOST_DEVICE
-  result_type operator()(bool const predicates[]) {
-
-    result_type packed;
-    packed.clear();
-
-    int const kWordSize = 8;
-    uint8_t *bytes = reinterpret_cast<uint8_t *>(packed.data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int word_idx = (i / kWordSize);
-      int bit_idx = (i % kWordSize);
-
-      uint8_t mask = static_cast<uint8_t>((predicates[i] ? 1u : 0u) << bit_idx);
-      bytes[word_idx] = (bytes[word_idx] | mask);
-    }
-    return packed;
-  }
-};
-
-/// Packs predicates into an array
-template <int N>
-struct UnpackPredicates {
-  using result_type = Array<uint1b_t, N>;
-
-  static_assert(!(N % 4), "Must unpack predicates in a count that is a multiple of 4");
-
-  CUTLASS_HOST_DEVICE
-  void operator()(bool predicates[], result_type const &packed) {
-
-    int const kWordSize = 8;
-    uint8_t const *bytes = reinterpret_cast<uint8_t const *>(packed.data());
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      int word_idx = (i / kWordSize);
-      int bit_idx = (i % kWordSize);
-
-      predicates[i] = bool((bytes[word_idx] >> bit_idx) & 0x1);
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_size.h b/lightllm-kernel/cutlass/include/cutlass/numeric_size.h
deleted file mode 100755
index 4ff83bab8..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/numeric_size.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Top-level include for all CUTLASS numeric types.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines the size of an element in bits
-template <typename T>
-struct sizeof_bits {
-  static constexpr int value = int(sizeof(T) * 8);
-};
-
-template <typename T>
-struct sizeof_bits<T const>: sizeof_bits<T> {};
-
-template <>
-struct sizeof_bits<void> {
-  static constexpr int value = 0;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns the number of bytes required to hold a specified number of bits
-template <class R = int, class T>
-CUTLASS_HOST_DEVICE
-constexpr
-R
-bits_to_bytes(T bits) {
-  return (R(bits) + R(7)) / R(8);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class T>
-struct is_subbyte {
-  static constexpr bool value = sizeof_bits<T>::value < 8;
-};
-
-template <class T>
-struct is_subbyte<T const> : is_subbyte<T> {};
-
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/numeric_types.h b/lightllm-kernel/cutlass/include/cutlass/numeric_types.h
deleted file mode 100755
index 5519fbe7c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/numeric_types.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-    \file
-    \brief Top-level include for all CUTLASS numeric types.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/numeric_size.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <size_t... Seq>
-struct index_sequence;
-
-template <size_t N, size_t... Next>
-struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
-
-template <size_t... Next>
-struct index_sequence_helper<0, 0, Next...> {
-  using type = index_sequence<0, Next...>;
-};
-
-template <size_t N>
-using make_index_sequence = typename index_sequence_helper<N>::type;
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Get the register type used in kernel
-//
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template<typename T>
-struct get_unpacked_element_type {
-  using type = T;
-};
-
-} // namespace detail
-
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/integer_subbyte.h"
-#include "cutlass/half.h"
-#include "cutlass/bfloat16.h"
-#include "cutlass/tfloat32.h"
-#include "cutlass/float8.h"
-#include "cutlass/uint128.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp b/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
deleted file mode 100755
index 0b5617976..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/pipeline/sm90_pipeline.hpp"
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp b/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
deleted file mode 100755
index 96bb8db74..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
+++ /dev/null
@@ -1,1173 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/layout_composed.hpp"  // cute::composition
-#include "cute/swizzle.hpp"             // cute::Swizzle
-#include "cute/swizzle_layout.hpp"      // cute::composition
-#include "cute/util/type_traits.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/container/array.hpp"
-#include "cute/numeric/integral_constant.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/detail/dependent_false.hpp"
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-using namespace cute;
-
-enum class BarrierStatus : uint32_t {
-  WaitAgain = 0u,
-  WaitDone  = 1u,
-};
-
-class ArrivalToken {
-public:
-  CUTLASS_HOST_DEVICE
-  ArrivalToken(BarrierStatus barrier_status) : barrier_status_(barrier_status) {}
-
-  CUTLASS_HOST_DEVICE
-  ArrivalToken() = delete;
-
-  CUTLASS_HOST_DEVICE
-  BarrierStatus get() const {
-    return barrier_status_;
-  }
-
-  CUTLASS_HOST_DEVICE
-  bool operator==(ArrivalToken const& other) const {
-    return barrier_status_ == other.get();
-  }
-
-private:
-  BarrierStatus barrier_status_;
-
-  CUTLASS_HOST_DEVICE
-  friend bool operator==(const ArrivalToken& left, const BarrierStatus& right) {
-    return left.get() == right;
-  }
-
-  CUTLASS_HOST_DEVICE
-  friend bool operator==(const BarrierStatus& left, const ArrivalToken& right) {
-    return left == right.get();
-  }
-
-  CUTLASS_HOST_DEVICE
-  friend bool operator!=(const ArrivalToken& left, const BarrierStatus& right) {
-    return left.get() != right;
-  }
-
-  CUTLASS_HOST_DEVICE
-  friend bool operator!=(const BarrierStatus& left, const ArrivalToken& right) {
-    return left != right.get();
-  }
-};
-
-class ProducerToken : public ArrivalToken {
-  using ArrivalToken::ArrivalToken;
-};
-
-class ConsumerToken : public ArrivalToken {
-  using ArrivalToken::ArrivalToken;
-};
-
-// Circular Buffer Index + Associated Phase
-// Assumes only one operation possible - i.e., ++
-template<uint32_t Stages_>
-struct PipelineState {
-
-  static constexpr uint32_t Stages = Stages_;
-
-  int index_ = 0;
-  uint32_t phase_ = 0;
-  uint32_t count_ = 0;
-
-  CUTLASS_DEVICE
-  PipelineState(): index_{}, phase_{}, count_{} {}
-
-  CUTLASS_DEVICE
-  PipelineState(int index, uint32_t phase, uint32_t count)
-    : index_(index)
-    , phase_(phase)
-    , count_(count) {}
-
-  CUTLASS_DEVICE
-  int index() const {
-    return index_;
-  }
-
-  CUTLASS_DEVICE
-  uint32_t phase() const {
-    return phase_;
-  }
-
-  CUTLASS_DEVICE
-  uint32_t count() const {
-    return count_;
-  }
-
-  CUTLASS_DEVICE
-  void operator++() {
-    if constexpr (Stages > 0) {
-      ++index_;
-      ++count_;
-      if (index_ == Stages) {
-        index_ = 0;
-        phase_ ^= 1;
-      }
-    }
-  }
-
-  CUTLASS_DEVICE
-  PipelineState& operator+=(uint32_t num_iterations) {
-    return advance(num_iterations);
-  }
-
-  CUTLASS_DEVICE
-  PipelineState& operator=(PipelineState const& other) {
-    index_ = other.index();
-    phase_ = other.phase();
-    count_ = other.count();
-    return *this;
-  }
-
-  CUTLASS_DEVICE
-  PipelineState& advance(uint32_t num_iterations) {
-    if constexpr (Stages > 0) {
-      // Number of iterations cross over the stage boundary => flipped phase
-      if ((num_iterations < Stages) && (index_ + num_iterations) >= Stages ) {
-        phase_ ^= 1;
-      }
-      // How many times number of iterations cross over the stage boundary and
-      // end up on a odd number => flipped phase
-      if ((num_iterations >= Stages) && (((index_ + num_iterations) / Stages) % 2) == 1) {
-        phase_ ^= 1;
-      }
-      index_ = (index_ + num_iterations) % Stages;
-      count_ += num_iterations;
-    }
-    return *this;
-  }
-
-  CUTLASS_DEVICE
-  static PipelineState make_pipeline_state(PipelineState start_state, uint32_t num_iterations) {
-    return start_state.advance(num_iterations);
-  }
-};
-
-template<class Pipeline>
-CUTLASS_DEVICE
-PipelineState<Pipeline::Stages> make_producer_start_state() {
-  // Producer starts with an opposite phase as the buffers are initially empty
-  constexpr int InitialProducerStage = 0;
-  constexpr uint32_t InitialProducerPhase = 1;
-  constexpr uint32_t InitialProducerCount = 0;
-  return {InitialProducerStage, InitialProducerPhase, InitialProducerCount};
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// TMA load (producer) Async Pipeline class
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Assumptions : Constructor is visible Cluster-wide (as it needs a Cluster-Sync)
-// We have exactly one thread elected in the Producer as the "leader"
-// Currently, it is optional to elect a leader for the Consumers
-template <int Stages_>
-class PipelineTmaAsync {
-public :
-  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
-  using EmptyBarrier = cutlass::arch::ClusterBarrier;
-  using ProducerBarrierType = FullBarrier::ValueType;
-  using ConsumerBarrierType = EmptyBarrier::ValueType;
-  static constexpr uint32_t Stages = Stages_;
-  using PipelineState = cutlass::PipelineState<Stages>;
-
-  struct SharedStorage {
-    FullBarrier full_barrier_[Stages];
-    EmptyBarrier empty_barrier_[Stages];
-  };
-
-  enum class ThreadCategory {
-    NonParticipant,
-    Producer,
-    Consumer,
-    ProducerConsumer
-  };
-
-  struct Params {
-    uint32_t transaction_bytes = 0;
-    ThreadCategory role = ThreadCategory::NonParticipant;
-    uint32_t is_leader = 0;
-    uint32_t num_consumers = 0;
-  };
-
-  // Constructor
-  template<class ClusterShape>
-  CUTLASS_DEVICE
-  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape)
-      : params_(params)
-      , full_barrier_ptr_(&storage.full_barrier_[0])
-      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
-
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-
-    if (warp_idx == 0 && lane_predicate == 1) {
-      // Barrier FULL init
-      for (int i = 0; i < Stages; ++i) {
-        full_barrier_ptr_[i].init(1);
-      }
-      uint32_t const num_consumer_warpgroups_per_cluster = params_.num_consumers / NumThreadsPerWarpGroup;
-      uint32_t const multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1) *
-          num_consumer_warpgroups_per_cluster;
-      // Barrier EMPTY init
-      for (int i = 0; i < Stages; ++i) {
-        empty_barrier_ptr_[i].init(multicast_consumer_arrival_count);
-      }
-    }
-    cutlass::arch::fence_barrier_init();
-
-    // Logic to optimally schedule Empty Arrives
-    // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads)
-    dim3 block_id = cute::block_id_in_cluster();
-    auto cluster_size = cute::size(cluster_shape);
-    static constexpr int MaxClusterSize = 16;
-
-    // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15)
-    if (params_.num_consumers % NumThreadsPerWarpGroup == 0) {
-      int thread_idx = threadIdx.x % NumThreadsPerWarpGroup;
-      is_signalling_thread_ = (thread_idx % (NumThreadsPerWarpGroup / MaxClusterSize)) == 0;
-      auto layout = cute::composition(Swizzle<2,0,-2>{},
-                                      Layout<Shape<_4,_4>,Stride<_4,_1>>{});
-      uint32_t thread_row = warp_idx % 4;
-      uint32_t thread_col = (thread_idx / 8) % 4;
-      dst_blockid_ = layout(thread_row, thread_col);
-    }
-    else if (params_.num_consumers == 32) {
-      int thread_idx = threadIdx.x % 32;
-      is_signalling_thread_ = (thread_idx % (32 / MaxClusterSize)) == 0;
-      auto layout = Layout<Shape<_4,_4>,Stride<_4, _1>>{};
-      uint32_t thread_row = thread_idx / 8;
-      uint32_t thread_col = (thread_idx % 8) / 2;
-      dst_blockid_ = layout(thread_row, thread_col);
-    }
-    else {
-      is_signalling_thread_ = 0;
-      #ifndef NDEBUG
-        asm volatile ("brkpt;\n" ::);
-      #endif
-    }
-
-    // STEP 2: Find if this dst block-id needs an arrival for this problem
-    is_signalling_thread_ &= dst_blockid_ < cluster_size;
-    is_signalling_thread_ &= is_same_row_or_col(dst_blockid_, block_id, cluster_shape);
-  }
-
-  template <class ClusterShape>
-  CUTLASS_DEVICE
-  bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
-    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) ||
-            (
-              ((dst_block_id / cute::size<0>(cluster_shape)) == block_id.y)
-            ));
-  }
-
-  ////////////////////
-  // Producer APIs
-  ////////////////////
-  // Four member functions are always used in pairs:
-  //
-  // * producer_try_acquire and producer_acquire, and
-  // * consumer_try_wait and consumer_wait.
-  //
-  // The two functions with "try" in their names are called "try" functions,
-  // and the other two are conceptually "finalize" functions.
-  // The "try" function in each pair starts the process of waiting on the barrier to flip.
-  // It opportunistically waits for an implementation-dependent timeout.
-  // Whether or not the barrier has flipped yet, the try function will return a token.
-  // If the token indicates that the barrier has not flipped,
-  // then the token must be passed into the corresponding "finalize" function.
-  // The finalize function will then block until the barrier has flipped.
-  // If the token indicates that the barrier _has_ flipped,
-  // then it is still correct to pass it into the finalize function.
-  // The finalize function will return immediately in that case.
-
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
-    return producer_try_acquire(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
-    producer_acquire(state.index(), state.phase(), barrier_token);
-  }
-
-  CUTLASS_DEVICE
-  void producer_commit(PipelineState state, uint32_t bytes) {
-    producer_commit(state.index(), bytes);
-  }
-
-  // Prevents early exit of producer blocks in Cluster.
-  // This should be called once before kernel exits.
-  CUTLASS_DEVICE
-  void producer_tail(PipelineState state) {
-    for (int count = 0; count < Stages; ++count) {
-      empty_barrier_ptr_[state.index()].wait(state.phase());
-      ++state;
-    }
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(PipelineState state) {
-    return producer_get_barrier(state.index());
-  }
-
-  ////////////////////
-  // Consumer APIs
-  ////////////////////
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_try_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_test_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(PipelineState state) {
-    consumer_wait(state.index(), state.phase());
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(PipelineState state, ConsumerToken barrier_token) {
-    consumer_wait(state.index(), state.phase(), barrier_token);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_release(PipelineState state) {
-    consumer_release(state.index());
-  }
-
-private :
-  uint32_t dst_blockid_ = 0;
-  uint32_t is_signalling_thread_ = 0;
-  FullBarrier *full_barrier_ptr_ = nullptr;
-  EmptyBarrier *empty_barrier_ptr_ = nullptr;
-  Params params_;
-
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
-    if (barrier_token != BarrierStatus::WaitDone) {
-      empty_barrier_ptr_[stage].wait(phase);
-    }
-
-    if (params_.is_leader) {
-      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
-    }
-    #ifndef NDEBUG
-    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
-      asm volatile ("brkpt;\n" ::);
-    }
-
-    // Most likely you have elected more than one leader
-    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
-      asm volatile ("brkpt;\n" ::);
-    }
-    #endif
-  }
-
-  // NOP for TMA based mainloop
-  CUTLASS_DEVICE
-  void producer_commit(uint32_t stage, uint32_t bytes) {
-    // Below code is used only for unit-testing (in the absence of TMA commit)
-    #if CUTLASS_UNIT_TEST_PIPELINE
-      if (params_.is_leader) {
-        // STEP 1 : Commit to self
-        full_barrier_ptr_[stage].complete_transaction(bytes);
-
-        // STEP 2 : Commit to other blocks in our cluster
-        auto cluster_shape = cute::cluster_shape();
-        Layout block_layout_in_cluster = make_layout(cluster_shape);
-        dim3 local_block_id = cute::block_id_in_cluster();
-
-        CUTLASS_PRAGMA_UNROLL
-        for(int n = 0; n < size<1>(block_layout_in_cluster); ++n) {
-          uint32_t dst_block_id = block_layout_in_cluster(local_block_id.x,n,Int<0>{});
-          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, n!=local_block_id.y);
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for(int m = 0; m < size<0>(block_layout_in_cluster); ++m) {
-          uint32_t dst_block_id = block_layout_in_cluster(m,local_block_id.y,Int<0>{});
-          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, m!=local_block_id.x);
-        }
-      }
-    #endif
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  // Wait for producer to commit transactions (done by TMA)
-  CUTLASS_DEVICE
-  void consumer_wait(uint32_t stage, uint32_t phase) {
-    full_barrier_ptr_[stage].wait(phase);
-  }
-
-  // Wait for producer to commit transactions (done by TMA)
-  CUTLASS_DEVICE
-  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
-    if (barrier_token == BarrierStatus::WaitAgain) {
-      full_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  // Consumer signalling Producer of completion
-  // Ensures all blocks in the Same Row and Column get notifed.
-  CUTLASS_DEVICE
-  void consumer_release(uint32_t stage, uint32_t skip = false) {
-    empty_barrier_ptr_[stage].arrive(dst_blockid_, is_signalling_thread_ & (!skip));
-    #ifndef NDEBUG
-    if (params_.role == ThreadCategory::Producer || params_.role == ThreadCategory::NonParticipant) {
-      asm volatile ("brkpt;\n" ::);
-    }
-    #endif
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
-    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// TMA store pipeline class
-// producer-only class, no async barriers between threads because consumer is TMA unit
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  int Stages_,
-  // The number of committed TMA store batches that can be in flight upon return of producer acquire
-  int UnacquiredStages_ = Stages_-1
->
-class PipelineTmaStore {
-public:
-  static constexpr uint32_t Stages = Stages_;
-  static_assert(Stages_ > 0);
-  static_assert(UnacquiredStages_ >= 0);
-  static constexpr uint32_t UnacquiredStages = static_cast<uint32_t>(UnacquiredStages_);
-  using PipelineState = cutlass::PipelineState<Stages>;
-
-  struct Params {
-    bool always_wait = false;
-  };
-
-  CUTLASS_DEVICE
-  PipelineTmaStore(Params params = {}) : params_(params) {}
-
-  ////////////////////
-  // Producer APIs
-  ////////////////////
-  // Wait for the least recently committed batch of TMA stores to complete
-  CUTLASS_DEVICE
-  void producer_acquire(PipelineState state) {
-    producer_acquire(state.index(), state.count());
-  }
-
-  // Commit the most recently issued batch of TMA stores
-  CUTLASS_DEVICE
-  void producer_commit(PipelineState state) {
-    producer_commit(state.index(), state.count());
-  }
-
-  // Wait for all TMA stores to complete
-  CUTLASS_DEVICE
-  void producer_tail([[maybe_unused]] PipelineState state) {
-    tma_store_wait<0>();
-  }
-
-private:
-  Params params_;
-
-  // Wait for the least recently committed batch of TMA stores to complete
-  // or until at most UnacquiredStages TMA store batches are in-flight (if specified)
-  CUTLASS_DEVICE
-  void producer_acquire([[maybe_unused]] uint32_t stage, uint32_t count) {
-    if (params_.always_wait || count > UnacquiredStages) {
-      tma_store_wait<UnacquiredStages>();
-    }
-  }
-
-  // Commit the most recently issued batch of TMA stores
-  CUTLASS_DEVICE
-  void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
-    tma_store_arrive();
-  }
-};
-
-template <>
-class PipelineTmaStore< /* Stages_ = */ 0, /* UnacquiredStages = Stages_ - 1 = */ -1 > {
-public:
-  static constexpr uint32_t Stages = 0;
-  static constexpr uint32_t UnacquiredStages = 0;
-  using PipelineState = cutlass::PipelineState<Stages>;
-
-  struct Params {
-    bool always_wait = false;
-  };
-
-  PipelineTmaStore() = default;
-  CUTLASS_DEVICE
-    PipelineTmaStore(Params params) : params_(params) {}
-
-  ////////////////////
-  // Producer APIs
-  ////////////////////
-
-  template<class ThisTemplateParameterExistsOnlyForDependentFalse = int>
-  CUTLASS_DEVICE
-    void producer_acquire(PipelineState /* state */,
-      ThisTemplateParameterExistsOnlyForDependentFalse* /* unused */ = nullptr) {
-    static_assert(cutlass::detail::dependent_false<ThisTemplateParameterExistsOnlyForDependentFalse>,
-      "It is never valid to call PipelineTmaStore<0>::producer_acquire");
-  }
-
-  // Commit the most recently issued batch of TMA stores
-  CUTLASS_DEVICE
-    void producer_commit(PipelineState state) {
-    producer_commit(state.index(), state.count());
-  }
-
-  // Wait for all TMA stores to complete
-  CUTLASS_DEVICE
-    void producer_tail([[maybe_unused]] PipelineState state) {
-    tma_store_wait<0>();
-  }
-
-private:
-  Params params_;
-
-  // Commit the most recently issued batch of TMA stores
-  CUTLASS_DEVICE
-    void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
-    tma_store_arrive();
-  }
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Simple producer-consumer async Pipeline class using producer transaction barriers
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-template <int Stages_>
-class PipelineTransactionAsync {
-public :
-  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
-  using EmptyBarrier = cutlass::arch::ClusterBarrier;
-  using ProducerBarrierType = FullBarrier::ValueType;
-  using ConsumerBarrierType = EmptyBarrier::ValueType;
-  static constexpr uint32_t Stages = Stages_;
-  using PipelineState = cutlass::PipelineState<Stages>;
-
-  struct SharedStorage {
-    cute::array<FullBarrier, Stages> full_barrier_;
-    cute::array<EmptyBarrier, Stages> empty_barrier_;
-  };
-
-  enum class ThreadCategory {
-    NonParticipant,
-    Producer,
-    Consumer,
-    ProducerConsumer
-  };
-
-  struct Params {
-    ThreadCategory role = ThreadCategory::NonParticipant;
-    uint32_t transaction_bytes = 0;
-    uint32_t producer_arv_count = 1;
-    uint32_t consumer_arv_count = 1;
-    uint32_t dst_blockid = cute::block_rank_in_cluster();
-  };
-
-  // Constructor
-  CUTLASS_DEVICE
-  PipelineTransactionAsync(SharedStorage& storage, Params const& params)
-    : params_(params)
-    , full_barrier_ptr_(storage.full_barrier_.data())
-    , empty_barrier_ptr_(storage.empty_barrier_.data()) {
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-
-    // Barrier FULL, EMPTY init
-    // Init is done only by thread 0 of the block
-    if (warp_idx == 0 && lane_predicate) {
-      for (int i = 0; i < Stages; ++i) {
-        full_barrier_ptr_[i].init(params.producer_arv_count);
-        empty_barrier_ptr_[i].init(params.consumer_arv_count);
-      }
-    }
-    cutlass::arch::fence_barrier_init();
-  }
-
-  ////////////////////
-  // Producer APIs
-  ////////////////////
-  // Four member functions are always used in pairs:
-  //
-  // * producer_try_acquire and producer_acquire, and
-  // * consumer_try_wait and consumer_wait.
-  //
-  // The two functions with "try" in their names are called "try" functions,
-  // and the other two are conceptually "finalize" functions.
-  // The "try" function in each pair starts the process of waiting on the barrier to flip.
-  // It opportunistically waits for an implementation-dependent timeout.
-  // Whether or not the barrier has flipped yet, the try function will return a token.
-  // If the token indicates that the barrier has not flipped,
-  // then the token must be passed into the corresponding "finalize" function.
-  // The finalize function will then block until the barrier has flipped.
-  // If the token indicates that the barrier _has_ flipped,
-  // then it is still correct to pass it into the finalize function.
-  // The finalize function will return immediately in that case.
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
-    return producer_try_acquire(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
-    producer_acquire(state.index(), state.phase(), barrier_token);
-  }
-
-  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
-  CUTLASS_DEVICE
-  void producer_expect_transaction(PipelineState state) {
-    producer_expect_transaction(state.index());
-  }
-
-  CUTLASS_DEVICE
-  void producer_commit(PipelineState state) {
-    producer_commit(state.index());
-  }
-
-  // Prevents early exit of producer blocks in Cluster.
-  // This should be called once before kernel exits.
-  CUTLASS_DEVICE
-  void producer_tail(PipelineState state) {
-    for (int count = 0; count < Stages; ++count) {
-      producer_acquire(state);
-      ++state;
-    }
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(PipelineState state) {
-    return producer_get_barrier(state.index());
-  }
-
-  ////////////////////
-  // Consumer APIs
-  ////////////////////
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_try_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_test_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
-    consumer_wait(state.index(), state.phase(), barrier_token);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_release(PipelineState state) {
-    consumer_release(state.index());
-  }
-
-private:
-  FullBarrier *full_barrier_ptr_ = nullptr;
-  EmptyBarrier *empty_barrier_ptr_ = nullptr;
-  Params params_;
-
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
-    if (barrier_token == BarrierStatus::WaitAgain) {
-      empty_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
-  CUTLASS_DEVICE
-  void producer_expect_transaction(uint32_t stage) {
-    full_barrier_ptr_[stage].expect_transaction(params_.transaction_bytes);
-  }
-
-  CUTLASS_DEVICE
-  void producer_commit(uint32_t stage) {
-    full_barrier_ptr_[stage].arrive(params_.dst_blockid);
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
-    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
-    if (barrier_token == BarrierStatus::WaitAgain) {
-      full_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void consumer_release(uint32_t stage, uint32_t skip = false) {
-    empty_barrier_ptr_[stage].arrive(params_.dst_blockid, (not skip));
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Simple producer-consumer async Pipeline class
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace PipelineDetail {
-  template<int Stages>
-  using PipelineAsyncPipelineState = cutlass::PipelineState<Stages>;
-
-  template<int Stages>
-  struct PipelineAsyncSharedStorage {
-    using FullBarrier = cutlass::arch::ClusterBarrier;
-    using EmptyBarrier = cutlass::arch::ClusterBarrier;
-
-    FullBarrier full_barrier_[Stages];
-    EmptyBarrier empty_barrier_[Stages];
-  };
-};
-
-template <int Stages_>
-class PipelineAsync {
-public :
-  static constexpr uint32_t Stages = Stages_;
-  using SharedStorage = PipelineDetail::PipelineAsyncSharedStorage<Stages>;
-  using FullBarrier = typename SharedStorage::FullBarrier;
-  using EmptyBarrier = typename SharedStorage::EmptyBarrier;
-  using ProducerBarrierType = typename FullBarrier::ValueType;
-  using ConsumerBarrierType = typename EmptyBarrier::ValueType;
-  using PipelineState = PipelineDetail::PipelineAsyncPipelineState<Stages>;
-
-  enum class ThreadCategory {
-    NonParticipant,
-    Producer,
-    Consumer,
-    ProducerConsumer
-  };
-
-  struct Params {
-    ThreadCategory role = ThreadCategory::NonParticipant;
-    uint32_t producer_arv_count = 1;
-    uint32_t consumer_arv_count = 1;
-    uint32_t dst_blockid = cute::block_rank_in_cluster();
-  };
-
-  // Default assumption when only storage is passed is :
-  // => single producer, single consumer & they are in the same block (within the Cluster)
-  CUTLASS_DEVICE
-  PipelineAsync(SharedStorage& storage)
-    : PipelineAsync(storage, {}) {}
-
-  CUTLASS_DEVICE
-  PipelineAsync(
-    SharedStorage& storage,
-    Params const& params) :
-      params_(params),
-      full_barrier_ptr_(&storage.full_barrier_[0]),
-      empty_barrier_ptr_(&storage.empty_barrier_[0]) {
-
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-
-    // Barrier FULL, EMPTY init
-    // Init is done only by thread 0 of the block
-    if (warp_idx == 0 && lane_predicate == 1) {
-      for (int i = 0; i < Stages; ++i) {
-        full_barrier_ptr_[i].init(params.producer_arv_count);
-        empty_barrier_ptr_[i].init(params.consumer_arv_count);
-      }
-    }
-    cutlass::arch::fence_barrier_init();
-  }
-
-  ////////////////////
-  // Producer APIs
-  ////////////////////
-  // Four member functions are always used in pairs:
-  //
-  // * producer_try_acquire and producer_acquire, and
-  // * consumer_try_wait and consumer_wait.
-  //
-  // The two functions with "try" in their names are called "try" functions,
-  // and the other two are conceptually "finalize" functions.
-  // The "try" function in each pair starts the process of waiting on the barrier to flip.
-  // It opportunistically waits for an implementation-dependent timeout.
-  // Whether or not the barrier has flipped yet, the try function will return a token.
-  // If the token indicates that the barrier has not flipped,
-  // then the token must be passed into the corresponding "finalize" function.
-  // The finalize function will then block until the barrier has flipped.
-  // If the token indicates that the barrier _has_ flipped,
-  // then it is still correct to pass it into the finalize function.
-  // The finalize function will return immediately in that case.
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
-    return producer_try_acquire(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
-    producer_acquire(state.index(), state.phase(), barrier_token);
-  }
-
-  CUTLASS_DEVICE
-  void producer_commit(PipelineState state) {
-    producer_commit(state.index());
-  }
-
-  template<class UserDefinedArriveOp>
-  CUTLASS_DEVICE
-  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
-    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));
-    producer_commit(state);
-  }
-
-  // Prevents early exit of producer blocks in Cluster.
-  // This should be called once before kernel exits.
-  CUTLASS_DEVICE
-  void producer_tail(PipelineState state) {
-    for (int count = 0; count < Stages; ++count) {
-      producer_acquire(state);
-      ++state;
-    }
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(PipelineState state) {
-    return producer_get_barrier(state.index());
-  }
-
-  ////////////////////
-  // Consumer APIs
-  ////////////////////
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_try_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
-    return consumer_test_wait(state.index(), state.phase(), skip_wait);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
-    consumer_wait(state.index(), state.phase(), barrier_token);
-  }
-
-  CUTLASS_DEVICE
-  void consumer_release(PipelineState state) {
-    consumer_release(state.index());
-  }
-
-  CUTLASS_DEVICE
-  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
-    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
-  }
-
-private:
-  Params params_;
-  FullBarrier *full_barrier_ptr_;
-  EmptyBarrier *empty_barrier_ptr_;
-
-  CUTLASS_DEVICE
-  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
-    if (barrier_token == BarrierStatus::WaitAgain) {
-      empty_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void producer_commit(uint32_t stage) {
-    full_barrier_ptr_[stage].arrive();
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
-    if (skip_wait) {
-      return {BarrierStatus::WaitDone};
-    }
-    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
-    return {static_cast<BarrierStatus>(barrier_status)};
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(uint32_t stage, uint32_t phase) {
-    bool done = full_barrier_ptr_[stage].test_wait(phase);
-    if (!done) {
-      full_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
-    if (barrier_token == BarrierStatus::WaitAgain) {
-      full_barrier_ptr_[stage].wait(phase);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void consumer_release(uint32_t stage) {
-    empty_barrier_ptr_[stage].arrive(params_.dst_blockid);
-  }
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Barrier to ensure an Ordered Sequence between
-// SequenceLength number of groups (each with group_size participants) executing SequenceDepth Stages
-// i.e., for all i < j - only after id "i" arrives at a particular stage "m"
-// will the wait() for id "j" succeed for the same stage
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace PipelineDetail {
-
-template<int SequenceDepth, int SequenceLength>
-struct OrderedSequenceBarrierSharedStorage {
-  using Barrier = cutlass::arch::ClusterBarrier;
-  Barrier barrier_[SequenceDepth][SequenceLength];
-};
-
-} // namespace PipelineDetail
-
-template<int SequenceDepth_, int SequenceLength_>
-class OrderedSequenceBarrier {
-public:
-  static constexpr int SequenceDepth = SequenceDepth_;
-  static constexpr int SequenceLength = SequenceLength_;
-  using SharedStorage =
-    PipelineDetail::OrderedSequenceBarrierSharedStorage<SequenceDepth, SequenceLength>;
-  using Barrier = typename SharedStorage::Barrier;
-
-  struct Params {
-    uint32_t group_id;
-    uint32_t group_size;
-  };
-
-private :
-  // In future this Params object can be replaced easily with a CG object
-  Params params_;
-  Barrier *barrier_ptr_;
-  PipelineState<SequenceDepth> stage_;
-
-  static constexpr int Depth = SequenceDepth;
-  static constexpr int Length = SequenceLength;
-
-public:
-  OrderedSequenceBarrier() = delete;
-  OrderedSequenceBarrier(const OrderedSequenceBarrier&) = delete;
-  OrderedSequenceBarrier(OrderedSequenceBarrier&&) = delete;
-  OrderedSequenceBarrier& operator=(const OrderedSequenceBarrier&) = delete;
-  OrderedSequenceBarrier& operator=(OrderedSequenceBarrier&&) = delete;
-  ~OrderedSequenceBarrier() = default;
-
-  CUTLASS_DEVICE
-  OrderedSequenceBarrier(SharedStorage& storage, Params const& params) :
-      params_(params),
-      barrier_ptr_(&storage.barrier_[0][0]),
-      // Group 0 - starts with an opposite phase
-      stage_({0, params.group_id == 0, 0}) {
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-
-    // Barrier FULL, EMPTY init
-    // Init is done only by the one elected thread of the block
-    if (warp_idx == 0 && lane_predicate) {
-      for (int d = 0; d < Depth; ++d) {
-        for (int l = 0; l < Length; ++l) {
-          barrier_ptr_[d * Length + l].init(params.group_size);
-        }
-      }
-    }
-    cutlass::arch::fence_barrier_init();
-  }
-
-  // Wait on a stage to be unlocked
-  CUTLASS_DEVICE
-  void wait() {
-    get_barrier_for_current_stage(params_.group_id).wait(stage_.phase());
-  }
-
-  // Signal completion of Stage and move to the next stage
-  // (group_id) signals to (group_id+1)
-  CUTLASS_DEVICE
-  void arrive() {
-    int signalling_id = (params_.group_id + 1) % Length;
-    get_barrier_for_current_stage(signalling_id).arrive();
-    ++stage_;
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-    ++stage_;
-  }
-
-private:
-
-  CUTLASS_DEVICE
-  Barrier& get_barrier_for_current_stage(int group_id) {
-    return barrier_ptr_[stage_.index() * Length + group_id];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Synchronization call. Blocks until barriers are initialized in shared memory.
-CUTLASS_DEVICE
-void
-pipeline_init_wait(int cluster_size) {
-  if (cluster_size > 1) {
-    cute::cluster_wait();
-  }
-  else {
-    __syncthreads();
-  }
-}
-
-// Used to guarantee that the Pipeline init is visible
-// to all producers and consumer threadblocks in the cluster
-CUTLASS_DEVICE
-void
-pipeline_init_arrive_relaxed(int cluster_size) {
-  if (cluster_size > 1) {
-    cute::cluster_arrive_relaxed();
-  }
-  else {
-    __syncthreads();
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // end namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h b/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
deleted file mode 100755
index 475229a25..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template defining a shape used by pitch-linear operators
-template <
-  int Contiguous,
-  int Strided
->
-struct PitchLinearShape {
-  static int const kContiguous = Contiguous;
-  static int const kStrided = Strided;
-  static int const kCount = Contiguous * Strided;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Coordinate in pitch-linear space
-struct PitchLinearCoord : public Coord<2, int> {
-public:
-
-  /// Integer-valued index
-  using Index = int;
-
-  /// Base type is a Coord of rank=2
-  using Base = Coord<2, Index>;
-
-  /// Long integer type
-  using LongIndex = typename Base::LongIndex;
-
-private:
-
-  /// Rows dimension
-  static int const kContiguous = 0;
-
-  /// Columns dimension
-  static int const kStrided = 1;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord() { }
-
-  /// Constructs from Coord<2>
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord(Coord<2, Index> const &coord): Base(coord) { }
-
-  /// Helper to construct from a row and column
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord(Index contiguous_, Index strided_): Base(make_Coord(contiguous_, strided_)) { }
-
-  /// Helper to construct from a row and column based on LongIndex
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord(LongIndex contiguous_, LongIndex strided_)
-    : Base(make_Coord(Index(contiguous_), Index(strided_))) { }
-
-  /// Returns the contiguous dimension
-  CUTLASS_HOST_DEVICE
-  Index const & contiguous() const { return this->at(kContiguous); }
-
-  /// Returns the contiguous dimension
-  CUTLASS_HOST_DEVICE
-  Index & contiguous() { return this->at(kContiguous); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & strided() const { return this->at(kStrided); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & strided() { return this->at(kStrided); }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord operator+(Base const& b) const {
-    return PitchLinearCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord operator-(Base const& b) const {
-    return PitchLinearCoord(Base::operator-(b));
-  }
-
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord operator-() const {
-    return PitchLinearCoord(-at(0), -at(1));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord operator*(Base const& b) const {
-    return PitchLinearCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord operator/(Base const& b) const {
-    return PitchLinearCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  PitchLinearCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/platform/platform.h b/lightllm-kernel/cutlass/include/cutlass/platform/platform.h
deleted file mode 100755
index ba1f74011..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/platform/platform.h
+++ /dev/null
@@ -1,913 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * \brief C++ features that may be otherwise unimplemented for CUDA device functions.
- *
- * This file has three components:
- *
- *   (1) Macros:
- *       - Empty macro defines for C++ keywords not supported by the current
- *         version of C++. These simply allow compilation to proceed (but do
- *         not provide the added semantics).
- *           - \p noexcept
- *           - \p constexpr
- *           - \p nullptr
- *           - \p static_assert
- *
- *       - Macro functions that we need in constant expressions because the
- *         C++ equivalents require constexpr compiler support.  These are
- *         prefixed with \p __NV_STD_*
- *           - \p __NV_STD_MAX
- *           - \p __NV_STD_MIN
- *
- *   (2) Re-implementations of STL functions and types:
- *       - C++ features that need the \p __device__ annotation.  These are
- *         placed into the \p platform namespace.
- *           - \p abs
- *           - \p plus
- *           - \p less
- *           - \p greater
- *           - \p min
- *           - \p max
- *           - \p methods on std::pair (==, !=, <, <=, >, >=, and make_pair())
- *
- *   (3) Stop-gap implementations of unsupported STL functions and types:
- *       - STL functions and types defined by C++ 11/14/17/etc. that are not
- *         provided by the current version of C++. These are placed into the
- *         \p platform namespace
- *           - \p integral_constant
- *           - \p nullptr_t
- *           - \p true_type
- *           - \p false_type
- *           - \p bool_constant
- *           - \p enable_if
- *           - \p conditional
- *           - \p is_same
- *           - \p is_base_of
- *           - \p remove_const
- *           - \p remove_volatile
- *           - \p remove_cv
- *           - \p is_volatile
- *           - \p is_pointer
- *           - \p is_void
- *           - \p is_integral
- *           - \p is_floating_point
- *           - \p is_arithmetic
- *           - \p is_fundamental
- *           - \p is_trivially_copyable
- *           - \p alignment_of
- *           - \p aligned_storage
- *
- * The idea is that, as we drop support for older compilers, we can simply #define
- * the \p __NV_STD_XYZ macros and \p platform namespace to alias their C++
- * counterparts (or trivially find-and-replace their occurrences in code text).
- */
-
-//-----------------------------------------------------------------------------
-// Dependencies
-//-----------------------------------------------------------------------------
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/type_traits>
-#include <cuda/std/utility>
-#include <cuda/std/cstddef>
-#include <cuda/std/cstdint>
-#include <cuda/std/limits>
-#else
-#include <stdint.h>
-#endif
-
-#if !defined(__CUDACC_RTC__)
-//-----------------------------------------------------------------------------
-// Include STL files that platform provides functionality for
-//-----------------------------------------------------------------------------
-
-#include <algorithm>   // Minimum/maximum operations
-#include <cstddef>     // nullptr_t
-#include <functional>  // Arithmetic operations
-#include <utility>     // For methods on std::pair
-#include <limits>      // float_round_style, float_denorm_style
-#if (!defined(_MSC_VER) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MS_VER >= 1500))
-#include <type_traits>  // For integral constants, conditional metaprogramming, and type traits
-#endif
-
-#include <cutlass/cutlass.h>
-
-#endif
-
-//-----------------------------------------------------------------------------
-// OS
-//-----------------------------------------------------------------------------
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-#define CUTLASS_OS_WINDOWS
-#endif
-
-/******************************************************************************
- * Macros
- ******************************************************************************/
-/// std
-#if !defined(CUTLASS_STL_NAMESPACE)
-#if defined(__CUDACC_RTC__)
-#define CUTLASS_STL_NAMESPACE cuda::std
-#else
-#define CUTLASS_STL_NAMESPACE std
-#endif
-#endif
-
-/// builtin_unreachable
-#if !defined(CUTLASS_GCC_UNREACHABLE)
-#  if defined(__GNUC__)
-#    define CUTLASS_GCC_UNREACHABLE __builtin_unreachable()
-#  else
-#    define CUTLASS_GCC_UNREACHABLE
-#  endif
-#endif
-
-//-----------------------------------------------------------------------------
-// Keywords
-//-----------------------------------------------------------------------------
-
-/// noexcept, constexpr
-#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1900))
-#ifndef noexcept
-#define noexcept
-#endif
-#ifndef constexpr
-#define constexpr
-#endif
-#endif
-
-/// nullptr
-#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1310))
-#ifndef nullptr
-#define nullptr 0
-#endif
-#endif
-
-/// static_assert
-#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
-#ifndef static_assert
-#define __platform_cat_(a, b) a##b
-#define __platform_cat(a, b) __platform_cat_(a, b)
-#define static_assert(__e, __m) typedef int __platform_cat(AsSeRt, __LINE__)[(__e) ? 1 : -1]
-#endif
-#endif
-
-//-----------------------------------------------------------------------------
-// Functions
-//-----------------------------------------------------------------------------
-
-/// Select maximum(a, b)
-#ifndef __NV_STD_MAX
-#define __NV_STD_MAX(a, b) (((b) > (a)) ? (b) : (a))
-#endif
-
-/// Select minimum(a, b)
-#ifndef __NV_STD_MIN
-#define __NV_STD_MIN(a, b) (((b) < (a)) ? (b) : (a))
-#endif
-
-/******************************************************************************
- * Re-implementations
- ******************************************************************************/
-namespace cutlass {
-namespace platform {
-
-//-----------------------------------------------------------------------------
-// Abs operations <algorithm>
-//-----------------------------------------------------------------------------
-
-#if defined(__CUDACC_RTC__)
-/// std::abs
-CUTLASS_HOST_DEVICE constexpr int abs(int a) {
-    return (a < 0) ? -a : a;
-}
-CUTLASS_HOST_DEVICE constexpr long long abs(long long a) {
-    return (a < 0) ? -a : a;
-}
-#else
-using std::abs;
-#endif
-
-//-----------------------------------------------------------------------------
-// Minimum/maximum operations <algorithm>
-//-----------------------------------------------------------------------------
-
-/// std::min
-template <typename T>
-CUTLASS_HOST_DEVICE constexpr const T& min(const T& a, const T& b) {
-  return (b < a) ? b : a;
-}
-
-/// std::max
-template <typename T>
-CUTLASS_HOST_DEVICE constexpr const T& max(const T& a, const T& b) {
-  return (a < b) ? b : a;
-}
-
-#if !defined(__CUDACC_RTC__)
-//-----------------------------------------------------------------------------
-// Methods on std::pair
-//-----------------------------------------------------------------------------
-
-using std::pair;
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator==(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return (lhs.first == rhs.first) && (lhs.second == rhs.second);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator!=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return (lhs.first != rhs.first) && (lhs.second != rhs.second);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator<(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return (lhs.first < rhs.first) ? true : (rhs.first < lhs.first) ? false
-                                                                  : (lhs.second < rhs.second);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator<=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return !(rhs < lhs);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator>(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return (rhs < lhs);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE constexpr bool operator>=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
-  return !(lhs < rhs);
-}
-
-template <class T1, class T2>
-CUTLASS_HOST_DEVICE std::pair<T1, T2> make_pair(T1 t, T2 u) {
-  std::pair<T1, T2> retval;
-  retval.first = t;
-  retval.second = u;
-  return retval;
-}
-#endif
-
-}  // namespace platform
-
-/******************************************************************************
- * Implementations of C++ 11/14/17/... STL features
- ******************************************************************************/
-
-namespace platform {
-
-//-----------------------------------------------------------------------------
-// Integral constant helper types <type_traits>
-//-----------------------------------------------------------------------------
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
-
-/// std::integral_constant
-template <typename value_t, value_t V>
-struct integral_constant;
-
-/// std::integral_constant
-template <typename value_t, value_t V>
-struct integral_constant {
-  static const value_t value = V;
-
-  typedef value_t value_type;
-  typedef integral_constant<value_t, V> type;
-
-  CUTLASS_HOST_DEVICE operator value_type() const { return value; }
-
-  CUTLASS_HOST_DEVICE const value_type operator()() const { return value; }
-};
-
-#else
-
-using std::integral_constant;
-using std::pair;
-
-#endif
-
-using CUTLASS_STL_NAMESPACE::bool_constant;
-using CUTLASS_STL_NAMESPACE::true_type;
-using CUTLASS_STL_NAMESPACE::false_type;
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1700))
-
-/// std::nullptr_t
-struct nullptr_t {};
-
-#else
-
-using std::nullptr_t;
-
-#endif
-
-//-----------------------------------------------------------------------------
-// Conditional metaprogramming <type_traits>
-//-----------------------------------------------------------------------------
-
-using CUTLASS_STL_NAMESPACE::conditional;
-using CUTLASS_STL_NAMESPACE::conditional_t;
-using CUTLASS_STL_NAMESPACE::enable_if;
-using CUTLASS_STL_NAMESPACE::enable_if_t;
-using CUTLASS_STL_NAMESPACE::void_t;
-
-//-----------------------------------------------------------------------------
-// Const/volatility specifiers <type_traits>
-//-----------------------------------------------------------------------------
-
-using CUTLASS_STL_NAMESPACE::remove_const;
-using CUTLASS_STL_NAMESPACE::remove_const_t;
-using CUTLASS_STL_NAMESPACE::remove_cv;
-using CUTLASS_STL_NAMESPACE::remove_cv_t;
-using CUTLASS_STL_NAMESPACE::remove_reference;
-using CUTLASS_STL_NAMESPACE::remove_reference_t;
-using CUTLASS_STL_NAMESPACE::remove_volatile;
-using CUTLASS_STL_NAMESPACE::remove_volatile_t;
-
-// remove_cvref and remove_cvref_t are C++20 features,
-// but CUTLASS finds them useful enough to back-port.
-#if defined(__cpp_lib_remove_cvref)
-
-using CUTLASS_STL_NAMESPACE::remove_cvref;
-using CUTLASS_STL_NAMESPACE::remove_cvref_t;
-
-#else
-
-template <class T>
-struct remove_cvref {
-  using type = remove_cv_t<remove_reference_t<T>>;
-};
-
-template <class T>
-using remove_cvref_t = typename remove_cvref<T>::type;
-
-#endif
-
-//-----------------------------------------------------------------------------
-// Type relationships <type_traits>
-//-----------------------------------------------------------------------------
-
-using CUTLASS_STL_NAMESPACE::is_same;  
-using CUTLASS_STL_NAMESPACE::is_same_v;
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
-
-/// Helper for std::is_base_of
-template <typename BaseT, typename DerivedT>
-struct is_base_of_helper {
-  typedef char (&yes)[1];
-  typedef char (&no)[2];
-
-  template <typename B, typename D>
-  struct dummy {
-    CUTLASS_HOST_DEVICE operator B*() const;
-    CUTLASS_HOST_DEVICE operator D*();
-  };
-
-  template <typename T>
-  CUTLASS_HOST_DEVICE static yes check(DerivedT*, T);
-
-  CUTLASS_HOST_DEVICE static no check(BaseT*, int);
-
-  static const bool value = sizeof(check(dummy<BaseT, DerivedT>(), int())) == sizeof(yes);
-};
-
-/// std::is_base_of
-template <typename BaseT, typename DerivedT>
-struct is_base_of
-    : integral_constant<bool,
-                        (is_base_of_helper<typename remove_cv<BaseT>::type,
-                                           typename remove_cv<DerivedT>::type>::value) ||
-                            (is_same<typename remove_cv<BaseT>::type,
-                                     typename remove_cv<DerivedT>::type>::value)> {};
-
-#else
-
-using std::is_base_of;
-
-#endif
-
-//-----------------------------------------------------------------------------
-// Type properties <type_traits>
-//-----------------------------------------------------------------------------
-
-using CUTLASS_STL_NAMESPACE::is_arithmetic;
-using CUTLASS_STL_NAMESPACE::is_arithmetic_v;
-using CUTLASS_STL_NAMESPACE::is_void;
-using CUTLASS_STL_NAMESPACE::is_void_v;
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
-
-/// std::is_volatile
-template <typename T>
-struct is_volatile : false_type {};
-template <typename T>
-struct is_volatile<volatile T> : true_type {};
-
-/// Helper for std::is_pointer (false specialization)
-template <typename T>
-struct is_pointer_helper : false_type {};
-
-/// Helper for std::is_pointer (true specialization)
-template <typename T>
-struct is_pointer_helper<T*> : true_type {};
-
-/// std::is_pointer
-template <typename T>
-struct is_pointer : is_pointer_helper<typename remove_cv<T>::type> {};
-
-/// std::is_integral
-template <typename T>
-struct is_integral : false_type {};
-template <>
-struct is_integral<char> : true_type {};
-template <>
-struct is_integral<signed char> : true_type {};
-template <>
-struct is_integral<unsigned char> : true_type {};
-template <>
-struct is_integral<short> : true_type {};
-template <>
-struct is_integral<unsigned short> : true_type {};
-template <>
-struct is_integral<int> : true_type {};
-template <>
-struct is_integral<unsigned int> : true_type {};
-template <>
-struct is_integral<long> : true_type {};
-template <>
-struct is_integral<unsigned long> : true_type {};
-template <>
-struct is_integral<long long> : true_type {};
-template <>
-struct is_integral<unsigned long long> : true_type {};
-template <typename T>
-struct is_integral<volatile T> : is_integral<T> {};
-template <typename T>
-struct is_integral<const T> : is_integral<T> {};
-template <typename T>
-struct is_integral<const volatile T> : is_integral<T> {};
-
-/// std::is_floating_point
-template <typename T>
-struct is_floating_point
-    : integral_constant<bool,
-                        (is_same<float, typename remove_cv<T>::type>::value ||
-                         is_same<double, typename remove_cv<T>::type>::value)> {};
-
-/// std::is_fundamental
-template <typename T>
-struct is_fundamental
-    : integral_constant<bool,
-                        (is_arithmetic<T>::value || is_void<T>::value ||
-                         is_same<nullptr_t, typename remove_cv<T>::type>::value)> {};
-
-#else
-
-using std::is_volatile;
-using std::is_pointer;
-using std::is_integral;
-using std::is_floating_point;
-using std::is_fundamental;
-
-#endif
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800)) || \
-    (defined(__GNUG__) && (__GNUC__ < 5))
-
-/**
-     * std::is_trivially_copyable
-     *
-     * This implementation only evaluates true if T is fundamental or pointer
-     *
-     * Without help from partial template specializations provided by the user for
-     * a specific class or struct, this trait will never report that the specified
-     * class or struct  is trivially-copyable ; this is always safe,
-     * if possibly sub-optimal.
-     */
-template <typename T>
-struct is_trivially_copyable
-    : integral_constant<bool, (is_fundamental<T>::value || is_pointer<T>::value)> {};
-
-#else
-
-using std::is_trivially_copyable;
-
-#endif
-
-#if (201703L <=__cplusplus)
-
-/// std::is_unsigned_v
-using CUTLASS_STL_NAMESPACE::is_integral_v;
-/// std::is_unsigned_v
-using CUTLASS_STL_NAMESPACE::is_unsigned_v;
-
-#endif
-
-//-----------------------------------------------------------------------------
-// <utility>
-//-----------------------------------------------------------------------------
-
-using CUTLASS_STL_NAMESPACE::declval;
-  
-//-----------------------------------------------------------------------------
-// bit_cast <bit>
-//-----------------------------------------------------------------------------
-
-template< class To, class From >
-constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& from ) noexcept;
-
-template <class To, class From>
-constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& src) noexcept
-{
-  static_assert(sizeof(To) == sizeof(From), "sizes must match");
-  return reinterpret_cast<To const &>(src);
-}
-
-//-----------------------------------------------------------------------------
-// Convertable
-//-----------------------------------------------------------------------------
-using CUTLASS_STL_NAMESPACE::is_convertible;
-using CUTLASS_STL_NAMESPACE::is_convertible_v;
-
-//-----------------------------------------------------------------------------
-// Alignment and layout utilities
-//-----------------------------------------------------------------------------
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
-
-/// std::alignment_of
-template <typename value_t>
-struct alignment_of {
-  struct pad {
-    value_t val;
-    char byte;
-  };
-
-  enum { value = sizeof(pad) - sizeof(value_t) };
-};
-
-#else
-
-template <typename value_t>
-struct alignment_of : std::alignment_of<value_t> {};
-
-#endif
-
-/* 16B specializations where 32-bit Win32 host compiler disagrees with device compiler */
-template <>
-struct alignment_of<int4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<uint4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<float4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<long4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<ulong4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<longlong2> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<ulonglong2> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<double2> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<longlong4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<ulonglong4> {
-  enum { value = 16 };
-};
-template <>
-struct alignment_of<double4> {
-  enum { value = 16 };
-};
-
-// Specializations for volatile/const qualified types
-template <typename value_t>
-struct alignment_of<volatile value_t> : alignment_of<value_t> {};
-template <typename value_t>
-struct alignment_of<const value_t> : alignment_of<value_t> {};
-template <typename value_t>
-struct alignment_of<const volatile value_t> : alignment_of<value_t> {};
-
-#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800))
-
-template <size_t Align>
-struct aligned_chunk;
-template <>
-struct __align__(1) aligned_chunk<1> {
-  uint8_t buff;
-};
-template <>
-struct __align__(2) aligned_chunk<2> {
-  uint16_t buff;
-};
-template <>
-struct __align__(4) aligned_chunk<4> {
-  uint32_t buff;
-};
-template <>
-struct __align__(8) aligned_chunk<8> {
-  uint32_t buff[2];
-};
-template <>
-struct __align__(16) aligned_chunk<16> {
-  uint32_t buff[4];
-};
-template <>
-struct __align__(32) aligned_chunk<32> {
-  uint32_t buff[8];
-};
-template <>
-struct __align__(64) aligned_chunk<64> {
-  uint32_t buff[16];
-};
-template <>
-struct __align__(128) aligned_chunk<128> {
-  uint32_t buff[32];
-};
-template <>
-struct __align__(256) aligned_chunk<256> {
-  uint32_t buff[64];
-};
-template <>
-struct __align__(512) aligned_chunk<512> {
-  uint32_t buff[128];
-};
-template <>
-struct __align__(1024) aligned_chunk<1024> {
-  uint32_t buff[256];
-};
-template <>
-struct __align__(2048) aligned_chunk<2048> {
-  uint32_t buff[512];
-};
-template <>
-struct __align__(4096) aligned_chunk<4096> {
-  uint32_t buff[1024];
-};
-
-/// std::aligned_storage
-template <size_t Len, size_t Align>
-struct aligned_storage {
-  typedef aligned_chunk<Align> type[Len / sizeof(aligned_chunk<Align>)];
-};
-
-#else
-
-using std::aligned_storage;
-
-#endif
-
-#if !defined(__CUDACC_RTC__)
-/// Default deleter
-template <typename T>
-struct default_delete {
-  void operator()(T* ptr) const { delete ptr; }
-};
-
-/// Partial specialization for deleting array types
-template <typename T>
-struct default_delete<T[]> {
-  void operator()(T* ptr) const { delete[] ptr; }
-};
-
-/// std::unique_ptr
-template <class T, class Deleter = default_delete<T> >
-class unique_ptr {
- public:
-  typedef T* pointer;
-  typedef T element_type;
-  typedef Deleter deleter_type;
-
- private:
-  /// Pointer to memory
-  pointer _ptr;
-
-  /// Deleter
-  deleter_type _deleter;
-
- public:
-  unique_ptr() : _ptr(nullptr) {}
-  unique_ptr(pointer p) : _ptr(p) {}
-
-  ~unique_ptr() {
-    if (_ptr) {
-      _deleter(_ptr);
-    }
-  }
-  /// Returns a pointer to the managed object or nullptr if no object is owned.
-  pointer get() const noexcept { return _ptr; }
-
-  /// Releases ownership of the managed object, if any
-  pointer release() noexcept {
-    pointer p(_ptr);
-    _ptr = nullptr;
-    return p;
-  }
-
-  /// Replaces the managed object, deleting the old object.
-  void reset(pointer p = pointer()) noexcept {
-    pointer old_ptr = _ptr;
-    _ptr = p;
-    if (old_ptr != nullptr) {
-      get_deleter()(old_ptr);
-    }
-  }
-
-  /// Swaps the managed objects with *this and another unique_ptr
-  void swap(unique_ptr& other) noexcept { std::swap(_ptr, other._ptr); }
-
-  /// Returns the deleter object
-  Deleter& get_deleter() noexcept { return _deleter; }
-
-  /// Returns the deleter object
-  Deleter const& get_deleter() const noexcept { return _deleter; }
-
-  /// Checks whether an object is owned
-  operator bool() const noexcept { return _ptr != nullptr; }
-
-  /// Dereferences the unique_ptr
-  T& operator*() const { return *_ptr; }
-
-  /// Returns a pointer to the managed object
-  pointer operator->() const noexcept { return _ptr; }
-
-  /// Array access to managed object
-  T& operator[](size_t i) const { return _ptr[i]; }
-};
-
-/// Specializes the swap algorithm
-template <typename T, typename Deleter>
-void swap(unique_ptr<T, Deleter>& lhs, unique_ptr<T, Deleter>& rhs) noexcept {
-  lhs.swap(rhs);
-}
-#endif
-
-/// std::numeric_limits
-template <class T>
-struct numeric_limits;
-
-template <>
-struct numeric_limits<int32_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr int32_t lowest() noexcept { return -2147483647 - 1;}
-  CUTLASS_HOST_DEVICE
-  static constexpr int32_t max() noexcept { return 2147483647;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-template <>
-struct numeric_limits<int16_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr int16_t lowest() noexcept { return -32768;}
-  CUTLASS_HOST_DEVICE
-  static constexpr int16_t max() noexcept { return 32767;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-template <>
-struct numeric_limits<int8_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr int8_t lowest() noexcept { return -128;}
-  CUTLASS_HOST_DEVICE
-  static constexpr int8_t max() noexcept { return 127;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-
-template <>
-struct numeric_limits<uint32_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr uint32_t lowest() noexcept { return 0;}
-  CUTLASS_HOST_DEVICE
-  static constexpr uint32_t max() noexcept { return 4294967295U;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-template <>
-struct numeric_limits<uint16_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr uint16_t lowest() noexcept { return 0;}
-  CUTLASS_HOST_DEVICE
-  static constexpr uint16_t max() noexcept { return 65535U;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-template <>
-struct numeric_limits<uint8_t> {
-  CUTLASS_HOST_DEVICE
-  static constexpr uint8_t lowest() noexcept { return 0;}
-  CUTLASS_HOST_DEVICE
-  static constexpr uint8_t max() noexcept { return 255U;}
-  static constexpr bool is_integer = true;
-  static constexpr bool has_infinity = false;
-};
-
-template <>
-struct numeric_limits<float> {
-  CUTLASS_HOST_DEVICE
-  static constexpr float infinity() noexcept { return bit_cast<float, int32_t>(0x7f800000);}
-  CUTLASS_HOST_DEVICE
-  static constexpr float max() noexcept { return bit_cast<float, int32_t>(0x7f7fffff);}
-  static constexpr bool is_integer = false;
-  static constexpr bool has_infinity = true;
-};
-
-/// Returns a value that curries the `std::maximum()` function into the identity
-/// function. No value will compare < than this value.
-template <typename T>
-constexpr T identity_for_maximum() {
-  if constexpr (numeric_limits<T>::has_infinity) {
-    return -numeric_limits<T>::infinity();
-  } else {
-    return numeric_limits<T>::lowest();
-  }
-}
-
-/// Returns a value that curries the `std::minimum()` function into the identity
-/// function. No value will compare > than this value.
-template <typename T>
-constexpr T identity_for_minimum() {
-  if constexpr (numeric_limits<T>::has_infinity) {
-    return numeric_limits<T>::infinity();
-  } else {
-    return numeric_limits<T>::max();
-  }
-}
-
-/// std::float_round_style
-using CUTLASS_STL_NAMESPACE::float_round_style;
-using CUTLASS_STL_NAMESPACE::round_indeterminate;
-using CUTLASS_STL_NAMESPACE::round_toward_zero;
-using CUTLASS_STL_NAMESPACE::round_to_nearest;
-using CUTLASS_STL_NAMESPACE::round_toward_infinity;
-using CUTLASS_STL_NAMESPACE::round_toward_neg_infinity;
-
-/// std::float_denorm_style
-using CUTLASS_STL_NAMESPACE::float_denorm_style;
-using CUTLASS_STL_NAMESPACE::denorm_indeterminate;
-using CUTLASS_STL_NAMESPACE::denorm_absent;
-using CUTLASS_STL_NAMESPACE::denorm_present;
-
-}  // namespace platform
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h b/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
deleted file mode 100755
index aa4e3f1a1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/predicate_vector.h
+++ /dev/null
@@ -1,547 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines container classes and iterators for managing a statically sized vector
-      of boolean predicates.
-*/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#else
-#include <assert.h>
-#include <stdint.h>
-#endif
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!@defgroup predicate_vector_concept Predicate Vector Concept
-@{
-
-Implementations of \ref predicate_vector_concept contain an ordered set of boolean predicates which
-may be used as conditionals in other device-side operations. Both random access and iterators
-offering sequential access are provided.
-
-@par Predicate Vector
-   A \ref predicate_vector_concept satisfies the following expressions
-  - <b>at(int idx)</b> - returns the value of the indexed predicate
-  - <b>set(int idx, bool value)</b> - sets the value of the indexed predicate
-  - <b>begin()</b> - returns a \ref predicate_iterator_concept pointing to the first predicate
-
-@}
-*/
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!@defgroup predicate_iterator_concept Predicate Iterator Concept
-@{
-
-Implementations of \ref predicate_iterator_concept enables accessing and traversing elements of a
-bit vector.
-
-@par Const Predicate Iterator
-  A const \ref predicate_iterator_concept satisfies the following expressions
- - <b>++it</b> increments the iterator to the next predicate
- - <b>*it</b> returns the value of the currently pointed-to predicate
-
-@par Mutable Predicate Iterator
- A \ref predicate_iterator_concept that is non-const <b>also</b> satisfies the following expressions
- - <b>it.set(bool value)</b> sets the value of the currently pointed-to predicate
-
-@}
-*/
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*!@defgroup predicate_tile_adapter Predicate Tile Adapter Concept
-@{
-
-Implementations of \ref predicate_tile_adapter provide a mapping between a the elements of a \ref
-tile_traits_concept and a \ref predicate_vector_concept.
-
-@par Predicate Tile Adapter
-  A \ref predicate_tile_adapter satisfies the following expressions
- - <b>at(int d, int h, int w, int c)</b> - returns the value of a predicate corresponding to the
-   access (d, h, w, c) within the tile.
-
-@}
-*/
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Statically sized array of bits implementing @concept{predicate_vector_concept}.
-template <
-    /// Number of predicates contained in predicate vector
-    int kPredicates_,
-    /// Number of predicates contained in each byte of internal storage
-    int kPredicatesPerByte_ = 4,
-    /// Location of first predicate within byte of internal storage
-    int kPredicateStart_ = 0>
-struct PredicateVector {
-  /// Number of bits stored by the PredicateVector
-  static constexpr int kPredicates = kPredicates_;
-
-  /// Number of bits stored within each byte of the predicate bit vector
-  static constexpr int kPredicatesPerByte = kPredicatesPerByte_;
-
-  /// First bit within each byte containing predicates
-  static constexpr int kPredicateStart = kPredicateStart_;
-
-  // Make sure no one tries to put more than 8 bits in a byte :)
-  static_assert(kPredicatesPerByte <= 8, "kPredicatesPerByte must fit within an actual byte");
-  // Make sure the "offsetted" bits fit in one byte.
-  static_assert(kPredicateStart + kPredicatesPerByte <= 8,
-                "The offsetted predicates must fit within an actual byte.");
-
-  /// Storage type of individual elements
-  typedef uint32_t Storage;
-
-  /// Number of bytes needed
-  static constexpr int kBytes = (kPredicates + kPredicatesPerByte - 1) / kPredicatesPerByte;
-
-  /// Number of storage elements needed
-  static constexpr int kWordCount = (kBytes + int(sizeof(Storage)) - 1) / int(sizeof(Storage));
-
-  /// The byte mask corresponding to predicates
-  static constexpr Storage kByteMask = (((1 << kPredicatesPerByte) - 1) << kPredicateStart);
-
- private:
-  //
-  // Data members
-  //
-
-  /// Words of bit vector
-  Storage storageData[kWordCount];
-
-  //
-  // Methods
-  //
-
-  /// Computes the word and bit corresponding to a logical predicate index
-  CUTLASS_HOST_DEVICE void computeStorageOffset(int &word, int &bit, int idx) const {
-    CUTLASS_ASSERT(idx < kPredicates);
-
-    int byte = (idx / kPredicatesPerByte);
-    int bit_offset = (idx % kPredicatesPerByte);
-
-    word = byte / sizeof(Storage);
-    int byte_offset = (byte % sizeof(Storage));
-
-    bit = byte_offset * 8 + bit_offset + kPredicateStart;
-  }
-
-  /// Returns word mask.
-  CUTLASS_HOST_DEVICE static constexpr bool computeWordMask() {
-    Storage mask(0);
-    CUTLASS_PRAGMA_UNROLL
-    for (size_t byte = 0; byte < sizeof(Storage); ++byte) {
-      mask |= (kByteMask << (byte * 8));
-    }
-    return mask;
-  }
-
-  /// Returns mask of last word.
-  CUTLASS_HOST_DEVICE static constexpr bool computeLastWordMask() {
-    Storage mask(0);
-    CUTLASS_PRAGMA_UNROLL
-    for (int byte = 0; byte < kBytes % sizeof(Storage); ++byte) {
-      mask |= (kByteMask << (byte * 8));
-    }
-    return mask;
-  }
-
-  /// Accesses a given word with optional assertions
-  CUTLASS_HOST_DEVICE Storage &storage(int word) {
-    CUTLASS_ASSERT(word < kWordCount);
-    return storageData[word];
-  }
-
-  /// Accesses a given word with optional assertions
-  CUTLASS_HOST_DEVICE Storage const &storage(int word) const {
-    CUTLASS_ASSERT(word < kWordCount);
-    return storageData[word];
-  }
-
- public:
-  //
-  // Iterator
-  //
-
-  /**
-  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
-  * read and write access to predicates.
-  * @concept{predicate_iterator_concept}
-  */
-  class Iterator {
-    /// Reference to PredicateVector instance
-    PredicateVector &vec_;
-
-    /// Index into PredicateVector
-    int bit_;
-
-   public:
-    /// Copy constructor
-    CUTLASS_HOST_DEVICE
-    Iterator(Iterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
-
-    /// Constructs an iterator from a PredicateVector
-    CUTLASS_HOST_DEVICE
-    Iterator(PredicateVector &vec, int _start = 0) : vec_(vec), bit_(_start) {}
-
-    /// Pre-increment
-    CUTLASS_HOST_DEVICE
-    Iterator &operator++() {
-      ++bit_;
-      return *this;
-    }
-
-    /// Increment
-    CUTLASS_HOST_DEVICE
-    Iterator &operator+=(int offset) {
-      bit_ += offset;
-      return *this;
-    }
-
-    /// Pre-decrement
-    CUTLASS_HOST_DEVICE
-    Iterator &operator--() {
-      --bit_;
-      return *this;
-    }
-
-    /// Decrement
-    CUTLASS_HOST_DEVICE
-    Iterator &operator-=(int offset) {
-      bit_ -= offset;
-      return *this;
-    }
-
-    /// Post-increment
-    CUTLASS_HOST_DEVICE
-    Iterator operator++(int) {
-      Iterator ret(*this);
-      ret.bit_++;
-      return ret;
-    }
-
-    /// Post-decrement
-    CUTLASS_HOST_DEVICE
-    Iterator operator--(int) {
-      Iterator ret(*this);
-      ret.bit_--;
-      return ret;
-    }
-
-    /// Iterator advances by some amount
-    CUTLASS_HOST_DEVICE
-    Iterator operator+(int offset) {
-      Iterator ret(*this);
-      ret.bit_ += offset;
-      return ret;
-    }
-
-    /// Iterator recedes by some amount
-    CUTLASS_HOST_DEVICE
-    Iterator operator-(int offset) {
-      ConstIterator ret(*this);
-      ret.bit_ -= offset;
-      return ret;
-    }
-
-    /// Returns true if iterators point to the same bit
-    CUTLASS_HOST_DEVICE
-    bool operator==(Iterator const &it) const { return bit_ == it.bit_; }
-
-    /// Returns false if iterators point to the same bit
-    CUTLASS_HOST_DEVICE
-    bool operator!=(Iterator const &it) const { return bit_ != it.bit_; }
-
-    /// Gets the bit at the pointed to location
-    CUTLASS_HOST_DEVICE
-    bool get() { return vec_.at(bit_); }
-
-    /// Gets the bit at the pointed to location
-    CUTLASS_HOST_DEVICE
-    bool at() const { return vec_.at(bit_); }
-
-    /// Dereferences iterator
-    CUTLASS_HOST_DEVICE
-    bool operator*() const { return at(); }
-
-    /// Sets the bit at the pointed to location
-    CUTLASS_HOST_DEVICE
-    void set(bool value = true) { vec_.set(bit_, value); }
-  };
-
-  /**
-  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
-  * read and write access to predicates.
-  * @concept{predicate_iterator_concept}
-  */
-  class ConstIterator {
-    /// Reference to PredicateVector instance
-    PredicateVector const &vec_;
-
-    /// Index into PredicateVector
-    int bit_;
-
-   public:
-    /// Copy constructor
-    CUTLASS_HOST_DEVICE
-    ConstIterator(ConstIterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
-
-    /// Constructs an iterator from a PredicateVector
-    CUTLASS_HOST_DEVICE
-    ConstIterator(PredicateVector const &vec, int _start = 0) : vec_(vec), bit_(_start) {}
-
-    /// Pre-increment
-    CUTLASS_HOST_DEVICE
-    ConstIterator &operator++() {
-      ++bit_;
-      return *this;
-    }
-
-    /// Increment
-    CUTLASS_HOST_DEVICE
-    ConstIterator &operator+=(int offset) {
-      bit_ += offset;
-      return *this;
-    }
-
-    /// Pre-decrement
-    CUTLASS_HOST_DEVICE
-    ConstIterator &operator--() {
-      --bit_;
-      return *this;
-    }
-
-    /// Decrement
-    CUTLASS_HOST_DEVICE
-    ConstIterator &operator-=(int offset) {
-      bit_ -= offset;
-      return *this;
-    }
-
-    /// Post-increment
-    CUTLASS_HOST_DEVICE
-    ConstIterator operator++(int) {
-      ConstIterator ret(*this);
-      ret.bit_++;
-      return ret;
-    }
-
-    /// Post-decrement
-    CUTLASS_HOST_DEVICE
-    ConstIterator operator--(int) {
-      ConstIterator ret(*this);
-      ret.bit_--;
-      return ret;
-    }
-
-    /// Iterator advances by some amount
-    CUTLASS_HOST_DEVICE
-    ConstIterator operator+(int offset) {
-      ConstIterator ret(*this);
-      ret.bit_ += offset;
-      return ret;
-    }
-
-    /// Iterator recedes by some amount
-    CUTLASS_HOST_DEVICE
-    ConstIterator operator-(int offset) {
-      ConstIterator ret(*this);
-      ret.bit_ -= offset;
-      return ret;
-    }
-
-    /// Returns true if iterators point to the same bit
-    CUTLASS_HOST_DEVICE
-    bool operator==(ConstIterator const &it) const { return bit_ == it.bit_; }
-
-    /// Returns false if iterators point to the same bit
-    CUTLASS_HOST_DEVICE
-    bool operator!=(ConstIterator const &it) const { return bit_ != it.bit_; }
-
-    /// Gets the bit at the pointed to location
-    CUTLASS_HOST_DEVICE
-    bool get() { return vec_.at(bit_); }
-
-    /// Gets the bit at the pointed to location
-    CUTLASS_HOST_DEVICE
-    bool at() const { return vec_.at(bit_); }
-
-    /// Dereferences iterator
-    CUTLASS_HOST_DEVICE
-    bool operator*() const { return at(); }
-  };
-
-  /// Iterator that always returns true
-  struct TrivialIterator {
-    /// Constructor
-    CUTLASS_HOST_DEVICE
-    TrivialIterator() {}
-
-    /// Copy constructor
-    CUTLASS_HOST_DEVICE
-    TrivialIterator(Iterator const &it) {}
-
-    /// Constructs an iterator from a PredicateVector
-    CUTLASS_HOST_DEVICE
-    TrivialIterator(PredicateVector const &_vec) {}
-
-    /// Pre-increment
-    CUTLASS_HOST_DEVICE
-    TrivialIterator &operator++() { return *this; }
-
-    /// Post-increment
-    CUTLASS_HOST_DEVICE
-    TrivialIterator operator++(int) { return *this; }
-
-    /// Dereferences iterator
-    CUTLASS_HOST_DEVICE
-    bool operator*() const { return true; }
-  };
-
- public:
-  //
-  // Methods
-  //
-
-  /// Initialize the predicate vector
-  CUTLASS_HOST_DEVICE PredicateVector(bool value = true) { fill(value); }
-
-  /// Fills all predicates with a given value
-  CUTLASS_HOST_DEVICE void fill(bool value = true) {
-    Storage item = (value ? ~Storage(0) : Storage(0));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kWordCount; ++i) {
-      storage(i) = item;
-    }
-  }
-
-  /// Clears all predicates
-  CUTLASS_HOST_DEVICE void clear() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kWordCount; ++i) {
-      storage(i) = 0;
-    }
-  }
-
-  /// Sets all predicates to true
-  CUTLASS_HOST_DEVICE void enable() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kWordCount; ++i) {
-      storage(i) = ~Storage(0);
-    }
-  }
-
-  /// Accesses a bit within the predicate vector.
-  CUTLASS_HOST_DEVICE bool operator[](int idx) const { return at(idx); }
-
-  /// Accesses a bit within the predicate vector.
-  CUTLASS_HOST_DEVICE bool at(int idx) const {
-    int bit, word;
-    computeStorageOffset(word, bit, idx);
-
-    return ((storage(word) >> bit) & 1);
-  }
-
-  /// Set a bit within the predicate vector.
-  CUTLASS_HOST_DEVICE void set(int idx, bool value = true) {
-    int bit, word;
-    computeStorageOffset(word, bit, idx);
-
-    Storage disable_mask = (~(Storage(1) << bit));
-    Storage enable_mask = (Storage(value) << bit);
-
-    storage(word) = ((storage(word) & disable_mask) | enable_mask);
-  }
-
-  /// Computes the intersection of two identical predicate vectors.
-  CUTLASS_HOST_DEVICE PredicateVector &operator&=(PredicateVector const &predicates) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kWordCount; ++i) {
-      storage(i) = (storage(i) & predicates.storage(i));
-    }
-    return *this;
-  }
-
-  /// Computes the union of two identical predicate vectors.
-  CUTLASS_HOST_DEVICE PredicateVector &operator|=(PredicateVector const &predicates) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kWordCount; ++i) {
-      storage(i) = (storage(i) | predicates.storage(i));
-    }
-    return *this;
-  }
-
-  /// Returns true if entire predicate array is zero.
-  CUTLASS_HOST_DEVICE bool is_zero() const {
-   constexpr Storage mask = computeWordMask();
-    Storage result = 0;
-    CUTLASS_PRAGMA_UNROLL
-    for (int word = 0; word < kWordCount - 1; ++word) {
-      result |= (storage(word) & mask);
-    }
-    constexpr Storage last_word_mask = computeLastWordMask();
-    result |= (storage(kWordCount - 1) & last_word_mask);
-    
-    return result == 0;
-  }
-
-  /// Returns an iterator to the start of the bit vector
-  CUTLASS_DEVICE
-  Iterator begin() { return Iterator(*this); }
-
-  /// Returns an iterator
-  CUTLASS_DEVICE
-  Iterator end() { return Iterator(*this, kPredicates); }
-
-  /// Returns a ConstIterator
-  CUTLASS_DEVICE
-  ConstIterator const_begin() const { return ConstIterator(*this); }
-
-  /// Returns a ConstIterator
-  CUTLASS_DEVICE
-  ConstIterator const_end() const { return ConstIterator(*this, kPredicates); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/quaternion.h b/lightllm-kernel/cutlass/include/cutlass/quaternion.h
deleted file mode 100755
index b31df4557..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/quaternion.h
+++ /dev/null
@@ -1,752 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a densely packed quaternion object intended for storing data in registers and
-    executing quaternion operations within a CUDA or host thread.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-#include "cutlass/array.h"
-#include "cutlass/real.h"
-#include "cutlass/coord.h"
-#include "cutlass/matrix.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/layout/vector.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Quaternion: xi + yj + zk + w
-template <
-  typename Element_ = float      ///< element type
->
-class Quaternion : public Array<Element_, 4> {
-public:
-
-  /// Logical rank of tensor index space
-  static int const kRank = 1;
-
-  /// Number of elements
-  static int const kExtent = 4;
-
-  /// Base class is a four-element array
-  using Base = Array<Element_, kExtent>;
-
-  /// Element type
-  using Element = typename Base::Element;
-
-  /// Reference type to an element
-  using Reference = typename Base::reference;
-
-  /// Index type
-  using Index = int;
-
-  /// Quaternion storage - imaginary part
-  static int const kX = 0;
-
-  /// Quaternion storage - imaginary part
-  static int const kY = 1;
-
-  /// Quaternion storage - imaginary part
-  static int const kZ = 2;
-
-  /// Quaternion storage - real part
-  static int const kW = 3;
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructs a quaternion q = 0
-  CUTLASS_HOST_DEVICE
-  Quaternion() {
-    Base::at(kX) = Element();
-    Base::at(kY) = Element();
-    Base::at(kZ) = Element();
-    Base::at(kW) = Element();
-  }
-
-  /// Constructs a quaternion q = w + 0*i + 0*j + 0*k
-  CUTLASS_HOST_DEVICE
-  Quaternion(
-    Element w_
-  ) {
-    Base::at(kX) = Element();
-    Base::at(kY) = Element();
-    Base::at(kZ) = Element();
-    Base::at(kW) = w_;
-  }
-
-  /// Constructs a quaternion q = w + x*i + y*j + z*k
-  CUTLASS_HOST_DEVICE
-  Quaternion(
-    Element x_,
-    Element y_,
-    Element z_,
-    Element w_
-  ) {
-    Base::at(kX) = x_;
-    Base::at(kY) = y_;
-    Base::at(kZ) = z_;
-    Base::at(kW) = w_;
-  }
-
-  /// Constructs a quaternion from a vector representing the imaginary part and a real number
-  CUTLASS_HOST_DEVICE
-  Quaternion(
-    Matrix3x1<Element> const &imag_,
-    Element w_ = Element()
-  ) {
-    Base::at(kX) = imag_[0];
-    Base::at(kY) = imag_[1];
-    Base::at(kZ) = imag_[2];
-    Base::at(kW) = w_;
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference at(Index idx) const {
-    return Base::at(idx);
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference at(Index idx) {
-    return Base::at(idx);
-  }
-
-  /// Accesses the x element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Element x() const {
-    return Base::at(kX);
-  }
-
-  /// Accesses the x element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Reference x() {
-    return Base::at(kX);
-  }
-
-  /// Accesses the y element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Element y() const {
-    return Base::at(kY);
-  }
-
-  /// Accesses the y element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Reference y() {
-    return Base::at(kY);
-  }
-
-  /// Accesses the z element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Element z() const {
-    return Base::at(kZ);
-  }
-
-  /// Accesses the z element of the imaginary part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Reference z() {
-    return Base::at(kZ);
-  }
-
-  /// Accesses the real part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Element w() const {
-    return Base::at(kW);
-  }
-
-  /// Accesses the real part of the quaternion
-  CUTLASS_HOST_DEVICE
-  Reference w() {
-    return Base::at(kW);
-  }
-
-  /// Returns the pure imaginary part of the quaternion as a 3-vector
-  CUTLASS_HOST_DEVICE
-  Matrix3x1<Element> pure() const {
-    return Matrix3x1<Element>(x(), y(), z());
-  }
-
-  /// Returns a quaternion representation of a spatial rotation given a unit-length axis and
-  /// a rotation in radians.
-  CUTLASS_HOST_DEVICE
-  static Quaternion<Element> rotation(
-    Matrix3x1<Element> const &axis_unit,    ///< axis of rotation (assumed to be unit length)
-    Element theta) {                        ///< angular rotation in radians
-
-    Element s = fast_sin(theta / Element(2));
-
-    return Quaternion(
-      s * axis_unit[0],
-      s * axis_unit[1],
-      s * axis_unit[2],
-      fast_cos(theta / Element(2))
-    );
-  }
-  
-  /// Returns a quaternion representation of a spatial rotation represented as a
-  /// unit-length rotation axis (r_x, r_y, r_z) and an angular rotation in radians
-  CUTLASS_HOST_DEVICE
-  static Quaternion<Element> rotation(
-    Element r_x,
-    Element r_y,
-    Element r_z,
-    Element theta) {                      ///< angular rotation in radians
-
-    return rotation({r_x, r_y, r_z}, theta);
-  }
-
-  /// Geometric rotation of a 3-element vector
-  CUTLASS_HOST_DEVICE
-  Matrix3x1<Element> rotate(Matrix3x1<Element> const &rhs) const {
-    return (*this * Quaternion<Element>(rhs, 0) * reciprocal(*this)).pure();
-  }
-
-  /// Inverse rotation operation
-  CUTLASS_HOST_DEVICE
-  Matrix3x1<Element> rotate_inv(Matrix3x1<Element> const &rhs) const {
-    return (reciprocal(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
-  }
-
-  /// Rotates a 3-vector assuming this is a unit quaternion (a spinor)
-  CUTLASS_HOST_DEVICE
-  Matrix3x1<Element> spinor(Matrix3x1<Element> const &rhs) const {
-    return (*this * Quaternion<Element>(rhs, 0) * conj(*this)).pure();
-  }
-
-  /// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor)
-  CUTLASS_HOST_DEVICE
-  Matrix3x1<Element> spinor_inv(Matrix3x1<Element> const &rhs) const {
-    return (conj(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
-  }
-
-  /// In-place addition
-  template <typename Element>
-  CUTLASS_HOST_DEVICE 
-  Quaternion<Element> &operator+=(Quaternion<Element> const &rhs) {
-    *this = (*this + rhs);
-    return *this;
-  }
-
-  /// In-place subtraction
-  template <typename Element>
-  CUTLASS_HOST_DEVICE
-  Quaternion<Element> &operator-=(Quaternion<Element> const &rhs) {
-    *this = (*this - rhs);
-    return *this;
-  }
-
-  /// In-place multiplication
-  template <typename T>
-  CUTLASS_HOST_DEVICE
-  Quaternion<Element> &operator*=(Quaternion<Element> const &rhs) {
-    *this = (*this * rhs);
-    return *this;
-  }
-
-  /// Scalar multiplication
-  template <typename T>
-  CUTLASS_HOST_DEVICE
-  Quaternion<Element> &operator*=(Element s) {
-    *this = (*this * s);
-    return *this;
-  }
-
-  /// In-place Division
-  template <typename T>
-  CUTLASS_HOST_DEVICE
-  Quaternion<Element> &operator/=(Quaternion<Element> const &rhs) {
-    *this = (*this / rhs);
-    return *this;
-  }
-
-  /// In-place Division
-  template <typename T>
-  CUTLASS_HOST_DEVICE
-  Quaternion<Element> &operator/=(Element s) {
-    *this = (*this / s);
-    return *this;
-  }
-
-  /// Computes a 3x3 rotation matrix (row-major representation)
-  CUTLASS_HOST_DEVICE
-  Matrix3x3<Element> as_rotation_matrix_3x3() const {
-    Matrix3x3<Element> m(
-      w() * w() + x() * x() - y() * y() - z() * z(),
-      2 * x() * y() - 2 * w() * z(),
-      2 * x() * z() + 2 * w() * y(),
-
-      2 * x() * y() + 2 * w() * z(),
-      w() * w() - x() * x() + y() * y() - z() * z(),
-      2 * y() * z() - 2 * w() * x(),
-
-      2 * x() * z() - 2 * w() * y(),
-      2 * y() * z() + 2 * w() * x(),
-      w() * w() - x() * x() - y() * y() + z() * z()
-    );
-    return m;
-  }
-
-  /// Computes a 4x4 rotation matrix (row-major representation)
-  CUTLASS_HOST_DEVICE
-  Matrix4x4<Element> as_rotation_matrix_4x4() const {
-    Matrix4x4<Element> m = Matrix4x4<Element>::identity();
-    m.set_slice_3x3(as_rotation_matrix_3x3());
-    return m;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Constructs a quaternion that is non-zero only in its real element.
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> make_Quaternion(
-  Element w) {                                ///< real part
-
-  return Quaternion<Element>(w);
-}
-
-/// Constructs a quaternion from a vector and real
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> make_Quaternion(
-  Matrix3x1<Element> const &imag,             ///< imaginary party as a vector
-  Element w) {                                ///< real part
-
-  return Quaternion<Element>(imag, w);
-}
-
-/// Constructs a quaternion from a unit-length rotation axis and a rotation 
-/// angle in radians
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> make_QuaternionRotation(
-  Matrix3x1<Element> const &axis_unit,        ///< rotation axis (unit-length)
-  Element w) {                                ///< rotation angle in radians
-
-  return Quaternion<Element>::rotation(axis_unit, w);
-}
-
-/// Constructs a quaternion q = xi + yj + zk + w
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> make_Quaternion(Element x, Element y, Element z, Element w) {
-  return Quaternion<Element>(x, y, z, w);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns the real part of the quaternion number
-template <typename Element>
-CUTLASS_HOST_DEVICE 
-Element const &real(Quaternion<Element> const &q) {
-  return q.w();
-}
-
-/// Returns the real part of the quaternion number
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Element &real(Quaternion<Element> &q) {
-  return q.w();
-}
-
-/// Returns the magnitude of the quaternion number
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Element abs(Quaternion<Element> const &q) {
-  return fast_sqrt(norm(q));
-}
-
-/// Quaternion conjugate
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> conj(Quaternion<Element> const &q) {
-  return make_Quaternion(
-    -q.x(),
-    -q.y(),
-    -q.z(),
-    q.w()
-  );
-}
-
-/// Computes the squared magnitude of the quaternion
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Element norm(Quaternion<Element> const &q) {
-  return q.x() * q.x() + q.y() * q.y() + q.z() * q.z() + q.w() * q.w();
-}
-
-/// Quaternion reciprocal
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> reciprocal(Quaternion<Element> const &q) {
-  
-  Element nsq = norm(q);
-  
-  return make_Quaternion(
-    -q.x() / nsq,
-    -q.y() / nsq,
-    -q.z() / nsq,
-    q.w() / nsq
-  );
-}
-
-/// Returns a unit-length quaternion
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> unit(Quaternion<Element> const &q) {
-  
-  Element rcp_mag = Element(1) / abs(q);
-  
-  return make_Quaternion(
-    q.x() * rcp_mag,
-    q.y() * rcp_mag,
-    q.z() * rcp_mag,
-    q.w() * rcp_mag
-  );
-}
-
-/// Quaternion exponential
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> exp(Quaternion<Element> const &q) {
-  
-  Element exp_ = fast_exp(q.w());
-  Element imag_norm = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
-  Element sin_norm = fast_sin(imag_norm);
-
-  return make_Quaternion(
-    exp_ * q.x() * sin_norm / imag_norm,
-    exp_ * q.y() * sin_norm / imag_norm,
-    exp_ * q.z() * sin_norm / imag_norm,
-    exp_ * fast_cos(imag_norm)
-  );
-}
-
-/// Quaternion natural logarithm
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> log(Quaternion<Element> const &q) {
-  
-  Element v = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
-  Element s = fast_acos(q.w() / abs(q)) / v;
-  
-  return make_Quaternion(
-    q.x() * s,
-    q.y() * s,
-    q.z() * s,
-    fast_log(q.w())
-  );
-}
-
-/// Gets the rotation angle from a unit-length quaternion
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Element get_rotation_angle(Quaternion<Element> const &q_unit) {
-  return fast_acos(q_unit.w()) * Element(2);
-}
-
-/// Gets the rotation axis from a unit-length quaternion
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Matrix3x1<Element> get_rotation_axis(Quaternion<Element> const &q_unit) {
-  return q_unit.pure().unit();
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Equality operator
-template <typename Element>
-CUTLASS_HOST_DEVICE 
-bool operator==(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return lhs.x() == rhs.x() &&
-    lhs.y() == rhs.y() &&
-    lhs.z() == rhs.z() &&
-    lhs.w() == rhs.w();
-}
-
-/// Inequality operator
-template <typename Element>
-CUTLASS_HOST_DEVICE 
-bool operator!=(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return !(lhs == rhs);
-}
-
-/// Quaternion scalar multiplication
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator*(Quaternion<Element> q, Element s) {
-  return make_Quaternion(
-    q.x() * s,
-    q.y() * s,
-    q.z() * s,
-    q.w() * s
-  );
-}
-
-/// Quaternion scalar multiplication
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator*(Element s, Quaternion<Element> const &q) {
-  return make_Quaternion(
-    s * q.x(),
-    s * q.y(),
-    s * q.z(),
-    s * q.w()
-  );
-}
-
-/// Quaternion scalar division
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator/(Quaternion<Element> const &q, Element s) {
-  return make_Quaternion(
-    q.x() / s,
-    q.y() / s,
-    q.z() / s,
-    q.w() / s
-  );
-}
-
-/// Quaternion unary negation
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator-(Quaternion<Element> const &q) {
-  return make_Quaternion(
-    -q.x(),
-    -q.y(),
-    -q.z(),
-    -q.w()
-  );
-}
-
-/// Quaternion addition
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator+(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return make_Quaternion(
-    lhs.x() + rhs.x(), 
-    lhs.y() + rhs.y(), 
-    lhs.z() + rhs.z(), 
-    lhs.w() + rhs.w()
-  );
-}
-
-/// Quaternion subtraction
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator-(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return make_Quaternion(
-    lhs.x() - rhs.x(), 
-    lhs.y() - rhs.y(), 
-    lhs.z() - rhs.z(), 
-    lhs.w() - rhs.w()
-  );
-}
-
-/// Quaternion product
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator*(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return make_Quaternion(
-    lhs.w() * rhs.x() + rhs.w() * lhs.x() + lhs.y() * rhs.z() - lhs.z() * rhs.y(),
-    lhs.w() * rhs.y() + rhs.w() * lhs.y() + lhs.z() * rhs.x() - lhs.x() * rhs.z(),
-    lhs.w() * rhs.z() + rhs.w() * lhs.z() + lhs.x() * rhs.y() - lhs.y() * rhs.x(),
-    lhs.w() * rhs.w() - lhs.x() * rhs.x() - lhs.y() * rhs.y() - lhs.z() * rhs.z()
-  );
-}
-
-/// Quaternion division
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator/(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return lhs * reciprocal(rhs);
-}
-
-/// Quaternion scalar division
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Quaternion<Element> operator/(Element s, Quaternion<Element> const &q) {
-  return s * reciprocal(q);
-}
-
-/// Comparison 
-template <typename Element>
-CUTLASS_HOST_DEVICE
-bool operator<(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
-  return true; 
-}
-
-/// Rotates a 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
-/// a reciprocal.
-template <typename Element>
-CUTLASS_HOST_DEVICE
-Matrix3x1<Element> spinor_rotation(
-  Quaternion<Element> const &spinor,        /// unit-length quaternion
-  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
-
-  return (spinor * Quaternion<Element>(rhs, 0) * conj(spinor)).pure();
-}
-
-/// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
-/// a reciprocal.
-template <typename  Element>
-CUTLASS_HOST_DEVICE
-Matrix3x1<Element> spinor_rotation_inv(
-  Quaternion<Element> const &spinor,        /// unit-length quaternion
-  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
-
-  return (conj(spinor) * Quaternion<Element>(rhs, 0) * spinor).pure();
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for Quaternion-valued type.
-template <typename T>
-struct RealType< Quaternion<T> > {
-  using Type = T;
-
-  /// Number of elements
-  static int const kExtent = Quaternion<T>::kExtent;
-
-CUTLASS_HOST_DEVICE
-  static Quaternion<T> from_real(double x) {
-    return Quaternion<T>(static_cast<T>(x));
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Factories
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::Quaternion<half_t> from_real<cutlass::Quaternion<half_t> >(double r) {
-  return cutlass::Quaternion<half_t>(half_t(r));
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::Quaternion<float> from_real<cutlass::Quaternion<float> >(double r) {
-  return cutlass::Quaternion<float>(float(r));
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-cutlass::Quaternion<double> from_real<cutlass::Quaternion<double> >(double r) {
-  return cutlass::Quaternion<double>(r);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// functional.h numeric specializations
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct multiplies<Quaternion<T>> {
-  CUTLASS_HOST_DEVICE
-  Quaternion<T> operator()(Quaternion<T> lhs, Quaternion<T> const &rhs) const {
-    lhs = lhs * rhs;
-    return lhs;
-  }
-};
-
-/// Squares with optional conversion
-template <typename T, typename Output>
-struct magnitude_squared<Quaternion<T>, Output> {
-  CUTLASS_HOST_DEVICE
-  Output operator()(Quaternion<T> lhs) const {
-    multiplies<Output> mul_op;
-
-    Output y_w = Output(lhs.w());
-    Output y_x = Output(lhs.x());
-    Output y_y = Output(lhs.y());
-    Output y_z = Output(lhs.z());
-
-    return mul_op(y_w, y_w) + mul_op(y_x, y_x) + mul_op(y_y, y_y) + \
-           mul_op(y_z, y_z);
-  }
-};
-
-template <typename T>
-struct multiply_add<Quaternion<T>, Quaternion<T>, Quaternion<T>> {
-  CUTLASS_HOST_DEVICE
-  Quaternion<T> operator()(
-    Quaternion<T> const &a,
-    Quaternion<T> const &b,
-    Quaternion<T> const &c) const {
-
-    T x = c.x();
-    T y = c.y();
-    T z = c.z();
-    T w = c.w();
-
-    x += a.w() * b.x();
-    x += b.w() * a.x();
-    x += a.y() * b.z();
-    x += -a.z() * b.y(),
-
-    y += a.w() * b.y();
-    y += b.w() * a.y();
-    y += a.z() * b.x();
-    y += -a.x() * b.z();
-
-    z += a.w() * b.z();
-    z += b.w() * a.z();
-    z += a.x() * b.y();
-    z += -a.y() * b.x();
-
-    w += a.w() * b.w();
-    w += -a.x() * b.x();
-    w += -a.y() * b.y();
-    w += -a.z() * b.z();
-
-    return cutlass::make_Quaternion(x, y, z, w);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/real.h b/lightllm-kernel/cutlass/include/cutlass/real.h
deleted file mode 100755
index e53301b3f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/real.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/**
-  \file
-  \brief This class provides helpers to support real<> and complex<> types in generic code.
-*/
-
-#pragma once
-
-namespace cutlass {
-
-/// Used to determine the real-valued underlying type of a numeric type T.
-template <typename T>
-struct RealType {
-  using Type = T;
-
-  /// Number of elements
-  static int const kExtent = 1;
-
-CUTLASS_HOST_DEVICE
-  static T from_real(double x) {
-    return static_cast<T>(x);
-  }
-};
-
-template <typename T>
-CUTLASS_HOST_DEVICE
-static T from_real(double r) {
-  return T(r);
-}
-
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
deleted file mode 100755
index 0b8ac7a56..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over densely packed tensors in global memory
-*/
-
-#pragma once
-
-#include "cutlass/device_kernel.h"
-#include "cutlass/reduction/kernel/reduce_split_k.h"
-#include "cutlass/cuda_host_adapter.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ReductionKernel_
->
-class ReduceSplitK {
-public:
-  using ReductionKernel = ReductionKernel_;
-
-  using Shape = typename ReductionKernel::Shape;
-  using ReductionOp = typename ReductionKernel::ReductionOp;
-  using OutputOp = typename ReductionKernel::OutputOp;
-
-  using ElementWorkspace = typename ReductionKernel::ElementWorkspace;
-  using ElementAccumulator = typename ReductionKernel::ElementAccumulator;
-  using ElementOutput = typename ReductionKernel::ElementOutput;
-
-  using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef;
-  using OutputTensorRef = typename ReductionKernel::OutputTensorRef;
-
-  using StrideIndex = typename ReductionKernel::StrideIndex;
-
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-  /// Argument structure
-  struct Arguments {
-
-    //
-    // Data members
-    //
-
-    MatrixCoord problem_size{0,0};
-    int partitions{1};
-    size_t partition_stride{0};
-    WorkspaceTensorRef workspace{};
-    OutputTensorRef destination{};
-    OutputTensorRef source{};
-    typename OutputOp::Params output{};
-    typename ReductionOp::Params reduction{};
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    Arguments() = default;
-   
-    CUTLASS_HOST_DEVICE 
-    Arguments(
-      MatrixCoord const & problem_size
-    ):
-      problem_size(problem_size) { }
-
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      MatrixCoord problem_size_,
-      int partitions_,
-      size_t partition_stride_,
-      WorkspaceTensorRef workspace_,
-      OutputTensorRef destination_,
-      OutputTensorRef source_,
-      typename OutputOp::Params output_ = typename OutputOp::Params(),
-      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
-    ):
-      problem_size(problem_size_),
-      partitions(partitions_),
-      partition_stride(partition_stride_),
-      workspace(workspace_),
-      destination(destination_),
-      source(source_),
-      output(output_),
-      reduction(reduction_)
-    {
-
-    }
-
-  };
-
-private:
-  /// Kernel parameters object
-  typename ReductionKernel::Params params_;
-
-public:
-  /// Constructs Reduction SplitK
-  ReduceSplitK() { }
-
-  /// Determines whether the ReduceSplitK can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-
-    return Status::kSuccess;
-  }
-
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    // needs no additional workspace
-    return 0;
-  }
-
-  /// Initializes Reduction state from arguments.
-  Status initialize(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr) {
-    
-    // initialize the params structure from the arguments
-    params_ = typename ReductionKernel::Params(
-      args.problem_size,
-      args.partitions,
-      args.partition_stride,
-      args.workspace,
-      args.destination,
-      args.source,
-      args.output,
-      args.reduction
-    );
-
-    return Status::kSuccess;
-
-   }
-
-  /// Initializes Reduction kernel state from arguments.
-  Status update(Arguments const &args, void *workspace = nullptr) {
-
-    // update the params structure from the arguments
-    params_.workspace.reset(args.workspace.non_const_ref().data());
-    params_.destination.reset(args.destination.non_const_ref().data());
-    params_.source.reset(args.source.non_const_ref().data());
-    params_.output = args.output;
-    params_.reduction = args.reduction;
-
-    return Status::kSuccess;
-  }
-
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-
-    //
-    // Launch reduction kernel
-    //
-    dim3 block = ReductionKernel::block_shape();
-    dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
-
-    if constexpr (kEnableCudaHostAdapter) {
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          void* kernel_params[] = {&params_};
-          cuda_adapter->launch(
-              grid, dim3(1,1,1), block, 0, stream, kernel_params, kernel_index);
-        }
-    }
-    else {
-      cutlass::arch::synclog_setup();
-      Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
-    }
-
-    cudaError_t result = cudaGetLastError();
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-
-
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-    return run(stream, cuda_adapter, kernel_index);
-  }
-
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args, 
-    void *workspace = nullptr, 
-    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
-    
-    Status status = initialize(args, workspace, stream);
-    
-    if (status == Status::kSuccess) {
-      status = run(stream,cuda_adapter, kernel_index);
-    }
-
-    return status;
-  }
-  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace reduction
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
deleted file mode 100755
index f36c72c92..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over one or more ranks of an affine tensor
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
-#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
-template <
-  typename ElementOutput_,
-  typename ElementSource_,
-  typename Layout_,
-  typename ReductionOp_,
-  int VectorLength_  = 1,
-  typename ElementCompute_ = ElementOutput_
->
-struct TensorReduction {
-
-  using ElementOutput = ElementOutput_;
-  using ElementSource = ElementSource_;
-  using Layout = Layout_;
-  using ReductionOp = ReductionOp_;
-  static int const kVectorLength = VectorLength_;
-  using ElementCompute = ElementCompute_;
-
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Reduction operator
-  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
-    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
-  >;
-
-  using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
-    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
-  >;
-
-  //
-  // Data members
-  //
-
-  ReductionDeviceStridedOperator reduction_strided;
-  ReductionDeviceContiguousOperator reduction_contiguous;
-  int reduction_index;
-
-  //
-  // Methods
-  //
-
-  ///
-  TensorReduction(
-    TensorCoord extent, 
-    int reduction_index_
-  ): 
-    reduction_index(reduction_index_) {
-
-    Coord<4> extent_affine;
-
-    switch (reduction_index) {
-    case 0:
-      extent_affine[0] = extent[1];
-      extent_affine[1] = extent[2];
-      extent_affine[2] = extent[0];
-      extent_affine[3] = extent[3];
-      break;
-    case 1:
-      extent_affine[0] = extent[0];
-      extent_affine[1] = extent[2];
-      extent_affine[2] = extent[1];
-      extent_affine[3] = extent[3];
-      break;
-    case 2:
-      extent_affine[0] = extent[0];
-      extent_affine[1] = extent[1];
-      extent_affine[2] = extent[2];
-      extent_affine[3] = extent[3];
-      break;
-    case 3:
-      extent_affine[0] = extent[0];
-      extent_affine[1] = extent[1];
-      extent_affine[2] = extent[2];
-      extent_affine[3] = extent[3];
-      break;
-    default: break;
-    }
-
-    if (reduction_index == 3) {
-      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);  
-    }
-    else {
-      reduction_strided = ReductionDeviceStridedOperator(extent_affine);  
-    }
-  }
-
-  /// Simple check to verify the object is initialized correctly
-  bool good() const {
-    if (reduction_index == 3) {
-      return reduction_contiguous.good();
-    }
-    return reduction_strided.good();
-  }
-
-  /// Size of one workspace
-  int64_t workspace_stride() const {
-    if (reduction_index == 3) {
-      return reduction_contiguous.workspace_stride();
-    }
-    else {
-      return reduction_strided.workspace_stride();
-    }
-  }
-
-  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
-  int64_t workspace_size() const {
-    if (reduction_index == 3) {
-      return reduction_contiguous.workspace_size();
-    }
-    else {
-      return reduction_strided.workspace_size();
-    }
-  }
-
-  /// Helper to use overloaded function call operator
-  Status reduce(
-    TensorRef<ElementOutput, Layout> dst_ref,
-    TensorRef<ElementSource, Layout> src_ref,
-    void *device_workspace_ptr = nullptr,
-    ElementCompute reduction_identity = ElementCompute(),
-    ReductionOp reduction_op = ReductionOp(),
-    cudaStream_t stream = nullptr) {
-
-    int64_t src_stride[3];
-    int64_t dst_stride[3];
-
-    switch (reduction_index) {
-    case 0:
-      src_stride[0] = src_ref.stride()[1];
-      src_stride[1] = src_ref.stride()[0];
-      src_stride[2] = src_ref.stride()[2];
-      dst_stride[0] = dst_ref.stride()[1];
-      dst_stride[1] = dst_ref.stride()[0];
-      break;
-    case 1:
-      src_stride[0] = src_ref.stride()[2];
-      src_stride[1] = src_ref.stride()[0];
-      src_stride[2] = src_ref.stride()[1];
-      dst_stride[0] = dst_ref.stride()[2];
-      dst_stride[1] = dst_ref.stride()[0];
-      break;
-    case 2:
-      src_stride[0] = src_ref.stride()[2];
-      src_stride[1] = src_ref.stride()[1];
-      src_stride[2] = src_ref.stride()[0];
-      dst_stride[0] = dst_ref.stride()[2];
-      dst_stride[1] = dst_ref.stride()[1];
-      break;
-    case 3:
-      src_stride[0] = src_ref.stride()[2];
-      src_stride[1] = src_ref.stride()[1];
-      src_stride[2] = src_ref.stride()[0];
-
-      dst_stride[0] = dst_ref.stride()[2];
-      dst_stride[1] = dst_ref.stride()[1];
-      dst_stride[2] = dst_ref.stride()[0];
-
-    default: break;
-    }
-
-    if (reduction_index == 3) {
-      return reduction_contiguous(
-        dst_ref.data(),
-        dst_stride, 
-        src_ref.data(), 
-        src_stride, 
-        device_workspace_ptr, 
-        reduction_identity,
-        reduction_op, 
-        stream);
-    }
-    else {
-      return reduction_strided(
-        dst_ref.data(),
-        dst_stride, 
-        src_ref.data(), 
-        src_stride, 
-        device_workspace_ptr, 
-        reduction_identity,
-        reduction_op, 
-        stream);
-    }
-  }
-
-  Status operator()(
-    TensorRef<ElementOutput, Layout> dst_ref,
-    TensorRef<ElementSource, Layout> src_ref,
-    void *device_workspace_ptr = nullptr,
-    ElementCompute reduction_identity = ElementCompute(),
-    ReductionOp reduction_op = ReductionOp(),
-    cudaStream_t stream = nullptr) {
-
-    return reduce(
-      dst_ref, 
-      src_ref, 
-      device_workspace_ptr, 
-      reduction_identity,
-      reduction_op, 
-      stream);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
deleted file mode 100755
index 8d71aa9dd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over one or more ranks of an affine tensor
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tensor reduction operator on layouts which are affine
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (e.g. ND => 2)
-  typename ElementOutput_,
-  typename ElementSource_,
-  typename ReductionOp_,
-  int VectorLength  = 1,
-  typename ElementCompute_ = ElementOutput_,
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-struct TensorReductionAffineContiguous {
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-
-  using ElementOutput = ElementOutput_;
-  using ElementSource = ElementSource_;
-  using ReductionOp = ReductionOp_;
-  using ElementCompute = ElementCompute_;
-
-  //
-  // Data members
-  //
-
-  /// Internal status field
-  Status status;
-
-  /// Extent of tensor in source layout
-  Coord<kRank> extent;
-
-  /// Number of points in the outer index space
-  int64_t outer_count;
-
-  /// Number of elements in the inner index space
-  int64_t inner_count;
-
-  /// Number of workspaces needed
-  int workspace_count;
-
-  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
-  dim3 grid_shape;
-
-  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
-  dim3 threadblock_shape;
-
-  /// CUDA grid shape for the final reduction step if needed
-  dim3 grid_final;
-
-  /// CUDA threadblock shape for the final reduction step if needed
-  dim3 threadblock_final;
-
-private:
-  //
-  // Methods
-  //
-
-  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
-  static int reshape_pow2(int ext, int count) {
-    if (ext > count) {
-      return 1;
-    }
-    int x = 1;
-    for (; count >= ext * 2; ) {
-      count >>= 1;
-      x <<= 1;
-    }
-    return x;
-  }
-
-public:
-
-  /// Default ctor
-  TensorReductionAffineContiguous():
-    status(Status::kErrorInvalidProblem),
-    extent(),
-    outer_count(0),
-    inner_count(0),
-    workspace_count(0),
-    grid_shape(0, 0, 0),
-    threadblock_shape(0, 0, 0) { }
-
-  /// Constructor
-  TensorReductionAffineContiguous(
-    Coord<kRank> extent_,
-    int target_threadblock_count = 128
-  ):
-    status(Status::kSuccess),
-    extent(extent_), 
-    outer_count(0),
-    inner_count(0),
-    workspace_count(0) {
-
-    //
-    // Plan the parallel mapping strategy.
-    //
-
-    outer_count = 1;
-    inner_count = 1;
-
-    // Compute number of elements in strided ranks
-    for (int p = 0; p < kReducedRank; ++p) {
-      outer_count *= extent[p];
-    }
-
-    for (int p = 0; p < kInnerRank; ++p) {
-      inner_count *= extent[kReducedRank + p];
-    }
-
-    int cta_count_x = 1;
-    int cta_count_y = 1;
-    int cta_count_z = 1;
-
-    int cta_threads_x = kThreads;
-    int cta_threads_y = 1;
-    int cta_threads_z = 1;
-
-    // Determine CTA shape
-    int64_t inner_vector_count = inner_count / kVectorLength;
-
-    // Priority 1. Assign threadblocks to outer indices if possible
-    if (outer_count > target_threadblock_count) {
-      cta_count_x = 1;
-      cta_count_y = target_threadblock_count;
-      cta_count_z = 1;
-    }
-    else {
-
-      cta_count_y = int(outer_count);
-      int remaining_ctas = target_threadblock_count / cta_count_y;
-
-      // Priority 2. Assign inner dimensions to one CTA
-      if (inner_vector_count > cta_threads_x) {
-        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
-        if (cta_z_bound > remaining_ctas) {
-          cta_count_z = remaining_ctas;
-        }
-        else {
-          cta_count_z = int(cta_z_bound);
-        }
-      }
-      else {
-        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
-        cta_count_z = 1;
-      }
-    }
-
-    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
-    threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
-
-    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
-
-    // Determine shape of final reduction kernel if needed
-    if (workspace_count) {
-
-      int final_threads = kThreads;
-      int final_ctas = 1;
-
-      if (outer_count > kThreads) {
-        final_ctas = int(outer_count + kThreads - 1) / kThreads;
-      }
-      else {
-        final_threads = int(outer_count);
-      }
-
-      grid_final = dim3(final_ctas, 1, 1);
-      threadblock_final = dim3(final_threads, 1, 1); 
-    }
-    else {
-      grid_final = dim3(0, 0, 0);
-      threadblock_final = dim3(0, 0, 0);
-    }
-  }
-
-  /// Simple check to verify the object is initialized correctly
-  bool good() const {
-    return status == Status::kSuccess;
-  }
-
-  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
-  int64_t workspace_stride() const {
-    
-    // Error condition
-    if (!good()) {
-      return 0;
-    }
-
-    return outer_count * sizeof_bits<ElementCompute>::value / 8;
-  }
-
-  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
-  int64_t workspace_size() const {
-
-    // Error condition
-    if (!good()) {
-      return 0;
-    }
-
-    // No reduction across CTAs
-    if (grid_shape.z == 1) {
-      return 0;
-    }
-
-    return workspace_stride() * grid_shape.z;
-  }
-
-  /// Performs a reduction
-  Status reduce(
-    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
-    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
-    ElementSource const *src_ptr,                 ///< Pointer to source tensor
-    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
-    void *device_workspace_ptr = nullptr,         ///< Device workspace
-    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
-    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
-    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
-
-    // Initial status check
-    if (!good()) {
-      return status;
-    }
-
-    // Guard against null workspace
-    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    // Define reduction kernel
-    using ReductionKernel = kernel::TensorReductionAffineContiguous<
-      kRank,
-      kReducedRank,
-      ElementOutput, 
-      ElementSource, 
-      ReductionOp, 
-      kVectorLength,
-      ElementCompute,
-      kThreads>;
-
-    using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
-      kRank,
-      kReducedRank,
-      ElementOutput, 
-      ElementSource, 
-      ReductionOp, 
-      kVectorLength,
-      ElementCompute,
-      kThreads>;
-
-    using Params = typename ReductionKernel::Params;
-
-    // Construct the parameters
-    Params params(
-      extent, 
-      dst_ptr,
-      dst_stride, 
-      src_ptr,
-      src_stride,
-      static_cast<ElementCompute *>(device_workspace_ptr),
-      workspace_stride(),
-      workspace_count,
-      reduction_op,
-      reduction_identity);
-
-    // Shared memory size
-    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
-
-    // Launch the kernel
-    cutlass::arch::synclog_setup();
-    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
-
-    // Check error condition
-    if (cudaPeekAtLastError() == cudaSuccess) {
-      status = Status::kSuccess;
-    }
-    else {
-      status = Status::kErrorInternal;
-    }
-
-    // Final reduction kernel
-    if (workspace_count) {
-      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
-    }
-
-    // Check error condition
-    if (cudaPeekAtLastError() == cudaSuccess) {
-      status = Status::kSuccess;
-    }
-    else {
-      status = Status::kErrorInternal;
-    }
-
-    return status;
-  }
-
-  /// Helper to use overloaded function call operator
-  Status operator()(
-    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
-    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
-    ElementSource const *src_ptr,                 ///< Pointer to source tensor
-    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
-    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
-    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
-    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
-    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
-
-    return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
deleted file mode 100755
index 5ec7e6549..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over one or more ranks of an affine tensor
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tensor reduction operator on layouts which are affine
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput_,
-  typename ElementSource_,
-  typename ReductionOp_,
-  int VectorLength  = 1,
-  typename ElementCompute_ = ElementOutput_,
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-struct TensorReductionAffineStrided {
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-
-  using ElementOutput = ElementOutput_;
-  using ElementSource = ElementSource_;
-  using ReductionOp = ReductionOp_;
-  using ElementCompute = ElementCompute_;
-
-  //
-  // Data members
-  //
-
-  /// Internal status field
-  Status status;
-
-  /// Extent of tensor in source layout
-  Coord<kRank> extent;
-
-  /// Number of points in the outer index space
-  int64_t outer_count;
-
-  /// Number of elements in the inner index space
-  int64_t inner_count;
-
-  /// Number of workspaces needed
-  int workspace_count;
-
-  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
-  dim3 grid_shape;
-
-  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
-  dim3 threadblock_shape;
-
-  /// CUDA grid shape for the final reduction step if needed
-  dim3 grid_final;
-
-  /// CUDA threadblock shape for the final reduction step if needed
-  dim3 threadblock_final;
-
-private:
-  //
-  // Methods
-  //
-
-  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
-  static int reshape_pow2(int ext, int count) {
-    if (ext > count) {
-      return 1;
-    }
-    int x = 1;
-    for (; count >= ext * 2; ) {
-      count >>= 1;
-      x <<= 1;
-    }
-    return x;
-  }
-
-public:
-
-  /// Default ctor
-  TensorReductionAffineStrided():
-    status(Status::kErrorInvalidProblem),
-    extent(),
-    outer_count(0),
-    inner_count(0),
-    workspace_count(0),
-    grid_shape(0, 0, 0),
-    threadblock_shape(0, 0, 0) { }
-
-  /// Constructor
-  TensorReductionAffineStrided(
-    Coord<kRank> extent_,
-    int target_threadblock_count = 128
-  ):
-    status(Status::kSuccess),
-    extent(extent_), 
-    outer_count(0),
-    inner_count(0),
-    workspace_count(0) {
-
-    //
-    // Plan the parallel mapping strategy.
-    //
-
-    outer_count = 1;
-    inner_count = 1;
-
-    // Compute number of elements in strided ranks
-    for (int p = 0; p < kReducedRank - 1; ++p) {
-      outer_count *= extent[p];
-    }
-
-    for (int p = 0; p < kInnerRank; ++p) {
-      inner_count *= extent[kReducedRank + p - 1];
-    }
-
-    // Compute plan for the reduction
-    int extent_c = extent[kRank - 1];
-    int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
-
-    // Determine CTA shape
-    int cta_width = kThreads * kVectorLength;
-    int cta_ways = reshape_pow2(extent_c, cta_width);
-    int cta_threads_x = kThreads / cta_ways;
-
-    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
-
-    // This leads to an error.
-    if (threadblock_shape.z > 1) {
-      if (threadblock_shape.y != 1) {
-        status = Status::kErrorInternal;
-        return;
-      }
-    }
-    
-    // Determine grid shape
-    int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
-    int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
-
-    // Limit the number of CTAs assigned to outer dimension
-    if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
-      cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
-    }
-
-    // Limit the number of CTAs assigned to inner dimension
-    int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
-    if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
-      cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
-    }
-
-    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
-    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
-
-    // Determine shape of final reduction kernel if needed
-    grid_final = dim3(cta_count_x, int(outer_count));
-    threadblock_final = dim3(cta_threads_x, 1, 1);
-  }
-
-  /// Simple check to verify the object is initialized correctly
-  bool good() const {
-    return status == Status::kSuccess;
-  }
-
-  /// Size of one CTA's workspace
-  int64_t workspace_stride() const {
-    
-    // Error condition
-    if (!good()) {
-      return 0;
-    }
-
-    int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
-
-    return extent[kRank - 1] * vector_size_bytes;
-  }
-
-  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
-  int64_t workspace_size() const {
-
-    // Error condition
-    if (!good()) {
-      return 0;
-    }
-
-    // No reduction across CTAs
-    if (grid_shape.z == 1) {
-      return 0;
-    }
-
-    return workspace_stride() * outer_count * grid_shape.z;
-  }
-
-  /// Performs a reduction
-  Status reduce(
-    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
-    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
-    ElementSource const *src_ptr,                 ///< Pointer to source tensor
-    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
-    void *device_workspace_ptr = nullptr,             ///< Device workspace
-    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
-    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
-    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
-
-    // Initial status check
-    if (!good()) {
-      return status;
-    }
-
-    // Guard against null workspace
-    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
-      return Status::kErrorWorkspaceNull;
-    }
-
-    // Define reduction kernel
-    using ReductionKernel = kernel::TensorReductionAffineStrided<
-      kRank,
-      kReducedRank,
-      ElementOutput, 
-      ElementSource, 
-      ReductionOp, 
-      kVectorLength,
-      ElementCompute,
-      kThreads>;
-
-    using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
-      kRank,
-      kReducedRank,
-      ElementOutput, 
-      ElementSource, 
-      ReductionOp, 
-      kVectorLength,
-      ElementCompute,
-      kThreads>;
-
-    using Params = typename ReductionKernel::Params;
-
-    // Construct the parameters
-    Params params(
-      extent, 
-      dst_ptr,
-      dst_stride, 
-      src_ptr,
-      src_stride,
-      static_cast<ElementCompute *>(device_workspace_ptr),
-      workspace_stride(),
-      workspace_count,
-      reduction_op,
-      reduction_identity);
-
-    // Shared memory size
-    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
-
-    // Launch the kernel
-    cutlass::arch::synclog_setup();
-    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
-
-    // Check error condition
-    if (cudaPeekAtLastError() == cudaSuccess) {
-      status = Status::kSuccess;
-    }
-    else {
-      status = Status::kErrorInternal;
-    }
-
-    // Final reduction kernel
-    if (workspace_count) {
-
-      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
-
-      // Check error condition
-      if (cudaPeekAtLastError() == cudaSuccess) {
-        status = Status::kSuccess;
-      }
-      else {
-        status = Status::kErrorInternal;
-      }
-    }
-
-    return status;
-  }
-
-  /// Helper to use overloaded function call operator
-  Status operator()(
-    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
-    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
-    ElementSource const *src_ptr,                 ///< Pointer to source tensor
-    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
-    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
-    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
-    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
-    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
-
-    return reduce(
-      dst_ptr, 
-      dst_stride, 
-      src_ptr, 
-      src_stride, 
-      device_workspace_ptr, 
-      reduction_identity, 
-      reduction_op, 
-      stream);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
deleted file mode 100755
index 9752b9b76..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a final reduction for softmax
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/arch/memory_sm75.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace kernel {
-
-template <
-  typename ElementNorm_,
-  typename ElementSum_,
-  typename ElementSoftmaxCompute_,
-  typename ThreadblockShape_,
-  bool GroupedProblem = false
->
-class ApplySoftmaxFinalReduction {
-public:
-
-  using ElementNorm = ElementNorm_;
-  using ElementSum = ElementSum_;
-  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
-  using ThreadblockShape = ThreadblockShape_;
-  static const bool isGroupedProblem = GroupedProblem;
-
-  //
-  // Arguments
-  //
-
-  struct Arguments {
-
-    cutlass::gemm::GemmCoord*  problem_sizes{nullptr};
-    cutlass::gemm::GemmCoord   problem_size{};
-    ElementNorm*               block_Norm{nullptr};
-    ElementSum*                block_Sum{nullptr};
-    int64_t*                   offset_Norm_Device{nullptr};
-    int64_t*                   offset_Sum_Device{nullptr};
-    int64_t                    batch_stride_Max{0};
-    int64_t                    batch_stride_Sum{0};
-
-    //
-    // Methods
-    //
-    Arguments() { }
-
-    // Non-grouped constructor without batching
-    Arguments(
-      cutlass::gemm::GemmCoord  problem_size,
-      ElementNorm*              block_Norm,
-      ElementSum*               block_Sum
-    ):
-      problem_size(problem_size),
-      block_Norm(block_Norm),
-      block_Sum(block_Sum),
-      problem_sizes(nullptr),
-      offset_Norm_Device(nullptr),
-      offset_Sum_Device(nullptr),
-      batch_stride_Max(0),
-      batch_stride_Sum(0)
-    {
-
-    }
-
-    // Non-grouped constructor with batching
-    Arguments(
-      cutlass::gemm::GemmCoord  problem_size,
-      ElementNorm*              block_Norm,
-      ElementSum*               block_Sum,
-      int64_t                   batch_stride_Max,
-      int64_t                   batch_stride_Sum
-    ):
-      problem_size(problem_size),
-      block_Norm(block_Norm),
-      block_Sum(block_Sum),
-      batch_stride_Max(batch_stride_Max),
-      batch_stride_Sum(batch_stride_Sum),
-      problem_sizes(nullptr),
-      offset_Norm_Device(nullptr),
-      offset_Sum_Device(nullptr)
-    {
-
-    }
-
-
-    // Grouped constructor
-    Arguments(
-      cutlass::gemm::GemmCoord  *problem_sizes,
-      ElementNorm*              block_Norm,
-      ElementSum*               block_Sum,
-      int64_t*                  offset_Norm_Device,
-      int64_t*                  offset_Sum_Device
-    ):
-      problem_sizes(problem_sizes),
-      problem_size(cutlass::gemm::GemmCoord(0, 0, 0)),
-      block_Norm(block_Norm),
-      block_Sum(block_Sum),
-      offset_Norm_Device(offset_Norm_Device),
-      offset_Sum_Device(offset_Sum_Device)
-    {
-
-    }
-  };
-
-  struct SharedStorage {
-
-
-  };
-
-  //
-  // Params struct
-  //
-
-  struct Params {
-    Arguments args;
-
-    //
-    // Methods
-    //
-    Params() { }
-
-    Params(Arguments const &args_): args(args_) { }
-  };
-
-private:
-
-public:
-
-  CUTLASS_DEVICE
-  ApplySoftmaxFinalReduction() { }
-
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    apply(params, shared_storage);
-  }
-
-private:
-
-  /// Full reduction
-  CUTLASS_DEVICE
-  void apply(Params const &params, SharedStorage &shared_storage) {
-
-    int tid = threadIdx.x;
-    int bid = blockIdx.x;
-    int bdim = blockDim.x;
-    
-    int block_batch = blockIdx.z;
-
-    // defining three vars for a general reduction module
-    cutlass::gemm::GemmCoord problem_size = isGroupedProblem ? params.args.problem_sizes[bid] : params.args.problem_size;
-    int m_dim_in_loop = isGroupedProblem ? problem_size.m() : tid + bdim;
-    int access_offset = isGroupedProblem ? 0 : bid * bdim;
-
-    if (!isGroupedProblem && access_offset + tid >= problem_size.m()) return;
-
-    ElementNorm *curr_ptr_Max = isGroupedProblem ? \
-              params.args.block_Norm + params.args.offset_Norm_Device[bid] : \
-              params.args.block_Norm + block_batch * params.args.batch_stride_Max;
-    ElementSum *curr_ptr_Sum = isGroupedProblem ? \
-              params.args.block_Sum + params.args.offset_Sum_Device[bid] : \
-              params.args.block_Sum + block_batch * params.args.batch_stride_Sum;
-
-    int threadblock_num = (problem_size.n() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
-
-    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
-    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
-
-    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
-    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
-
-    ConvertSum   convert_sum;
-    ConvertNorm  convert_norm;
-
-    ConvertSumOutput   convert_sum_output;
-    ConvertNormOutput  convert_norm_output;
-
-    uint32_t float_max_bits = 0xff7fffff;
-    float min_float = reinterpret_cast<float const &>(float_max_bits);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int idx_m = tid; idx_m < m_dim_in_loop; idx_m += bdim) {
-      ElementNorm *access_n = curr_ptr_Max + idx_m + access_offset;
-      ElementSum *access_s = curr_ptr_Sum + idx_m + access_offset;
-      ElementNorm *access_n_bak = access_n;
-      ElementSum *access_s_bak = access_s;
-      ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
-      ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
-      ElementNorm fetch_n;
-      ElementSum fetch_s;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
-        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
-        max_val = cutlass::fast_max(max_val, convert_norm(fetch_n));
-        access_n += problem_size.m();
-      }
-
-      access_n = access_n_bak;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
-        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
-        cutlass::arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
-        sum_val += convert_sum(fetch_s) * cutlass::fast_exp(convert_norm(fetch_n) - max_val);
-        access_n += problem_size.m();
-        access_s += problem_size.m();
-      }
-
-      ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
-
-      access_n = access_n_bak;
-      access_s = access_s_bak;
-
-      access_n[0] = convert_norm_output(max_val);
-      access_s[0] = convert_sum_output(inv_sum);
-    }
-
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace reduction
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
deleted file mode 100755
index d9c701396..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over densely packed tensors in global memory
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/layout/matrix.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape_,              ///< shape of CTA        (concept: MatrixShape)
-  typename OutputOp_ ,          ///< output operator     (concept: epilogue::thread operator)
-  typename ReductionOp_,        ///< reduction operator  (concept: ReductionOperator)
-  int PartitionsPerStage = 4    ///< number of partitions to issue 
->
-class ReduceSplitK {
-public:
-
-  using Shape = Shape_;
-  using ReductionOp = ReductionOp_;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = OutputOp::kCount;
-  static int const kPartitionsPerStage = PartitionsPerStage;
-
-  using ElementWorkspace = typename ReductionOp::Element;
-  using ElementAccumulator = typename ReductionOp::ElementAccumulator;
-  using ElementOutput = typename OutputOp::ElementOutput;
-
-  using WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>;
-  using OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>;
-  using StrideIndex = typename WorkspaceTensorRef::Layout::Stride::Index;
-
-  using FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>;
-  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>;
-
-  //
-  // Types
-  //
-
-  /// Params structure
-  struct Params {
-
-    MatrixCoord problem_size;
-    int partitions;
-    size_t partition_stride;
-    WorkspaceTensorRef workspace;
-    OutputTensorRef destination;
-    OutputTensorRef source;
-    typename OutputOp::Params output;
-    typename ReductionOp::Params reduction;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(
-      MatrixCoord problem_size_,
-      int partitions_,
-      size_t partition_stride_,
-      WorkspaceTensorRef workspace_,
-      OutputTensorRef destination_,
-      OutputTensorRef source_,
-      typename OutputOp::Params output_ = typename OutputOp::Params(),
-      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
-    ):
-      problem_size(problem_size_),
-      partitions(partitions_),
-      partition_stride(sizeof(FragmentWorkspace) * partition_stride_ / kElementsPerAccess),
-      workspace(workspace_),
-      destination(destination_),
-      source(source_),
-      output(output_),
-      reduction(reduction_) {
-
-    }
-  };
-
-  struct SharedStorage { };
-
-
-public:
-
-  /// Computes the grid size given a chosen threadblock shape
-  CUTLASS_HOST_DEVICE
-  static dim3 grid_shape(
-    cutlass::MatrixCoord problem_size) {
-
-    return dim3(
-      (problem_size.row() + Shape::kRow - 1) / Shape::kRow,
-      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn);
-  }
-
-  /// Determines the threadblock shape
-  CUTLASS_HOST_DEVICE
-  static dim3 block_shape() {
-    return dim3(Shape::kColumn / kElementsPerAccess, Shape::kRow);
-  }
-
-  /// Perform a reduction
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &storage) {
-
-    // Determine CTA position
-    MatrixCoord thread_offset(
-      MatrixCoord::Index(int(blockIdx.x) * Shape::kRow + threadIdx.y),
-      MatrixCoord::Index(int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess)
-    );
-
-    // One guard conditional
-    if (!(thread_offset.row() < params.problem_size.row() && 
-          thread_offset.column() < params.problem_size.column())) {
-
-      return;
-    }
-
-
-    ReductionOp reduction_op(params.reduction);
-
-    FragmentAccumulator accumulator;
-
-    accumulator.clear();  
-    
-    //
-    // Load the first slice
-    //
-
-    char const *workspace_ptr = 
-      reinterpret_cast<char const *>(
-        params.workspace.data() + params.workspace.offset(thread_offset));
-
-    FragmentWorkspace workspace_frag[kPartitionsPerStage];
-    
-    //
-    // Construct the output operator
-    //
-    
-    OutputOp output_op(params.output);
-
-    //
-    // Load and accumulate with a simple batched loading sequence.
-    //
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int k = 0; k < params.partitions; k += kPartitionsPerStage) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kPartitionsPerStage; ++i) {
-        if (k + i < params.partitions) {
-          workspace_frag[i] = *reinterpret_cast<FragmentWorkspace const *>(workspace_ptr);
-          workspace_ptr += params.partition_stride;
-        }
-      }   
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kPartitionsPerStage; ++i) {
-        if (k + i < params.partitions) {
-          accumulator = reduction_op(accumulator, workspace_frag[i]);
-        }
-      }
-    }
-
-    //
-    // Conditionally load the source
-    //
-
-    FragmentOutput source_frag;
-
-    source_frag.clear();
-
-    FragmentOutput const *source_ptr = reinterpret_cast<FragmentOutput const *>(
-      params.source.data() + params.source.offset(thread_offset));
-
-    if (output_op.is_source_needed()) {
-      reinterpret_cast<FragmentOutput &>(source_frag) = *source_ptr;
-    }
-    
-    //
-    // Compute the output
-    //
-
-    typename OutputOp::FragmentOutput output_frag = output_op(accumulator, source_frag);
-
-    //
-    // Store
-    //
-
-    FragmentOutput *dest_ptr = reinterpret_cast<FragmentOutput *>(
-      params.destination.data() + params.destination.offset(thread_offset));
-
-    *dest_ptr = reinterpret_cast<FragmentOutput const &>(output_frag);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace reduction
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
deleted file mode 100755
index bffc956f2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ /dev/null
@@ -1,606 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over one or more ranks of an affine tensor
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/reduction/thread/reduction_operators.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Parameters structure
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (i.e. number of outer ranks)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-struct TensorReductionAffineContiguousParams {
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-
-  Coord<kRank> extent;                          /// Extent of source tensor
-  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
-  int64_t dst_stride[kReducedRank];             /// stride (units of bytes) - I, J
-  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
-  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
-  int workspace_count;                          /// number of workspaces
-  
-  uint64_t inner_count;                          /// Number of elements in reduced index space
-  uint64_t outer_count;                          /// Number of elements in outer index space
-
-  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
-  ReductionOp reduction_op;                     /// Reduction operator
-  ElementCompute reduction_identity;            /// Identity element used by reduction operator
-  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorReductionAffineContiguousParams() {
-
-  }
-
-  /// Ctor
-  TensorReductionAffineContiguousParams(
-    Coord<kRank> extent_,                       ///< Extent of source tensor
-    ElementOutput * dst_ptr_,                   ///< Output tensor data
-    int64_t dst_stride_[],                      ///< Stride (units of elements)
-    ElementSource const * src_ptr_,             ///< Source tensor data
-    int64_t src_stride_[],                      ///< Stride (units of elements)
-    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
-    int64_t workspace_stride_,                  ///< Stride between workspaces
-    int workspace_count_,                       ///< Number of workspaces
-    ReductionOp reduction_op_,                  ///< Reduction operator
-    ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
-  ):
-    extent(extent_),
-    inner_count(1),
-    outer_count(1),
-    destination(dst_ptr_),
-    source(src_ptr_),
-    device_workspace(device_workspace_),
-    workspace_stride(workspace_stride_),
-    workspace_count(workspace_count_),
-    reduction_op(reduction_op_),
-    reduction_identity(reduction_identity_) {
-
-    // Initialize divisors for fast div-mod
-    for (int p = 1; p < kRank; ++p) {
-      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
-    }
-
-    int input_size_bits = sizeof_bits<ElementSource>::value;
-    int output_size_bits = sizeof_bits<ElementOutput>::value;
-
-    // Compute strides in units of bytes
-    for (int p = 0; p < kReducedRank; ++p) {
-      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
-    }  
-
-    for (int p = 0; p < kRank - 1; ++p) {
-      src_stride[p] = src_stride_[p] * input_size_bits / 8;
-    }
-
-    // Compute number of elements in strided ranks
-    for (int p = 0; p < kReducedRank; ++p) {
-      outer_count *= uint64_t(extent[p]);
-    }
-
-    for (int p = 0; p < kInnerRank; ++p) {
-      inner_count *= uint64_t(extent[kRank - 1 - p]);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
-/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-class TensorReductionAffineContiguous {
-public:
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-  using ComputeFragment = Array<ElementCompute, VectorLength>;
-  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
-  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
-
-  /// Shared memory allocation used for reduction within the CTA
-  struct SharedStorage {
-    Array<ElementCompute, kThreads * kVectorLength> workspace;
-  };
-
-  /// Parameters structure
-  using Params = TensorReductionAffineContiguousParams<
-    Rank,
-    ReducedRank,
-    ElementOutput,
-    ElementSource,
-    ReductionOp,
-    VectorLength,
-    ElementCompute,
-    Threads,
-    BatchSize
-  >;
-
-private:
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_inner_coord_and_offset_(
-    Params const &params, 
-    Coord<kInnerRank> & coord, 
-    int64_t &src_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose into a coordinate of rank <kInnerRank>
-    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kRank - kInnerRank]);
-
-    // Compute an offset using the souce stride
-    src_offset = 0;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kInnerRank - 1; ++i) {
-      src_offset += coord[i] * params.src_stride[kReducedRank + i];
-    }
-    src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
-  }
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_outer_coord_and_offset_(
-    Params const &params, 
-    Coord<kReducedRank> & coord, 
-    int64_t &dst_offset,
-    int64_t &src_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose into coordinate of rank <kReducedRank>
-    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
-
-    // Compute offsets using destination and source strides
-    dst_offset = 0;
-    src_offset = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kReducedRank; ++i) {
-      dst_offset += params.dst_stride[i] * coord[i];
-      src_offset += params.src_stride[i] * coord[i];
-    }
-  }
-
-  /// Reduces over the reduction indices yielding a single element
-  CUTLASS_DEVICE
-  ElementCompute reduce_indices_(
-    Params const &params,
-    ElementCompute *threadblock_workspace,
-    char const *src_byte_ptr,
-    int coord_c) {
-
-    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
-    ReductionOp reduction_op(params.reduction_op);
-
-    //
-    // Early exit or initialize to identity element
-    //
-    if (!params.inner_count) {
-      return params.reduction_identity;
-    }
-
-    ComputeFragment accumulator;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(accumulator.size()); ++i) {
-      accumulator[i] = params.reduction_identity;
-    }
-    
-    // Compute the coordinate of the first access    
-    int64_t src_byte_offset = 0;
-    Coord<kInnerRank> coord; 
-
-    uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
-    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
-
-    // Load the first vector
-    SourceFragment source_fragment[kBatchSize];
-    
-    bool not_done = true;
-
-    // Iterate over vectors in a linearized reduction index space
-    while (not_done) {
-
-      bool guards[kBatchSize];
-
-      // Issue a batch of loads
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-
-        if (linear_idx < params.inner_count) {
-          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
-          guards[b] = true;
-        }
-        else {
-          guards[b] = false;
-          not_done = false;
-        }
-
-        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
-        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
-      }
-
-      // Perform a batch of reduction operations
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-        if (guards[b]) {
-          auto cvt = convert_source(source_fragment[b]);
-
-          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
-            reduction_op, 
-            accumulator, 
-            cvt);
-        }
-      }
-    };
-
-    //
-    // Reduction of vectors to scalar
-    //
-
-    ElementCompute reduced_accumulator = accumulator[0];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < kVectorLength; ++i) {
-      reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
-    }
-
-    //
-    // Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
-    //
-    // This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
-    //
-
-    int thread_count = blockDim.x * blockDim.z;
-    int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
-    int thread_i = threadIdx.y;
-
-    ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
-
-    frag_ptr[thread_j] = reduced_accumulator;
-
-    //
-    // Reduce
-    //
-    CUTLASS_PRAGMA_NO_UNROLL
-    while (thread_count > 1) {
-      thread_count /= 2;
-
-      __syncthreads();
-
-      if (thread_j < thread_count) {
-        ElementCompute other = frag_ptr[thread_j + thread_count];
-
-        reduced_accumulator = reduction_op(reduced_accumulator, other);
-
-        frag_ptr[thread_j] = reduced_accumulator;
-      }
-
-      __syncthreads();
-    }
-
-
-    return reduced_accumulator;
-  }
-
-public:
-
-  /// Perform a reduction
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
-
-    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
-    char * dst_byte_ptr = nullptr;
-
-    // If performing a reduction across CTAs, redirect output to device workspace
-    if (gridDim.z == 1) {
-      dst_byte_ptr = reinterpret_cast<char *>(params.destination);
-    }
-    else {
-      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
-    }
-
-    uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
-
-    // Use modulo division to compute location
-    Coord<kReducedRank> outer_coord;
-    int64_t dst_byte_offset;
-    int64_t src_byte_offset;
-
-    compute_outer_coord_and_offset_(
-      params, 
-      outer_coord, 
-      dst_byte_offset, 
-      src_byte_offset, 
-      idx_linear);
-
-    if (gridDim.z == 1) {
-
-      /// Complete the reduction with no workspace
-      while (idx_linear < params.outer_count) {
-
-        ElementCompute result = reduce_indices_(
-          params, 
-          shared_storage.workspace.data(),
-          src_byte_ptr + src_byte_offset,
-          coord_c);
-
-        // Store the result after possible final reduction within the CTA
-        if (threadIdx.z == 0 && threadIdx.x == 0) {
-
-          // Convert to output type and store
-          NumericConverter<ElementOutput, ElementCompute> convert_output;
-          ElementOutput cvt = convert_output(result);
-
-          *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
-        }
-
-        __syncthreads();
-
-        // Update indices and pointers
-        idx_linear += gridDim.y * blockDim.y;
-
-        compute_outer_coord_and_offset_(
-          params, 
-          outer_coord, 
-          dst_byte_offset, 
-          src_byte_offset, 
-          idx_linear);
-
-      } // while 
-    }
-    else {
-
-      /// Complete the reduction with workspace
-      while (idx_linear < params.outer_count) {
-
-        ElementCompute result = reduce_indices_(
-          params, 
-          shared_storage.workspace.data(),
-          src_byte_ptr + src_byte_offset,
-          coord_c);
-
-        int64_t byte_offset = 
-          blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
-
-        // Store the result for final reduction
-        if (threadIdx.z == 0 && threadIdx.x == 0) {
-          *reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
-        }
-
-        __syncthreads();
-
-        // Update indices and pointers
-        idx_linear += gridDim.y * blockDim.y;
-
-        compute_outer_coord_and_offset_(
-          params, 
-          outer_coord, 
-          dst_byte_offset, 
-          src_byte_offset, 
-          idx_linear);
-      } // while
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to perform final reduction
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-class TensorReductionAffineContiguousFinal {
-public:
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-
-  /// Shared memory
-  struct SharedStorage { };
-
-  /// Parameters structure
-  using Params = TensorReductionAffineContiguousParams<
-    Rank,
-    ReducedRank,
-    ElementOutput,
-    ElementSource,
-    ReductionOp,
-    VectorLength,
-    ElementCompute,
-    Threads,
-    BatchSize
-  >;
-
-private:
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_outer_coord_and_offset_(
-    Params const &params, 
-    Coord<kReducedRank> & coord, 
-    int64_t &dst_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose into coordinate of rank <kReducedRank>
-    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
-
-    // Compute offsets using destination and source strides
-    dst_offset = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kReducedRank; ++i) {
-      dst_offset += params.dst_stride[i] * coord[i];
-    }
-  }
-
-  /// Reduces over the reduction indices
-  CUTLASS_DEVICE
-  ElementCompute reduce_indices_(
-    Params const &params,
-    ElementCompute const *device_workspace) {
-
-    ReductionOp reduction_op(params.reduction_op);
-    char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
-
-    // Accumulated output
-    ElementCompute accumulator = params.reduction_identity;
-
-    for (int iter = 0; iter < params.workspace_count; ++iter) {
-      ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
-      
-      accumulator = reduction_op(accumulator, workspace_item);
-
-      src_byte_ptr += params.workspace_stride;
-    }
-
-    return accumulator;
-  }
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Perform a reduction
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
-
-    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
-
-    // Use modulo division to compute location
-    Coord<kReducedRank> outer_coord;
-    int64_t dst_byte_offset;
-
-    compute_outer_coord_and_offset_(
-      params, 
-      outer_coord, 
-      dst_byte_offset, 
-      idx_linear);
-
-    /// Complete the reduction
-    while (idx_linear < params.outer_count) {
-
-      ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
-
-      // Convert to output type and store
-      NumericConverter<ElementOutput, ElementCompute> convert_output;
-
-      *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
-
-      // Update indices and pointers
-      idx_linear += gridDim.x * blockDim.x;
-
-      compute_outer_coord_and_offset_(
-        params, 
-        outer_coord, 
-        dst_byte_offset, 
-        idx_linear);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
deleted file mode 100755
index 0d449e687..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over one or more ranks of an affine tensor
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/device_kernel.h"
-
-#include "cutlass/reduction/thread/reduction_operators.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-/// Parameters structure
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-struct TensorReductionAffineStridedParams {
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-
-  Coord<kRank> extent;                          /// Extent of source tensor
-  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
-  int64_t dst_stride[kReducedRank - 1];         /// stride (units of bytes) - I, J
-  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
-  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
-  int64_t workspace_outer_stride;               /// stride (units of bytes) between 'rows' of the workspace
-  int workspace_count;                          /// number of workspaces
-  
-  uint64_t inner_count;                          /// Number of elements in reduced index space
-  uint64_t outer_count;                          /// Number of elements in outer index space
-
-  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
-  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
-  ReductionOp reduction_op;                     /// Reduction operator
-  ElementCompute reduction_identity;            /// Identity element for reduction operator
-  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  TensorReductionAffineStridedParams() {
-
-  }
-
-  /// Ctor
-  TensorReductionAffineStridedParams(
-    Coord<kRank> extent_,                       ///< Extent of source tensor
-    ElementOutput * dst_ptr_,                   ///< Output tensor data
-    int64_t dst_stride_[],                      ///< Stride (units of elements)
-    ElementSource const * src_ptr_,             ///< Source tensor data
-    int64_t src_stride_[],                      ///< Stride (units of elements)
-    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
-    int64_t workspace_stride_,                  ///< Stride between workspaces
-    int workspace_count_,                       ///< Number of workspaces
-    ReductionOp reduction_op_,                  ///< Reduction operator
-    ElementCompute reduction_identity_  = ElementCompute() ///< Identity element for reduction operator
-  ):
-    extent(extent_),
-    inner_count(1),
-    outer_count(1),
-    destination(dst_ptr_),
-    source(src_ptr_),
-    device_workspace(device_workspace_),
-    workspace_outer_stride(0),
-    workspace_stride(workspace_stride_),
-    workspace_count(workspace_count_),
-    reduction_op(reduction_op_),
-    reduction_identity(reduction_identity_) {
-
-    // Initialize divisors for fast div-mod
-    for (int p = 1; p < kRank; ++p) {
-      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
-    }
-
-    int input_size_bits = sizeof_bits<ElementSource>::value;
-    int output_size_bits = sizeof_bits<ElementOutput>::value;
-
-    workspace_outer_stride = workspace_stride * workspace_count;
-
-    // Compute strides in units of bytes
-    for (int p = 0; p < kReducedRank - 1; ++p) {
-      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
-    }  
-
-    for (int p = 0; p < kRank - 1; ++p) {
-      src_stride[p] = src_stride_[p] * input_size_bits / 8;
-    }
-
-    // Compute number of elements in strided ranks
-    for (int p = 0; p < kReducedRank - 1; ++p) {
-      outer_count *= uint64_t(extent[p]);
-    }
-
-    for (int p = 0; p < kInnerRank; ++p) {
-      inner_count *= uint64_t(extent[kReducedRank + p - 1]);
-    }
-  }
-};
-
-/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
-/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-class TensorReductionAffineStrided {
-public:
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-  using ComputeFragment = Array<ElementCompute, VectorLength>;
-  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
-  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
-
-  /// Shared memory allocation used for reduction within the CTA
-  struct SharedStorage {
-    Array<ElementCompute, kThreads * kVectorLength> workspace;
-  };
-
-  /// Parameters structure
-  using Params = TensorReductionAffineStridedParams<
-    Rank,
-    ReducedRank,
-    ElementOutput,
-    ElementSource,
-    ReductionOp,
-    VectorLength,
-    ElementCompute,
-    Threads,
-    BatchSize
-  >;
-
-private:
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_inner_coord_and_offset_(
-    Params const &params, 
-    Coord<kInnerRank> & coord, 
-    int64_t &src_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose into coordinate
-    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kReducedRank - 1]);
-
-    // Compute linear offset
-    src_offset = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kInnerRank; ++i) {
-      src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
-    }
-  }
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_outer_coord_and_offset_(
-    Params const &params, 
-    Coord<kReducedRank - 1> & coord, 
-    int64_t &dst_offset,
-    int64_t &src_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose linear coordinate
-    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
-
-    // Compute offset into tensors
-    dst_offset = 0;
-    src_offset = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kReducedRank - 1; ++i) {
-      dst_offset += params.dst_stride[i] * coord[i];
-      src_offset += params.src_stride[i] * coord[i];
-    }
-  }
-
-  /// Reduces over the reduction indices
-  CUTLASS_DEVICE
-  ComputeFragment reduce_indices_(
-    Params const &params,
-    ElementCompute *threadblock_workspace,
-    char const *src_byte_ptr) {
-
-    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
-    ReductionOp reduction_op(params.reduction_op);
-
-    // Accumulated output
-    ComputeFragment identity_frag;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(identity_frag.size()); ++i) {
-      identity_frag[i] = params.reduction_identity;
-    }
-
-    if (!params.inner_count) {
-      return identity_frag;
-    }
-    
-    ComputeFragment accumulator = identity_frag;
-
-    // Compute the coordinate of the first access    
-    int64_t src_byte_offset = 0;
-    Coord<kInnerRank> coord; 
-
-    uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
-    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
-
-    // Load the first vector
-    SourceFragment source_fragment[kBatchSize];
-    
-    bool not_done = true;
-
-    // Iterate over vectors in a linearized reduction index space
-    while (not_done) {
-
-      bool guards[kBatchSize];
-
-      // Issue a batch of loads
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-
-        if (linear_idx < params.inner_count) {
-          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
-          guards[b] = true;
-        }
-        else {
-          guards[b] = false;
-          not_done = false;
-        }
-
-        linear_idx += blockDim.z * gridDim.z;
-        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
-      }
-
-      // Perform a batch of reduction operations
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-        if (guards[b]) {
-
-          auto cvt = convert_source(source_fragment[b]);
-
-          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
-            reduction_op,
-             accumulator, 
-             cvt);
-        }
-      }
-    };
-
-    // Optional reduction within a CTA
-    if (blockDim.z > 1) {
-
-      // Linearized thread ID
-      int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-      // all threads store to workspace
-      ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
-
-      frag_ptr[thread_idx] = accumulator;
-
-      __syncthreads();
-
-      if (threadIdx.z == 0) {
-        // Load all additional block indices
-        for (int z = 1; z < blockDim.z; ++z) {
-          ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
-
-          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
-            reduction_op, 
-            accumulator, 
-            frag);
-        } 
-      }
-
-      __syncthreads();
-    }
-
-    return accumulator;
-  }
-
-public:
-
-  /// Perform a reduction
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
-
-    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
-    char * dst_byte_ptr = nullptr;
-
-    // If performing a reduction across CTAs, redirect output to device workspace
-    if (gridDim.z == 1) {
-      dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
-    }
-    else {
-      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
-    }
-
-    // If the C index is out of bounds, exit
-    if (coord_c >= params.extent[kRank - 1]) {
-      return;
-    }
-
-    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
-
-    // Use modulo division to compute location
-    Coord<kReducedRank - 1> outer_coord;
-    int64_t dst_byte_offset;
-    int64_t src_byte_offset;
-
-    compute_outer_coord_and_offset_(
-      params, 
-      outer_coord, 
-      dst_byte_offset, 
-      src_byte_offset, 
-      idx_linear);
-
-    if (gridDim.z == 1) {
-
-      /// Complete the reduction with no workspace
-      while (idx_linear < params.outer_count) {
-
-        ComputeFragment result;
-
-        result = reduce_indices_(
-          params, 
-          shared_storage.workspace.data(),
-          src_byte_ptr + src_byte_offset);
-
-        // Store the result after possible final reduction within the CTA
-        if (threadIdx.z == 0) {
-
-          // Convert to output type and store
-          NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
-          auto cvt = convert_output(result);
-
-          *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
-            reinterpret_cast<OutputFragment const &>(cvt);
-        }
-
-        // Update indices and pointers
-        idx_linear += gridDim.y * blockDim.y;
-
-        compute_outer_coord_and_offset_(
-          params, 
-          outer_coord, 
-          dst_byte_offset, 
-          src_byte_offset, 
-          idx_linear);
-
-      } // while 
-    }
-    else {
-
-      /// Complete the reduction with a device workspace
-      while (idx_linear < params.outer_count) {
-
-        ComputeFragment result;
-
-        result = reduce_indices_(
-          params, 
-          shared_storage.workspace.data(),
-          src_byte_ptr + src_byte_offset);
-
-        // Store the result after possible final reduction within the CTA
-        if (threadIdx.z == 0) {
-
-          int64_t byte_offset = 
-            blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
-
-          // No conversion - store in compute type
-          *reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) = 
-            reinterpret_cast<ComputeFragment const &>(result);
-        }
-
-        // Update indices and pointers
-        idx_linear += gridDim.y * blockDim.y;
-
-        compute_outer_coord_and_offset_(
-          params, 
-          outer_coord, 
-          dst_byte_offset, 
-          src_byte_offset, 
-          idx_linear);
-        
-      } // while (outer index)
-    } // if ()
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to perform final reduction
-template <
-  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
-  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
-  typename ElementOutput,                     ///< Data type of output tensor
-  typename ElementSource,                     ///< Data type of source tensor
-  typename ReductionOp,                       ///< Reduction operator
-  int VectorLength  = 1,                      ///< Vector length for memory
-  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
-  int Threads = 256,                          ///< Number of participating threads
-  int BatchSize = 4                           ///< Number of elements to load per batch
->
-class TensorReductionAffineStridedFinal {
-public:
-
-  static int const kRank = Rank;
-  static int const kReducedRank = ReducedRank;
-  static int const kVectorLength = VectorLength;
-  static int const kInnerRank = kRank - kReducedRank;
-  static int const kThreads = Threads;
-  static int const kBatchSize = BatchSize;
-  using ComputeFragment = Array<ElementCompute, VectorLength>;
-  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
-  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
-
-  /// Shared memory
-  struct SharedStorage { };
-
-  /// Parameters structure
-  using Params = TensorReductionAffineStridedParams<
-    Rank,
-    ReducedRank,
-    ElementOutput,
-    ElementSource,
-    ReductionOp,
-    VectorLength,
-    ElementCompute,
-    Threads,
-    BatchSize
-  >;
-
-private:
-
-  /// Computes the coordinate and offset of a given linear index
-  CUTLASS_DEVICE
-  void compute_outer_coord_and_offset_(
-    Params const &params, 
-    Coord<kReducedRank - 1> & coord, 
-    int64_t &dst_offset,
-    uint64_t linear_idx) const {
-
-    // Decompose linear index
-    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
-
-    // Compute tensor offset
-    dst_offset = 0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kReducedRank - 1; ++i) {
-      dst_offset += params.dst_stride[i] * coord[i];
-    }
-  }
-
-  /// Reduces over the reduction indices
-  CUTLASS_DEVICE
-  ComputeFragment reduce_indices_(
-    Params const &params,
-    char *src_byte_ptr) {
-
-    ReductionOp reduction_op(params.reduction_op);
-
-    // Accumulated output
-    ComputeFragment identity_frag;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < int(identity_frag.size()); ++i) {
-      identity_frag[i] = params.reduction_identity;
-    }
-
-    ComputeFragment accumulator = identity_frag;
-    ComputeFragment workspace_fragments[kBatchSize];
-
-    // Partially unrolled loop
-    for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
-
-      // Issue a batch of loads
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-        if (idx + b < params.workspace_count) {
-          workspace_fragments[b] = 
-            *reinterpret_cast<ComputeFragment *>(src_byte_ptr);  
-        }
-        else {
-          workspace_fragments[b] = identity_frag;
-        }
-        src_byte_ptr += + params.workspace_stride;
-      }
-
-      // Perform a reduction
-      CUTLASS_PRAGMA_UNROLL
-      for (int b = 0; b < kBatchSize; ++b) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kVectorLength; ++i) {
-          accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
-        }
-      }
-    }
-
-    return accumulator;
-  }
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Perform a reduction
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-
-    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
-
-    char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
-    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
-
-    // If the C index is out of bounds, exit
-    if (coord_c >= params.extent[kRank - 1]) {
-      return;
-    }
-
-    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
-
-    // Use modulo division to compute location
-    Coord<kReducedRank - 1> outer_coord;
-    int64_t dst_byte_offset;
-
-    compute_outer_coord_and_offset_(
-      params, 
-      outer_coord, 
-      dst_byte_offset, 
-      idx_linear);
-
-    /// Complete the reduction
-    while (idx_linear < params.outer_count) {
-
-      int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
-
-      ComputeFragment result = reduce_indices_(
-        params, 
-        src_byte_ptr + src_byte_offset);
-
-      // Convert to output type and store
-      NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
-      auto cvt = convert_output(result);
-
-      *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
-        reinterpret_cast<OutputFragment const &>(cvt);
-
-      // Update indices and pointers
-      idx_linear += gridDim.y * blockDim.y;
-
-      compute_outer_coord_and_offset_(
-        params, 
-        outer_coord, 
-        dst_byte_offset, 
-        idx_linear);
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
deleted file mode 100755
index d2551f977..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines basic thread level reduction with specializations for Array<T, N>.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/half.h"
-#include "cutlass/functional.h"
-
-namespace cutlass {
-namespace reduction {
-namespace thread {
-
-/// Structure to compute the thread level reduction
-template <typename Op, typename T>
-struct Reduce;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial Specialization of Reduce for "plus" (a functional operator)
-template <typename T>
-struct Reduce< plus<T>, T > {
-
-  CUTLASS_HOST_DEVICE
-  T operator()(T lhs, T const &rhs) const {
-    plus<T> _op;
-    return _op(lhs, rhs);
-  } 
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization of Reduce for Array<T, N>
-template <typename T, int N>
-struct Reduce < plus<T>, Array<T, N>> {
-  
-  CUTLASS_HOST_DEVICE
-  Array<T, 1> operator()(Array<T, N> const &in) const {
-
-    Array<T, 1> result;
-    Reduce< plus<T>, T > scalar_reduce;
-    result.clear();
-
-    CUTLASS_PRAGMA_UNROLL
-    for (auto i = 0; i < N; ++i) {
-      result[0] = scalar_reduce(result[0], in[i]);
-    }
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specializations of Reduce for Array<half_t, N>
-template <int N>
-struct Reduce < plus<half_t>, Array<half_t, N> > {
-  
-  CUTLASS_HOST_DEVICE
-  Array<half_t, 1> operator()(Array<half_t, N> const &input) {
-
-    Array<half_t, 1> result;
-
-    // If there is only 1 element - there is nothing to reduce
-    if( N ==1 ){
-
-      result[0] = input.front();
-
-    } else {
-    
-      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-        
-        __half result_d;
-        Array<half_t, 1> const *in_ptr_half = reinterpret_cast<Array<half_t, 1> const *>(&input);
-        Array<half_t, 2> const *in_ptr_half2 = reinterpret_cast<Array<half_t, 2> const *>(&input);
-        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
-
-        // Set initial result = first half2, in case N==2
-        __half2 tmp_result = x_in_half2[0];
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 1; i < N/2; ++i) {
-
-          tmp_result = __hadd2(x_in_half2[i], tmp_result);
-
-        }
-        
-        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
-    
-        // One final step is needed for odd "N" (to add the (N-1)th element)
-        if( N%2 ){
-
-          __half last_element;
-          Array<half_t, 1> tmp_last;
-          Array<half_t, 1> *tmp_last_ptr = &tmp_last;
-          tmp_last_ptr[0] = in_ptr_half[N-1];
-          last_element = reinterpret_cast<__half  const &>(tmp_last);
-
-          result_d = __hadd(result_d, last_element);
-
-        } 
-
-        Array<half_t, 1> *result_ptr = &result;
-        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
-
-      #else
-        
-        Reduce< plus<half_t>, half_t > scalar_reduce;
-        result.clear();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (auto i = 0; i < N; ++i) {
-
-          result[0] = scalar_reduce(result[0], input[i]);
-
-        }
-
-      #endif
-    }
-
-    return result;
-      
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specializations of Reduce for AlignedArray<half_t, N>
-template <int N>
-struct Reduce < plus<half_t>, AlignedArray<half_t, N> > {
-  
-  CUTLASS_HOST_DEVICE
-  Array<half_t, 1> operator()(AlignedArray<half_t, N> const &input) {
-
-    Array<half_t, 1> result;
-
-    // If there is only 1 element - there is nothing to reduce
-    if( N ==1 ){
-
-      result[0] = input.front();
-
-    } else {
-    
-      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-        
-        __half result_d;
-        AlignedArray<half_t, 1> const *in_ptr_half = reinterpret_cast<AlignedArray<half_t, 1> const *>(&input);
-        AlignedArray<half_t, 2> const *in_ptr_half2 = reinterpret_cast<AlignedArray<half_t, 2> const *>(&input);
-        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
-
-        // Set initial result = first half2, in case N==2
-        __half2 tmp_result = x_in_half2[0];
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 1; i < N/2; ++i) {
-
-          tmp_result = __hadd2(x_in_half2[i], tmp_result);
-
-        }
-        
-        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
-    
-        // One final step is needed for odd "N" (to add the (N-1)th element)
-        if( N%2 ){
-
-          __half last_element;
-          AlignedArray<half_t, 1> tmp_last;
-          AlignedArray<half_t, 1> *tmp_last_ptr = &tmp_last;
-          tmp_last_ptr[0] = in_ptr_half[N-1];
-          last_element = reinterpret_cast<__half  const &>(tmp_last);
-
-          result_d = __hadd(result_d, last_element);
-
-        } 
-
-        Array<half_t, 1> *result_ptr = &result;
-        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
-
-      #else
-        
-        Reduce< plus<half_t>, half_t > scalar_reduce;
-        result.clear();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (auto i = 0; i < N; ++i) {
-
-          result[0] = scalar_reduce(result[0], input[i]);
-
-        }
-
-      #endif
-    }
-
-    return result;
-      
-  }
-};
-}
-}
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h b/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
deleted file mode 100755
index ba62c1b50..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Kernel performing a reduction over densely packed tensors in global memory
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reduction {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Mixed-precision reduction
-template <
-  typename ElementAccumulator_,
-  typename Element_,
-  int Count = 1
->
-struct ReduceAdd {
-
-  //
-  // Type definitions
-  //
-
-  using ElementAccumulator = ElementAccumulator_;
-  using Element = Element_;
-  static int const kCount = Count;
-
-  using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
-  using FragmentElement = cutlass::Array<Element, kCount>;
-
-  struct Params { };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  ReduceAdd(Params params_ = Params()): params(params_) { }
-
-  /// Operator
-  CUTLASS_HOST_DEVICE
-  FragmentAccumulator operator()(
-    FragmentAccumulator accumulator, 
-    FragmentElement element) const {
-
-    plus<FragmentAccumulator> op;
-
-    NumericArrayConverter<
-      ElementAccumulator, 
-      Element, 
-      kCount, 
-      PreferredRoundingMode<ElementAccumulator, Element>::kRound> converter;
-
-    return op(accumulator, converter(element));
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Special handling for binary operators
-template <typename ReductionOp, typename Element, int N>
-struct VectorizeArrayOperation {
-
-  using ValueType = Array<Element, N>;
-
-  CUTLASS_HOST_DEVICE
-  ValueType operator()(
-    ReductionOp const &reduction_op, 
-    ValueType const &lhs, 
-    ValueType const &rhs) const {
-
-    ValueType result;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-      result[i] = reduction_op(lhs[i], rhs[i]);
-    }
-
-    return result;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename ReductionOp, typename Element, int N>
-struct ReduceArrayOperation {
-
-  using ArrayType = Array<Element, N>;
-
-  CUTLASS_HOST_DEVICE
-  Element operator()(
-    ReductionOp const &reduction_op, 
-    ArrayType const &array) const {
-
-    Element item = reduction_op(array[0], array[1]);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 2; i < N; ++i) {
-      item = reduction_op(item, array[i]);
-    }
-
-    return item;
-  }
-};
-
-template <int N>
-struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
-
-  using ArrayType = Array<uint1b_t, N>;
-
-  CUTLASS_HOST_DEVICE
-  uint1b_t operator()(
-    logical_and<uint1b_t> const &reduction_op, 
-    ArrayType const &array) const {
-
-    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
-    bool item = false;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
-      uint8_t bits = ptr[byte];
-      item = (item || !bits);
-    }
-
-    return uint1b_t(!item);
-  }
-};
-
-template <int N>
-struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
-
-  using ArrayType = Array<uint1b_t, N>;
-
-  CUTLASS_HOST_DEVICE
-  uint1b_t operator()(
-    logical_and<uint1b_t> const &reduction_op, 
-    ArrayType const &array) const {
-
-    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
-    bool item = true;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
-      uint8_t bits = ptr[byte];
-      item = (item || bits);
-    }
-
-    return uint1b_t(item);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper function to infer template argument types
-template <typename ReductionOp, typename Element, int N>
-CUTLASS_HOST_DEVICE
-Array<Element, N> ApplyArrayOperator(
-  ReductionOp const &reduction_op,
-  Array<Element, N> const &lhs, 
-  Array<Element, N> const &rhs) {
-
-  VectorizeArrayOperation<ReductionOp, Element, N> vectorize_op;
-
-  return vectorize_op(reduction_op, lhs, rhs);
-}
-
-/// Helper to reduce an array
-template <typename ReductionOp, typename Element, int N>
-Element ReduceArray(ReductionOp const &reduction_op, Array<Element, N> const &array) {
-  ReduceArrayOperation<ReductionOp, Element, N> reduce_array_op;
-
-  return reduce_array_op(reduction_op, array);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace reduction
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h b/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
deleted file mode 100755
index ffb35dada..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-**************************************************************************************************/
-/*! \file
-\brief Defies functors for mapping blockIdx to partitions of the batched reduction computation.
-*/
-#pragma once
-#include "cutlass/coord.h"
-
-namespace cutlass {
-namespace reduction {
-struct DefaultBlockSwizzle {
-  /// Ctor
-  CUTLASS_HOST_DEVICE DefaultBlockSwizzle() {}
-
-  /// Swizzle the block index.
-  CUTLASS_DEVICE dim3 swizzle() { return blockIdx; }
-
-  /// 
-  CUTLASS_HOST_DEVICE dim3 get_grid_layout(Coord<3> const &problem_size,
-                                           Coord<3> const &OutputTile) {
-    assert(OutputTile[0] == 1 && OutputTile[1] == 1);
-    assert((problem_size[0] * problem_size[1] * problem_size[2]) % OutputTile[2] == 0);
-    dim3 grid;
-    grid.x = problem_size[0] * problem_size[1] * problem_size[2]
-      / OutputTile[2] ;
-    return grid;
-  }
-
-  ///
-  CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &SubTile) {
-    assert(SubTile[0] == 1 && SubTile[1] == 1);
-    dim3 block = swizzle();
-    Coord<3> threadblock_offset =
-      make_Coord(0, 0, block.x * SubTile[2]);
-    return threadblock_offset;
-  }
-};
-} // namespace reduction
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h b/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
deleted file mode 100755
index 26b7c66b1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/relatively_equal.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Performs comparison between two elements with support for floating-point comparisons.
-*/
-
-#pragma once
-
-#include "numeric_types.h"
-#include "complex.h"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename U = T>
-CUTLASS_HOST_DEVICE
-bool relatively_equal(T a, T b, U epsilon, U nonzero_floor);
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-// This floating-point comparison function implements the method described in
-//
-// https://floating-point-gui.de/errors/comparison/
-//
-template <typename T>
-CUTLASS_HOST_DEVICE
-bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
-  
-#if defined(__CUDACC_RTC__)
-  using cuda::std::abs;
-#else
-  using std::abs;
-#endif
-
-  T abs_A = abs(a);
-  T abs_B = abs(b);
-  T diff = abs(a - b);
-  T zero = T(0);
-
-  if (a == b) {
-    return true;
-  }
-  else if (a == zero || b == zero || diff < nonzero_floor) {
-    return diff < epsilon * nonzero_floor;
-  }
-  
-  return diff < epsilon * (abs_A + abs_B);
-}
-
-} // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<bool>(bool a, bool b, bool, bool) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint1b_t>(uint1b_t a, uint1b_t b, uint1b_t, uint1b_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int2b_t>(int2b_t a, int2b_t b, int2b_t, int2b_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint2b_t>(uint2b_t a, uint2b_t b, uint2b_t, uint2b_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int4b_t>(int4b_t a, int4b_t b, int4b_t, int4b_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint4b_t>(uint4b_t a, uint4b_t b, uint4b_t, uint4b_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int8_t>(int8_t a, int8_t b, int8_t, int8_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint8_t>(uint8_t a, uint8_t b, uint8_t, uint8_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int16_t>(int16_t a, int16_t b, int16_t, int16_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint16_t>(uint16_t a, uint16_t b, uint16_t, uint16_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int32_t>(int32_t a, int32_t b, int32_t, int32_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint32_t>(uint32_t a, uint32_t b, uint32_t, uint32_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<int64_t>(int64_t a, int64_t b, int64_t, int64_t) {
-  return (a == b);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<uint64_t>(uint64_t a, uint64_t b, uint64_t, uint64_t) {
-  return (a == b);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<float_e4m3_t>(float_e4m3_t a, float_e4m3_t b, float_e4m3_t epsilon, float_e4m3_t nonzero_floor) {
-  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<float_e5m2_t>(float_e5m2_t a, float_e5m2_t b, float_e5m2_t epsilon, float_e5m2_t nonzero_floor) {
-  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero_floor) {
-  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<bfloat16_t>(
-  bfloat16_t a, 
-  bfloat16_t b, 
-  bfloat16_t epsilon, 
-  bfloat16_t nonzero_floor) {
-  
-  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<tfloat32_t>(
-  tfloat32_t a, 
-  tfloat32_t b, 
-  tfloat32_t epsilon, 
-  tfloat32_t nonzero_floor) {
-  
-  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
-}
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<float>(float a, float b, float epsilon, float nonzero_floor) {
-  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
-}
-
-
-template <>
-CUTLASS_HOST_DEVICE
-bool relatively_equal<double>(double a, double b, double epsilon, double nonzero_floor) {
-  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
-}
-
-template<typename T>
-CUTLASS_HOST_DEVICE
-bool relatively_equal(complex<T> a, complex<T> b, T epsilon, T nonzero_floor) {
-#if defined(__CUDACC_RTC__)
-  using cuda::std::abs;
-#else
-  using std::abs;
-#endif
-
-  T abs_A = abs(a);
-  T abs_B = abs(b);
-  T diff = abs(a - b);
-  complex<T> zero = complex<T>{T{}, T{}};
-
-  if (a == b) {
-    return true;
-  }
-  else if (a == zero || b == zero || diff < nonzero_floor) {
-    return diff < epsilon * nonzero_floor;
-  }
-
-  return diff < epsilon * (abs_A + abs_B);
-}
-
-template <typename T>
-CUTLASS_HOST_DEVICE 
-bool relatively_equal(complex<T> a,  complex<T> b, complex<T> epsilon, complex<T> nonzero_floor) {
-#if defined(__CUDACC_RTC__)
-  using cuda::std::abs;
-#else
-  using std::abs;
-#endif
-
-  T abs_A = abs(a);
-  T abs_B = abs(b);
-  complex<T> diff = a - b;
-  T abs_diff = abs(diff);
-  complex<T> zero = complex<T>{T{}, T{}};
-
-  if (a == b) {
-    return true;
-  }
-  else if (a == zero || b == zero || abs_diff < abs(nonzero_floor)) {
-    return abs_diff < abs(epsilon * nonzero_floor);
-  }
-
-  return abs_diff < abs(epsilon) * (abs_A + abs_B);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/semaphore.h b/lightllm-kernel/cutlass/include/cutlass/semaphore.h
deleted file mode 100755
index efcd9211c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/semaphore.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Implementation of a CTA-wide semaphore for inter-CTA synchronization.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/matrix_shape.h"
-
-#include "cutlass/gemm/gemm.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// CTA-wide semaphore for inter-CTA synchronization.
-class Semaphore { 
-public:
-
-  int *lock;
-  bool wait_thread;
-  int state;
-
-public:
-
-  /// Implements a semaphore to wait for a flag to reach a given value
-  CUTLASS_HOST_DEVICE
-  Semaphore(int *lock_, int thread_id): 
-    lock(lock_), 
-    wait_thread(thread_id < 0 || thread_id == 0),
-    state(-1) {
-
-  }
-
-  /// Permit fetching the synchronization mechanism early
-  CUTLASS_DEVICE
-  void fetch() {
-    if (wait_thread) {
-      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
-      #else
-      asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
-      #endif
-    }
-  }
-
-  /// Gets the internal state
-  CUTLASS_DEVICE
-  int get_state() const {
-    return state;
-  }
-
-  /// Waits until the semaphore is equal to the given value
-  CUTLASS_DEVICE
-  void wait(int status = 0) {
-    while( __syncthreads_and(state != status) ) {
-      fetch();
-    }
-
-    __syncthreads();
-  }
-
-  /// Updates the lock with the given result
-  CUTLASS_DEVICE
-  void release(int status = 0) {
-    __syncthreads();
-
-    if (wait_thread) {
-      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
-      #else
-      asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
-      #endif
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h b/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
deleted file mode 100755
index 8d43f503e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h
+++ /dev/null
@@ -1,1388 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Provides a mechanism for packing and unpacking elements smaller than one byte
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/integer_subbyte.h"
-#include "cutlass/fast_math.h"
-
-namespace cutlass {
-
-namespace detail {
-// This is an implementation detail of cutlass::SubbyteReference and.
-// cutlass::HostTensor.  For a given logical element type Element,
-// and its corresponding storage (physical) element type StorageUnit,
-// it computes quantities that help with managing allocations.
-//
-// CUTLASS uses a hidden "ContainerUnitType" or StorageUnit type to support
-// packed arrays of subbyte types such as int4.  Element is the "logical" type
-// for computations, while CUTLASS uses StorageUnit as the element type
-// of a packed array of Element.  If Element is not a subbyte type,
-// then the corresponding StorageUnit type is just Element itself.
-//
-// The ContainerType is always calculated as an array StorageUnit type (the StorageUnit
-// is always a byte for subbyte types),
-// and its number of bits is the lcm of the subbyte type's number of bits and 8.
-// Below are some examples for different subbyte types.
-//
-// * Subbyte Type=int2, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
-// * Subbyte Type=int4, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
-template<class Element, class StorageUnit>
-struct StorageContainerCalculator {
-  // kContainerTypeNumBits: The number of bits needed for ContainerType
-  static constexpr int kContainerTypeNumBits   = (sizeof_bits<Element>::value < 8) ? cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value) : sizeof_bits<Element>::value;
-  static_assert(kContainerTypeNumBits % sizeof_bits<Element>::value == 0, "The bits of ContainerType should be divisible by the element's number of bits");
-  // kContainerTypeNumLogicalElements: The number of logical Element instance(s) that can be stored per ContainerType instance
-  static constexpr int kContainerTypeNumLogicalElements = kContainerTypeNumBits / sizeof_bits<Element>::value;
-  /// 3. kContainerTypeNumBytes: The number of bytes per ContainerType instance
-  static constexpr int kContainerTypeNumBytes = kContainerTypeNumBits / 8;
-  /// 4. kContainerTypeNumBytes: The number of base StorageUnit in the ContainerType
-  static constexpr int kContainerTypeNumStorageUnit = kContainerTypeNumBits / sizeof_bits<StorageUnit>::value;
-
-  static_assert(kContainerTypeNumBits != 0, "kContainerTypeNumBits can not be zero");
-  static_assert(kContainerTypeNumLogicalElements != 0, "kContainerTypeNumLogicalElements can not be zero");
-  static_assert(kContainerTypeNumBytes != 0, "kContainerTypeNumBytes can not be zero");
-};
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// This class provides a mechanism for packing and unpacking elements smaller than one byte. It
-/// assumes these sub-byte elements are packed in a traditional C++ numeric type.
-///
-/// The intended application is to provide a mechanism to indirectly reference elements in
-/// memory or Array<> objects whose addresses cannot otherwise be taken since they are smaller
-/// than one byte.
-/// 
-/// Supports basic pointer arithmetic:
-///
-/// Example:
-///
-///   int4b_t *ptr = ...;
-///
-///   SubbyteReference<int4b_t> ref = ptr;
-///   ref += 15;
-///
-///   int4b_t x = ref;      // load an int4b_t
-///   ref = x + 2_s4;      // perform arithmetic on int4b_t and then store
-///
-template <
-  typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_ = uint8_t,    /// Underlying storage type. Must be able to hold an integer 
-                                  ///   number of objects of type Element.
-  class = void
->
-class ConstSubbyteReference {
-public:
-
-  using Element = Element_;
-  using Storage = Storage_;
-  using StoragePointer = Storage const *;
-
-  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
-    "Size of Element must not be greater than Storage.");
-
-  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
-    "Storage must be divisible by Element");
-
-private:
-
-  ///! Number of elements per storage vector
-  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
-
-  ///! Bit mask 
-  Storage const kMask = 
-    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
-      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
-      ~Storage(0));
-
-private:
-
-  /// Pointer to array containing element
-  StoragePointer ptr_;
-
-  /// Offset (in units of elements) from pointer.
-  ///
-  /// Invariant: must always be in range [0, kElementsPerVector)
-  int offset_;
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(
-    Element const *ptr,           /// pointer to memory
-    int64_t offset          /// logical offset in units of Element
-  ): 
-    ptr_(reinterpret_cast<StoragePointer>(ptr)),
-    offset_(0) {
-
-    int64_t offset_in_vectors = offset / kElementsPerVector;
-    int64_t offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = int(offset_in_elements);
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(
-    Element *ptr = nullptr
-  ): ConstSubbyteReference(ptr, 0) { }
-
-  /// Gets storage pointer
-  CUTLASS_HOST_DEVICE
-  StoragePointer storage_pointer() const {
-    return ptr_;
-  }
-
-  /// Gets element offset within storage vector
-  CUTLASS_HOST_DEVICE
-  int element_offset() const {
-    return offset_;
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  Element get() const {
-    Storage item = Storage((*ptr_ >> (offset_ * sizeof_bits<Element>::value)) & kMask);
-    return reinterpret_cast<Element const &>(item);
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  operator Element() const {
-    return get();
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator+=(int offset) {
-
-    offset += offset_;
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator+=(long long offset) {
-
-    offset += offset_;
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator-=(int offset) {
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator-=(long long offset) {
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    return *this;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator+(int offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator+(long long offset) const {
-    
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator-(int offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator-=(long long offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Computes the difference in elements between references
-  CUTLASS_HOST_DEVICE
-  ptrdiff_t operator-(ConstSubbyteReference ref) const {
-    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
-  }
-
-  /// Explicit cast to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(get());
-  }
-
-  /// Explicit cast to signed 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator int64_t() const {
-    return int64_t(get());
-  }
-
-  /// Explicit cast to unsigned 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const {
-    return uint64_t(get());
-  }
-
-  /// Explicit cast to float
-  CUTLASS_HOST_DEVICE
-  explicit operator float() const {
-    return float(get());
-  }
-
-  /// Explicit cast to double
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(get());
-  }
-};
-
-template <
-  typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_ =             /// Underlying storage type. Must be able to hold an integer
-                                  ///   number of objects of type Element.
-
-#if defined(__CUDA_ARCH__)        /// Default size depends on width of atomicCas() overloads.
-  #if (__CUDA_ARCH__ >= 700)      ///
-  uint16_t
-  #else
-  uint32_t
-  #endif
-#else
-  uint8_t
-#endif
-  ,
-  class = void
->
-class SubbyteReference {
-public:
-
-  using Element = Element_;
-  using Storage = Storage_;
-  using StoragePointer = Storage *;
-
-  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
-    "Size of Element must not be greater than Storage.");
-
-  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
-    "Storage must be divisible by Element");
-
-private:
-
-  ///! Number of elements per storage vector
-  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
-
-  ///! Bit mask 
-  Storage const kMask = 
-    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
-      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
-      ~Storage(0));
-
-private:
-
-  /// Pointer to array containing element
-  StoragePointer ptr_;
-
-  /// Offset (in units of elements) from pointer.
-  ///
-  /// Invariant: must always be in range [0, kElementsPerVector)
-  int offset_;
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(): ptr_(nullptr), offset_(0) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(
-    Element *ptr,           /// pointer to memory
-    int64_t offset          /// logical offset in units of Element
-  ): 
-    ptr_(reinterpret_cast<StoragePointer>(ptr)),
-    offset_(0) {
-
-    int64_t offset_in_vectors = offset / kElementsPerVector;
-    int64_t offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = int(offset_in_elements);
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(
-    Element *ptr = nullptr
-  ): SubbyteReference(ptr, 0) { }
-
-  /// Gets storage pointer
-  CUTLASS_HOST_DEVICE
-  StoragePointer storage_pointer() const {
-    return ptr_;
-  }
-
-  /// Gets storage pointer
-  CUTLASS_HOST_DEVICE
-  Element * operator&() const {
-    return reinterpret_cast<Element *>(ptr_);
-  }
-
-  /// Gets element offset within storage vector
-  CUTLASS_HOST_DEVICE
-  int element_offset() const {
-    return offset_;
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  Element get() const {
-    uint8_t const* byte_ptr = reinterpret_cast<uint8_t const*>(ptr_);
-    // Convert offset in elements to offset in bytes
-    constexpr int elements_per_byte = cutlass::sizeof_bits<uint8_t>::value / cutlass::sizeof_bits<Element>::value;
-    byte_ptr += offset_ / elements_per_byte;
-    // Offset of element within a byte
-    int byte_offset = offset_ % elements_per_byte;
-    uint8_t item = uint8_t((*byte_ptr >> (byte_offset * cutlass::sizeof_bits<Element>::value)) & kMask);
-    return reinterpret_cast<Element const &>(item);
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference & set(Element const &x) {
-
-    Storage item        = (reinterpret_cast<Storage const &>(x) & kMask);
-    Storage kUpdateMask = Storage(~(kMask << (offset_ * cutlass::sizeof_bits<Element>::value)));
-    Storage new_bits    = Storage(item << (offset_ * cutlass::sizeof_bits<Element>::value));
-
-#if defined(__CUDA_ARCH__)
-
-    //
-    // Homebrew read-modify-write
-    //
-    Storage original;
-    Storage updated;
-
-    do {
-
-      original = (*ptr_);
-
-      updated  = Storage((original & kUpdateMask) | new_bits);
-
-      original = atomicCAS(ptr_, original, updated);
-
-    } while (updated != original);
-
-#else
-
-    Storage original = (*ptr_);
-    Storage updated  = Storage((original & kUpdateMask) | new_bits);
-    *ptr_ = updated;
-
-#endif
-
-    return *this;
-  }
-
-  ////
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  operator Element() const {
-    return get();
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(Element const & x) {
-    return set(x);
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(SubbyteReference const & x) {
-    return set(x.get());
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(
-      ConstSubbyteReference<Element, Storage> const &x) {
-    return set(x.get());
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator+=(int offset) {
-
-    offset += offset_;
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator+=(long long offset) {
-
-    offset += offset_;
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator-=(int offset) {
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator-=(long long offset) {
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    return *this;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator+(int offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator+(long long offset) const {
-    
-    SubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator-(int offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator-=(long long offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Computes the difference in elements between references
-  CUTLASS_HOST_DEVICE
-  ptrdiff_t operator-(SubbyteReference ref) const {
-    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
-  }
-
-  /// Explicit cast to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(get());
-  }
-
-  /// Explicit cast to signed 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator int64_t() const {
-    return int64_t(get());
-  }
-
-  /// Explicit cast to unsigned 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const {
-    return uint64_t(get());
-  }
-
-  /// Explicit cast to float
-  CUTLASS_HOST_DEVICE
-  explicit operator float() const {
-    return float(get());
-  }
-
-  /// Explicit cast to double
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(get());
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename T> using _war = T;
-template <
-  typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_               /// Underlying basic storage type.
->
-class SubbyteReference<Element_, Storage_, 
-    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
-public:
-
-  using Element = Element_;
-  /// Note: It's possible that StorageUnit is not divisible by Element.
-  /// For example, an Element instance might be stored across 2 StorageUnit instances.
-  /// Thus, CUTLASS needs a storage vector to hold an integer number of Element instances.
-
-  using StorageUnit = Storage_;
-private:
-  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-public:
-  static int const kBitsStoredVec = StorageContainerCalculator::kContainerTypeNumBits; 
-  static int const kNumStorageUnitPerStoredVec = StorageContainerCalculator::kContainerTypeNumStorageUnit;
-
-  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
-  using StorageVecPointer = StorageVec *;
-  
-  using CudaAtomicType = typename platform::conditional<
-      sizeof_bits<StorageUnit>::value == 16,
-      uint32_t,
-      uint64_t
-    >::type;
-
-  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
-    "Size of Element must not be greater than StorageVec.");
-
-  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
-    "StorageVec must be divisible by Element");
-
-private:
-
-  ///! Number of elements per storage vector
-  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
-
-  ///! Bit mask for storage unit.
-  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
-
-  /// Pointer to array containing element
-  _war<StorageVecPointer> ptr_;
-
-  /// Offset (in units of elements) from pointer.
-  ///
-  /// Invariant: must always be in range [0, kElementsPerVector)
-  int offset_;
-
-  /// Element may be stored across 2 storage unit.
-  ///   Low storage unit index in StorageVec
-  ///   High storage unit index in StorageVec
-  int low_storage_unit_idx_;
-  int high_storage_unit_idx_;
-
-  /// Full Mask to extract the entire element
-  uint64_t full_element_mask_;
-
-  /// Mask to extract the Element from Low storage unit and High storage unit.
-  StorageUnit low_storage_mask_;
-  StorageUnit high_storage_mask_;
-
-  /// Start bit index inside the storage unit.
-  int start_bit_idx_;
-
-private:
-
-  CUTLASS_HOST_DEVICE
-  void update_element_status() {
-    int num_bits = offset_ * sizeof_bits<Element>::value;
-
-    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
-    
-    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
-    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
-                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
-    
-    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
-    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
-    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
-  }
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(): ptr_(nullptr), offset_(0) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(
-    Element *ptr,           /// pointer to memory
-    int64_t offset          /// logical offset in units of Element
-  ): 
-    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
-    offset_(0) {
-    int64_t offset_in_vectors = offset / kElementsPerVector;
-    int64_t offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = int(offset_in_elements);
-
-    update_element_status();
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  SubbyteReference(
-    Element *ptr = nullptr
-  ): SubbyteReference(ptr, 0) { }
-
-  /// Gets StorageVec pointer
-  CUTLASS_HOST_DEVICE
-  StorageVecPointer storage_pointer() const {
-    return ptr_;
-  }
-
-  /// Gets StorageVec pointer
-  CUTLASS_HOST_DEVICE
-  Element * operator&() const {
-    return reinterpret_cast<Element *>(ptr_);
-  }
-
-  /// Gets element offset within StorageVec vector
-  CUTLASS_HOST_DEVICE
-  int element_offset() const {
-    return offset_;
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  Element get() const {
-    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
-    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
-
-    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
-    uint8_t result = uint8_t(full_item >> start_bit_idx_);
-
-    return reinterpret_cast<Element const &>(result);
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference & set(Element const &x) {
-
-    uint64_t item = static_cast<uint64_t>((reinterpret_cast<uint8_t const &>(x) & kMask)) << start_bit_idx_;
-    
-    StorageUnit low_new_bits  = StorageUnit(item & ~StorageUnit(0));
-    StorageUnit high_new_bits = StorageUnit(item >> sizeof_bits<StorageUnit>::value);
-
-    StorageUnit const kLowUpdateMask  = StorageUnit((~full_element_mask_) & (~StorageUnit(0)));
-    StorageUnit const kHighUpdateMask = StorageUnit(((~full_element_mask_) >> sizeof_bits<StorageUnit>::value) & (~StorageUnit(0)));
-
-#if defined(__CUDA_ARCH__)
-    //
-    // Homebrew read-modify-write
-    //
-    if(high_storage_unit_idx_ != low_storage_unit_idx_){
-      /// Only need update 2 storage unit at once.
-      /// consider misaligned address issue, we need to do atomicCAS twice 
-      StorageUnit original_low_bits, original_high_bits, update_low_bits, update_high_bits;
-      do {
-        original_low_bits  = ((*ptr_)[low_storage_unit_idx_]);
-        update_low_bits  = (original_low_bits & kLowUpdateMask) | low_new_bits;
-        original_low_bits = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original_low_bits, update_low_bits);
-      } while (update_low_bits != original_low_bits);
-      do {
-        original_high_bits = ((*ptr_)[high_storage_unit_idx_]);
-        update_high_bits  = (original_high_bits & kHighUpdateMask) | high_new_bits;
-        original_high_bits = atomicCAS(&((*ptr_)[high_storage_unit_idx_]), original_high_bits, update_high_bits);
-      } while (update_high_bits != original_high_bits);
-    }
-    else {
-      /// Only need update 1 storage unit.
-      StorageUnit original, updated;
-      do {
-        original = ((*ptr_)[low_storage_unit_idx_]);
-
-        updated = (original & kLowUpdateMask) | low_new_bits;
-
-        original = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original, updated);
-
-      } while (updated != original);
-    }
-#else
-
-
-    StorageUnit update_low_bits  = ((*ptr_)[low_storage_unit_idx_] & kLowUpdateMask) | low_new_bits;
-    StorageUnit update_high_bits = ((*ptr_)[high_storage_unit_idx_] & kHighUpdateMask) | high_new_bits;
-
-    (*ptr_)[low_storage_unit_idx_] = update_low_bits;
-
-    if(low_storage_unit_idx_ != high_storage_unit_idx_)
-      (*ptr_)[high_storage_unit_idx_] = update_high_bits;
-#endif
-
-    return *this;
-  }
-
-  ////
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  operator Element() const {
-    return get();
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(Element const & x) {
-    return set(x);
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(SubbyteReference const & x) {
-    return set(x.get());
-  }
-
-  /// Stores an element to memory
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator=(
-      ConstSubbyteReference<Element, StorageVec> const &x) {
-    return set(x.get());
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator+=(int offset) {
-
-    offset += offset_;
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator+=(long long offset) {
-
-    offset += offset_;
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator-=(int offset) {
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    update_element_status();
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference &operator-=(long long offset) {
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    update_element_status();
-    return *this;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator+(int offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator+(long long offset) const {
-    
-    SubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator-(int offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  SubbyteReference operator-=(long long offset) const {
-
-    SubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Computes the difference in elements between references
-  CUTLASS_HOST_DEVICE
-  ptrdiff_t operator-(SubbyteReference ref) const {
-    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
-  }
-
-  /// Explicit cast to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(get());
-  }
-
-  /// Explicit cast to signed 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator int64_t() const {
-    return int64_t(get());
-  }
-
-  /// Explicit cast to unsigned 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const {
-    return uint64_t(get());
-  }
-
-  /// Explicit cast to float
-  CUTLASS_HOST_DEVICE
-  explicit operator float() const {
-    return float(get());
-  }
-
-  /// Explicit cast to double
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(get());
-  }
-};
-
-template<typename T> using _war = T;
-template <
-  typename Element_,              /// CUTLASS numeric element type.
-  typename Storage_               /// Underlying storage type. Must be able to hold an integer 
->
-class ConstSubbyteReference<Element_, Storage_, 
-    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
-public:
-
-  using Element = Element_;
-  ///! Note: Storage unit could not be divisibale by Element,   
-  ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
-  ///   number of objects of type Element.
-  using StorageUnit = Storage_;
-  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
-  static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
-
-  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
-  using StorageVecPointer = StorageVec const *;
-  
-  using CudaAtomicType = typename platform::conditional<
-      sizeof_bits<StorageUnit>::value == 16,
-      uint32_t,
-      uint64_t
-    >::type;
-
-  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
-    "Size of Element must not be greater than StorageVec.");
-
-  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
-    "StorageVec must be divisible by Element");
-
-private:
-
-  ///! Number of elements per storage vector
-  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
-
-  ///! Bit mask for storage unit.
-  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
-
-  /// Pointer to array containing element
-  _war<StorageVecPointer> ptr_;
-
-  /// Offset (in units of elements) from pointer.
-  ///
-  /// Invariant: must always be in range [0, kElementsPerVector)
-  int offset_;
-
-  /// Element may be stored across 2 storage unit.
-  ///   Low storage unit index in StorageVec
-  ///   High storage unit index in StorageVec
-  int low_storage_unit_idx_;
-  int high_storage_unit_idx_;
-
-  /// Full Mask to extract the entire element
-  uint64_t full_element_mask_;
-
-  /// Mask to extract the Element from Low storage unit and High storage unit.
-  StorageUnit low_storage_mask_;
-  StorageUnit high_storage_mask_;
-
-  /// Start bit index inside the storage unit.
-  int start_bit_idx_;
-
-private:
-
-  CUTLASS_HOST_DEVICE
-  void update_element_status() {
-    int num_bits = offset_ * sizeof_bits<Element>::value;
-
-    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
-    
-    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
-    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
-                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
-    
-    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
-    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
-    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
-  }
-
-public:
-
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(
-    Element const *ptr,           /// pointer to memory
-    int64_t offset          /// logical offset in units of Element
-  ): 
-    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
-    offset_(0) {
-
-    int64_t offset_in_vectors = offset / kElementsPerVector;
-    int64_t offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = int(offset_in_elements);
-
-    update_element_status();
-  }
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference(
-    Element *ptr = nullptr
-  ): ConstSubbyteReference(ptr, 0) { }
-
-  /// Gets storage pointer
-  CUTLASS_HOST_DEVICE
-  StorageVecPointer storage_pointer() const {
-    return ptr_;
-  }
-
-  /// Gets element offset within storage vector
-  CUTLASS_HOST_DEVICE
-  int element_offset() const {
-    return offset_;
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  Element get() const {
-    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
-    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
-
-    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
-    uint8_t result = uint8_t(full_item >> start_bit_idx_);
-
-    return reinterpret_cast<Element const &>(result);
-  }
-
-  /// Unpacks an element from memory
-  CUTLASS_HOST_DEVICE
-  operator Element() const {
-    return get();
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator+=(int offset) {
-
-    offset += offset_;
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator+=(long long offset) {
-
-    offset += offset_;
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ += offset_in_vectors;
-    offset_ = offset_in_elements;
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator-=(int offset) {
-    
-    int offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = offset % kElementsPerVector;
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Adds an offset in units of elements to the reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference &operator-=(long long offset) {
-    
-    long long offset_in_vectors = offset / kElementsPerVector;
-    int offset_in_elements = int(offset % kElementsPerVector);
-
-    ptr_ -= offset_in_vectors;
-    offset_ -= offset_in_elements;
-
-    if (offset_ < 0) {
-      offset_ += kElementsPerVector;
-      --ptr_;
-    }
-
-    update_element_status();
-
-    return *this;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator+(int offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator+(long long offset) const {
-    
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref += offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator-(int offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Returns a reference to an element with a given offset from the current reference
-  CUTLASS_HOST_DEVICE
-  ConstSubbyteReference operator-=(long long offset) const {
-
-    ConstSubbyteReference ref(ptr_, offset_);
-    ref -= offset;
-
-    return ref;
-  }
-
-  /// Computes the difference in elements between references
-  CUTLASS_HOST_DEVICE
-  ptrdiff_t operator-(ConstSubbyteReference ref) const {
-    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
-  }
-
-  /// Explicit cast to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(get());
-  }
-
-  /// Explicit cast to signed 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator int64_t() const {
-    return int64_t(get());
-  }
-
-  /// Explicit cast to unsigned 64-bit integer
-  CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const {
-    return uint64_t(get());
-  }
-
-  /// Explicit cast to float
-  CUTLASS_HOST_DEVICE
-  explicit operator float() const {
-    return float(get());
-  }
-
-  /// Explicit cast to double
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(get());
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
-struct ReferenceFactory;
-
-template <typename Element>
-struct ReferenceFactory<Element, false> {
-
-  ///! Number of elements per storage vector
-  static int const kElementsPerVector = 1;
-
-  CUTLASS_HOST_DEVICE
-  static Element &get(Element *ptr, int64_t offset) {
-    return ptr[offset];
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Element const &get(Element const *ptr, int64_t offset) {
-    return ptr[offset];
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Element *add_pointer_offset(Element *ptr, int64_t offset) {
-    return ptr + offset;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static Element const *add_pointer_offset(Element const *ptr, int64_t offset) {
-    return ptr + offset;
-  }
-};
-
-template <typename Element>
-struct ReferenceFactory<Element, true> {
-
-  //
-  // Static methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
-    return SubbyteReference<Element>(ptr, offset);
-  }
-
-  CUTLASS_HOST_DEVICE
-  static ConstSubbyteReference<Element> get(Element const *ptr,
-                                             int64_t offset) {
-    return ConstSubbyteReference<Element>(ptr, offset);
-  }
-
-  /// Helper to add an offset in number of elements, assuming this offset is divisible
-  /// by the vector size.
-  CUTLASS_HOST_DEVICE
-  static Element *add_pointer_offset(Element *ptr, int64_t offset_in_elements) {
-    return &SubbyteReference<Element>(ptr, offset_in_elements);
-  }
-
-  /// Helper to add an offset in number of elements, assuming this offset is divisible
-  /// by the vector size.
-  CUTLASS_HOST_DEVICE
-  static Element const *add_pointer_offset(Element const *ptr, int64_t offset_in_elements) {
-    return &ConstSubbyteReference<Element>(ptr, offset_in_elements);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h b/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
deleted file mode 100755
index 982ec4e03..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tensor_coord.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a canonical coordinate for rank=4 tensors offering named indices.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a canonical 4D coordinate used by tensor operations.
-struct Tensor4DCoord : public Coord<4> {
-
-  /// Base class
-  using Base = Coord<4>;
-
-  /// Index type
-  using Index = typename Base::Index;
-
-  /// LongIndex type
-  using LongIndex = typename Base::LongIndex;
-
-  /// Batch dimension
-  static int const kN = 0;
-
-  /// Height dimension
-  static int const kH = 1;
-
-  /// Width dimension
-  static int const kW = 2;
-
-  /// Channels dimension
-  static int const kC = 3;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord() { }
-
-  /// Constructs from Coord<4>
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord(Coord<4> const &coord): Base(coord) { }
-
-  /// Helper to construct from N, H, W, and C.
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord(Index n, Index h, Index w, Index c): Base(make_Coord(n, h, w, c)) { }
-
-  /// Helper to construct from N, H, W, and C, which are LongIndex type
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord(LongIndex n, LongIndex h, LongIndex w, LongIndex c)
-    : Base(make_Coord(Index(n), Index(h), Index(w), Index(c))) { }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & n() const { return this->at(kN); }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & h() const { return this->at(kH); }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & h() { return this->at(kH); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & w() const { return this->at(kW); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & w() { return this->at(kW); }
-
-  /// Returns the channel of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & c() const { return this->at(kC); }
-
-  /// Returns the channel of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & c() { return this->at(kC); }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord operator+(Base const& b) const {
-    return Tensor4DCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord operator-(Base const& b) const {
-    return Tensor4DCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord operator*(Base const& b) const {
-    return Tensor4DCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord operator/(Base const& b) const {
-    return Tensor4DCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  Tensor4DCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a canonical 5D coordinate used by tensor operations.
-struct Tensor5DCoord : public Coord<5> {
-
-  /// Base class
-  using Base = Coord<5>;
-
-  /// Index type
-  using Index = typename Base::Index;
-
-  /// LongIndex type
-  using LongIndex = typename Base::LongIndex;
-
-  /// Batch dimension
-  static int const kN = 0;
-
-  /// Depth dimension
-  static int const kD = 1;
-
-  /// Height dimension
-  static int const kH = 2;
-
-  /// Width dimension
-  static int const kW = 3;
-
-  /// Channels dimension
-  static int const kC = 4;
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord() { }
-
-  /// Constructs from Coord<5>
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord(Coord<5> const &coord): Base(coord) { }
-
-  /// Helper to construct from N, D, H, W, and C.
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord(Index n, Index d, Index h, Index w, Index c): Base(make_Coord(n, d, h, w, c)) { }
-
-  /// Helper to construct from N, D, H, W, and C, which are LongIndex type
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord(LongIndex n, LongIndex d, LongIndex h, LongIndex w, LongIndex c)
-    : Base(make_Coord(Index(n), Index(d), Index(h), Index(w), Index(c))) { }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & n() const { return this->at(kN); }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & n() { return this->at(kN); }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & d() const { return this->at(kD); }
-
-  /// Returns the batch of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & d() { return this->at(kD); }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & h() const { return this->at(kH); }
-
-  /// Returns the row of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & h() { return this->at(kH); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & w() const { return this->at(kW); }
-
-  /// Returns the column of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & w() { return this->at(kW); }
-
-  /// Returns the channel of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index const & c() const { return this->at(kC); }
-
-  /// Returns the channel of the coordinate
-  CUTLASS_HOST_DEVICE
-  Index & c() { return this->at(kC); }
-
-  //
-  // Coord operators
-  //
-
-  /// Element-wise addition
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord operator+(Base const& b) const {
-    return Tensor5DCoord(Base::operator+(b));
-  }
-
-  /// Element-wise subtraction
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord operator-(Base const& b) const {
-    return Tensor5DCoord(Base::operator-(b));
-  }
-
-  /// Element-wise multiplication
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord operator*(Base const& b) const {
-    return Tensor5DCoord(Base::operator*(b));
-  }
-
-  /// Element-wise division
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord operator/(Base const& b) const {
-    return Tensor5DCoord(Base::operator/(b));
-  }
-
-  /// In-place addition
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord& operator+=(Base const& b) {
-    Base::operator+=(b);
-    return *this;
-  }
-
-  /// In-place subtraction
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord& operator-=(Base const& b) {
-    Base::operator-=(b);
-    return *this;
-  }
-
-  /// In-place multiplication
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord& operator*=(Base const& b) {
-    Base::operator*=(b);
-    return *this;
-  }
-
-  /// In-place division
-  CUTLASS_HOST_DEVICE
-  Tensor5DCoord& operator/=(Base const& b) {
-    Base::operator/=(b);
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h b/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
deleted file mode 100755
index 1191f651e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tensor_ref.h
+++ /dev/null
@@ -1,419 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
-*/
-#pragma once
-
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/subbyte_reference.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Default layout function from coordinates in a tensor's index space into the n-D array held
-/// in memory.
-///
-/// All layout functions must define at least the members shown in IdentityTensorLayout<>.
-template <int Rank>
-class IdentityTensorLayout {
-public:
-  /// Logical rank of tensor
-  static int const kRank = Rank;
-
-  /// Rank of stride vector
-  static int const kStrideRank = Rank;
-
-  /// Index type used for coordinates
-  using Index = int32_t;
-
-  /// Long index type used for offsets
-  using LongIndex = int64_t;
-
-  /// Logical coordinate
-  using TensorCoord = Coord<kRank, Index>;
-
-  /// Stride vector
-  using Stride = Coord<kStrideRank, Index>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride data member
-  Stride stride_;
-
-public:
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  IdentityTensorLayout(Stride const &stride = Stride()): stride_(stride) { }
-
-  /// Returns the offset of a coordinate in linear memory
-  CUTLASS_HOST_DEVICE
-  LongIndex operator()(Coord<Rank> const &coord) const {
-    return coord.dot(stride_);
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return stride_;
-  }
-
-  /// Returns the stride of the layout
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return stride_;
-  }
-
-  /// Compute the number of contiguous elements needed to store a tensor with the given size
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity(TensorCoord const &size) const {
-    int idx = stride_.max_dim_index();
-    return stride_[idx] * size[idx];
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
-          and layout within memory. A TensorRef combines a pointer and a Layout concept
-
-  Examples:
-
-  (These examples use helpers for matrix layouts defined in cutlass/layout/matrix.h)
-
-  1. Column-major matrix may be represented as a rank=2 tensor:
-
-    TensorRef<float, layout::ColumnMajor> A(ptr_A, ldm);
-
-  2. Row-major matrix may be represented as a rank=2 tensor:
-
-    TensorRef<float, layout::RowMajor> B(ptr_A, ldm);
-
-  3. An interleaved matrix may be represented as a rank=2 tensor:
-
-    TensorRef<int8_t, layout::ColumnMajorInterleaved<32> > C;
-
-  4. A helper exists to define a TensorRef for a contiguous matrix whose layout
-     is not known at compile time.
-
-    int ldm;                     // leading dimension
-    layout::Matrix kind;         // Could be layout::Matrix::kRowMajor or layout::Matrix::kColumnMajor
-    
-
-    TensorRef<int, layout::ContiguousMatrix> E(ptr_E, {ldm, kind});
-
-*/
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class TensorRef {
- public:
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-
-  /// Reference type to an element
-  using Reference = typename platform::conditional<
-    sizeof_bits<Element>::value >= 8,
-    Element &,
-    SubbyteReference<Element>
-    >::type;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-
-  /// TensorRef to constant data
-  using ConstTensorRef = TensorRef<
-    typename platform::remove_const<Element>::type const,
-    Layout>;
-
-  /// TensorRef to non-constant data
-  using NonConstTensorRef = TensorRef<
-    typename platform::remove_const<Element>::type,
-    Layout>;
-
-  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
-  /// scalar, but degenerate cases such as these are difficult to accommodate without
-  /// extensive C++ metaprogramming or support for zero-length arrays.
-  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
-
- private:
-
-  /// Pointer
-  Element* ptr_;
-
-  /// Layout object maps logical coordinates to linear offsets
-  Layout layout_;
-
- public:
-
-  //
-  // Methods
-  //
-
-  /// Constructs a TensorRef with a pointer and layout object.
-  CUTLASS_HOST_DEVICE
-  TensorRef(): ptr_(nullptr) {
-  
-  }
-
-  /// Constructs a TensorRef with a pointer and layout object.
-  CUTLASS_HOST_DEVICE
-  TensorRef(
-    Element *ptr,                   ///< pointer to start of tensor
-    Layout const &layout            ///< layout object containing stride and mapping function
-  ):
-    ptr_(ptr), layout_(layout) {
-  
-  }
-
-  /// Converting constructor from TensorRef to non-constant data.
-  template<typename _Magic = int>
-  CUTLASS_HOST_DEVICE
-  TensorRef(
-    NonConstTensorRef const &ref,              ///< TensorRef to non-const data
-    ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-    _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
-  ):
-    ptr_(ref.data()), layout_(ref.layout()) { }
-
-  /// Returns a reference to constant-valued tensor.
-  CUTLASS_HOST_DEVICE
-  ConstTensorRef const_ref() const {
-    return ConstTensorRef(ptr_, layout_);
-  }
-
-  CUTLASS_HOST_DEVICE
-  NonConstTensorRef non_const_ref() const {
-    return NonConstTensorRef(const_cast<typename platform::remove_const<Element>::type *>(ptr_), layout_);
-  }
-
-  /// Updates only the pointer
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr = nullptr) {
-    ptr_ = ptr;
-  }
-
-  /// Updates the pointer and layout object
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr, Layout const &layout) {
-    ptr_ = ptr;
-    layout_ = layout;
-  }
-
-  /// Returns true if the TensorRef is non-null
-  CUTLASS_HOST_DEVICE
-  bool good() const {
-    return ptr_ != nullptr;
-  }
-
-  /// Returns the pointer to referenced data
-  CUTLASS_HOST_DEVICE
-  Element * data() const { return ptr_; }
-
-  /// Returns a reference to the element at a given linear index
-  CUTLASS_HOST_DEVICE
-  Reference data(LongIndex idx) const {
-    return ReferenceFactory<typename platform::remove_const<Element>::type,
-                            (sizeof_bits<Element>::value < 8)>::get(ptr_, idx);
-  }
-
-  /// Returns the layout object
-  CUTLASS_HOST_DEVICE
-  Layout & layout() {
-    return layout_;
-  }
-
-  /// Returns the layout object
-  CUTLASS_HOST_DEVICE
-  Layout layout() const {
-    return layout_;
-  }
-
-  /// Returns the layout object's stride vector
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride vector
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  CUTLASS_HOST_DEVICE
-  typename Layout::Stride::Index stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  CUTLASS_HOST_DEVICE
-  typename Layout::Stride::Index & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
-
-  /// Computes the offset of an index from the origin of the tensor
-  CUTLASS_HOST_DEVICE
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference at(TensorCoord const& coord) const {
-    return data(offset(coord));
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference operator[](TensorCoord const& coord) const {
-    return data(offset(coord));
-  }
-
-  /// Adds an offset to each pointer
-  CUTLASS_HOST_DEVICE
-  TensorRef & add_pointer_offset(LongIndex offset_) {
-    ptr_ = ReferenceFactory<typename platform::remove_const<Element>::type,
-           (sizeof_bits<Element>::value < 8)>::add_pointer_offset(ptr_, offset_);
-    return *this;
-  }
-
-  /// Adds an offset to each pointer
-  CUTLASS_HOST_DEVICE
-  TensorRef & add_coord_offset(TensorCoord const &coord) {
-    add_pointer_offset(offset(coord));
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRef operator+(TensorCoord const& b) const {
-    TensorRef result(*this);
-    result.add_coord_offset(b);
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRef & operator+=(TensorCoord const& b) {
-    add_coord_offset(b);
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRef operator-(TensorCoord const& b) const {
-    TensorRef result(*this);
-    result.add_pointer_offset(-offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRef & operator-=(TensorCoord const& b) {
-    add_pointer_offset(-offset(b));
-    return *this;
-  }
-};
-
-/// Constructs a TensorRef, deducing types from arguments.
-template <
-  typename Element,
-  typename Layout
->
-CUTLASS_HOST_DEVICE
-TensorRef<Element, Layout> make_TensorRef(Element *ptr, Layout const &layout) {
-  return TensorRef<Element, Layout>(ptr, layout);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Partial specializations to handle degenerate and sub-byte cases.
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Element,
-  typename Layout
->
-CUTLASS_HOST_DEVICE
-bool TensorRef_aligned(TensorRef<Element, Layout> const &ref, int alignment) {
-
-  int const kStrideRank = Layout::kStrideRank;
-
-  if (reinterpret_cast<uintptr_t>(ref.data()) % alignment) {
-    return false;
-  }
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < kStrideRank; ++i) {
-    if (ref.stride(i) % alignment) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
deleted file mode 100755
index ab354bbaf..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
-*/
-#pragma once
-
-#include <cstdint>
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_ref.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element_>
-struct PlanarComplexReference {
-
-  //
-  // Type definitions
-  //
-
-  using Element = Element_;
-  using ComplexElement = complex<Element>;
-
-  //
-  // Data members
-  //
-
-  Element *real;
-  Element *imag;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  PlanarComplexReference(
-    Element *real_ = nullptr, 
-    Element *imag_ = nullptr
-  ):
-    real(real_), imag(imag_) { }
-
-  /// Loads the complex element
-  CUTLASS_HOST_DEVICE
-  operator complex<Element>() const {
-    return complex<Element>{*real, *imag};
-  }
-
-  /// Stores a complex element to the location pointed to by the reference 
-  CUTLASS_HOST_DEVICE
-  PlanarComplexReference &operator=(complex<Element> const &rhs) {
-    *real = rhs.real();
-    *imag = rhs.imag();
-    return *this;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
-          and layout within memory. A TensorRef combines a pointer and a Layout concept
-
-*/
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class TensorRefPlanarComplex {
- public:
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Complex element type
-  using ComplexElement = complex<Element>;
-
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-
-  static_assert(sizeof_bits<Element>::value >= 8,
-    "Planar complex not suitable for subbyte elements at this time");
-
-  /// Reference type to an element
-  using Reference = PlanarComplexReference<Element>;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-
-  /// TensorRef to constant data
-  using ConstTensorRef = TensorRefPlanarComplex<
-    typename platform::remove_const<Element>::type const,
-    Layout>;
-
-  /// TensorRef to non-constant data
-  using NonConstTensorRef = TensorRefPlanarComplex<
-    typename platform::remove_const<Element>::type,
-    Layout>;
-
-  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
-  /// scalar, but degenerate cases such as these are difficult to accommodate without
-  /// extensive C++ metaprogramming or support for zero-length arrays.
-  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
-
- private:
-
-  /// Pointer
-  Element* ptr_;
-
-  /// Layout object maps logical coordinates to linear offsets
-  Layout layout_;
-
-  /// Offset to imaginary part
-  LongIndex imaginary_stride_;
-
- public:
-
-  //
-  // Methods
-  //
-
-  /// Constructs a TensorRef with a pointer and layout object.
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex(
-    Element *ptr = nullptr,                   ///< pointer to start of tensor
-    Layout const &layout = Layout(),          ///< layout object containing stride and mapping function
-    LongIndex imaginary_stride = 0
-  ):
-    ptr_(ptr), layout_(layout), imaginary_stride_(imaginary_stride) {
-  
-  }
-
-  /// Converting constructor from TensorRef to non-constant data.
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex(
-    NonConstTensorRef const &ref              ///< TensorRef to non-const data
-  ):
-    ptr_(ref.data()), layout_(ref.layout()), imaginary_stride_(ref.imaginary_stride_) { }
-
-  /// Returns a reference to constant-valued tensor.
-  CUTLASS_HOST_DEVICE
-  ConstTensorRef const_ref() const {
-    return ConstTensorRef(ptr_, layout_, imaginary_stride_);
-  }
-
-  CUTLASS_HOST_DEVICE
-  NonConstTensorRef non_const_ref() const {
-    return NonConstTensorRef(
-      const_cast<typename platform::remove_const<Element>::type *>(ptr_), 
-      layout_, 
-      imaginary_stride_);
-  }
-
-  /// Updates only the pointer
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr = nullptr, LongIndex imaginary_stride = 0) {
-    ptr_ = ptr;
-    imaginary_stride_ = imaginary_stride;
-  }
-
-  /// Updates the pointer and layout object
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride) {
-    ptr_ = ptr;
-    layout_ = layout;
-    imaginary_stride_ = imaginary_stride;
-  }
-
-  /// Returns true if the TensorRef is non-null
-  CUTLASS_HOST_DEVICE
-  bool good() const {
-    return ptr_ != nullptr;
-  }
-
-  /// Returns the pointer to referenced data
-  CUTLASS_HOST_DEVICE
-  Element * data() const { return ptr_; }
-
-  /// Returns the pointer to referenced data
-  CUTLASS_HOST_DEVICE
-  Element * imaginary_data() const { return ptr_ + imaginary_stride_; }
-
-  /// Returns a reference to the element at a given linear index
-  CUTLASS_HOST_DEVICE
-  Reference data(LongIndex idx) const {
-    return Reference(ptr_ + idx, ptr_ + idx + imaginary_stride_);
-  }
-
-  /// Returns the layout object
-  CUTLASS_HOST_DEVICE
-  Layout & layout() {
-    return layout_;
-  }
-
-  /// Returns the layout object
-  CUTLASS_HOST_DEVICE
-  Layout layout() const {
-    return layout_;
-  }
-
-  /// Gets the stride to an imaginary element
-  LongIndex imaginary_stride() const {
-    return imaginary_stride_;
-  }
-
-  /// Gets the stride to an imaginary element
-  LongIndex &imaginary_stride() {
-    return imaginary_stride_;
-  }
-
-  /// Returns the layout object's stride vector
-  CUTLASS_HOST_DEVICE
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride vector
-  CUTLASS_HOST_DEVICE
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  CUTLASS_HOST_DEVICE
-  Index stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  CUTLASS_HOST_DEVICE
-  Index & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
-
-  /// Computes the offset of an index from the origin of the tensor
-  CUTLASS_HOST_DEVICE
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference at(TensorCoord const& coord) const {
-    return data(offset(coord));
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference operator[](TensorCoord const& coord) const {
-    return data(offset(coord));
-  }
-
-  /// Adds an offset to each pointer
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex & add_pointer_offset(LongIndex offset_) {
-    ptr_ += offset_;
-    return *this;
-  }
-
-  /// Adds an offset to each pointer
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex & add_coord_offset(TensorCoord const &coord) {
-    add_pointer_offset(offset(coord));
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex operator+(TensorCoord const& b) const {
-    TensorRefPlanarComplex result(*this);
-    result.add_coord_offset(b);
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex & operator+=(TensorCoord const& b) {
-    add_coord_offset(b);
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex operator-(TensorCoord const& b) const {
-    TensorRefPlanarComplex result(*this);
-    result.add_pointer_offset(-offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorRefPlanarComplex & operator-=(TensorCoord const& b) {
-    add_pointer_offset(-offset(b));
-    return *this;
-  }
-
-  /// TensorRef to real-valued tensor
-  CUTLASS_HOST_DEVICE
-  cutlass::TensorRef<Element, Layout> ref_real() const {
-    return cutlass::TensorRef<Element, Layout>(data(), layout());
-  }
-
-  /// TensorRef to real-valued tensor
-  CUTLASS_HOST_DEVICE
-  cutlass::TensorRef<Element, Layout> ref_imag() const {
-    return cutlass::TensorRef<Element, Layout>(imaginary_data(), layout());
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Constructs a TensorRef, deducing types from arguments.
-template <
-  typename Element,
-  typename Layout
->
-CUTLASS_HOST_DEVICE
-TensorRefPlanarComplex<Element, Layout> make_TensorRefPlanarComplex(
-  Element *ptr, 
-  Layout const &layout, 
-  int64_t imaginary_stride) {
-
-  return TensorRefPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_view.h b/lightllm-kernel/cutlass/include/cutlass/tensor_view.h
deleted file mode 100755
index 7defcc24f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tensor_view.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a structure containing strides and a pointer to tensor data.
-
-    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
-    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
-    data storage and is therefore lightweight and may be embedded in larger tensor objects or
-    memory structures.
-
-    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
-    linear memory.
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <cmath>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Data type of element stored within tensor
-  typename Element_,
-  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
-  typename Layout_
->
-class TensorView : public TensorRef<Element_, Layout_> {
- public:
-
-  /// Base tensor reference
-  using Base = cutlass::TensorRef<Element_, Layout_>;
-
-  /// Mapping function from logical coordinate to internal n-D array
-  using Layout = Layout_;
-
-  /// TensorRef pointing to constant memory
-  using ConstTensorRef = typename Base::ConstTensorRef;
-
-  /// Underlying TensorRef type
-  using TensorRef = Base;
-
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Reference type to an element
-  using Reference = Element &;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Coordinate in storage n-D array
-  using Stride = typename Layout::Stride;
-
-  /// TensorView pointing to constant memory
-  using ConstTensorView = TensorView<
-    typename platform::remove_const<Element>::type const,
-    Layout>;
-
-  /// TensorView pointing to non-constant memory
-  using NonConstTensorView = TensorView<
-    typename platform::remove_const<Element>::type,
-    Layout>;
-
-  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
-  /// scalar, but degenerate cases such as these are difficult to accommodate without
-  /// extensive C++ metaprogramming or support for zero-length arrays.
-  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
-
- private:
-
-  /// View extent
-  TensorCoord extent_;
-
- public:
-
-  //
-  // Methods
-  //
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorView() { }
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorView(
-    Element *ptr,                         ///< pointer to start of tensor
-    Layout const &layout,                 ///< layout object containing stride and mapping function
-    TensorCoord const &extent             ///< size of the view in logical coordinates
-  ):
-    Base(ptr, layout), extent_(extent) {
-  
-  }
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorView(
-    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
-    TensorCoord const &extent             ///< logical size of tensor
-  ):
-    Base(ref), extent_(extent) {
-  
-  }
-
-  /// Converting constructor from TensorRef to non-constant data.
-  CUTLASS_HOST_DEVICE
-  TensorView(
-    NonConstTensorView const &view        ///< TensorView to non-const data
-  ):
-    Base(view), extent_(view.extent_) { }
-
-  /// Updates the pointer and layout object
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
-    Base::reset(ptr, layout);
-    this->resize(extent);
-  }
-
-  /// Updates the pointer
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr) {
-    Base::reset(ptr);
-  }
-
-  /// Changes the size of the view without affecting pointer or layout
-  CUTLASS_HOST_DEVICE
-  void resize(TensorCoord const &extent) {
-    this->extent_ = extent;
-  }
-
-  /// Returns the extent of the view (the size along each logical dimension).
-  CUTLASS_HOST_DEVICE
-  TensorCoord const& extent() const { return extent_; }
-
-  /// Returns the extent along a particular logical dimension.
-  CUTLASS_HOST_DEVICE
-  Index extent(int dim) const { return extent_.at(dim); }
-
-  /// Returns the number of logical elements
-  CUTLASS_HOST_DEVICE
-  LongIndex size() const {
-    return extent_.product();
-  }
-
-  /// Determines whether a location is within a tensor
-  CUTLASS_HOST_DEVICE
-  bool contains(TensorCoord const& coord) const {
-    CUTLASS_PRAGMA_UNROLL
-    for (int dim = 0; dim < kRank; ++dim) {
-      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  TensorRef ref() const {
-    return TensorRef(this->data(), this->layout());
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  ConstTensorRef const_ref() const {
-    return ConstTensorRef(this->data(), this->layout());
-  }
-
-  /// Returns a TensorView to const data
-  CUTLASS_HOST_DEVICE
-  ConstTensorView const_view() const {
-    return ConstTensorView(const_ref(), extent_);
-  }
-
-  /// Returns a Tensor_view given location and size quantities
-  CUTLASS_HOST_DEVICE
-  TensorView subview(
-    TensorCoord extent,                               ///< extent of the resulting view
-    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
-  ) const {
-
-    TensorView result(this->ref(), extent.clamp(extent_ - location));
-    result.add_coord_offset(location);
-    return result;
-  }
-
-  /// Returns the number of scalar elements needed to store tensor.
-  CUTLASS_HOST_DEVICE
-  size_t capacity() const {
-    return Base::layout().capacity(extent_);
-  }
-
-  /// Returns a TensorView offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorView operator+(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) const {
-
-    TensorView result(*this);
-    result.add_pointer_offset(this->offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorView& operator+=(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) {
-
-    this->add_pointer_offset(this->offset(b));
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorView operator-(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) const {
-
-    TensorRef result(*this);
-    result.add_pointer_offset(-this->offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorView& operator-=(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) {
-
-    this->add_pointer_offset(-this->offset(b));
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Constructs a TensorRef, deducing types from arguments.
-template <
-  typename Element,
-  typename Layout
->
-CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
-  Element *ptr, 
-  Layout const &layout,
-  typename Layout::TensorCoord const &extent) {
-
-  return TensorView<Element, Layout>(ptr, layout, extent);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h b/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
deleted file mode 100755
index c98de563f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a structure containing strides and a pointer to tensor data.
-
-    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
-    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
-    data storage and is therefore lightweight and may be embedded in larger tensor objects or
-    memory structures.
-
-    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
-    linear memory.
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include <cmath>
-#endif
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  /// Data type of element stored within tensor
-  typename Element_,
-  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
-  typename Layout_
->
-class TensorViewPlanarComplex : public TensorRefPlanarComplex<Element_, Layout_> {
- public:
-
-  /// Base tensor reference
-  using Base = cutlass::TensorRefPlanarComplex<Element_, Layout_>;
-
-  /// Mapping function from logical coordinate to internal n-D array
-  using Layout = Layout_;
-
-  /// TensorRef pointing to constant memory
-  using ConstTensorRef = typename Base::ConstTensorRef;
-
-  /// Underlying TensorRef type
-  using TensorRef = Base;
-
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Reference type to an element
-  using Reference = Element &;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Coordinate in storage n-D array
-  using Stride = typename Layout::Stride;
-
-  /// TensorView pointing to constant memory
-  using ConstTensorView = TensorViewPlanarComplex<
-    typename platform::remove_const<Element>::type const,
-    Layout>;
-
-  /// TensorView pointing to non-constant memory
-  using NonConstTensorView = TensorViewPlanarComplex<
-    typename platform::remove_const<Element>::type,
-    Layout>;
-
-  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
-  /// scalar, but degenerate cases such as these are difficult to accommodate without
-  /// extensive C++ metaprogramming or support for zero-length arrays.
-  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
-
- private:
-
-  /// View extent
-  TensorCoord extent_;
-
- public:
-
-  //
-  // Methods
-  //
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex(TensorCoord const &extent = TensorCoord()): extent_(extent) {
-
-  }
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex(
-    Element *ptr,                         ///< pointer to start of tensor
-    Layout const &layout,                 ///< layout object containing stride and mapping function
-    LongIndex imaginary_stride,           ///< stride between real and imaginary part
-    TensorCoord const &extent             ///< size of the view in logical coordinates
-  ):
-    Base(ptr, layout, imaginary_stride), extent_(extent) {
-  
-  }
-
-  /// Constructs a TensorView object
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex(
-    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
-    TensorCoord const &extent             ///< logical size of tensor
-  ):
-    Base(ref), extent_(extent) {
-  
-  }
-
-  /// Converting constructor from TensorRef to non-constant data.
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex(
-    NonConstTensorView const &view        ///< TensorView to non-const data
-  ):
-    Base(view), extent_(view.extent_) { }
-
-  /// Updates the pointer and layout object
-  CUTLASS_HOST_DEVICE
-  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride, TensorCoord size) {
-    Base::reset(ptr, layout, imaginary_stride);
-    this->resize(extent_);
-  }
-
-  /// Changes the size of the view without affecting pointer or layout
-  CUTLASS_HOST_DEVICE
-  void resize(TensorCoord extent) {
-    this->extent_ = extent;
-  }
-
-  /// Returns the extent of the view (the size along each logical dimension).
-  CUTLASS_HOST_DEVICE
-  TensorCoord const& extent() const { return extent_; }
-
-  /// Returns the extent along a particular logical dimension.
-  CUTLASS_HOST_DEVICE
-  Index extent(int dim) const { return extent_.at(dim); }
-
-  /// Determines whether a location is within a tensor
-  CUTLASS_HOST_DEVICE
-  bool contains(TensorCoord const& coord) const {
-    CUTLASS_PRAGMA_UNROLL
-    for (int dim = 0; dim < kRank; ++dim) {
-      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  Base ref() const {
-    return Base(this->data(), this->layout(), this->imaginary_stride());
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  ConstTensorRef const_ref() const {
-    return ConstTensorRef(this->data(), this->layout());
-  }
-
-  /// Returns a TensorView to const data
-  CUTLASS_HOST_DEVICE
-  ConstTensorView const_view() const {
-    return ConstTensorView(const_ref(), extent_);
-  }
-
-  /// Returns a Tensor_view given location and size quantities
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex subview(
-    TensorCoord extent,                               ///< extent of the resulting view
-    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
-  ) const {
-
-    TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location));
-    result.add_coord_offset(location);
-    return result; 
-  }
-
-  /// Returns the number of scalar elements needed to store tensor.
-  CUTLASS_HOST_DEVICE
-  size_t capacity() const {
-    return Base::layout().capacity(extent_);
-  }
-
-  /// Returns a TensorView offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex operator+(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) const {
-
-    TensorViewPlanarComplex result(*this);
-    result.add_pointer_offset(this->offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex& operator+=(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) {
-
-    this->add_pointer_offset(this->offset(b));
-    return *this;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex operator-(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) const {
-
-    TensorRef result(*this);
-    result.add_pointer_offset(-this->offset(b));
-    return result;
-  }
-
-  /// Returns a TensorRef offset by a given amount
-  CUTLASS_HOST_DEVICE
-  TensorViewPlanarComplex& operator-=(
-    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
-  ) {
-
-    this->add_pointer_offset(-this->offset(b));
-    return *this;
-  }
-
-  /// TensorRef to real-valued tensor
-  CUTLASS_HOST_DEVICE
-  cutlass::TensorView<Element, Layout> view_real() const {
-    return cutlass::TensorView<Element, Layout>(this->data(), this->layout(), extent_);
-  }
-
-  /// TensorRef to real-valued tensor
-  CUTLASS_HOST_DEVICE
-  cutlass::TensorView<Element, Layout> view_imag() const {
-    return cutlass::TensorView<Element, Layout>(this->imaginary_data(), this->layout(), extent_);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Constructs a TensorRef, deducing types from arguments.
-template <
-  typename Element,
-  typename Layout
->
-CUTLASS_HOST_DEVICE TensorViewPlanarComplex<Element, Layout> make_TensorViewPlanarComplex(
-  Element *ptr, 
-  Layout const &layout,
-  typename Layout::LongIndex imaginary_stride,
-  typename Layout::TensorCoord const &extent) {
-
-  return TensorViewPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride, extent);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/tfloat32.h b/lightllm-kernel/cutlass/include/cutlass/tfloat32.h
deleted file mode 100755
index 8e7ab884c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/tfloat32.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*!
-    \file
-    \brief Defines a proxy class for storing Tensor Float 32 data type.
-*/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include "cutlass/floating_point_nvrtc.h"
-#else
-#include <cmath>
-#include <limits>
-#include <cstdint>
-#endif
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tensor Float 32 data type
-struct alignas(4) tfloat32_t {
-
-  //
-  // Data members
-  //
-
-  /// Storage type
-  uint32_t storage;
-
-  //
-  // Methods
-  //
-  private:
-    CUTLASS_HOST_DEVICE
-    static uint32_t float_to_storage(float s) {
-  #if defined(__CUDA_ARCH__)
-      uint32_t result = reinterpret_cast<uint32_t const &>(s);
-  #else
-      uint32_t result;
-      std::memcpy(&result, &s, sizeof(float));
-  #endif
-      return result;
-    }
-
-  public:
-  /// Constructs from an unsigned int
-  CUTLASS_HOST_DEVICE
-  static tfloat32_t bitcast(uint32_t x) {
-    tfloat32_t h;
-    h.storage = x;
-    return h;
-  }
-
-  /// Emulated rounding is fast in device code
-  CUTLASS_HOST_DEVICE
-  static tfloat32_t round_half_ulp_truncate(float const &s) {
-    uint32_t x = float_to_storage(s);
-
-    #if defined(__CUDA_ARCH__)
-    if (::isfinite(s)) {
-      x += 0x1000u;
-    }
-    #else
-    if (std::isfinite(s)) {
-      x += 0x1000u;
-    }
-    #endif
-
-    return tfloat32_t::bitcast(x);
-  }
-
-  tfloat32_t() = default;
-
-  /// Floating-point conversion - round toward nearest even
-  CUTLASS_HOST_DEVICE
-  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).raw()) { }
-
-  // Conversion from double (this rounds twice)
-  CUTLASS_HOST_DEVICE
-  explicit tfloat32_t(double x): tfloat32_t(float(x)) { }
-
-  /// Integer conversion - round toward zero
-  CUTLASS_HOST_DEVICE
-  explicit tfloat32_t(int x) {
-    float flt = static_cast<float>(x);
-    #if defined(__CUDA_ARCH__)
-    storage = reinterpret_cast<uint32_t const &>(flt);
-    #else
-    std::memcpy(&storage, &flt, sizeof(storage));
-    #endif
-  }
-
-  // Conversion to float
-  CUTLASS_HOST_DEVICE
-  operator float() const {
-
-    // Conversions to IEEE single-precision requires clearing dont-care bits
-    // of the mantissa.
-    unsigned bits = (storage & ~0x1fffu);
-
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<float const &>(bits);
-    #else
-    float flt;
-    std::memcpy(&flt, &bits, sizeof(flt));
-    return flt;
-    #endif
-  }
-
-  /// Converts to double
-  CUTLASS_HOST_DEVICE
-  explicit operator double() const {
-    return double(float(*this));
-  }
-
-  /// Converts to int
-  CUTLASS_HOST_DEVICE
-  explicit operator int() const {
-    return int(float(*this));
-  }
-
-  /// Casts to bool
-  CUTLASS_HOST_DEVICE
-  explicit operator bool() const {
-    return (float(*this) != 0.0f);
-  }
-
-  /// Obtains raw bits
-  CUTLASS_HOST_DEVICE
-  uint32_t raw() const {
-    return storage;
-  }
-
-  /// Returns the sign bit
-  CUTLASS_HOST_DEVICE
-  bool signbit() const {
-    return ((raw() & 0x80000000) != 0);
-  }
-
-  /// Returns the biased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent_biased() const {
-    return int((raw() >> 23) & 0x0ff);
-  }
-
-  /// Returns the unbiased exponent
-  CUTLASS_HOST_DEVICE
-  int exponent() const {
-    return exponent_biased() - 127;
-  }
-
-  /// Returns the mantissa
-  CUTLASS_HOST_DEVICE
-  int mantissa() const {
-    return int(raw() & 0x7fffff);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool signbit(cutlass::tfloat32_t const& h) {
-  return h.signbit();
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) {
-  return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isnan(cutlass::tfloat32_t const& h) {
-  return (h.exponent_biased() == 0x0ff) && h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isfinite(cutlass::tfloat32_t const& h) {
-  return (h.exponent_biased() != 0x0ff);
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::tfloat32_t nan_tf32(const char*) {
-  // NVIDIA canonical NaN
-  return cutlass::tfloat32_t::bitcast(0x7fffffff);
-}
-
-CUTLASS_HOST_DEVICE
-bool isinf(cutlass::tfloat32_t const& h) {
-  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
-}
-
-CUTLASS_HOST_DEVICE
-bool isnormal(cutlass::tfloat32_t const& h) {
-  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
-}
-
-CUTLASS_HOST_DEVICE
-int fpclassify(cutlass::tfloat32_t const& h) {
-  int exp = h.exponent_biased();
-  int mantissa = h.mantissa();
-  if (exp == 0x0ff) {
-    if (mantissa) {
-      return FP_NAN;
-    }
-    else {
-      return FP_INFINITE;
-    }
-  }
-  else if (!exp) {
-    if (mantissa) {
-      return FP_SUBNORMAL;
-    }
-    else {
-      return FP_ZERO;
-    }
-  }
-  return FP_NORMAL;
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
-#if defined(__CUDACC_RTC__)
-  return cutlass::tfloat32_t(sqrtf(float(h)));
-#else
-  return cutlass::tfloat32_t(std::sqrt(float(h)));
-#endif
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
-
-  uint32_t a_mag = (a.raw() & 0x7fffffff);
-  uint32_t b_sign = (b.raw() & 0x80000000);
-  uint32_t result = (a_mag | b_sign);
-
-  return tfloat32_t::bitcast(result);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Standard Library operations and definitions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace std {
-
-#if !defined(__CUDACC_RTC__)
-/// Numeric limits
-template <>
-struct numeric_limits<cutlass::tfloat32_t> {
-  static bool const is_specialized = true;
-  static bool const is_signed = true;
-  static bool const is_integer = false;
-  static bool const is_exact = false;
-  static bool const has_infinity = true;
-  static bool const has_quiet_NaN = true;
-  static bool const has_signaling_NaN = false;
-  static std::float_denorm_style const has_denorm = std::denorm_present;
-  static bool const has_denorm_loss = true;
-  static std::float_round_style const round_style = std::round_to_nearest;
-  static bool const is_iec559 = false;
-  static bool const is_bounded = true;
-  static bool const is_modulo = false;
-  static int const digits = 19;
-
-  /// Least positive value
-  static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); }
-
-  /// Minimum finite value
-  static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); }
-
-  /// Maximum finite value
-  static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
-
-  /// Returns smallest finite value
-  static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); }
-};
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace std
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Arithmetic operators
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_HOST_DEVICE
-bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) == float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) != float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) < float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) <= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) > float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return float(lhs) >= float(rhs);
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return tfloat32_t(float(lhs) + float(rhs));
-}
-
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator-(tfloat32_t const& lhs) {
-  return tfloat32_t::bitcast(0x80000000 ^ lhs.raw());
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return tfloat32_t(float(lhs) - float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return tfloat32_t(float(lhs) * float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) {
-  return tfloat32_t(float(lhs) / float(rhs));
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) {
-  lhs = tfloat32_t(float(lhs) + float(rhs));
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) {
-  lhs = tfloat32_t(float(lhs) - float(rhs));
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) {
-  lhs = tfloat32_t(float(lhs) * float(rhs));
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) {
-  lhs = tfloat32_t(float(lhs) / float(rhs));
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator++(tfloat32_t & lhs) {
-  float tmp(lhs);
-  ++tmp;
-  lhs = tfloat32_t(tmp);
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t& operator--(tfloat32_t & lhs) {
-  float tmp(lhs);
-  --tmp;
-  lhs = tfloat32_t(tmp);
-  return lhs;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator++(tfloat32_t & lhs, int) {
-  tfloat32_t ret(lhs);
-  float tmp(lhs);
-  tmp++;
-  lhs = tfloat32_t(tmp);
-  return ret;
-}
-
-CUTLASS_HOST_DEVICE
-tfloat32_t operator--(tfloat32_t & lhs, int) {
-  tfloat32_t ret(lhs);
-  float tmp(lhs);
-  tmp--;
-  lhs = tfloat32_t(tmp);
-  return ret;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// User-defined literals
-//
-
-CUTLASS_HOST_DEVICE
-cutlass::tfloat32_t operator "" _tf32(long double x) {
-  return cutlass::tfloat32_t(float(x));
-}
-
-CUTLASS_HOST_DEVICE
-cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) {
-  return cutlass::tfloat32_t(int(x));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h b/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
deleted file mode 100755
index f6b4b2b79..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/thread/matrix.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines a matrix object intended for storing data in registers and operations within
-      a CUDA thread.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/matrix_coord.h"
-
-namespace cutlass {
-namespace thread {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Per-thread matrix object storing a packed matrix
-template <
-  typename Element,
-  int Rows,
-  int Columns,
-  typename Layout = layout::RowMajor
->
-class Matrix : public Array<Element, Rows * Columns> {
-public:
-  
-  // Verify layout refers to a rank=2 matrix.
-  static_assert(
-    Layout::kRank == 2,
-    "Layout type must refer to a rank=2 matrix");
-
-  /// Base type
-  using Base = Array<Element, Rows * Columns>;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Number of rows
-  static int const kRows = Rows;
-
-  /// Number of columns
-  static int const kColumns = Columns;
-
-  /// Layout within the array
-  using Layout = Layout_;
-
-  /// Reference type to an element
-  using Reference = Element &;
-
-  /// Logical rank of tensor index space
-  static int const kRank = 2;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Stride type
-  using Stride = typename Layout::Stride;
-
-  /// TensorRef to matrix object
-  using TensorRef = TensorRef<Element, kRank, Layout>;
-
-  /// TensorRef to constant matrix object
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  /// TensorRef to matrix object
-  using TensorView = TensorView<Element, kRank, Layout>;
-
-  /// TensorRef to constant matrix object
-  using ConstTensorView = typename TensorView::ConstTensorView;
-
-  /// Diagonal vector
-  using Diagonal = Vector<Element, __NV_STD_MIN(kRows, kColumns)>;
-
-private:
-
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Returns the size of the object
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord extent() {
-    return make_Coord(kRows, kColumns);
-  }
-
-  /// Returns the layout object
-  CUTLASS_HOST_DEVICE
-  static Layout layout() {
-    return Layout::packed(extent());
-  }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  Matrix() { }
-
-  /// Ctor
-  CUTLASS_HOST_DEVICE
-  Matrix(Diagonal const &diag) {
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  TensorRef ref() {
-    return TensorRef(this->data(), layout());
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  ConstTensorRef const_ref() const {
-    return ConstTensorRef(this->data(), layout());
-  }
-
-  /// Returns a TensorRef pointing to the first element of the tensor.
-  CUTLASS_HOST_DEVICE
-  TensorView view() {
-    return TensorView(ref(), extent());
-  }
-
-  /// Returns a TensorView to const data
-  CUTLASS_HOST_DEVICE
-  ConstTensorView const_view() const {
-    return ConstTensorView(const_ref(), extent());
-  }
-
-  /// Returns a reference to the element at a given Coord
-  CUTLASS_HOST_DEVICE
-  Reference at(MatrixCoord const& coord) const {
-    typename Base::size_type offset_(layout().offset(coord));
-    return Base::at(offset_);
-  }
-
-  /// Returns the number of scalar elements needed to store tensor.
-  CUTLASS_HOST_DEVICE
-  LongIndex capacity() const {
-    return LongIndex(Base::size());
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Column vector defined as a matrix with exactly one column
-template <
-  typename Element,
-  int Rows,
-  typename Layout = layout::ColumnMajor
->
-using ColumnVector = Matrix<Element, Rows, 1, Layout>;
-
-/// Row vector defined as a matrix with exactly one row
-template <
-  typename Element,
-  int Columns,
-  typename Layout = layout::RowMajor
->
-using RowVector = Matrix<Element, 1, Columns, Layout>;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/trace.h b/lightllm-kernel/cutlass/include/cutlass/trace.h
deleted file mode 100755
index 1b0c51126..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/trace.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Helpers for optionally tracing through code when debugging.
-
-    This file is to be included after all other headers.
-*/
-
-#pragma once
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Tracing options
-#ifndef CUTLASS_DEBUG_TRACE_LEVEL
-#define CUTLASS_DEBUG_TRACE_LEVEL 0
-#endif
-
-#if CUTLASS_DEBUG_TRACE_LEVEL
-#include <iostream>
-#include "cutlass/core_io.h"
-#if defined(__CUDA_ARCH__)
-#define CUTLASS_TRACE_HOST(x)
-#else
-#define CUTLASS_TRACE_HOST(x) { std::cout << __FILE__ << ":" << __LINE__ << "  " << x << std::endl; }
-#endif
-#else
-#define CUTLASS_TRACE_HOST(x)
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
deleted file mode 100755
index 430545e6d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
+++ /dev/null
@@ -1,754 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing how threads are mapped to a given tile.
-*/
-
-#pragma once
-
-#include "cute/arch/mma_sm90_gmma.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-using namespace cute;
-
-template <bool Transpose, class SmemLayoutAtom, class ElementType>
-constexpr auto
-gmma_smem_transpose_or_passthrough() {
-  if constexpr (Transpose) {
-    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemLayoutAtom>) {
-      return GMMA::Layout_K_SW128_Atom<ElementType>{};
-    }
-    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemLayoutAtom>) {
-      return GMMA::Layout_K_SW64_Atom<ElementType>{};
-    }
-    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW32_Atom<ElementType>, SmemLayoutAtom>) {
-      return GMMA::Layout_K_SW32_Atom<ElementType>{};
-    }
-    else if constexpr (cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemLayoutAtom>) {
-      return GMMA::Layout_K_INTER_Atom<ElementType>{};
-    }
-    else {
-      static_assert(cutlass::detail::dependent_false<SmemLayoutAtom>, "Unsupported Layout_SW_Atom for B SMEM transposition");
-    }
-  }
-  else {
-    return SmemLayoutAtom{};
-  }
-}
-
-template <class SmemCopyAtom, class ElementType>
-constexpr auto
-use_universal_transposition() {
-  if constexpr (sizeof(ElementType) == 1) {
-    return !cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemCopyAtom>;
-  }
-  else if constexpr (sizeof(ElementType) == 4){
-    // Only universal transposition can handle SW64 and Non swizzle SMEM layout
-    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemCopyAtom> ||
-                  cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemCopyAtom>) {
-      return true;
-    }
-    else {
-      return false;
-    }
-  }
-  else {
-    static_assert(cutlass::detail::dependent_false<ElementType>, "Unsupported ElementType for B SMEM transposition");
-  }
-}
-
-template<
-  class TiledMma_,
-  class SmemLayoutB_,
-  class SmemLayoutAtomB_,
-  class ElementB_>
-class NoTranspositionOperandB {
-public:
-  using TiledMma = TiledMma_;
-  using SmemLayoutB = SmemLayoutB_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using ElementB = ElementB_;
-
-  constexpr CUTLASS_HOST_DEVICE
-  NoTranspositionOperandB(
-      int,
-      int,
-      TiledMma,
-      SmemLayoutB,
-      SmemLayoutAtomB,
-      ElementB) { }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void operator()(
-    TensorSmemB const&,
-    TensorTransposedSmemB const&,
-    int, int) { }
-
-  CUTLASS_DEVICE void synchronize(int) { }
-
-  CUTLASS_DEVICE void synchronize() { }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void transpose(
-    TensorSmemB const&,
-    TensorTransposedSmemB const&,
-    int) { }
-};
-
-template<
-  class TiledMma_,
-  class SmemLayoutB_,
-  class SmemLayoutAtomB_,
-  class ElementB_>
-class UniversalTranspositionOperandB {
-public:
-  using TiledMma = TiledMma_;
-  using SmemLayoutB = SmemLayoutB_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using ElementB = ElementB_;
-  
-  constexpr CUTLASS_HOST_DEVICE 
-  UniversalTranspositionOperandB(
-      int warp_idx_,
-      int warp_group_thread_idx_,
-      TiledMma,
-      SmemLayoutB,
-      SmemLayoutAtomB,
-      ElementB)
-      : warp_idx(warp_idx_)
-      , warp_group_thread_idx(warp_group_thread_idx_) { }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void operator()(
-    TensorSmemB const& sB,
-    TensorTransposedSmemB const& gmma_sB,
-    int read_stage, int current_step) {
-      if (current_step > 0) {
-        return;
-      }
-
-      constexpr int NumMathWarpGroup = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
-      static_assert(NumMathWarpGroup == 1 ||
-                    (!detail::use_universal_transposition<SmemLayoutAtomB, ElementB>() && NumMathWarpGroup == 2),
-                    "Wrong math warp group number for TransposeB");
-      constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
-
-      constexpr int BytesPerSmemSwizzleUnit = 16;
-      constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      /// Universal transposition, need warp_group sync between load and store.
-      /// The number of reg used depends on the input elementB.
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      /*
-          In one copy step, a warp group would load WarpgroupTileSize * WarpgroupTileSize tile then store to transposed location.
-          In warp_group_tile, each warp holds Four WarpTileSize x WarpTileSize elements:
-                    K
-              ------------
-            | W0 W1 W2 W3  ---
-            | W0 W1 W2 W3    |
-            | W0 W1 W2 W3    | --> Copy Step 0
-            | W0 W1 W2 W3  ---
-                  ....
-            | W0 W1 W2 W3  ---
-            | W0 W1 W2 W3    |
-            | W0 W1 W2 W3    | --> Copy Step n
-            | W0 W1 W2 W3  ---
-      */
-      static_assert((NumThreadsPerWarpGroup % WarpThreadShapeN == 0), "Unsupported warp thread layout.");
-      constexpr auto WarpgroupThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<NumThreadsPerWarpGroup / WarpThreadShapeN>{}));
-
-      // Get copy tile and partition to each thread
-      auto sB_tiled_copy = make_tiled_copy(
-        Copy_Atom<DefaultCopy, ElementB>{},
-        WarpgroupThreadLayout,                           // thr_layout
-        Layout<_1>{}                                     // val_layout
-      );
-      static_assert(size(sB_tiled_copy) == size(TiledMma{}), "Wrong thread number in TiledCopy.");
-
-      auto sB_thr_copy        = sB_tiled_copy.get_thread_slice(warp_group_thread_idx);
-      Tensor tCsB             = sB_thr_copy.partition_S(     sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
-      Tensor tCsB_transposed  = sB_thr_copy.partition_D(gmma_sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
-
-      // Divide partitioned tile to limit register usage
-      constexpr int  CopySteps      = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
-      constexpr auto CopyTileShape  = make_shape(size<0>(tCsB), Int< size<1>(tCsB) / CopySteps >{}, size<2>(tCsB));
-      static_assert(size<1>(tCsB) % CopySteps == 0, "CopySteps must evenly divide rank 1 size of partitioned SMEM.");
-
-      Tensor tCsB_copy_tile            = zipped_divide(tCsB, CopyTileShape);
-      Tensor tCsB_copy_tile_transposed = zipped_divide(tCsB_transposed, CopyTileShape);
-      auto   transpose_fragment        = make_fragment_like(tCsB_copy_tile(_,_0{}));
-
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int step = 0; step < CopySteps; ++step) {
-        copy(sB_tiled_copy, tCsB_copy_tile(_,step), transpose_fragment);
-
-        // Make sure all elements are read before being overwritten
-        __syncthreads();
-
-        copy(sB_tiled_copy, transpose_fragment, tCsB_copy_tile_transposed(_,step));
-      }
-  }
-
-  CUTLASS_DEVICE void synchronize(int step) {
-    if (step == 0) {
-      // SMEM fence to make sure B is transposed before math
-      cutlass::arch::fence_view_async_shared();
-      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-    }
-  }
-
-  CUTLASS_DEVICE void synchronize() {
-    // SMEM fence to make sure B is transposed before math
-    cutlass::arch::fence_view_async_shared();
-    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-  }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void transpose(
-    TensorSmemB const& sB,
-    TensorTransposedSmemB const& gmma_sB,
-    int read_stage) {
-
-    this->operator()(sB, gmma_sB, read_stage, 0);
-    synchronize();
-
-  }
-
-private:
-  const int warp_idx;
-  const int warp_group_thread_idx;
-};
-
-template<
-  class TiledMma_,
-  class SmemLayoutB_,
-  class SmemLayoutAtomB_,
-  class ElementB_>
-class AsyncTranspositionOperandB {
-public:
-
-  using TiledMma = TiledMma_;
-  using SmemLayoutB = SmemLayoutB_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using ElementB = ElementB_;
-  
-  static constexpr int Steps             = 2;
-  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
-  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
-  static_assert(NumMathWarpGroup <= 2,
-                    "Wrong math warp group number for TransposeB");
-  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
-  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
-
-  static constexpr int BytesPerSmemSwizzleUnit = 16;
-  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
-  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
-  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
-
-  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
-  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
-  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
-  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
-  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
-  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
-  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
-  static constexpr int NumBitsPerStep        = 3;
-  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
-  static constexpr int NumBitsPerWarp        = 12;
-  // Number of warp_group_tiles
-  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
-    "Copy size must evenly divide SMEM tile.");
-  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
-
-  static_assert(size<2>(typename TiledMma::AtomShape_MNK{}) <= WarpThreadShapeK,
-      "Need to be able to transpose first k-block in the first step");
-
-  constexpr CUTLASS_HOST_DEVICE
-  AsyncTranspositionOperandB(
-      int warp_idx_,
-      int warp_group_thread_idx_,
-      TiledMma,
-      SmemLayoutB,
-      SmemLayoutAtomB,
-      ElementB)
-      : warp_idx(warp_idx_)
-      , warp_group_thread_idx(warp_group_thread_idx_)
-      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
-      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
-            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
-      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
-            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void operator()(
-      TensorSmemB const& sB,
-      TensorTransposedSmemB const& gmma_sB,
-      int read_stage, int current_step)
-  {
-      if (current_step >= StepsPerWarpGroup) {
-        return;
-      }
-
-      static constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      /// A warp group uses 2 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
-      /// In each step, one warp would hold two warp_tiles.
-      ///  Step 0:                Step 1:
-      ///  W0 W1 W2 W3            -- -- -- --
-      ///  W1 W0 -- --            -- -- W3 W2
-      ///  W2 -- -- --            -- W3 W0 W1
-      ///  W3 -- -- --            -- W2 W1 W0
-      ///
-      /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      ///
-      /// Fully static coord LUT to avoid extra register use.
-      /// [warp_id][step][warp_tile][n / k]
-      /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
-      /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
-      /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
-      /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
-      /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
-      ///
-      /// Encoding the coord of warp tile0 into two int64_t values.
-      /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
-      /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
-      /// The 2-step transposition and the 8-step transposition share the same encoding.
-      ///
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-      // Divide entire SMEM to multiple warp_tiles
-      constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
-      Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
-      Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
-
-      // Get copy tile
-      auto sB_tiled_copy = make_tiled_copy(
-        Copy_Atom<DefaultCopy, ElementB>{},
-        WarpThreadLayout,     // thr_layout
-        Layout<_1>{}          // val_layout
-      );
-
-      static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
-      auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
-
-      // Construct fragments for transposition
-      Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
-      decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
-        make_fragment_like(tmp_tCsB),
-        make_fragment_like(tmp_tCsB)
-      };
-
-      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
-      if constexpr (NumMathWarpGroup == 2) {
-        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
-        step += warp_idx / (NumWarpsPerWarpGroup * 2);
-      }
-
-      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT >> (NumBitsPerStep * current_step);
-      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT >> (NumBitsPerStep * current_step);
-
-      if constexpr (NumMathWarpGroup == 2) {
-        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
-        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
-      }
-
-      // decoding the warp tile coord.
-      int warp_tile0_n, warp_tile0_k;
-      if constexpr (StepsPerWarpGroup <= NumStepsEncoded) {
-        warp_tile0_n = tmp_warp_tile_n_coord_LUT & MaskPerStep;
-        warp_tile0_k = tmp_warp_tile_k_coord_LUT & MaskPerStep;
-      } else {
-        warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
-        warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
-      }
-
-      int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
-      int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
-
-        static_assert(TilesPerWarp == 2);
-
-        // [warp_tile][n/k]
-        const int warp_tile_coord[TilesPerWarp][2] = {
-          // n                                                           k
-          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
-          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
-        };
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
-          Tensor tCsB = sB_thr_copy.partition_S(
-            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
-          ); // (CPY, CPY_N, CPY_K)
-
-          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
-        }
-
-        // Make sure elements in two 8x8 warp tiles are all consumed
-        __syncwarp();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
-          Tensor tCsB_transposed = sB_thr_copy.partition_D(
-            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
-          ); // (CPY, CPY_N, CPY_K)
-          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
-        }
-
-      } // loop warp_group_tile
-  }
-
-  CUTLASS_DEVICE void synchronize(int step) {
-    if (step < StepsPerWarpGroup) {
-      // SMEM fence to make sure B is transposed before math
-      cutlass::arch::fence_view_async_shared();
-      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-    }
-  }
-
-  CUTLASS_DEVICE void synchronize() {
-    cutlass::arch::fence_view_async_shared();
-    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-  }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void transpose(
-    TensorSmemB const& sB,
-    TensorTransposedSmemB const& gmma_sB,
-    int read_stage) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for(int i = 0; i < StepsPerWarpGroup; ++i) {
-      this->operator()(sB, gmma_sB, read_stage, i);
-    }
-    synchronize();
-
-  }
-private:
-  const int warp_idx;
-  const int warp_group_thread_idx;
-  const int warp_idx_in_warp_group;
-  const int current_warp_tile_n_coord_LUT;
-  const int current_warp_tile_k_coord_LUT;
-};
-
-template<
-  class TiledMma_,
-  class SmemLayoutB_,
-  class SmemLayoutAtomB_,
-  class ElementB_>
-class AsyncTranspositionOperandB_1BElementB {
-public:
-
-  static_assert(sizeof(ElementB_) == 1);
-
-  using TiledMma = TiledMma_;
-  using SmemLayoutB = SmemLayoutB_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using ElementB = ElementB_;
-
-  static constexpr int Steps             = 8;
-  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
-  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
-  static_assert(NumMathWarpGroup <= 2,
-                    "Wrong math warp group number for TransposeB");
-  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
-  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
-
-  static constexpr int BytesPerSmemSwizzleUnit = 16;
-  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
-  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
-  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
-
-  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
-  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invaild warp thread shape." );
-  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
-  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
-  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
-  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
-  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
-  static constexpr int NumBitsPerStep        = 3;
-  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
-  static constexpr int NumBitsPerWarp        = 12;
-  // Number of warp_group_tiles
-  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
-    "Copy size must evenly divide SMEM tile.");
-  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
-
-  constexpr CUTLASS_HOST_DEVICE
-  AsyncTranspositionOperandB_1BElementB(
-      int warp_idx_,
-      int warp_group_thread_idx_,
-      TiledMma,
-      SmemLayoutB,
-      SmemLayoutAtomB,
-      ElementB)
-      : warp_idx(warp_idx_)
-      , warp_group_thread_idx(warp_group_thread_idx_)
-      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
-      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
-            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
-      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
-            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void operator()(
-      TensorSmemB const& sB,
-      TensorTransposedSmemB const& gmma_sB,
-      int read_stage, int current_step)
-  {
-    if (current_step > 0) {
-      return;
-    }
-
-    constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
-    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
-    ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
-    ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  W2 -- -- -- -- -- -- --   -- W3 W0 W1 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  W3 -- -- -- -- -- -- --   -- W2 W1 W0 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W1 W0 -- --   -- -- -- -- -- -- W3 W2
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W2 -- -- --   -- -- -- -- -- W3 W0 W1
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W3 -- -- --   -- -- -- -- -- W2 W1 W0
-    ///
-    ///  Step 4:                   Step 5:                   Step 6:                   Step 7:
-    ///  -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
-    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3
-    ///  W0 -- -- -- -- -- -- --   -- W0 -- -- -- -- -- --   -- -- W0 -- -- -- -- --   -- -- -- W0 -- -- -- --
-    ///  W1 -- -- -- -- -- -- --   -- W1 -- -- -- -- -- --   -- -- W1 -- -- -- -- --   -- -- -- W1 -- -- -- --
-    ///  W2 -- -- -- -- -- -- --   -- W2 -- -- -- -- -- --   -- -- W2 -- -- -- -- --   -- -- -- W2 -- -- -- --
-    ///  W3 -- -- -- -- -- -- --   -- W3 -- -- -- -- -- --   -- -- W3 -- -- -- -- --   -- -- -- W3 -- -- -- --
-    ///
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    ///
-    /// Fully static coord LUT to avoid extra register use.
-    /// [warp_id][step][warp_tile][n / k]
-    /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
-    /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
-    /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
-    /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
-    /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
-    ///
-    /// Encoding the coord of warp tile0 into two int64_t values.
-    /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
-    /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
-    /// The 2-step transposition and the 8-step transposition share the same encoding.
-    ///
-    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // Divide entire SMEM to multiple warp_tiles
-    constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
-    Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
-    Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
-
-    // Get copy tile
-    auto sB_tiled_copy = make_tiled_copy(
-      Copy_Atom<DefaultCopy, ElementB>{},
-      WarpThreadLayout,     // thr_layout
-      Layout<_1>{}          // val_layout
-    );
-    static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
-    auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
-
-    // Construct fragments for transposition
-    Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
-    decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
-      make_fragment_like(tmp_tCsB),
-      make_fragment_like(tmp_tCsB)
-    };
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
-      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT;
-      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT;
-      constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
-
-      if constexpr (NumMathWarpGroup == 2) {
-        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
-        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
-      }
-
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int step_per_warp_group = 0; step_per_warp_group < StepsPerWarpGroup; ++step_per_warp_group) {
-        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
-        int step = step_per_warp_group * NumMathWarpGroup + warp_idx / (NumWarpsPerWarpGroup * 2);
-        // decoding the warp tile coord.
-        int warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
-        int warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
-        int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
-        int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
-
-        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep;
-        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep;
-
-        static_assert(TilesPerWarp == 2);
-
-        // [warp_tile][n/k]
-        const int warp_tile_coord[TilesPerWarp][2] = {
-          // n                                                           k
-          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
-          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
-        };
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
-          Tensor tCsB = sB_thr_copy.partition_S(
-            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
-          ); // (CPY, CPY_N, CPY_K)
-
-          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
-        }
-
-        // Make sure elements in two 8x8 warp tiles are all consumed
-        __syncwarp();
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
-          Tensor tCsB_transposed = sB_thr_copy.partition_D(
-            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
-          ); // (CPY, CPY_N, CPY_K)
-          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
-        }
-      } // lock step
-    } // loop warp_group_tile
-  }
-
-  CUTLASS_DEVICE void synchronize(int step) {
-    if (step == 0) {
-      // SMEM fence to make sure B is transposed before math
-      cutlass::arch::fence_view_async_shared();
-      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-    }
-  }
-
-  CUTLASS_DEVICE void synchronize() {
-    cutlass::arch::fence_view_async_shared();
-    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
-  }
-
-  template <
-    class TensorSmemB,
-    class TensorTransposedSmemB>
-  CUTLASS_DEVICE void transpose(
-    TensorSmemB const& sB,
-    TensorTransposedSmemB const& gmma_sB,
-    int read_stage) {
-    this->operator()(sB, gmma_sB, read_stage, 0);
-    synchronize();
-  }
-
-private:
-  const int warp_idx;
-  const int warp_group_thread_idx;
-  const int warp_idx_in_warp_group;
-  const int current_warp_tile_n_coord_LUT;
-  const int current_warp_tile_k_coord_LUT;
-};
-
-
-template<
-  class TiledMma,
-  class SmemLayoutB,
-  class SmemLayoutAtomB,
-  class ElementB,
-  bool TransposeB
->
-constexpr CUTLASS_HOST_DEVICE
-auto
-make_transpose_operand_b(
-    int warp_idx,
-    int warp_group_thread_idx,
-    TiledMma,
-    SmemLayoutB,
-    SmemLayoutAtomB,
-    ElementB,
-    cute::bool_constant<TransposeB>)
-{
-  if constexpr (!TransposeB) {
-    return NoTranspositionOperandB(
-        warp_idx, warp_group_thread_idx, TiledMma{},
-        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
-  }
-  else if constexpr (use_universal_transposition<SmemLayoutAtomB, ElementB>()) {
-    return UniversalTranspositionOperandB(
-        warp_idx, warp_group_thread_idx, TiledMma{},
-        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
-  }
-  else if constexpr (sizeof(ElementB) == 1) {
-    return AsyncTranspositionOperandB_1BElementB(
-        warp_idx, warp_group_thread_idx, TiledMma{},
-        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
-  }
-  else {
-    return AsyncTranspositionOperandB(
-        warp_idx, warp_group_thread_idx, TiledMma{},
-        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
-  }
-}
-
-}; // namespace detail
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace transform
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
deleted file mode 100755
index c7ab0ceb0..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Transform Kernel Universal adapter
-*/
-
-#pragma once
-
-// common
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/detail/layout.hpp"
-#include "cutlass/detail/mma.hpp"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cutlass/kernel_launch.h"
-#if !defined(__CUDACC_RTC__)
-#include "cutlass/cluster_launch.hpp"
-#include "cutlass/trace.h"
-#endif // !defined(__CUDACC_RTC__)
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::transform::device {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <class TransformKernel_>
-class TransformUniversalAdapter
-{
-public:
-  using TransformKernel = TransformKernel_;
-  using Arguments = typename TransformKernel::Arguments;
-  using Params = typename TransformKernel::Params;
-  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
-
-
-private:
-
-  /// Kernel API parameters object
-  Params params_;
-
-public:
-
-  /// Access the Params structure
-  Params const& params() const {
-    return params_;
-  }
-
-  /// Determines whether the GEMM can execute the given problem.
-  static Status
-  can_implement(Arguments const& args) {
-    return TransformKernel::can_implement(args);
-  }
-
-  /// Gets the workspace size
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    size_t workspace_bytes = 0;
-    workspace_bytes += TransformKernel::get_workspace_size(args);
-
-    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
-
-    return workspace_bytes;
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
-    auto tmp_params = TransformKernel::to_underlying_arguments(args, workspace);
-    return TransformKernel::get_grid_shape(tmp_params);
-  }
-
-  /// Computes the grid shape
-  static dim3
-  get_grid_shape(Params const& params) {
-    return TransformKernel::get_grid_shape(params);
-  }
-
-
-  /// Initializes GEMM state from arguments.
-  Status
-  initialize(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-
-    CUTLASS_TRACE_HOST("TransformUniversalAdapter::initialize() - workspace "
-      << workspace << ", stream: " << (stream ? "non-null" : "null")
-      << ", EnableCudaHostAdapter: " << (kEnableCudaHostAdapter ? "True" : "false"));
-
-    // Initialize the workspace
-    Status status = TransformKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    // Initialize the Params structure
-    params_ = TransformKernel::to_underlying_arguments(args, workspace);
-    // Don't set the function attributes - require the CudaHostAdapter to set it.
-    if constexpr (kEnableCudaHostAdapter) {
-      CUTLASS_ASSERT(cuda_adapter);
-      return Status::kSuccess;
-    }
-    else {
-      //
-      // Account for dynamic smem capacity if needed
-      //
-      int smem_size = TransformKernel::SharedStorageSize;
-
-      CUTLASS_ASSERT(cuda_adapter == nullptr);
-
-      if (smem_size >= (48 << 10)) {
-        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
-        cudaError_t result = cudaFuncSetAttribute(
-            device_kernel<TransformKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            smem_size);
-        if (cudaSuccess != result) {
-          result = cudaGetLastError(); // to clear the error bit
-          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    return Status::kSuccess;
-  }
-
-  static Status
-  run(Params& params,
-      cudaStream_t stream = nullptr,
-      CudaHostAdapter *cuda_adapter = nullptr,
-      int32_t kernel_index = 0,
-      bool launch_with_pdl = false) {
-    CUTLASS_TRACE_HOST("TransformUniversalAdapter::run()");
-    dim3 const block = TransformKernel::get_block_shape();
-    dim3 const grid = get_grid_shape(params);
-
-    // configure smem size and carveout
-    int smem_size = TransformKernel::SharedStorageSize;
-
-    Status launch_result{ Status::kSuccess };
-    // Use extended launch API only for mainloops that use it
-    if constexpr (TransformKernel::ArchTag::kMinComputeCapability >= 90) {
-      // Currently only support 1x1x1 for transform kernel.
-      dim3 const cluster = {1,1,1};
-      void* kernel_params[] = {&params};
-
-      if constexpr (kEnableCudaHostAdapter) {
-        //
-        // Use the cuda host adapter
-        //
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-
-          if (launch_with_pdl) {
-            CUTLASS_TRACE_HOST(
-              "TransformUniversalAdapter::run() does not support launching with PDL and a custom cuda adapter.");
-            return Status::kErrorInternal;
-          }
-          launch_result = cuda_adapter->launch(grid,
-                                               cluster,
-                                               block,
-                                               smem_size,
-                                               stream,
-                                               kernel_params,
-                                               kernel_index);
-          CUTLASS_TRACE_HOST("Kernel Launch Result" << cutlassGetStatusString(launch_result));
-        }
-        else {
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        void const* kernel = (void const*) device_kernel<TransformKernel>;
-        if constexpr (TransformKernel::ArchTag::kMinComputeCapability == 90) {
-          launch_result = ClusterLauncher::launch(
-            grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
-        }
-      }
-    }
-    else {
-      launch_result = Status::kSuccess;
-      cutlass::arch::synclog_setup();
-
-      if constexpr (kEnableCudaHostAdapter) {
-        CUTLASS_ASSERT(cuda_adapter);
-        if (cuda_adapter) {
-          void* kernel_params[] = {&params};
-
-          launch_result = cuda_adapter->launch(
-            grid, block, smem_size, stream, kernel_params, 0
-          );
-
-        }
-        else {
-          return Status::kErrorInternal;
-        }
-      }
-      else {
-        CUTLASS_ASSERT(cuda_adapter == nullptr);
-        cutlass::kernel_launch<TransformKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
-      }
-    }
-
-    cudaError_t result = cudaGetLastError();
-    if (cudaSuccess == result && Status::kSuccess == launch_result) {
-      return Status::kSuccess;
-    }
-    else if (cudaSuccess != result) {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cudaGetErrorString(result));
-    }
-    else if (Status::kSuccess != launch_result) {
-      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cutlassGetStatusString(launch_result));
-    }
-    return Status::kErrorInternal;
-  }
-
-  //
-  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
-  //
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  run(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    int32_t kernel_index = 0,
-    bool launch_with_pdl = false
-  ) {
-    Status status = initialize(args, workspace, stream, cuda_adapter);
-
-    if (Status::kSuccess == status) {
-      status = run(params_, stream, cuda_adapter, kernel_index, launch_with_pdl);
-    }
-    return status;
-  }
-
-  /// Launches the kernel after first constructing Params internal state from supplied arguments.
-  Status
-  operator()(
-    Arguments const& args,
-    void* workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    bool launch_with_pdl = false) {
-    return run(args, workspace, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  run(
-    cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr,
-    bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
-  }
-
-  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
-  Status
-  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
-    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::transform::device
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
deleted file mode 100755
index 9f54c93f1..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/* \file
-   \brief Convolution filter format transformation kernel.
-*/
-
-#pragma once
-
-#include <algorithm>
-#include <random>
-
-#include "cutlass/coord.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-#include "cute/int_tuple.hpp"
-#include "cute/tensor.hpp"
-#include "cute/config.hpp"
-
-namespace cutlass::transform::kernel {
-
-using namespace cute;
-
-enum class FilterFormat {
-  CKTRS,
-  CTRSK,
-  KTRSC
-};
-
-template <
-  FilterFormat SrcFormat,
-  FilterFormat DstFormat,
-  int NumDimensions,
-  class Element_,
-  int AlignmentBytes = 16
->
-struct ConvFilterFormatTransformer {
-  
-  using Element = Element_;
-  static_assert(SrcFormat == FilterFormat::CKTRS, "Currently only source format of CKTRS is supported");
-  static_assert(DstFormat == FilterFormat::CTRSK || DstFormat == FilterFormat::KTRSC, "Currently only destination format of CTRSK/KTRSC is supported");
-  static_assert(AlignmentBytes > 0 && AlignmentBytes % static_cast<int>(sizeof(Element)) == 0, "Invalid alignment setting");
-
-  // In ktrsc order.
-  using FilterExtent = array<int, NumDimensions>;
-
-  // Default cta tile shape: 32x32
-  static constexpr auto CTATileShape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
-  // Default thread layout: (4, 32)
-  static constexpr auto ThreadLayout = make_layout(make_shape(Int<4>{}, Int<32>{}));
-
-  static constexpr uint32_t MaxThreadsPerBlock = 128;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-
-  using ArchTag = arch::Sm90;
-
-  // Default ctor
-  CUTLASS_HOST_DEVICE
-  ConvFilterFormatTransformer() {}
-
-  struct Arguments {
-    const void *src_ptr;
-    void *dst_ptr;
-    FilterExtent filter_extent;
-  };
-
-  struct Params {
-    using TensorSrc = decltype(make_tensor(make_gmem_ptr(recast_ptr<const Element>(nullptr)), make_layout(take<0,NumDimensions>(FilterExtent{}))));
-    using TensorDst = decltype(make_tensor(make_gmem_ptr(recast_ptr<Element>(nullptr)), make_layout(make_shape(int32_t(0), int32_t(0)))));
-
-    TensorSrc src;
-    TensorDst dst; 
-  };
-
-  struct SharedStorage {
-    /* empty, no smem needed */
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  static Status
-  can_implement(Arguments const& args) {
-    bool implementable = true;
-    // alignment rule
-    {
-      int contiguous_dim = DstFormat == FilterFormat::CTRSK ? args.filter_extent[0] : args.filter_extent[NumDimensions - 1];
-      int align_element = AlignmentBytes / static_cast<int>(sizeof(Element));
-
-      implementable &= (contiguous_dim % align_element == 0);
-
-      if (!implementable) {
-        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Alignment setting is invalid.\n");
-        return Status::kInvalid;
-      }
-    }
-
-    return Status::kSuccess;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-
-  static dim3
-  get_block_shape() {
-    return dim3(size(shape(ThreadLayout)), 1, 1);
-  }
-
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto dim_m = ceil_div(size<0>(shape(params.dst)), get<0>(CTATileShape));
-    auto dim_n = ceil_div(size<1>(shape(params.dst)), get<1>(CTATileShape));
-
-    return dim3(dim_m, dim_n, 1);
-  }
-
-  static cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-
-  static Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    auto k = args.filter_extent[0];
-    auto c = args.filter_extent[NumDimensions - 1];
-    auto srt = reverse(take<1,NumDimensions - 1>(args.filter_extent));
-
-    // source shape (s,r,t,k,c)
-    auto shape_src = flatten(make_shape(srt, k, c));
-    auto shape_dst = DstFormat == FilterFormat::CTRSK ? make_shape(k, c * product(srt)) : make_shape(c, k * product(srt));
-
-    auto src = make_tensor(make_gmem_ptr(recast_ptr<const Element>(args.src_ptr)), make_layout(shape_src));
-    auto dst = make_tensor(make_gmem_ptr(recast_ptr<Element>(args.dst_ptr)), make_layout(shape_dst));
-
-    return Params{src, dst};
-  }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, char *smem_buf) {
-    // Tile the input tensor into blocks
-    auto block_coord = make_coord(blockIdx.x, blockIdx.y);
-    auto block_shape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
-    // Default thread layout: (4, 32)
-    auto thread_layout = make_layout(make_shape(Int<4>{}, Int<32>{}));
-    auto vec_layout = make_layout(make_shape(Int<AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<1>{}));
-
-    Tensor tile_D = local_tile(params.dst, block_shape, block_coord);
-
-    // Construct tiled copy
-    using AccessType = cutlass::AlignedArray<Element, size(vec_layout)>;
-    using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
-
-    auto tiled_copy = make_tiled_copy(Atom{}, thread_layout, vec_layout);
-    auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
-    Tensor thr_tile_D = thr_copy.partition_D(tile_D);
-
-    // shape (s, r, t)
-    auto shape_trs = take<0, NumDimensions - 2>(shape(params.src));
-    // strided_c = c for format CTRSK, strided_c = k for format KTRSC
-    auto strided_c = DstFormat == FilterFormat::CTRSK ? get<NumDimensions - 1>(shape(params.src)) : get<NumDimensions - 2>(shape(params.src));
-    // shape (s, r, t, c) for format CTRSK and shape (s, r, t, k) for format KTRSC 
-    auto shape_ctrs = append<NumDimensions - 1>(shape_trs, strided_c);
-    auto srtc_coord = idx2crd(int(blockIdx.y * get<1>(block_shape) + threadIdx.x / size<0>(thread_layout)), shape_ctrs);
-    // index of k for format CTRSK and index of c for format KTRSC
-    auto n_layout = make_layout(make_shape(gridDim.x, size<0>(thread_layout)), make_stride(size<0>(block_shape), size<0>(vec_layout)));
-    int n_idx = n_layout(make_coord(blockIdx.x, threadIdx.x % size<0>(thread_layout)));
-
-    // Fragment to load from S and store to D
-    auto frag = make_fragment_like(thr_tile_D);
-    // Predicate tensor.
-    Tensor thr_tile_P = make_tensor<bool>(shape(thr_tile_D));
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(frag); ++i) {
-      auto srt_coord = take<0, NumDimensions - 2>(srtc_coord);
-      auto kc_coord = DstFormat == FilterFormat::CTRSK ?
-          make_coord(n_idx+i, get<NumDimensions - 2>(srtc_coord)) :
-          make_coord(get<NumDimensions - 2>(srtc_coord), n_idx+i);
-      auto coord = flatten(make_coord(srt_coord, kc_coord)); 
-      thr_tile_P(i) = elem_less(coord, shape(params.src));
-      if (thr_tile_P(i)) {
-        frag(i) = params.src(coord);
-      }
-    }
-
-    // Copy from RMEM to GMEM
-    copy_if(tiled_copy, thr_tile_P, frag, thr_tile_D);
-  }
-};
-
-} // namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
deleted file mode 100755
index 0ae7bab06..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
+++ /dev/null
@@ -1,578 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Compress utils specific for SM90 structure sparse kernels
-*/
-
-#pragma once
-
-#include "cute/container/bit_field.hpp"    // cute::bit_field
-#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v, cute::uint_bit_t
-#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
-#include "cute/algorithm/cooperative_copy.hpp" // cute::cooperative_copy
-#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
-#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
-#include "cutlass/cutlass.h"               // cutlass::Status
-#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
-#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
-#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
-#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
-#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
-
-namespace cutlass::transform::kernel {
-
-using namespace cute;
-
-template<
-  class ProblemShape_,
-  class ElementA_,
-  class LayoutATag_,
-  class SparseConfig_
->
-class SM90StructuredSparseCompressor {
-public:
-  using SparseConfig = SparseConfig_;
-  using ProblemShape = ProblemShape_;
-
-  // * EltA
-  using ElementA = ElementA_;
-  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
-  using ElementAMma = typename SparseConfig::ElementAMma;
-  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
-  using ElementAMmaRawUnit = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
-  using ElementASparsity = typename SparseConfig::ElementASparsity;
-  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
-  using ElementAUintCompressed = cute::sparse_elem<ElementASparsity{}, ElementAUint>;
-  using LayoutATag = LayoutATag_;
-  using LayoutA = LayoutATag;
-  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
-
-  // * EltE
-  using ElementEMma = typename SparseConfig::ElementEMma;
-  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
-  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
-  // Data Type for storing one chunk's metadata
-  static constexpr int ElementEBitsPerChunk = typename SparseConfig::ElementEBitsPerChunk{};
-  CUTE_STATIC_ASSERT(ElementEBitsPerChunk == 4, "ElementEBitsPerChunk is 4 for SM90");
-  using ElementEChunk = cute::uint_bit_t<ElementEBitsPerChunk>;
-  CUTE_STATIC_ASSERT(cute::is_same_v<ElementEChunk, cute::uint4_t>, "ElementEChunk is uint4_t for SM90");
-  using ElementESparsityPerChunk = Int<ElementEMmaSparsity{} / (cute::sizeof_bits_v<ElementEMmaRaw> / ElementEBitsPerChunk)>;
-
-  // AtomE
-  using TensorEAtom = typename SparseConfig::TensorEAtom;
-  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
-  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
-
-  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
-  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
-  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
-  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
-  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
-
-  // * Alignment
-  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
-  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
-  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
-  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
-
-  // Required by `device_kernel`
-  static constexpr int MaxThreadsPerBlock = TensorEAtomM{};
-  static constexpr int MinBlocksPerMultiprocessor = 1;
-  using ArchTag = arch::Sm90;
-
-  struct SharedStorage {
-    ElementEMma cEsE[cute::size(TensorEAtom{})];
-    ElementAUintCompressed cACsAC[cute::size(TensorEAtom{})];
-    ElementAUint cAsA[cute::size(TensorEAtom{})];
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  struct TransformArguments {
-    void const* ptr_A{nullptr};
-    StrideA dA{};
-    void* ptr_ACompress{nullptr};
-    void* ptr_E{nullptr};
-  };
-
-  using TransformParams = TransformArguments;
-
-  struct Arguments {
-    ProblemShape problem_shape{};
-    TransformArguments transform{};
-    KernelHardwareInfo hw_info{};
-  };
-
-  struct Params {
-    ProblemShape problem_shape{};
-    TransformParams transform{};
-    KernelHardwareInfo hw_info{};
-    void* workspace = nullptr;
-  };
-
-public:
-  static Params
-  to_underlying_arguments(Arguments const& args, void* workspace = nullptr) {
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::to_underlying_arguments()");
-    return Params{{args.problem_shape},
-                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
-                  {args.hw_info},
-                  workspace};
-  }
-
-  static Status
-  can_implement(Arguments const& args) {
-    auto [M, N, K, L] = args.problem_shape;
-    if (K % LogicalElemsAPerChunk != 0) {
-      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size");
-      return Status::kErrorInvalidProblem;
-    }
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::can_implement() (True)");
-    return Status::kSuccess;
-  }
-
-  static size_t
-  get_workspace_size(Arguments const& args) {
-    CUTLASS_UNUSED(args);
-    // Backward compatible with host compressor
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_workspace_size() (" << SharedStorageSize << ")");
-    return SharedStorageSize;
-  }
-
-  static Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter *cuda_adapter = nullptr) {
-    CUTLASS_UNUSED(args);
-    CUTLASS_UNUSED(workspace);
-    CUTLASS_UNUSED(stream);
-    CUTLASS_UNUSED(cuda_adapter);
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::initialize_workspace()");
-    return Status::kSuccess;
-  }
-
-  static dim3
-  get_grid_shape(Params const& params) {
-    constexpr int MaxAlignmentM = cutlass::const_max(TensorEAlignmentM, TensorAAlignmentM);
-    constexpr int MaxAlignmentK = cutlass::const_max(TensorEAlignmentK, TensorAAlignmentK);
-    const auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
-
-    const int GemmMAlignedMax = cutlass::round_up(GemmM, MaxAlignmentM);
-    const int GemmKAlignedMax = cutlass::round_up(GemmK, MaxAlignmentK);
-
-    const int gridDim_X = cutlass::ceil_div(GemmMAlignedMax, TensorEAtomM{});
-    const int gridDim_Y = cutlass::ceil_div(GemmKAlignedMax, TensorEAtomK{});
-    const int gridDim_Z = GemmL;
-
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_grid_shape() ("
-      << gridDim_X << ", "
-      << gridDim_Y << ", "
-      << gridDim_Z << ")");
-    return dim3(gridDim_X, gridDim_Y, gridDim_Z);
-  }
-
-  static dim3
-  get_block_shape() {
-    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_block_shape() ("
-      << MaxThreadsPerBlock << ", "
-      << 1 << ", "
-      << 1 << ")");
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTE_DEVICE
-  void
-  operator()(Params params, void* smem_buf = nullptr) {
-    run(params, smem_buf);
-  }
-
-  CUTE_DEVICE
-  static void
-  run(Params params, void* smem_buf = nullptr) {
-    structure_sparse_compress(params, smem_buf);
-  }
-
-private:
-
-  struct MetadataOneChunk1to2 {
-
-    CUTE_DEVICE
-    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
-      auto metadata_bits = [&]() -> uint8_t {
-        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 2);
-        switch (elt_log_idx) {
-          case 0:
-            return 0b0100;
-          case 1:
-            return 0b1110;
-          default:
-            CUTE_GCC_UNREACHABLE;
-        }
-      };
-
-      storage_ |= (metadata_bits() << (4 * elt_phy_idx));
-    }
-
-
-    CUTE_DEVICE
-    ElementEChunk storage() const {
-      return ElementEChunk{storage_};
-    }
-
-  private:
-    uint8_t storage_ = 0b0000;
-  };
-
-  struct MetadataOneChunk2to4{
-
-    CUTE_DEVICE
-    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
-      auto metadata_bits = [&]() -> uint8_t {
-        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 4);
-        switch (elt_log_idx) {
-          case 0:
-            return 0b00;
-          case 1:
-            return 0b01;
-          case 2:
-            return 0b10;
-          case 3:
-            return 0b11;
-          default:
-            CUTE_GCC_UNREACHABLE;
-        }
-      };
-
-      storage_ |= (metadata_bits() << (2 * elt_phy_idx));
-    }
-
-    CUTE_DEVICE
-    ElementEChunk storage() const {
-      return ElementEChunk{storage_};
-    }
-
-  private:
-    uint8_t storage_ = 0b0000;
-  };
-
-  using MetadataOneChunk = cute::conditional_t<SparseConfig::IsTfmma,
-                                               MetadataOneChunk1to2,
-                                               MetadataOneChunk2to4>;
-
-private:
-
-  CUTE_DEVICE
-  static void
-  structure_sparse_compress(Params params, void* smem_buf) {
-    // * Input Params
-    auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
-    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    [[maybe_unused]] const int gridDim_X = gridDim.x;
-    [[maybe_unused]] const int gridDim_Y = gridDim.y;
-    [[maybe_unused]] const int gridDim_Z = gridDim.z;
-    [[maybe_unused]] const int blockDim_X = blockDim.x;
-
-    // * Global Tensor Layout
-    const cute::Layout layout_gA = make_layout(make_shape(GemmM, GemmK, GemmL), dA);
-    const cute::Layout layout_gAC = SparseConfig::fill_layoutA(params.problem_shape);
-    const cute::Layout layout_gE = SparseConfig::fill_layoutE(params.problem_shape);
-
-    // * Construct Global Tensor
-    const cute::Tensor gA   = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUint>(ptr_A)), layout_gA);
-    cute::Tensor gAC_sparse = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUintCompressed>(ptr_ACompress)), layout_gAC );
-    cute::Tensor gAC        = cute::recast<ElementAUint>(gAC_sparse);
-    cute::Tensor gE_sparse  = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementEMma>(ptr_E)), layout_gE);
-    cute::Tensor gE         = cute::recast<ElementEMmaRaw>(gE_sparse);
-
-    // * CTA Tensor Layout
-    using cAsA_layout_row = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutRight{}));
-    using cAsA_layout_col = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutLeft{}));
-    using cAsA_layout     = cute::conditional_t<cute::is_same_v<LayoutATag, layout::RowMajor>, cAsA_layout_row, cAsA_layout_col>;
-    using cACsAC_layout   = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementASparsity{}), LayoutRight{}));
-    using cEsE_layout     = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementEMmaSparsity{}), LayoutRight{}));
-
-    CUTE_STATIC_ASSERT(cute::is_static_v<TensorEAtom>, "TensorEAtom needs to be static");
-    CUTE_STATIC_ASSERT(cute::is_static_v<cAsA_layout>, "cAsA_layout needs to be static");
-    CUTE_STATIC_ASSERT(cute::is_static_v<cACsAC_layout>, "cACsAC_layout needs to be static");
-    CUTE_STATIC_ASSERT(cute::is_static_v<cEsE_layout>, "cEsE_layout needs to be static");
-
-    const int blockIdx_X = blockIdx.x;
-    const int blockIdx_Y = blockIdx.y;
-    const int blockIdx_Z = blockIdx.z;
-    const int threadIdx_X = threadIdx.x;
-
-    // * Construct CTA Tensor
-    const auto cta_coord = make_coord(blockIdx_X, blockIdx_Y, blockIdx_Z);
-    cute::Tensor cAgA   = cute::recast<ElementAMmaRawUnit>(local_tile(gA, shape(cAsA_layout{}), cta_coord));
-    cute::Tensor cACgAC = cute::recast<ElementAMmaRawUnit>(local_tile(gAC, shape(cACsAC_layout{}), cta_coord));
-    cute::Tensor cEgE   = local_tile(gE, shape(cEsE_layout{}), cta_coord);
-
-    cute::Tensor cAsA   = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cAsA)), cAsA_layout{}));
-    cute::Tensor cACsAC = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cACsAC)), cACsAC_layout{}));
-    cute::Tensor cEsE   = make_tensor(make_smem_ptr(cute::recast_ptr<ElementEMmaRaw>(shared_storage.cEsE)), cEsE_layout{});
-    cute::Tensor cEsE_chunk = cute::recast<ElementEChunk>(cEsE);
-
-    // * Handle in unit of Chunk when compress
-    using OneChunkSizeA  = Int<LogicalElemsAMmaRawPerChunk>;
-    using OneChunkSizeAC = Int<PhysicalElemsAMmaRawPerChunk>;
-    using OneChunkSizeE  = Int<LogicalElemsAPerChunk / ElementESparsityPerChunk{}>;
-    using NumOneChunkK   = Int<cutlass::ceil_div(TensorEAtomK{}, LogicalElemsAPerChunk)>;
-
-    cute::Tensor cAsA_log_chunk   = logical_divide(cAsA, make_shape(_, OneChunkSizeA{}));
-    cute::Tensor cACsAC_log_chunk = logical_divide(cACsAC, make_shape(_, OneChunkSizeAC{}));
-    cute::Tensor cEsE_log_chunk   = logical_divide(cEsE_chunk, make_shape(_, OneChunkSizeE{}));
-
-    // * Corner Case Handle
-    const auto GemmM_within_Cta = (GemmM - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmM - blockIdx_X * TensorEAtomM{};
-    const auto GemmK_within_Cta = ( (GemmK - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmK - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
-    const auto GemmK_NumOneChunk_within_Cta = GemmK_within_Cta / LogicalElemsAMmaRawPerChunk;
-
-    const auto GemmMAlignedAC = cutlass::round_up(GemmM, TensorAAlignmentM);
-    const auto GemmKAlignedAC = cutlass::round_up(GemmK, TensorAAlignmentK);
-    const auto GemmMAlignedAC_within_Cta = (GemmMAlignedAC - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmMAlignedAC - blockIdx_X * TensorEAtomM{};
-    const auto GemmKAlignedAC_within_Cta = ( (GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
-
-    // * Clear CTA Smem Tensor
-    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cACsAC);
-    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cEsE);
-
-    // * Input CTA Tensor G to S
-    if (GemmM_within_Cta == TensorEAtomM{} && GemmK_within_Cta == TensorEAtomK{}) {
-      copy_vec_pred<false, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
-    }
-    else {
-      copy_vec_pred<true, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
-    }
-
-    // * Compress
-    // cACsAC is always row major order
-    // TensorEAtomM threads perform the compression, each thread compress one row
-    const int row_i = threadIdx_X;
-    if (row_i < GemmM_within_Cta) {
-
-      CUTE_UNROLL
-      for (int col_chunk_i = 0; col_chunk_i < NumOneChunkK{}; ++col_chunk_i) {
-        if (col_chunk_i < GemmK_NumOneChunk_within_Cta) {
-          // Compress is handled in unit of ElementAMmaRawUnit
-          cute::Tensor tAsA   = cAsA_log_chunk(row_i, make_coord(_, col_chunk_i));
-          cute::Tensor tACsAC = cACsAC_log_chunk(row_i, make_coord(_, col_chunk_i));
-          cute::Tensor tEsE   = cEsE_log_chunk(row_i, make_coord(_, col_chunk_i));
-
-          int non_zero_cnt = 0;
-          // None zero element indx
-          // e.g.
-          //  2:4 sparsity [x 0 0 x]
-          //  non_zero_elt_log_idx = [0, 3]
-          int non_zero_elt_log_idx[OneChunkSizeAC{}] = { 0 };
-
-          // * Find None Zero Element Idx within Chunk
-          CUTE_UNROLL
-          for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
-            ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
-            if ( elem_A != ElementAMmaRawUnit{0} ) {
-              non_zero_elt_log_idx[non_zero_cnt] = elt_log_idx;
-              tACsAC[non_zero_cnt] = elem_A;
-              non_zero_cnt++;
-            }
-          }
-
-          // * Corner Case for 2:4 sparsity
-          if constexpr (cute::sizeof_bits_v<ElementAMmaRawUnit> < 32) {
-            // i.e. [0 0 0 x] -> [(0) 0 0 x]
-            if (non_zero_cnt == 1 && non_zero_elt_log_idx[0] == 3) {
-              tACsAC[1] = tACsAC[0];
-              tACsAC[0] = ElementAMmaRawUnit{0};
-              non_zero_elt_log_idx[0] = 0;
-              non_zero_elt_log_idx[1] = 3;
-            }
-            // i.e. [0 0 x 0] -> [0 0 x (0)]
-            // i.e. [0 x 0 0] -> [0 x 0 (0)]
-            // i.e. [x 0 0 0] -> [x 0 0 (0)]
-            else if (non_zero_cnt == 1) {
-              tACsAC[1] = ElementAMmaRawUnit{0};
-              non_zero_elt_log_idx[1] = 3;
-            }
-          }
-
-          // * Set Metadata Bits
-          MetadataOneChunk metadata_one_chunk;
-          CUTE_UNROLL
-          for (int elt_phy_idx = 0; elt_phy_idx < OneChunkSizeAC{}; elt_phy_idx++) {
-            metadata_one_chunk.set_metadata_bits(non_zero_elt_log_idx[elt_phy_idx], elt_phy_idx);
-          }
-          tEsE[0] = metadata_one_chunk.storage();
-
-        }
-        else {
-          break;
-        }
-      }
-    }
-
-    // * Sync after Compress
-    __syncthreads();
-
-    // * Output Cta Tensor S to G
-    if (GemmM_within_Cta > 0 && GemmK_within_Cta > 0) {
-      constexpr int MaxVecBits = 128; // STG.128
-      cute::cooperative_copy<MaxThreadsPerBlock, MaxVecBits>(threadIdx_X, cEsE, cEgE);
-    }
-
-    if (GemmMAlignedAC_within_Cta == TensorEAtomM{} && GemmKAlignedAC_within_Cta == TensorEAtomK{}) {
-      copy_vec_pred<false, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
-    }
-    else {
-      copy_vec_pred<true, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
-    }
-
-  } // end of structure_sparse_compress()
-
-  template<uint32_t NumThreads,
-           typename TensorSrc>
-  CUTE_DEVICE
-  static void
-  cooperative_clear(
-    uint32_t const& tid,
-    TensorSrc dSrc) {
-    
-    auto dSrctSrc = local_partition(dSrc, make_layout(make_shape(NumThreads, _1{})), tid);
-    cute::clear(dSrctSrc);
-
-    // Sync all thread data access
-    __syncthreads();
-  }
-
-  template <bool pred,
-            typename LayoutTag,
-            typename TensorSrc,
-            typename TensorDst>
-  CUTE_DEVICE
-  static void
-  copy_vec_pred(
-      TensorSrc dSrc,
-      TensorDst dDst,
-      int threadIdx_X,
-      int valid_rows,
-      int valid_cols) {
-
-    constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
-    using Element = typename TensorSrc::element_type;
-    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
-    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
-    CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
-      "dSrc and dDst need to have same element bit width");
-    CUTE_STATIC_ASSERT(cute::size(dSrc) == cute::size(dDst), "dSrc and dDst need to have same size");
-
-    // ValueShape
-    using ValueShape = 
-      cute::conditional_t<IsRowMajor,
-                          Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
-                          Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
-      ;
-
-    constexpr int ValueShapeRows = shape<0>(ValueShape{});
-    constexpr int ValueShapeCols = shape<1>(ValueShape{});
-
-    // ThreadShape
-    using ThreadShape = 
-      cute::conditional_t<IsRowMajor,
-                          Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
-                          Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
-      ;
-
-    constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
-    constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
-
-    const int threadIdx_X_row = threadIdx_X / ThreadShapeCols;
-    const int threadIdx_X_col = threadIdx_X % ThreadShapeCols;
-
-    // Row Major
-    if constexpr (IsRowMajor) {
-      CUTE_UNROLL
-      for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
-        CUTE_UNROLL
-        for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
-          CUTE_UNROLL
-          for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
-            CUTE_UNROLL
-            for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
-              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
-              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
-              if constexpr ( (not pred) 
-              ) {
-                dDst(row_i, col_i) = dSrc(row_i, col_i);
-              }
-              else {
-                if (row_i < valid_rows && col_i < valid_cols) {
-                  dDst(row_i, col_i) = dSrc(row_i, col_i);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // Col Major
-    else {
-      CUTE_UNROLL
-      for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
-        CUTE_UNROLL
-        for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
-          CUTE_UNROLL
-          for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
-            CUTE_UNROLL
-            for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
-              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
-              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
-              if constexpr ( (not pred)
-              ) {
-                dDst(row_i, col_i) = dSrc(row_i, col_i);
-              }
-              else {
-                if (row_i < valid_rows && col_i < valid_cols) {
-                  dDst(row_i, col_i) = dSrc(row_i, col_i);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  
-    // Sync all thread data access
-    __syncthreads();
-  } // end of copy_vec_pred()
-  
-};
-
-}  // namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
deleted file mode 100755
index 51f42e9fd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-  \brief Compress utils for structured sparse kernels
-*/
-
-#pragma once
-
-#include <algorithm>                       // std::fill
-#include <array>                           // std::array
-#include <random>                          // std::mt19937
-
-#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v
-#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
-#include "cutlass/arch/arch.h"             // cutlass::arch::SmXY
-#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
-#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
-#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
-
-#include "cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp"
-
-namespace cutlass::transform::kernel {
-
-template<
-  class ProblemShape_,
-  class ElementA_,
-  class LayoutATag_,
-  class SparseConfig_
->
-class StructuredSparseCompressorUtility {
-public:
-  using SparseConfig = SparseConfig_;
-  using ProblemShape = ProblemShape_;
-
-  //* EltA
-  using ElementA = ElementA_;
-  using LayoutATag = LayoutATag_;
-  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
-  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
-  using ElementASparsity = typename SparseConfig::ElementASparsity;
-  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
-
-  //* EltE
-  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
-  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
-
-  //* AtomE
-  using TensorEAtom = typename SparseConfig::TensorEAtom;
-  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
-  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
-
-  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
-  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
-  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
-  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
-  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
-
-  //* Alignment
-  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
-  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
-  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
-  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
-
-  StructuredSparseCompressorUtility() = default;
-
-  StructuredSparseCompressorUtility(ProblemShape problem, StrideA dA) {
-    set_problem_size(problem, dA);
-  }
-
-  void set_problem_size(ProblemShape problem, StrideA dA_) {
-    M = cute::size<0>(problem);
-    K = cute::size<2>(problem);
-    L = cute::size<3>(problem);
-
-    // The following three vars are logical elem count!
-    K_alignedA  = round_up(K, TensorAAlignmentK);
-    M_alignedA  = round_up(M, TensorAAlignmentM);
-    K_alignedE = round_up(K, TensorEAlignmentK);
-    M_alignedE = round_up(M, TensorEAlignmentM);
-
-    dA = dA_;
-  }
-
-  /**
-   * @brief Get the TensorE number of ElementE along K after alignment requirement
-   * 
-   * @return int : number of ElementE (uint8_t) along K-dim
-   */
-  int get_metadata_m_physical() const {
-    return M_alignedE;
-  }
-
-  /**
-   * @brief Get the TensorE number of ElementE along M after alignment requirement
-   * 
-   * @return int : number of ElementE (uint8_t) along M-dim
-   */
-  int get_metadata_k_physical() const {
-    return K_alignedE / ElementEMmaSparsity{};
-  }
-
-  /**
-   * @brief Get the TensorACompressed number of ElementA along K after alignment requirement
-   * 
-   * @return int : number of ElementA along K-dim
-   */
-  int get_tensorA_k_physical() const {
-    return K_alignedA / ElementASparsity{};
-  }
-
-  /**
-   * @brief Get the TensorACompressed number of ElementA along M after alignment requirement
-   * 
-   * @return int : number of ElementA along M-dim
-   */
-  int get_tensorA_m_physical() const {
-    return M_alignedA;
-  }
-
-  /**
-   * @brief Get the TensorACompressed Bytes
-   * 
-   * @return uint64_t bytes
-   */
-  uint64_t get_compressed_tensor_A_bytes() const {
-    const auto tensor_a_comp_num_elt_a = get_tensorA_m_physical() * get_tensorA_k_physical() * L;
-    const auto tensor_a_comp_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_comp_num_elt_a * cute::sizeof_bits_v<ElementA>);
-    return tensor_a_comp_bytes;
-  }
-
-  /**
-   * @brief Get the TensorA Bytes
-   * 
-   * @return uint64_t bytes
-   */
-  uint64_t get_raw_tensor_A_bytes() const {
-    const auto tensor_a_num_elt_a = uint64_t(M) * uint64_t(K) * uint64_t(L);
-    const auto tensor_a_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_num_elt_a * cute::sizeof_bits_v<ElementA>);
-    return tensor_a_bytes;
-  }
-
-  /**
-   * @brief Get the TensorE Bytes
-   * 
-   * @return uint64_t bytes
-   */
-  uint64_t get_tensor_E_bytes() const {
-    const auto tensor_e_num_elt_a = uint64_t(get_metadata_m_physical()) * uint64_t(get_metadata_k_physical()) * uint64_t(L);
-    const auto tensor_e_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_e_num_elt_a * cute::sizeof_bits_v<ElementEMmaRaw>);
-    return tensor_e_bytes;
-  }
-
-  constexpr auto fill_layoutA_from_compressor() const {
-    return SparseConfig::fill_layoutA(cute::make_tuple(M,_1{},K,L));
-  }
-
-  constexpr auto fill_layoutE_from_compressor() const {
-    return SparseConfig::fill_layoutE(cute::make_tuple(M,_1{},K,L));
-  }
-
-  void structure_sparse_zero_mask_fill(void* host_a_ptr, uint64_t seed) {
-    
-    constexpr int ChunkSize = LogicalElemsAMmaRawPerChunk;
-    using ChunkElement = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
-
-    cute::Tensor gA_eltA = cute::make_tensor(
-        cute::recast_ptr<ElementA>(host_a_ptr),
-        cute::make_layout(make_shape(M, K, L), dA));
-
-    // Input TensorA is handled in unit of ElementAMmaRaw instead of ElementA
-    cute::Tensor gA = cute::recast<ChunkElement>(gA_eltA);
-
-    // Extract out the Chunk from K-mode
-    Tensor gA_chunk = cute::zipped_divide(gA, cute::Shape<_1,cute::Int<ChunkSize>>{}); // (Chunk, Rest)
-
-    // Half of the data is zero to indicate sparsityA = 2
-    std::array<int, ChunkSize> nnzb_indicator{};
-    for (size_t i = 1; i < nnzb_indicator.size(); i += 2) {
-      nnzb_indicator.at(i) = 1;
-    }
-
-    std::mt19937 rng(seed);
-    auto rest_shape = cute::shape<1>(gA_chunk);
-    for (auto iter = cute::make_coord_iterator(rest_shape); iter != cute::ForwardCoordIteratorSentinel{}; ++iter) {
-      std::shuffle(nnzb_indicator.begin(), nnzb_indicator.end(), rng);
-      for (int c = 0; c < size<0>(gA_chunk); ++c) {                        // for each elem within chunk
-        if (nnzb_indicator[c] == 0) {
-          gA_chunk(c, *iter) = ChunkElement{0};
-        }
-      }  // end of within chunk
-    }    // end of chunk_idx
-  }
-
-  int M{-1};
-  int K{-1};
-  int L{-1};
-  StrideA dA{};
-
-private:
-  int K_alignedA{-1};
-  int M_alignedA{-1};
-  int K_alignedE{-1};
-  int M_alignedE{-1};
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ProblemShape,
-  class ElementA,
-  class LayoutATag,
-  class SparseConfig,
-  class ArchTag
->
-struct StructuredSparseCompressorSelector {
-  static_assert(cutlass::detail::dependent_false<ArchTag>,
-      "Could not select a structured sparse compressor for given parameters.");
-};
-
-template<
-  class ProblemShape,
-  class ElementA,
-  class LayoutATag,
-  class SparseConfig
->
-struct StructuredSparseCompressorSelector<
-    ProblemShape,
-    ElementA,
-    LayoutATag,
-    SparseConfig,
-    arch::Sm90> {
-  using Compressor = SM90StructuredSparseCompressor<
-    ProblemShape,
-    ElementA,
-    LayoutATag,
-    SparseConfig
-  >;
-};
-
-template<
-  class ProblemShape,
-  class ElementA,
-  class LayoutATag,
-  class SparseConfig,
-  class ArchTag
->
-using StructuredSparseCompressor = typename StructuredSparseCompressorSelector<
-    ProblemShape,
-    ElementA,
-    LayoutATag,
-    SparseConfig,
-    ArchTag
->::Compressor;
-
-} // End namespace cutlass::transform::kernel
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h b/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
deleted file mode 100755
index 0fcb48e56..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
+++ /dev/null
@@ -1,926 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing how threads are mapped to a given tile.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/layout/pitch_linear.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Strip-mines a pitch-linear tile among a given number of threads, first along
-/// the contiguous dimension then along the strided dimension.
-///
-/// The tile must be divisible by the thread count such that all threads may
-/// execute the same number of iterations with the same delta to exhaustively
-/// cover the tile.
-///
-/// This class satisfies the "RegularThreadMapping" concept.
-///
-/// This ThreadMap is used by SIMT kernels and operand E of the sparse tensor
-/// kernels.
-template <
-  typename Shape_,
-  int Threads,
-  int ElementsPerAccess = 1
->
-struct PitchLinearStripminedThreadMap {
-  
-  /// Tensor coordinate
-  using TensorCoord = layout::PitchLinearCoord;
-
-  /// Tile shape
-  using Shape = Shape_;
-
-  /// Number of threads total
-  static int const kThreads = Threads;
-
-  /// Extract vector length from Layout
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  /// Shape of access by each thread
-  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
-
-  /// Internal implementation details
-  struct Detail {
-
-    static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
-
-    /// Shape of the tile in units of vectors
-    using ShapeVec = layout::PitchLinearShape<
-      Shape::kContiguous / kElementsPerAccess,
-      Shape::kStrided
-    >;
-
-    static_assert((Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
-                      (!(kThreads % ShapeVec::kContiguous)),
-                  "Shape must be divisible by number of iterations of each thread.");
-  };
-
-  /// Number of iterations by each thread
-  using Iterations = typename platform::conditional<
-      Threads >= Detail::ShapeVec::kContiguous,
-      layout::PitchLinearShape<
-          1,
-          // Redo the comparison here to work around divide by zero compiler
-          // error.  The compiler evaluates both path of platform::conditional.
-          (Threads >= Detail::ShapeVec::kContiguous
-               ? (Detail::ShapeVec::kStrided + (kThreads / Detail::ShapeVec::kContiguous - 1)) /
-                     (kThreads / Detail::ShapeVec::kContiguous)
-               : 0)>,
-      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
-                               Detail::ShapeVec::kStrided>>::type;
-  
-
-  /// Interval between accesses along each dimension of the tensor's logical coordinate space
-  /// (in units of Elements)
-  using Delta = typename platform::conditional<
-    Threads >= Detail::ShapeVec::kContiguous,
-    layout::PitchLinearShape<
-      1,
-      kThreads / Detail::ShapeVec::kContiguous
-    >,
-    layout::PitchLinearShape<
-      kThreads * kElementsPerAccess,
-      1
-    >
-  >::type;
-
-  /// Shape of the tile in units of vectors
-  using StorageShape = typename platform::conditional<
-      Threads >= Detail::ShapeVec::kContiguous,
-      layout::PitchLinearShape<Shape::kContiguous,
-                               Iterations::kStrided*(kThreads / Detail::ShapeVec::kContiguous)>,
-      layout::PitchLinearShape<Shape::kContiguous, Shape::kStrided>>::type;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
-  /// (in units of Elements)
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-    return TensorCoord(
-      (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess, 
-      thread_id / Detail::ShapeVec::kContiguous);
-  }
-};
-
-/// This ThreadMap is used by GEMV
-template <
-  typename Shape,
-  int Threads,
-  int ElementsPerAccess = 1
->
-struct PitchLinearTilePolicyStripminedThreadContiguous
-{
- static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
-              "Contiguous shape must divide number of threads");
-
-  using TensorCoord = layout::PitchLinearCoord;
-
-  static int const kThreads = Threads;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using Iterations = layout::PitchLinearShape<
-                      Shape::kContiguous / (kThreads * kElementsPerAccess),
-                      Shape::kStrided>;
-
-  using Delta = layout::PitchLinearShape<1, 1>;
-
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id)
-  {
-    return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
-  }
-};
-
-template <
-  typename Shape,
-  int Threads,
-  int ElementsPerAccess = 1
->
-struct PitchLinearTilePolicyStripminedThreadStrided
-{
-  static_assert((Shape::kStrided % Threads == 0),
-                "Strided shape must divide number of threads");
-
-  using TensorCoord = layout::PitchLinearCoord;
-
-  static int const kThreads = Threads;
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  using Iterations = layout::PitchLinearShape<
-                      Shape::kContiguous / kElementsPerAccess,
-                      Shape::kStrided / kThreads>;
-
-  using Delta = layout::PitchLinearShape<1, 1>;
-
-  using ShapeVec = Shape;
-
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id)
-  {
-
-    return TensorCoord(0, thread_id * Iterations::kStrided);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
-/// elements.
-///
-/// This ThreadMap is used by tensor core kernels.
-template <
-  typename Shape_,
-  int Threads,
-  typename WarpThreadArrangement_,
-  int ElementsPerAccess = 1
->
-struct PitchLinearWarpRakedThreadMap {
-
-  /// Tensor coordinate
-  using TensorCoord = layout::PitchLinearCoord;
-
-  /// Tile shape
-  using Shape = Shape_;
-
-  /// Number of threads total
-  static int const kThreads = Threads;
-
-  /// Extract vector length from Layout
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  /// Shape of access by each thread
-  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    /// Fixed arrangement of threads within a warp (units of threads).
-    using WarpThreadArrangement = WarpThreadArrangement_;
-
-    /// Number of threads per warp
-    static int const kWarpSize = WarpThreadArrangement::kCount;
-
-    /// Number of participating warps
-    static int const kWarpCount = kThreads / kWarpSize;
-
-    static_assert(
-      !(Shape::kContiguous % kElementsPerAccess),
-      "Shape must be divisible by vector length.");
-
-    /// Compute the 'shape' of the overall tile in units of vectors
-    using ShapeInAccesses = layout::PitchLinearShape<
-      Shape::kContiguous / kElementsPerAccess,
-      Shape::kStrided
-    >;
-
-    static_assert(
-      !(ShapeInAccesses::kContiguous % WarpThreadArrangement::kContiguous),
-      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
-
-    static_assert(
-      !(ShapeInAccesses::kStrided % WarpThreadArrangement::kStrided),
-      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
-
-    // compute number of warp-level accesses total
-    using WarpAccessIterations = layout::PitchLinearShape<
-      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
-      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
-    >;
-
-    // Divide it into the number of warps, first partitioning the strided dimension then the
-    // contiguous.
-    static int const kWarpsStrided =
-        (WarpAccessIterations::kStrided >= kWarpCount
-             ? kWarpCount
-             : WarpAccessIterations::kStrided);
-
-    static int const kWarpsContiguous =
-        (kWarpCount > WarpAccessIterations::kStrided
-             ? kWarpCount / kWarpsStrided
-             : 1);
-
-    /// Arrangement of warps within a threadblock-scoped tile
-    using WarpArrangement = layout::PitchLinearShape<
-      kWarpsContiguous, kWarpsStrided
-    >;
-  };
-
-  ///< Iterations along each dimension (concept: PitchLinearShape)
-  using Iterations = layout::PitchLinearShape<
-    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
-    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
-  >;
-
-  static_assert(Iterations::kCount,
-    "Number of iterations must be non-zero");
-
-  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-  using Delta = layout::PitchLinearShape<
-    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
-    Detail::WarpThreadArrangement::kStrided
-  >;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-
-    int warp_id = (thread_id / Detail::kWarpSize);
-    int lane_id = (thread_id % Detail::kWarpSize);
-
-    //
-    // compute warp-level offset
-    //
-
-    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
-    layout::PitchLinearCoord warp_footprint{
-      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
-      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
-    };
-
-    // This is the offset of a specific warp (in units of vectors)
-    layout::PitchLinearCoord warp_offset{
-      (warp_id % Detail::kWarpsContiguous),
-      (warp_id / Detail::kWarpsContiguous)
-    };
-
-    // This is the offset of a specific thread within a warp (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_warp{
-      lane_id % Detail::WarpThreadArrangement::kContiguous,
-      lane_id / Detail::WarpThreadArrangement::kContiguous
-    };
-
-    // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
-      warp_footprint * warp_offset + thread_offset_in_warp;
-
-    // This is the offset of a thread within a threadblock tile (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
-      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
-      thread_offset_in_threadblock_tile_vec.strided()
-    };
-
-    return thread_offset_in_threadblock_tile_base;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
-/// elements. Warps are arranged based on a stride.
-///
-/// This ThreadMap is used by tensor core kernels for NCxHWx layout.
-template <
-  typename Shape_,
-  int Threads,
-  typename WarpThreadArrangement_,
-  int ElementsPerAccess = 1
->
-struct PitchLinearStridedWarpRakedThreadMap {
-
-  /// Tensor coordinate
-  using TensorCoord = layout::PitchLinearCoord;
-
-  /// Tile shape
-  using Shape = Shape_;
-
-  /// Number of threads total
-  static int const kThreads = Threads;
-
-  using WarpThreadArrangement = WarpThreadArrangement_;
-
-  /// Extract vector length from Layout
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  /// Base ThreadMap
-  using BaseThreadMap = PitchLinearWarpRakedThreadMap<
-    Shape,
-    kThreads,
-    WarpThreadArrangement,
-    kElementsPerAccess
-  >;
-
-  /// Shape of access by each thread
-  using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape;
-
-
-  struct Detail {
-
-    using WarpThreadArrangement = WarpThreadArrangement_;
-
-    using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations;
-
-    static int const kWarpSize = BaseThreadMap::Detail::kWarpSize;
-
-    static int const kWarpCount = BaseThreadMap::Detail::kWarpCount;
-
-    using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses;
-
-    // Divide it into the number of warps, first partitioning the contiguous dimension then the
-    // stride.
-    static int const kWarpsContiguous =
-        (WarpAccessIterations::kContiguous >= kWarpCount
-             ? kWarpCount
-             : WarpAccessIterations::kContiguous);
-
-    static int const kWarpsStrided =
-        (kWarpCount > WarpAccessIterations::kContiguous
-             ? kWarpCount / kWarpsContiguous
-             : 1);
-
-    /// Arrangement of warps within a threadblock-scoped tile
-    using WarpArrangement = layout::PitchLinearShape<
-      kWarpsContiguous, kWarpsStrided
-    >;
-
-  };
-
-  ///< Iterations along each dimension (concept: PitchLinearShape)
-  using Iterations = layout::PitchLinearShape<
-    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
-    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
-  >;
-
-  static_assert(Iterations::kCount,
-    "Number of iterations must be non-zero");
-
-  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-  using Delta = typename BaseThreadMap::Delta;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-
-    int warp_id = (thread_id / Detail::kWarpSize);
-    int lane_id = (thread_id % Detail::kWarpSize);
-
-    //
-    // compute warp-level offset
-    //
-
-    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
-    layout::PitchLinearCoord warp_footprint{
-      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
-      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
-    };
-
-    // This is the offset of a specific warp (in units of vectors)
-    layout::PitchLinearCoord warp_offset{
-      (warp_id % Detail::kWarpsContiguous),
-      (warp_id / Detail::kWarpsContiguous)
-    };
-
-    // This is the offset of a specific thread within a warp (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_warp{
-      lane_id % Detail::WarpThreadArrangement::kContiguous,
-      lane_id / Detail::WarpThreadArrangement::kContiguous
-    };
-
-    // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
-      warp_footprint * warp_offset + thread_offset_in_warp;
-
-    // This is the offset of a thread within a threadblock tile (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
-      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
-      thread_offset_in_threadblock_tile_vec.strided()
-    };
-
-    return thread_offset_in_threadblock_tile_base;
-  }
-
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Transpose the existing ThreadMap.  For example, interleaved layout is like
-/// congruous in the global memory and crosswise in the shared memory.  We need
-/// to transpose the coordinates between two.
-
-template <typename ThreadMap_, typename WarpThreadArrangement_>
-struct TransposePitchLinearThreadMap {
-  /// Underlying ThreadMap
-  using ThreadMap = ThreadMap_;
-
-  /// Tensor coordinate
-  using TensorCoord = typename ThreadMap::TensorCoord;
-
-  /// Tile shape
-  using Shape = typename ThreadMap::Shape;
-
-  /// Number of threads total
-  static int const kThreads = ThreadMap::kThreads;
-
-  /// Extract vector length from Layout
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-  /// Shape of access by each thread
-  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// Fixed arrangement of threads within a warp (units of threads).
-    using WarpThreadArrangement = WarpThreadArrangement_;
-
-    /// Number of threads per warp
-    static int const kWarpSize = WarpThreadArrangement::kCount;
-
-    /// Number of participating warps
-    static int const kWarpCount = kThreads / kWarpSize;
-
-    static_assert(!(Shape::kContiguous % kElementsPerAccess),
-                  "Shape must be divisible by vector length.");
-
-    /// Arrangement of warps within a threadblock-scoped tile
-    using WarpArrangement =
-        layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
-                                 ThreadMap::Detail::kWarpsContiguous>;
-  };
-
-  ///< Iterations along each dimension (concept: PitchLinearShape)
-  using Iterations =
-      layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
-                               ThreadMap::Iterations::kContiguous>;
-
-  static_assert(Iterations::kContiguous == 1,
-    "Contiguous iteration has to be one to reuse the same shared store function with those that don't need transpose");
-
-  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
-
-  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-  using Delta =
-      layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
-                                   kElementsPerAccess,
-                               Detail::WarpThreadArrangement::kStrided>;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical
-  /// coordinate space Note this is slightly different from the one of
-  /// PitchLinearWarpRakedThreadMap.
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-
-    int warp_id = (thread_id / Detail::kWarpSize);
-    int lane_id = (thread_id % Detail::kWarpSize);
-
-    //
-    // compute warp-level offset
-    //
-
-    // This is the shape of the entire area covered by a warp's memory access
-    // (in units of vectors)
-    layout::PitchLinearCoord warp_footprint{
-        Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
-        Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
-
-    // This is the offset of a specific warp (in units of vectors)
-    // Note the order of / and %. Also the 2nd operand is kStrided.
-    layout::PitchLinearCoord warp_offset{
-        (warp_id / Detail::WarpArrangement::kStrided),
-        (warp_id % Detail::WarpArrangement::kStrided)};
-
-    // This is the offset of a specific thread within a warp (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_warp{
-        lane_id % Detail::WarpThreadArrangement::kContiguous,
-        lane_id / Detail::WarpThreadArrangement::kContiguous};
-
-    // This is the offset of a thread within a threadblock tile (units of
-    // vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
-        warp_footprint * warp_offset + thread_offset_in_warp;
-
-    // This is the offset of a thread within a threadblock tile (units of
-    // elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
-        thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
-        thread_offset_in_threadblock_tile_vec.strided()};
-
-    return thread_offset_in_threadblock_tile_base;
-  }
-};
-
-template <typename ThreadMap_>
-struct TransposePitchLinearThreadMapSimt {
-    /// Underlying ThreadMap
-    using ThreadMap = ThreadMap_;
-
-    /// Tensor coordinate
-    using TensorCoord = typename ThreadMap::TensorCoord;
-
-    /// Tile shape
-    using Shape = typename ThreadMap::Shape;
-
-    /// Number of threads total
-    static int const kThreads = ThreadMap::kThreads;
-
-    /// Extract vector length from Layout
-    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-    static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
-    ///< Iterations along each dimension (concept: PitchLinearShape)
-    using Iterations =
-        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
-        ThreadMap::Iterations::kContiguous>;
-
-    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
-
-    static_assert(Iterations::kStrided == 1,
-      "Strided iteration has to be one to reuse the same shared store function with those that don't need transpose");
-
-    /// Shape of access by each thread
-    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
-
-    ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-    using Delta =
-        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
-        ThreadMap::Delta::kContiguous>;
-
-
-    /// Maps thread ID to a coordinate offset within the tensor's logical
-    /// coordinate space Note this is slightly different from the one of
-    /// PitchLinearWarpRakedThreadMap.
-    CUTLASS_HOST_DEVICE
-        static TensorCoord initial_offset(int thread_id) {
-
-        TensorCoord coord = ThreadMap::initial_offset(thread_id);
-
-        return TensorCoord(
-            coord.strided(),
-            coord.contiguous()
-        );
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// Policy defining a warp-striped arrangement.  This partitions a tile into vectorized memory
-/// accesses performed by each warp then distributes warps across them. Warps are striped in the
-/// strided dimension and raked across the contiguous dimension.
-template <
-  typename Shape_,                          /// Overall shape to partition in units of elements
-  int Threads,                              /// Number of partiticipation threads
-  typename WarpThreadArrangement_,          /// Describes the shape of one memory access per warp
-  int ElementsPerAccess = 1                 /// Number of elements accessed by each thread per memory operation (i.e. vector size)
->
-struct PitchLinearWarpStripedThreadMap {
-
-  /// Tensor coordinate
-  using TensorCoord = layout::PitchLinearCoord;
-
-  /// Tile shape
-  using Shape = Shape_;
-
-  /// Number of threads total
-  static int const kThreads = Threads;
-
-  /// Extract vector length from Layout
-  static int const kElementsPerAccess = ElementsPerAccess;
-
-  /// Shape of access by each thread
-  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    /// Fixed arrangement of threads within a warp (units of threads).
-    using WarpThreadArrangement = WarpThreadArrangement_;
-
-    /// Number of threads per warp
-    static int const kWarpSize = WarpThreadArrangement::kCount;
-
-    /// Number of participating warps
-    static int const kWarpCount = kThreads / kWarpSize;
-
-    static_assert(
-      !(Shape::kContiguous % kElementsPerAccess),
-      "Shape must be divisible by vector length.");
-
-    /// Compute the 'shape' of the overall tile in units of vectors
-    using ShapeInAccesses = layout::PitchLinearShape<
-      Shape::kContiguous / kElementsPerAccess,
-      Shape::kStrided
-    >;
-
-    // compute number of warp-level accesses total
-    using WarpAccessIterations = layout::PitchLinearShape<
-      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
-      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
-    >;
-
-    // Divide it into the number of warps, first partitioning the strided dimension then the
-    // contiguous.
-    static int const kWarpsStrided =
-      (WarpAccessIterations::kStrided >= kWarpCount
-        ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
-
-    static int const kWarpsContiguous =
-      (kWarpCount > WarpAccessIterations::kStrided ?
-        WarpAccessIterations::kContiguous / kWarpsStrided : 1);
-
-    /// Arrangement of warps within a threadblock-scoped tile
-    using WarpArrangement = layout::PitchLinearShape<
-      kWarpsContiguous, kWarpsStrided
-    >;
-  };
-
-  ///< Iterations along each dimension (concept: PitchLinearShape)
-  using Iterations = layout::PitchLinearShape<
-    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
-    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
-  >;
-
-  static_assert(Iterations::kCount,
-    "Number of iterations must be non-zero");
-
-  ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-  using Delta = layout::PitchLinearShape<
-    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
-    Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
-  >;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-
-    int warp_id = (thread_id / Detail::kWarpSize);
-    int lane_id = (thread_id % Detail::kWarpSize);
-
-    //
-    // compute warp-level offset
-    //
-
-    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
-    layout::PitchLinearCoord warp_footprint{
-      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
-      Detail::WarpThreadArrangement::kStrided
-    };
-
-    // This is the offset of a specific warp (in units of vectors)
-    layout::PitchLinearCoord warp_offset{
-      (warp_id % Detail::kWarpsContiguous),
-      (warp_id / Detail::kWarpsContiguous)
-    };
-
-    // This is the offset of a specific thread within a warp (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_warp{
-      lane_id % Detail::WarpThreadArrangement::kContiguous,
-      lane_id / Detail::WarpThreadArrangement::kContiguous
-    };
-
-    // This is the offset of a thread within a threadblock tile (units of vectors)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
-      warp_footprint * warp_offset + thread_offset_in_warp;
-
-    // This is the offset of a thread within a threadblock tile (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
-      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
-      thread_offset_in_threadblock_tile_vec.strided()
-    };
-
-    return thread_offset_in_threadblock_tile_base;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Strip-mines a pitch-linear tile among a given number of threads, first along the contiguous
-/// dimension then along the strided dimension, while each thread access a 2D thread-tile.
-///
-/// The tile must be divisible by the thread count such that all threads may execute the same
-/// number of iterations with the same delta to exhaustively cover the tile.
-///
-/// This class satisfies the "RegularThreadMapping" concept.
-template <
-  typename Shape_,
-  int Threads,
-        typename ThreadTileShape
->
-struct PitchLinear2DThreadTileStripminedThreadMap;
-
-
-template <
-  typename Shape_,
-  int Threads
->
-struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{
-
-  /// Tensor coordinate
-  using TensorCoord = layout::PitchLinearCoord;
-
-  /// Tile shape
-  using Shape = Shape_;
-
-  /// Access Shape of each thread
-  using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
-  //using ThreadAccessShape = ThreadTileShape;
-
-  /// Number of threads total
-  static int const kThreads = Threads;
-
-  /// Extract length of each access from Layout
-  static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
-
-  static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
-
-  /// Internal implementation details
-  struct Detail {
-
-    static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
-
-    static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
-
-    static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
-      "Shape must be divisible thread count * accesses per thread.");
-
-    /// Shape of the tile in units of vectors
-    using ShapeVec = layout::PitchLinearShape<
-      Shape::kContiguous / ThreadAccessShape::kContiguous,
-      Shape::kStrided / ThreadAccessShape::kStrided
-    >;
-
-    static_assert(
-      (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
-      (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
-      "Shape must be divisible by number of iterations of each thread."
-    );
-  };
-
-  /// Number of iterations by each thread
-  using Iterations = typename platform::conditional<
-      Threads >= Detail::ShapeVec::kContiguous,
-      layout::PitchLinearShape<
-          1,
-          // Redo the comparison here to work around divide by zero compiler
-          // error.  The compiler evaluates both path of platform::conditional.
-          (Threads >= Detail::ShapeVec::kContiguous
-               ? Detail::ShapeVec::kStrided /
-                     (kThreads / Detail::ShapeVec::kContiguous)
-               : 0)>,
-      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
-                               Detail::ShapeVec::kStrided>>::type;
-
-  /// Interval between accesses along each dimension of the tensor's logical coordinate space
-  /// (in units of Elements)
-  using Delta = typename platform::conditional<
-    Threads >= Detail::ShapeVec::kContiguous,
-    layout::PitchLinearShape<
-      Shape::kContiguous,
-      kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
-    >,
-    layout::PitchLinearShape<
-      kThreads * ThreadAccessShape::kContiguous,
-      1
-    >
-  >::type;
-
-  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
-  /// (in units of Elements)
-  CUTLASS_HOST_DEVICE
-  static TensorCoord initial_offset(int thread_id) {
-
-    return TensorCoord(
-      (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
-      (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
-  }
-};
-
-/// Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping
-template <typename ThreadMap_>
-struct TransposePitchLinearThreadMap2DThreadTile {
-    /// Underlying ThreadMap
-    using ThreadMap = ThreadMap_;
-
-    /// Tensor coordinate
-    using TensorCoord = typename ThreadMap::TensorCoord;
-
-    /// Tile shape
-    using Shape = typename ThreadMap::Shape;
-
-    /// Number of threads total
-    static int const kThreads = ThreadMap::kThreads;
-
-    /// Extract vector length from Layout
-    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-
-
-    static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
-    ///< Iterations along each dimension (concept: PitchLinearShape)
-    using Iterations =
-        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
-        ThreadMap::Iterations::kContiguous>;
-
-    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
-
-    /// Shape of access by each thread
-    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
-
-    ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape)
-    using Delta =
-        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
-        ThreadMap::Delta::kContiguous>;
-
-
-    /// Maps thread ID to a coordinate offset within the tensor's logical
-    /// coordinate space Note this is slightly different from the one of
-    /// PitchLinearWarpRakedThreadMap.
-    CUTLASS_HOST_DEVICE
-        static TensorCoord initial_offset(int thread_id) {
-
-        TensorCoord coord = ThreadMap::initial_offset(thread_id);
-        return TensorCoord(
-            coord.strided(),
-            coord.contiguous()
-        );
-    }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace transform
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h b/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
deleted file mode 100755
index 4d0b39073..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Basic copy routines for tensor views
-*/
-
-#pragma once
-
-namespace cutlass {
-namespace transform {
-namespace thread {
-
-/// Transforms a fragment by doing a transpose
-template <
-  int ElementCount, 
-  typename TransposeShape, 
-  typename Element
-> struct Transpose;
-
-/// Specialization for int8_t 4x4 transpose
-template <int ElementCount_>
-struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
-
-    static const int kElementCount = ElementCount_;
-    using TransposeShape = layout::PitchLinearShape<4,4>;
-    using Element = int8_t;
-    using Fragment = cutlass::Array<Element, kElementCount>;
-
-    static_assert(!(kElementCount % TransposeShape::kCount), "Shape needs to be multiple of 16 elements to do a 4x4 transpose");
-
-    CUTLASS_DEVICE 
-    void transform(Fragment& dst, Fragment& src) {
-
-    // Expose src/dst as int arrays.
-    int* src_int = reinterpret_cast<int*>(&src);
-    int* dst_int = reinterpret_cast<int*>(&dst);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kElementCount / TransposeShape::kCount; i++){
-  
-      int const i0 = 4 * i + 0;
-      int const i1 = 4 * i + 1;
-      int const i2 = 4 * i + 2;
-      int const i3 = 4 * i + 3;
-
-      int a0 = src_int[i0];
-      int a1 = src_int[i1];
-      int a2 = src_int[i2];
-      int a3 = src_int[i3];
-
-      int b0, b1, b2, b3, c0;
-      b0 = __byte_perm(a0, a1, 0x0040);
-      c0 = __byte_perm(a2, a3, 0x0040);
-      b0 = __byte_perm(b0, c0, 0x5410);
-
-      b1 = __byte_perm(a0, a1, 0x0051);
-      c0 = __byte_perm(a2, a3, 0x0051);
-      b1 = __byte_perm(b1, c0, 0x5410);
-
-      b2 = __byte_perm(a0, a1, 0x0062);
-      c0 = __byte_perm(a2, a3, 0x0062);
-      b2 = __byte_perm(b2, c0, 0x5410);
-
-      b3 = __byte_perm(a0, a1, 0x0073);
-      c0 = __byte_perm(a2, a3, 0x0073);
-      b3 = __byte_perm(b3, c0, 0x5410);
-
-      dst_int[i0] = b0;
-      dst_int[i1] = b1;
-      dst_int[i2] = b2;
-      dst_int[i3] = b3;
-    }
-  }
-};
-
-}  // namespace thread
-}  // namespace layout
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
deleted file mode 100755
index ce7cbbe8f..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-
-namespace cutlass {
-namespace transform {
-namespace thread {
-
-namespace UnaryTransform {
-    struct Identity;    ///< None (i.e., identity)
-    struct Conjugate;   ///< Complex conjugate
-}
-
-/// Element-wise unary operator that transforms one element of a fragment at a time
-template<
-    typename FragmentIn, ///< Input Fragment
-    typename FragmentOut,///< Output Fragment
-    typename Transform>  ///< Unary transform operator
-class UnaryOp
-{
-    public:
-        CUTLASS_DEVICE
-        static FragmentOut execute(FragmentIn &in)
-        {
-            static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match.");
-            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
-                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
-                          "Unary Operator not supported.");
-
-            FragmentOut out;
-            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
-            {
-                CUTLASS_PRAGMA_UNROLL
-                for (int i=0; i < FragmentIn::kElements; ++i){
-                   out[i] = static_cast<typename FragmentOut::Element>(in[i]);
-                }
-            }
-            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
-            {
-                for (int i=0; i < FragmentIn::kElements; ++i){
-                   out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
-                }
-            }
-            return out;
-        }
-};
-
-template<typename FragmentIn, typename Transform>
-class UnaryOp<FragmentIn, FragmentIn, Transform>
-{
-    public:
-        CUTLASS_DEVICE
-        static FragmentIn execute(FragmentIn &in)
-        {
-            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
-                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
-                          "Unary Operator not supported.");
-
-            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
-            {
-                return in;
-            }
-            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
-            {
-                for(int i=0; i < FragmentIn::kElements; ++i){
-                   in[i] = conj(in[i]);
-                }
-            }
-            return in;
-        }
-      };
-    }
-  }
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
deleted file mode 100755
index 026e4ced4..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Ell iterator for matrix of indices (ellColInd matrix) 
-*/
-
-#pragma once
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-namespace ell{
-
-constexpr unsigned int SmemPow = 8;
-constexpr unsigned int SmemStages = 2;
-constexpr unsigned int SmemSize = 1 << SmemPow;
-constexpr unsigned int SmemMask = (SmemSize*SmemStages-1);
-
-class SharedStorage{
-  public:
-    Array<int, SmemSize*SmemStages> array;
-};
-
-class Iterator{
-  public:
-  using Layout = layout::PitchLinear;
-  using LongIndex = typename Layout::LongIndex;
-
-  private:
-    const int *gmem_col_idx_;
-    int *smem_col_idx_;
-    const int  block_size_;
-    const int  base_idx_;
-    const int  k_shape_;
-    const int  ell_increment_;
-    const int  array_length_;
-    int  col_idx_base_;
-    int  residue_;
-    int  counter_;
-
-    int  pow2_;
-    int  residue_shape_;
-
-    int  smem_offset_;
-    int  smem_stage_;
-    int  gmem_offset_;
-
-    int  lane_;
-
-    bool is_pow2_;
-    bool is_residue_tile_;
-
-  public:
-    CUTLASS_DEVICE
-    void load_ell_indices(){
-      for(int i=threadIdx.x; i<SmemSize; i+=blockDim.x){
-        int idx = (gmem_offset_+i < array_length_) ? gmem_offset_+i : array_length_-1;
-        int gmem_col_idx = gmem_col_idx_[idx] - base_idx_;
-        smem_col_idx_[i + smem_stage_ * SmemSize] = 
-          (gmem_col_idx >= 0) ? gmem_col_idx : -1;
-      }
-      gmem_offset_ += SmemSize;
-      smem_stage_ ^= 1;
-    }
-
-    CUTLASS_DEVICE
-    Iterator(
-        SharedStorage& shared_storage_base,
-        const int* col_idx,
-        const int& block_size,
-        const int& base_idx,
-        const int  k_shape,
-        const int& problem_size_k,
-        const int& ell_stride,
-        const int& thread_idx)
-        : residue_(0),
-          counter_(0),
-          smem_offset_(0),
-          smem_stage_(0),
-          gmem_offset_(0),
-          block_size_(block_size),
-          base_idx_(base_idx),
-          k_shape_(k_shape),
-          ell_increment_(ell_stride * block_size),
-          array_length_((problem_size_k + block_size_ - 1) / block_size_), 
-          residue_shape_(problem_size_k % k_shape_),
-          is_residue_tile_(residue_shape_ != 0),
-          smem_col_idx_(reinterpret_cast<int*>(&shared_storage_base.array)),
-          gmem_col_idx_(const_cast<int*>(col_idx)),
-          lane_(thread_idx % 32) {
-
-      load_ell_indices();
-      __syncthreads();
-          
-      is_pow2_ = ((block_size_ & (block_size_ - 1)) == 0);
-      if( is_pow2_ && k_shape <= block_size_ ) lane_ = 0;
-      
-      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_;
-
-      pow2_ = 0;
-      while(block_size_ >> (pow2_ + 1)) ++pow2_;
-    }
-
-    CUTLASS_DEVICE
-    int get_blocksize(){
-      return block_size_;
-    }
-
-    CUTLASS_DEVICE
-    Iterator &operator++(){
-      if(is_residue_tile_){
-        residue_ += residue_shape_;
-        is_residue_tile_ = false;
-      } else {
-        residue_ += k_shape_;
-      }
-
-      if(residue_ < block_size_){
-        return *this;
-      }
-
-      if((array_length_ > SmemSize) && (((smem_offset_ >> SmemPow) & 1) != smem_stage_)) 
-        load_ell_indices();
-
-      if(residue_ == block_size_){
-        ++smem_offset_;
-        counter_ += ell_increment_;
-        residue_ = 0;
-        col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
-        return *this;
-      }
-      
-      if(is_pow2_){
-        smem_offset_ += residue_ >> pow2_; 
-        counter_ += (residue_ >> pow2_) * ell_increment_;
-        residue_ = residue_ & ((1 << pow2_) - 1);
-      }
-      else {
-        smem_offset_ += residue_ / block_size_; 
-        counter_ += (residue_ / block_size_) * ell_increment_;
-        residue_ %= block_size_;
-      }
-      
-      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
-      
-      return *this;
-    }
-    
-    CUTLASS_DEVICE
-    LongIndex get_offset(const int& idx) {
-      int num_jump_tiles;
-      if(is_pow2_)
-        num_jump_tiles = (idx + residue_) >> pow2_;
-      else 
-        num_jump_tiles = (idx + residue_) / block_size_;
-
-      int tmp = __shfl_sync(0xffffffff, col_idx_base_, num_jump_tiles); 
-      return tmp - num_jump_tiles * ell_increment_;
-    }
-    
-    CUTLASS_DEVICE
-    LongIndex get_offset_fast() {
-      return col_idx_base_;
-    }
-};
-
-}
-}
-}
-}
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
deleted file mode 100755
index 2e9e3716a..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
+++ /dev/null
@@ -1,1350 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaMultistage
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// EllPredicatedTileAccessIterator
-///
-template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap, typename AccessType>
-class EllPredicatedTileAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class EllPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static int const kPredicatesPerByte = 4;
-  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
-
-  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
-
-  /// Number of 32b words containing predicates
-  static int const kPredicateByteCount =
-    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
-  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
-
-  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
-
-  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = Array<uint32_t, kPredicateWordCount>;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend EllPredicatedTileAccessIterator;
-
-   private:
-    /// stride of pitch-linear layout (units of Element)
-    LongIndex stride_;
-    /// amount (in byte) to increment pointer to move to next access along
-    /// strided dimension
-    LongIndex inc_strided_;
-    /// amount (in byte) to increment pointer from last access to first access
-    /// of next tile
-    LongIndex inc_next_;
-    /// amount (in byte) to increment pointer from first access of current tile
-    /// to first access of next tile
-    LongIndex inc_advance_;
-
-   public:
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : stride_(layout.stride(0)) {
-      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
-                     sizeof_bits<Element>::value / 8;
-
-      if (kAdvanceRank) {
-        // advance along strided dimension
-        inc_advance_ =
-            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
-      } else {
-        // advance along contiguous dimension
-        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
-      }
-
-      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
-                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
-                                     sizeof_bits<Element>::value / 8;
-    };
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params const &params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  /// Guard predicates
-  uint32_t predicates_[kPredicateWordCount];
-
-  /// Size of tensor
-  TensorCoord extent_;
-
-  /// Initial offset for each thread
-  TensorCoord thread_offset_;
-
-  /// Offset to the first steady-state tile
-  TensorCoord residue_offset_;
-
-  /// Initial offset to define ELL block
-  TensorCoord ell_offset_;
-
-  /// Used for out-of-order visitation
-  bool is_residue_tile_;
-
-  /// Iteration along vectors implied by the thread map
-  int iteration_vector_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_DEVICE
-  void compute_predicates_(
-      /// Extent of the matrix window
-      TensorCoord extent,
-      /// optionally, simplify predicate calculation during 'steady state' phase
-      bool is_steady_state = false) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0u;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
-
-      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-      
-      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-
-      int c = access_residual / kAccessesPerVector;
-      int v = access_residual % kAccessesPerVector;
-
-      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
-                                s * ThreadMap::Delta::kStrided);
-
-      TensorCoord coord = thread_offset_ + iteration_coord;
-
-      bool guard;
-
-      if (is_steady_state) {
-        if (kAdvanceRank == 0) {
-          guard = (coord.strided() < extent.strided());
-        } else {
-          guard = (coord.contiguous() < extent.contiguous());
-        }
-      } else {
-        guard = (coord.strided() < extent.strided() &&
-                 coord.contiguous() < extent.contiguous());
-      }
-
-      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
-
-      int word_idx = pred_idx / kPredicatesPerWord;
-      int residual = pred_idx % kPredicatesPerWord;
-      int byte_idx = residual / kPredicatesPerByte;
-      int bit_idx = residual % kPredicatesPerByte;
-      
-      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
-
-    }
-
-  }
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
-        extent_(extent),
-        is_residue_tile_(true) {
-          
-    TensorCoord residue_extent;
-    if (kAdvanceRank) {
-
-      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
-      if (!residue_size) {
-        residue_size = Shape::kStrided;
-      }
-
-      residue_offset_ = make_Coord(0, residue_size);
-      residue_extent = make_Coord(
-        extent_.contiguous(), 
-        min(threadblock_offset.strided() + residue_size, extent_.strided())
-      );
-    } else {
-
-      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
-      if (!residue_size) {
-        residue_size = Shape::kContiguous;
-      }
-
-      residue_offset_ = make_Coord(residue_size, 0);
-      
-      residue_extent = make_Coord(
-        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
-        extent_.strided()
-      );
-    }
-
-    // Per-thread offset in logical coordinates of tensor
-    ell_offset_ = ThreadMap::initial_offset(thread_id);
-    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
-
-    // update internal pointers
-    Layout layout(params_.stride_);
-    add_pointer_offset(layout(thread_offset_));
-
-    compute_predicates_(residue_extent, false);
-
-    set_iteration_index(0);
-  }
-
-  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id)
-      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-    if (is_residue_tile_) {
-
-      thread_offset_ += residue_offset_;
-
-      Layout layout(params_.stride_);
-      add_pointer_offset(layout(residue_offset_));
-
-      compute_predicates_(extent_, true);
-
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
-      } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
-        pointer_ += Shape::kStrided * tile_offset.strided();
-      }
-    } else {
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
-      } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
-        pointer_ += Shape::kStrided * tile_offset.strided();
-      }
-    }
-    is_residue_tile_ = false;
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(
-        pointer_ + 
-        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
-  }
-  
-  /// Returns a k_location
-  CUTLASS_HOST_DEVICE
-  int get_k() const {
-    if(kAdvanceRank){ //strided
-      return ell_offset_.strided() + iteration_strided_ * ThreadMap::Delta::kStrided;
-    }else{
-      return ell_offset_.contiguous() + iteration_contiguous_ * ThreadMap::Delta::kContiguous + iteration_vector_ * AccessType::kElements;
-    }
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_stride() const {
-    if(kAdvanceRank)
-      return params_.stride_;
-    else
-      return 1;
-  }
-  
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator &operator++() {
-
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-
-    iteration_vector_ = 0;
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      pointer_ += params_.inc_strided_;
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    // advance to next tile
-    pointer_ += params_.inc_next_;
-
-    // now return to start tile - if the iterator is subsequently advanced, this
-    // subtraction as well as the subsequent integer addition are both elided by
-    // the compiler.
-    pointer_ -= params_.inc_advance_;
-
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator operator++(int) {
-    EllPredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = enable ? 0u : predicates_[i];
-    }
-
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0xffffffff;
-    }
-
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = mask[i];
-    }
-
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] = predicates_[i];
-    }
-  }
-  
-  /// add mask for small tiles in ELL
-  CUTLASS_DEVICE
-  void ell_add_mask(int blocksize) {
-
-    Mask mask;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] = 0u;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
-
-      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-      
-      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-
-      int c = access_residual / kAccessesPerVector;
-      int v = access_residual % kAccessesPerVector;
-
-      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
-                                s * ThreadMap::Delta::kStrided);
-
-      TensorCoord coord = ell_offset_ + iteration_coord;
-
-      bool guard;
-
-      if (kAdvanceRank == 0) {
-        guard = (coord.strided() < blocksize);
-      } else {
-        guard = (coord.contiguous() < blocksize);
-      }
-
-      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
-
-      int word_idx = pred_idx / kPredicatesPerWord;
-      int residual = pred_idx % kPredicatesPerWord;
-      int byte_idx = residual / kPredicatesPerByte;
-      int bit_idx = residual % kPredicatesPerByte;
-      
-      mask[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
-
-    }
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] &= predicates_[i];
-    }
-    set_mask(mask);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    int pred_idx = 
-      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
-
-    int word_idx = pred_idx / kPredicatesPerWord;
-    int residual = pred_idx % kPredicatesPerWord;
-    int byte_idx = residual / kPredicatesPerByte;
-    int bit_idx = residual % kPredicatesPerByte;
-    
-    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
-    return pred;
-    
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class EllPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row(), extent.column()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column())) {}
-
-  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  CUTLASS_HOST_DEVICE
-  int get_k() const {
-    return iterator_.get_k();
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_stride() const {
-    return iterator_.get_stride();
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator operator++(int) {
-    EllPredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_DEVICE
-  void ell_add_mask(int blocksize) {
-    iterator_.ell_add_mask(blocksize);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class EllPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  CUTLASS_HOST_DEVICE
-  int get_k() const {
-    return iterator_.get_k();
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_stride() const {
-    return iterator_.get_stride();
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator operator++(int) {
-    EllPredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_DEVICE
-  void ell_add_mask(int blocksize) {
-    iterator_.ell_add_mask(blocksize);
-  }
-  
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileAccessIterator for column-major interleaved data.
-/// It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, int InterleavedK>
-class EllPredicatedTileAccessIterator<Shape_, Element_,
-                                   layout::ColumnMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
-                               Shape::kColumn / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
-      AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
-                                           extent.column() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.row() * kInterleavedK,
-                      threadblock_offset.column() / kInterleavedK)) {}
-
-  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  CUTLASS_HOST_DEVICE
-  int get_k() const {
-    return iterator_.get_k();
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_stride() const {
-    return iterator_.get_stride();
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator operator++(int) {
-    EllPredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-  
-  /// add mask for small tiles in ELL
-  CUTLASS_DEVICE
-  void ell_add_mask(int blocksize) {
-    iterator_.ell_add_mask(blocksize);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() { return iterator_.valid(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileAccessIterator for row-major interleaved data.
-/// It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, int InterleavedK>
-class EllPredicatedTileAccessIterator<Shape_, Element_,
-                                   layout::RowMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
-                               Shape::kRow / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
-      AccessType>;
-
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
-                                           extent.row() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.column() * kInterleavedK,
-                      threadblock_offset.row() / kInterleavedK)) {}
-
-  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_k() const {
-    return iterator_.get_k();
-  }
-  
-  CUTLASS_HOST_DEVICE
-  int get_stride() const {
-    return iterator_.get_stride();
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileAccessIterator operator++(int) {
-    EllPredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_DEVICE
-  void ell_add_mask(int blocksize) {
-    iterator_.ell_add_mask(blocksize);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() { return iterator_.valid(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
deleted file mode 100755
index 7c1b27b3d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
+++ /dev/null
@@ -1,1315 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaPipelined
-*/
-
-#pragma once
-
-#include "cutlass/arch/memory.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
-
-#include "cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h"
-#include "cutlass/transform/threadblock/ell_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// EllPredicatedTileIterator
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-/// Regular tile iterator using a precomputed control structure to minimize register liveness
-/// and integer arithmetic.
-///
-/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
-///
-/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
-/// Subsequently, they are assumed to be immutable.
-///
-/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
-/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
-///
-/// Visitation order is intended to first visit a "residual" tile that may be partially full in
-/// both the advance dimension and the steady-state dimension. This is assumed to be the last
-/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
-/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
-/// accesses may be performed without updating internal predicates and are efficient in terms of
-/// live register state and pointer arithmetic instructions.
-///
-/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
-/// outside any looping structure to minimize integer arithmetic. 
-///
-/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
-/// the iterator.
-///
-///
-/// Example:
-///
-/// An efficient pipeline structure may be constructed as follows:
-///
-// template <typename Iterator>
-// __global__ void kernel(
-//   typename Iterator::Params params, 
-//   typename Iterator::Element *ptr,
-//   TensorCoord extent) {
-//
-//   typename Iterator::Fragment fragment;
-//
-//   TensorCoord threadblock_offset(0, 0);
-//
-//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
-//
-//
-//   fragment = *iter;        // load "residue" tile first
-//   ++iter;                  // advance to first "steady state" tile and update internal masks
-//
-//
-//   #pragma unroll
-//   for (int i = Remaining - 1; i >= 0; --i) {
-//
-//     f(fragment);
-//
-//     if (!i) {
-//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
-//     }
-//  
-//     fragment = *iter;      // load tile during "steady state" phase
-//     ++iter;                // advance to next tile - lightweight due to steady-state masks
-//   }
-// }
-//
-// void host(TensorView<Element, 2, layout::PitchLinear> view) {
-//
-//   using Iterator = transform::threadblock::EllPredicatedTileIterator;
-//
-//   typename Iterator::Params params(view.layout());
-//
-//   kernel<Iterator>(params, view.data());
-// }
-///
-///
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  int AccessSize = ThreadMap::kElementsPerAccess
->
-class EllPredicatedTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize>
-class EllPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
-                             ThreadMap_, AccessSize> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator =
-      EllPredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-
-  /// Iterator for ELL storage
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend EllPredicatedTileIterator;
-
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) { }
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset) {}
-
-  /// Construct a EllPredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator &operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset({0, 1});
-    else
-      address_iterator_.add_tile_offset({1, 0});
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator operator++(int) {
-    EllPredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Returns a stride
-  CUTLASS_HOST_DEVICE
-  int get_stride() const { return address_iterator_.get_stride(); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { address_iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_HOST_DEVICE
-  void ell_add_mask(int blocksize) { address_iterator_.ell_add_mask(blocksize); }
-
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          
-          address_iterator_.set_iteration_index(idx);
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index(Fragment &frag, EllIterator &ell_iter) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          address_iterator_.set_iteration_index(idx);
-          LongIndex ell_offset = 0;
-
-          int k_offset = address_iterator_.get_k();
-          ell_offset = ell_iter.get_offset(k_offset) * sizeof(Element);
-          
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          bool is_valid = address_iterator_.valid();
-          is_valid = is_valid && (ell_offset >= 0);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, is_valid);
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  
-  CUTLASS_DEVICE
-  void load_with_ell_index_fast(Fragment &frag, EllIterator &ell_iter) {
-
-    LongIndex ell_offset = ell_iter.get_offset_fast() * sizeof(Element);
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          address_iterator_.set_iteration_index(idx);
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          bool is_valid = address_iterator_.valid();
-          is_valid = is_valid && (ell_offset >= 0);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, is_valid);
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize
->
-class EllPredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Iterator for ELL storage
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
-  
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend EllPredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
-
-    }
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.row(), extent.column()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
-    ) { }
-
-  /// Construct a EllPredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator operator++(int) {
-    EllPredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-  
-  /// Returns a stride
-  CUTLASS_HOST_DEVICE
-  int get_stride() const { return iterator_.get_stride(); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_HOST_DEVICE
-  void ell_add_mask(int blocksize) { 
-    iterator_.ell_add_mask(blocksize); 
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index(frag, ell_iter);
-  }
-  
-  CUTLASS_DEVICE
-  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index_fast(frag, ell_iter);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize
->
-class EllPredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Iterator for ELL storage
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
-  
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend EllPredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { } 
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
-
-    };
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.column(), extent.row()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
-    ) { }
-
-  /// Construct a EllPredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator operator++(int) {
-    EllPredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-  
-  /// Returns a stride
-  CUTLASS_HOST_DEVICE
-  int get_stride() const { return iterator_.get_stride(); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_HOST_DEVICE
-  void ell_add_mask(int blocksize) { 
-    iterator_.ell_add_mask(blocksize); 
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index(frag, ell_iter);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index_fast(frag, ell_iter);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileIterator for interleaved data.  It is mapped
-/// to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, int InterleavedK>
-class EllPredicatedTileIterator<Shape_, Element_,
-                             layout::ColumnMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileIterator<
-      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
-                               Shape::kColumn / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
-
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Iterator for ELL storage
-  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
-  
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
-                                           extent.column() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.row() * kInterleavedK,
-                      threadblock_offset.column() / kInterleavedK)) {}
-
-  /// Construct a EllPredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator operator++(int) {
-    EllPredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-  
-  /// Returns a stride
-  CUTLASS_HOST_DEVICE
-  int get_stride() const { return iterator_.get_stride(); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_HOST_DEVICE
-  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index(frag, ell_iter);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
-    iterator_.load_with_ell_index_fast(frag, ell_iter);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of EllPredicatedTileIterator for interleaved-32 data.  It is
-/// mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, int InterleavedK>
-class EllPredicatedTileIterator<Shape_, Element_,
-                             layout::RowMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = EllPredicatedTileIterator<
-      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
-                               Shape::kRow / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
-
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend EllPredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
-                                           extent.row() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.column() * kInterleavedK,
-                      threadblock_offset.row() / kInterleavedK)) {}
-
-  /// Construct a EllPredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  EllPredicatedTileIterator operator++(int) {
-    EllPredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-  
-  /// Returns a stride
-  CUTLASS_HOST_DEVICE
-  int get_stride() const { return iterator_.get_stride(); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// add mask for small tiles in ELL
-  CUTLASS_HOST_DEVICE
-  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
deleted file mode 100755
index 366897c65..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ /dev/null
@@ -1,375 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
-
-    This iterator uses masks to guard out-of-bounds accesses.
-
-    It can be used to load the gamma and beta vectors of layernorm which is loop variant.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/conv/threadblock/conv2d_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedScaleBiasVectorAccessIterator
-///
-template <typename ThreadblockShape,
-          typename Element,
-          typename Layout>
-class PredicatedScaleBiasVectorAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
-///
-template <typename ThreadblockShape_, typename Element_>
-class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
-                                              Element_,
-                                              layout::PitchLinear> {
- public:
-
-  using ThreadblockShape = ThreadblockShape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
-
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  TensorCoord thread_offset_;
-
-  int problem_size_k_;
-
-  /// Used for out-of-order visitation
-  bool is_residue_tile_;
-
-  bool guard_;
-
-  TensorCoord::Index residue_size_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Extent of tensor
-      int problem_size_k,
-      /// Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset) {
-    pointer_ = (thread_id < kThreads)
-                   ? reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(scale_pointer))
-                   : reinterpret_cast<BytePointer>(
-                         const_cast<NonConstPointer>(bias_pointer));
-
-    // Per-thread offset in logical coordinates of tensor
-    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
-
-    problem_size_k_ = problem_size_k;
-
-    is_residue_tile_ = true;
-
-    residue_size_ = (problem_size_k_ - threadblock_offset.contiguous()) % ThreadblockShape::kContiguous;
-
-    if (residue_size_ == 0) {
-      residue_size_ = ThreadblockShape::kContiguous;
-    }
-
-    guard_ = ((thread_id - thread_base) * kElementsPerAccess) < residue_size_;
-
-    thread_offset_ =
-        threadblock_offset +
-        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
-
-    set_iteration_index(0);
-  }
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      /// Extent of tensor
-      int problem_size_k,
-      /// Pointer to start of scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to start of scale vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {}
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-
-    guard_ = threadIdx.x < kThreads * 2;
-
-    TensorCoord offset = is_residue_tile_ ?
-      TensorCoord(residue_size_ + ThreadblockShape::kContiguous * (tile_offset.contiguous() - 1), 0)
-      : TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
-
-    thread_offset_ =
-        thread_offset_ +
-        offset;
-
-    is_residue_tile_ = false;
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    return reinterpret_cast<AccessType *>(
-        pointer_ +
-        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator &operator++() {
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_DEVICE
-  PredicatedScaleBiasVectorAccessIterator operator++(int) {
-    PredicatedScaleBiasVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    guard_ &= (!enable);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return guard_;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename ThreadblockShape_,
-          typename Element_>
-class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
-                                        Element_,
-                                        layout::RowMajor> {
- public:
-
-  using ThreadblockShape = ThreadblockShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
-      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
-      Element,
-      layout::PitchLinear>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      ///< Extent of tensor
-      int problem_size_k,
-      ///< Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      ///< Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(problem_size_k, scale_pointer, bias_pointer,
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator(
-      int problem_size_k,  ///< Extent of tensor
-      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
-      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
-      int thread_id                ///< ID of each participating thread
-      )
-      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
-                                                scale_pointer, bias_pointer,
-                                                thread_id, make_Coord(0, 0)) {}
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// threadblock tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorAccessIterator operator++(int) {
-    PredicatedScaleBiasVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform 
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
deleted file mode 100755
index 54b0ecf5e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
-
-    This iterator uses masks to guard out-of-bounds accesses.
-
-    This can be used to load var and mean vectors in layernorm which is loop invariant.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedScaleBiasVectorIterator
-///
-template <typename WarpShape,
-          typename Element,
-          typename Layout>
-class PredicatedScaleBiasVectorIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
-///
-template <typename WarpShape_, typename Element_>
-class PredicatedScaleBiasVectorIterator<WarpShape_,
-                                        Element_,
-                                        layout::PitchLinear> {
- public:
-
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kElementsPerAccess = 1;
-
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
-  static int const kIterations = WarpShape::kContiguous / 8;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Internal pointer to first access of tile
-  ConstPointer scale_pointer_;
-  ConstPointer bias_pointer_;
-
-  /// Size of tensor
-  int problem_size_;
-
-  int32_t thread_offset_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      /// Extent of tensor
-      int problem_size,
-      /// Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : problem_size_(problem_size),
-        scale_pointer_(scale_pointer),
-        bias_pointer_(bias_pointer) {
-
-    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
-  }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      /// Extent of tensor
-      int problem_size,
-      /// Pointer to start of scale vector
-      ConstPointer scale_pointer,
-      /// Pointer to start of scale vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedScaleBiasVectorIterator(problem_size,
-                                          scale_pointer, bias_pointer,
-                                          thread_id, make_Coord(0, 0)) {}
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-
-    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    frag.fill(__float2half2_rn(0.0f));
-    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
-
-    // load scale
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-
-      cutlass::arch::global_load<
-        __half,
-        sizeof(AccessType)
-      >(
-        frag_ptr[c * 2].x,
-        scale_pointer_ + thread_offset_ + c * 8,
-        (thread_offset_ + c * 8) < problem_size_ 
-      );
-    }
-
-    // load bias
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-
-      cutlass::arch::global_load<
-        __half,
-        sizeof(AccessType)
-      >(
-        frag_ptr[c * 2 + 1].x,
-        bias_pointer_ + thread_offset_ + c * 8,
-        (thread_offset_ + c * 8) < problem_size_ 
-      );
-    }
-
-    // duplicate scale
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
-    }
-
-    // duplicate bias
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < kIterations; ++c) {
-      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename WarpShape_,
-          typename Element_>
-class PredicatedScaleBiasVectorIterator<WarpShape_,
-                                        Element_,
-                                        layout::RowMajor> {
- public:
-
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
-      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
-      Element,
-      layout::PitchLinear>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
-  using Fragment = typename UnderlyingIterator::Fragment;
-
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      ///< Extent of tensor
-      int problem_size,
-      ///< Pointer to the start of the scale vector
-      ConstPointer scale_pointer,
-      ///< Pointer to the start of the bias vector
-      ConstPointer bias_pointer,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(problem_size, scale_pointer, bias_pointer,
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedScaleBiasVectorIterator(
-      int problem_size,  ///< Extent of tensor
-      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
-      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
-      int thread_id                ///< ID of each participating thread
-      )
-      : PredicatedScaleBiasVectorIterator(problem_size,
-                                          scale_pointer, bias_pointer,
-                                          thread_id, make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// threadblock tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load(frag);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform 
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
deleted file mode 100755
index a99dae952..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ /dev/null
@@ -1,2118 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates calculating the address and predicates to the load of tiles
-    from pitch-linear rank=2 tensors.
-
-    This iterator uses masks to guard out-of-bounds accesses. The first tile this
-    iterator visits maybe partial, then the remaining tiles are complete. So, we 
-    only need to compute the predicates twice, once before the first tile and 
-    once for the remaining full tiles which can share the same predicates.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-    stored in registers, and integer addition is used to advance the pointer
-    through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/permute.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileAccessIteratorPredicates
-///
-template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIteratorPredicates {
- public:
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = Layout_;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorCoord = typename Layout::TensorCoord;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static int const kPredicatesPerByte = 4;
-  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
-
-  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
-
-  /// Number of 32b words containing predicates
-  static int const kPredicateByteCount =
-    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
-  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
-
-  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
-
-  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = Array<uint32_t, kPredicateWordCount>;
-
-// private:
-  /// Guard predicates
-  uint32_t predicates_[kPredicateWordCount];
-
-  /// Size of tensor
-  TensorCoord extent_;
-
-  /// Initial offset for each thread
-  TensorCoord thread_offset_;
-
-  /// Offset to the first steady-state tile
-  TensorCoord residue_offset_;
-
-  /// Iteration along vectors implied by the thread map
-  int iteration_vector_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_DEVICE
-  void compute_predicates_(
-      /// Extent of the matrix window
-      TensorCoord extent,
-      /// optionally, simplify predicate calculation during 'steady state' phase
-      bool is_steady_state = false) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0u;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
-
-      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-      
-      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-
-      int c = access_residual / kAccessesPerVector;
-      int v = access_residual % kAccessesPerVector;
-
-      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
-                                s * ThreadMap::Delta::kStrided);
-
-      TensorCoord coord = thread_offset_ + iteration_coord;
-
-      bool guard;
-
-      if (is_steady_state) {
-        if (kAdvanceRank == 0) {
-          guard = (coord.strided() < extent.strided());
-        } else {
-          guard = (coord.contiguous() < extent.contiguous());
-        }
-      } else {
-        guard = (coord.strided() < extent.strided() &&
-                 coord.contiguous() < extent.contiguous());
-      }
-
-      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
-
-      int word_idx = pred_idx / kPredicatesPerWord;
-      int residual = pred_idx % kPredicatesPerWord;
-      int byte_idx = residual / kPredicatesPerByte;
-      int bit_idx = residual % kPredicatesPerByte;
-      
-      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
-
-    }
-
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
-
-    TensorCoord residue_extent;
-    if (kAdvanceRank) {
-
-      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
-      if (!residue_size) {
-        residue_size = Shape::kStrided;
-      }
-
-      residue_offset_ = make_Coord(0, residue_size);
-      residue_extent = make_Coord(
-        extent_.contiguous(), 
-        min(threadblock_offset.strided() + residue_size, extent_.strided())
-      );
-    } else {
-
-      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
-      if (!residue_size) {
-        residue_size = Shape::kContiguous;
-      }
-
-      residue_offset_ = make_Coord(residue_size, 0);
-      
-      residue_extent = make_Coord(
-        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
-        extent_.strided()
-      );
-    }
-
-    // Per-thread offset in logical coordinates of tensor
-    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
-
-    compute_predicates_(residue_extent, false);
-
-    set_iteration_index(0);
-  }
-
-  /// Default constructor
-  PredicatedTileAccessIteratorPredicates() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorPredicates(
-      /// Extent of tensor
-      TensorCoord extent)
-      : extent_(extent) {
-	}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorPredicates &operator++() {
-
-    return *this;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = enable ? 0u : predicates_[i];
-    }
-
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0xffffffff;
-    }
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = mask[i];
-    }
-
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] = predicates_[i];
-    }
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-
-    
-    int pred_idx = 
-      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
-
-    int word_idx = pred_idx / kPredicatesPerWord;
-    int residual = pred_idx % kPredicatesPerWord;
-    int byte_idx = residual / kPredicatesPerByte;
-    int bit_idx = residual % kPredicatesPerByte;
-    
-    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
-    return pred;
-    
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileAccessIterator
-///
-template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap, typename AccessType, bool Gather = false,
-          typename PermuteLayout = layout::NoPermute>
-class PredicatedTileAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather,
-          typename PermuteLayout>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
-                                   PermuteLayout> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
-      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-  
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
-                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
-
-  using Mask = typename UnderlyingPredicates::Mask;
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileAccessIteratorParams {
-    
-    using Base = PredicatedTileAccessIteratorParams;
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : 
-      Base(layout.stride(0),
-            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
-        ) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  UnderlyingPredicates the_predicates;
-
-  /// Parameters object with precomputed internal state
-  Params params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  /// Used for out-of-order visitation
-  bool is_residue_tile_;
-
-  /// Below is used when Gather is turned on.  We need to record strided_offset
-  /// and contiguous_offset separated to compute the offset by using
-  ///
-  /// offset = contiguous_offset + indices[strided_offset]
-
-  /// Gather indices
-  int const *indices_;
-
-  /// Function to perform layout permutation and offset computation
-  PermuteLayout permute_layout_;
-
-  /// Tracks thread's coordinate offset in the matrix for current tile.
-  /// This is only used in the following cases:
-  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
-  /// - when Permute is true, both coordinates are neeeded as input into permutation function (pointer_ is fixed)
-  TensorCoord coord_offset_;
-
- private:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_DEVICE
-  void compute_predicates_(
-      /// Extent of the matrix window
-      TensorCoord extent,
-      /// optionally, simplify predicate calculation during 'steady state' phase
-      bool is_steady_state = false) {
-	  the_predicates.compute_predicates_(extent, is_steady_state);
-  }
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      /// Gather indices
-      int const *indices = nullptr)
-      : params_(params),
-	      pointer_(reinterpret_cast<BytePointer>(
-                 const_cast<NonConstPointer>(pointer))),
-	      the_predicates(extent),
-        is_residue_tile_(true),
-        indices_(indices),
-        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
-
-    the_predicates.set_predicates(thread_id, threadblock_offset);
-          
-    if (Gather) {
-      assert(indices_);
-    }
-
-    // update internal pointers
-    Layout layout(params_.stride_);
-
-    if (!Gather && !Permute) {
-      add_pointer_offset(layout(the_predicates.thread_offset_));
-    } else {
-      coord_offset_ = the_predicates.thread_offset_;
-      if (!Permute) {
-        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
-      }
-    }
-  }
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    the_predicates.set_iteration_index(index);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-    if (is_residue_tile_) {
-
-      the_predicates.thread_offset_ += the_predicates.residue_offset_;
-
-      the_predicates.compute_predicates_(the_predicates.extent_, true);
-
-      Layout layout(params_.stride_);
-
-      if (!Gather && !Permute) {
-        add_pointer_offset(layout(the_predicates.residue_offset_));
-
-        if (kAdvanceRank) {
-          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
-          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
-        } else {
-          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
-          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
-        }
-      } else {
-        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
-        if (!Permute) {
-          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
-          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
-        } else {
-          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
-        }
-      }
-    } else {
-      if (!Gather && !Permute) {
-        if (kAdvanceRank) {
-          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
-          pointer_ += Shape::kContiguous * tile_offset.contiguous();
-        } else {
-          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
-          pointer_ += Shape::kStrided * tile_offset.strided();
-        }
-      } else {
-        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
-        if (!Permute) {
-          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
-        } else {
-          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
-        }
-      }
-    }
-
-    is_residue_tile_ = false;
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    if (Gather || Permute)
-    {
-      if (!valid()) {
-        return nullptr;
-      }
-
-      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
-      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
-      if (Gather) {
-        coord_strided = indices_[coord_strided];
-      }
-
-      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
-      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
-    }
-
-    return reinterpret_cast<AccessType *>(
-        pointer_ + 
-        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-
-    the_predicates.operator++();
-
-    ++the_predicates.iteration_vector_;
-    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-
-    the_predicates.iteration_vector_ = 0;
-    ++the_predicates.iteration_contiguous_;
-
-    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
-    the_predicates.iteration_contiguous_ = 0;
-    ++the_predicates.iteration_strided_;
-
-    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      if (!Gather && !Permute) {
-        pointer_ += params_.inc_strided_;
-      }
-
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    the_predicates.iteration_strided_ = 0;
-
-    if (!Gather && !Permute) {
-      // advance to next tile
-      pointer_ += params_.inc_next_;
-  
-      // now return to start tile - if the iterator is subsequently advanced, this
-      // subtraction as well as the subsequent integer addition are both elided by
-      // the compiler.
-      pointer_ -= params_.inc_advance_;
-    }
-
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    the_predicates.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    the_predicates.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { 
-    the_predicates.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    the_predicates.get_mask(mask);
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() const {
-    return the_predicates.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather,
-          typename PermuteLayout>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
-                                   PermuteLayout> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
-      Gather, PermuteLayout>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row(), extent.column()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column()),
-                  indices) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, bool Gather,
-          typename PermuteLayout>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
-                                   PermuteLayout> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, 
-      Gather, PermuteLayout>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      /// Gather indices
-      int const *indices = nullptr)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row()),
-                  indices) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for affine rank 2 data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false,
-                                   layout::NoPermute> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRankN<2>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
-      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingPredicates::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend PredicatedTileAccessIterator;
-
-   private:
-    /// stride of pitch-linear layout (units of Element)
-    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
-    /// amount (in byte) to increment pointer to move to next access along
-    /// contiguous dimension
-    LongIndex inc_contiguous_;
-    /// amount (in byte) to increment pointer from first access of current
-    /// contiguous dimension to first access of next one.
-    LongIndex inc_strided_;
-    /// amount (in byte) to increment pointer from last access of current
-    /// contiguous dimension to first access of next one.
-    LongIndex inc_next_strided_;
-    /// amount (in byte) to increment pointer from last access to first access
-    /// of next tile
-    LongIndex inc_next_;
-    /// amount (in byte) to increment pointer from first access of current tile
-    /// to first access of next tile
-    LongIndex inc_advance_;
-
-   public:
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
-      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
-                     sizeof_bits<Element>::value / 8;
-
-      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
-                     sizeof_bits<Element>::value / 8;
-
-      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
-
-      if (kAdvanceRank) {
-        // advance along strided dimension
-        inc_advance_ =
-            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
-      } else {
-        // advance along contiguous dimension
-        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
-      }
-
-      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
-    };
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  UnderlyingPredicates the_predicates;
-
-  /// Used for out-of-order visitation
-  bool is_residue_tile_;
-
- private:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_DEVICE
-  void compute_predicates_(
-      /// Extent of the matrix window
-      TensorCoord extent,
-      /// optionally, simplify predicate calculation during 'steady state' phase
-      bool is_steady_state = false) {
-          the_predicates.compute_predicates_(extent, is_steady_state);
-  }
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
-        the_predicates(extent),
-	is_residue_tile_(true) {
-
-    the_predicates.set_predicates(thread_id, threadblock_offset);
-
-    // update internal pointers
-    Layout layout(params_.stride_);
-    add_pointer_offset(layout(the_predicates.thread_offset_));
-  }
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    if (is_residue_tile_) {
-
-      the_predicates.thread_offset_ += the_predicates.residue_offset_;
-
-      Layout layout(params_.stride_);
-      add_pointer_offset(layout(the_predicates.residue_offset_));
-
-      the_predicates.compute_predicates_(the_predicates.extent_, true);
-
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
-        pointer_ += Shape::kContiguous * tile_offset[0];
-      } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
-        pointer_ += Shape::kStrided * tile_offset[1];
-      }
-    } else {
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
-        pointer_ += Shape::kContiguous * tile_offset[0];
-      } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
-        pointer_ += Shape::kStrided * tile_offset[1];
-      }
-    }
-    is_residue_tile_ = false;
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    the_predicates.operator++();
-    ++the_predicates.iteration_vector_;
-    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-
-    the_predicates.iteration_vector_ = 0;
-    ++the_predicates.iteration_contiguous_;
-
-    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      pointer_ += params_.inc_contiguous_;
-      return *this;
-    }
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    the_predicates.iteration_contiguous_ = 0;
-    ++the_predicates.iteration_strided_;
-
-    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      pointer_ += params_.inc_next_strided_;
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    the_predicates.iteration_strided_ = 0;
-
-    // advance to next tile
-    pointer_ += params_.inc_next_;
-
-    // now return to start tile - if the iterator is subsequently advanced, this
-    // subtraction as well as the subsequent integer addition are both elided by
-    // the compiler.
-    pointer_ -= params_.inc_advance_;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { the_predicates.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return the_predicates.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for affine rank 2 column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, false,
-                                   layout::NoPermute> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row(), extent.column()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column())) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for affine rank-2 row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_, false,
-                                   layout::NoPermute> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.  
-/// It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, int InterleavedK>
-class PredicatedTileAccessIterator<Shape_, Element_,
-                                   layout::ColumnMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false,
-                                   layout::NoPermute> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
-                               Shape::kColumn / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
-      AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
-                                           extent.column() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.row() * kInterleavedK,
-                      threadblock_offset.column() / kInterleavedK)) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() { return iterator_.valid(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for row-major interleaved data.  
-//  It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_, int InterleavedK>
-class PredicatedTileAccessIterator<Shape_, Element_,
-                                   layout::RowMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_, false,
-                                   layout::NoPermute> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
-                               Shape::kRow / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
-      AccessType>;
-
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileAccessIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
-                                           extent.row() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.column() * kInterleavedK,
-                      threadblock_offset.row() / kInterleavedK)) {}
-
-  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator operator++(int) {
-    PredicatedTileAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() { return iterator_.valid(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
deleted file mode 100755
index 4379bb0a6..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ /dev/null
@@ -1,834 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates calculating the address and predicates to the load of tiles
-   from pitch-linear rank=2 tensors.
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last
-   "residue" tile first, with the objective of minimizing predicate mask updates
-   during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileAccessIterator2dThreadTile
-///
-template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap, typename AccessType>
-class PredicatedTileAccessIterator2dThreadTile;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kPredicatesPerByte = 4;
-  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
-
-  /// Number of 32b words containing predicates
-  static int const kPredicateByteCount = (ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kStrided + kPredicatesPerByte - 1) / kPredicatesPerByte;
-  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
-
-  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
-
-  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = Array<uint32_t, kPredicateWordCount>;
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileAccessIteratorParams {
-
-   public:
-    friend PredicatedTileAccessIterator2dThreadTile;
-
-    using Base = PredicatedTileAccessIteratorParams;
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : 
-      Base(layout.stride(0),
-            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
-        ) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) : 
-      Base(base) { }
-  };
-
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params const &params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  /// Guard predicates
-  uint32_t predicates_[kPredicateWordCount];
-
-  /// Size of tensor
-  TensorCoord extent_;
-
-  /// Initial offset for each thread
-  TensorCoord thread_offset_;
-
-  /// Index of residue tile
-  int residue_tile_idx_;
-
-  /// Used for out-of-order visitation
-  bool is_residue_tile_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
-  /// Tracks iterations within the thread loop
-  int iteration_thread_;
-
- private:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_HOST_DEVICE
-  void compute_predicates_(
-      /// optionally, simplify predicate calculation during 'steady state' phase
-      bool is_steady_state = false) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0u;
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++) {
-
-          TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous,
-                                      ts + s * ThreadMap::Delta::kStrided);
-
-          TensorCoord coord = thread_offset_ + iteration_coord;
-
-          bool guard;
-
-          if (is_steady_state) {
-            if (kAdvanceRank == 0) {
-              guard = (coord.strided() < extent_.strided());
-            } else {
-              guard = (coord.contiguous() < extent_.contiguous());
-            }
-          } else {
-            guard = (coord.strided() < extent_.strided() &&
-                     coord.contiguous() < extent_.contiguous());
-          }
-
-          int pred_idx = ts + c *  ThreadMap::ThreadAccessShape::kStrided + s * ThreadMap::Iterations::kContiguous *  ThreadMap::ThreadAccessShape::kStrided;
-          int word_idx = pred_idx / kPredicatesPerWord;
-          int residual = pred_idx % kPredicatesPerWord;
-          int byte_idx = residual / kPredicatesPerByte;
-          int bit_idx = residual % kPredicatesPerByte;
-          
-          predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
-
-        }
-      }
-    }
-
-  }
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
-        extent_(extent),
-        is_residue_tile_(true) {
-          
-
-    TensorCoord residue_offset;
-    if (kAdvanceRank) {
-      residue_tile_idx_ =
-          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
-          Shape::kStrided;
-      residue_offset = make_Coord(0, residue_tile_idx_ * Shape::kStrided);
-    } else {
-      residue_tile_idx_ =
-          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
-          Shape::kContiguous;
-      residue_offset = make_Coord(residue_tile_idx_ * Shape::kContiguous, 0);
-    }
-
-    // Per-thread offset in logical coordinates of tensor
-    thread_offset_ = threadblock_offset + residue_offset +
-                     ThreadMap::initial_offset(thread_id);
-
-    // update internal pointers
-    Layout layout(params_.stride_);
-    add_pointer_offset(layout(thread_offset_));
-
-    compute_predicates_(false);
-
-    set_iteration_index(0);
-  }
-
-  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    int residual = index % (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
-    iteration_strided_ = index / (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
-    
-    iteration_contiguous_ = residual / ThreadMap::ThreadAccessShape::kStrided;
-    iteration_thread_ = residual % ThreadMap::ThreadAccessShape::kStrided;
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += int(sizeof(Element)) * pointer_offset;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-    if (is_residue_tile_) {
-      TensorCoord residue_offset;
-      if (kAdvanceRank) {
-        residue_offset = TensorCoord(0, residue_tile_idx_ * Shape::kStrided);
-      } else {
-        residue_offset = TensorCoord(residue_tile_idx_ * Shape::kContiguous, 0);
-      }
-
-      thread_offset_ -= residue_offset;
-
-      Layout layout(params_.stride_);
-      add_pointer_offset(-layout(residue_offset));
-
-      compute_predicates_(true);
-
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * (tile_offset.strided() - 1);
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
-      } else {
-        pointer_ += params_.inc_advance_ * (tile_offset.contiguous() - 1);
-        pointer_ += Shape::kStrided * tile_offset.strided();
-      }
-    } else {
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * tile_offset.strided();
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
-      } else {
-        pointer_ += params_.inc_advance_ * tile_offset.contiguous();
-        pointer_ += Shape::kStrided * tile_offset.strided();
-      }
-    }
-    is_residue_tile_ = false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    AccessType *ret_val = reinterpret_cast<AccessType *>(
-                pointer_ + (iteration_thread_ * params_.stride_  + iteration_contiguous_ * ThreadMap::Delta::kContiguous) * int(sizeof(Element)));
-
-    return ret_val;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile &operator++() {
-
-    iteration_thread_++;
-
-    if (iteration_thread_ < ThreadMap::ThreadAccessShape::kStrided)
-      return *this;
-
-    iteration_thread_ = 0;
-
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      pointer_ += params_.inc_strided_;
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    // advance to next tile
-    pointer_ += params_.inc_next_;
-
-    // now return to start tile - if the iterator is subsequently advanced, this
-    // subtraction as well as the subsequent integer addition are both elided by
-    // the compiler.
-    pointer_ -= params_.inc_advance_;
-
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile operator++(int) {
-    PredicatedTileAccessIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = enable ? 0u : predicates_[i];
-    }
-
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0xffffffff;
-    }
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = mask[i];
-    }
-
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] = predicates_[i];
-    }
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    int pred_idx = 
-      iteration_thread_ + 
-      iteration_contiguous_ * ThreadMap::ThreadAccessShape::kStrided + 
-      iteration_strided_ * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
-
-    int word_idx = pred_idx / kPredicatesPerWord;
-    int residual = pred_idx % kPredicatesPerWord;
-    int byte_idx = residual / kPredicatesPerByte;
-    int bit_idx = residual % kPredicatesPerByte;
-    
-    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
-    
-    return pred;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator2dThreadTile;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row(), extent.column()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column())) {}
-
-  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile operator++(int) {
-    PredicatedTileAccessIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
-class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIterator2dThreadTile;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))){}
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIterator2dThreadTile operator++(int) {
-    PredicatedTileAccessIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
deleted file mode 100755
index c67af387e..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/detail/helper_macros.hpp"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Predicated tile access iterator descriptor object containing template dependent state
-struct PredicatedTileAccessIteratorDesc {
-
-  int element_size_bits = -1;
-  int advance_rank = -1;
-  layout::PitchLinearCoord threadblock_shape;
-  layout::PitchLinearCoord threadmap_iterations;
-  layout::PitchLinearCoord threadmap_delta;
-
-  //
-  // Methods
-  //
-
-  PredicatedTileAccessIteratorDesc() = default;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc(
-    int element_size_bits_,
-    int advance_rank_,
-    layout::PitchLinearCoord threadblock_shape_,
-    layout::PitchLinearCoord threadmap_iterations_,
-    layout::PitchLinearCoord threadmap_delta_
-  ):
-    element_size_bits(element_size_bits_),
-    advance_rank(advance_rank_),
-    threadblock_shape(threadblock_shape_),
-    threadmap_iterations(threadmap_iterations_),
-    threadmap_delta(threadmap_delta_)
-  {
-    #if 0
-    printf("PredicatedTileAccessIteratorDesc(%d, %d, {%d, %d}, {%d, %d}, {%d, %d}})\n",
-      element_size_bits,
-      advance_rank,
-      threadblock_shape.contiguous(), threadblock_shape.strided(),
-      threadmap_iterations.contiguous(), threadmap_iterations.strided(),
-      threadmap_delta.contiguous(), threadmap_delta.strided());
-    #endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Helper template to construct an PredicatedTileAccessIteratorDesc from a template 
-// dependent state
-template <
-  typename Shape, typename Element, typename Layout,
-  int AdvanceRank, typename ThreadMap>
-  struct MakePredicatedTileAccessIteratorDesc;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
-template <
-  typename Shape, typename Element, int AdvanceRank, 
-  typename ThreadMap>
-struct MakePredicatedTileAccessIteratorDesc <
-    Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap> {
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc operator()() {
-
-    return PredicatedTileAccessIteratorDesc(
-      sizeof_bits<Element>::value,
-      AdvanceRank,
-      {Shape::kContiguous, Shape::kStrided},
-      {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
-      {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
-    );
-}
-
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for column-major data.
-template <
-  typename Shape, typename Element, int AdvanceRank, 
-  typename ThreadMap>
-struct MakePredicatedTileAccessIteratorDesc <
-    Shape, Element, layout::ColumnMajor, AdvanceRank, ThreadMap> {
-
-  static int const kAdvanceRank = AdvanceRank;
-
-  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc operator()() {
-
-    return UnderlyingMakeOperator()();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for row-major data.
-template <
-  typename Shape, typename Element, int AdvanceRank, 
-  typename ThreadMap>
-struct MakePredicatedTileAccessIteratorDesc <
-    Shape, Element, layout::RowMajor, AdvanceRank, ThreadMap> {
-
-  static int const kAdvanceRank = AdvanceRank;
-
-  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc operator()() {
-
-    return UnderlyingMakeOperator()();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
-template <
-  typename Shape, typename Element, int AdvanceRank, 
-  typename ThreadMap, int InterleavedK>
-struct MakePredicatedTileAccessIteratorDesc <
-    Shape, Element, layout::ColumnMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
-
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kInterleavedK = InterleavedK;
-
-  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
-      layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc operator()() {
-
-    return UnderlyingMakeOperator()();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIterator for roww-major interleaved data.
-template <
-  typename Shape, typename Element, int AdvanceRank, 
-  typename ThreadMap, int InterleavedK>
-struct MakePredicatedTileAccessIteratorDesc <
-    Shape, Element, layout::RowMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
-
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kInterleavedK = InterleavedK;
-
-  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
-      layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorDesc operator()() {
-
-    return UnderlyingMakeOperator()();
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// Parameters struct
-//
-
-struct PredicatedTileAccessIteratorParams {
-
-  using Index = int32_t;
-  using LongIndex = int64_t;
-
-  //
-  // Data members
-  //
-  /// stride of pitch-linear layout (units of Element)
-  LongIndex stride_ = 0;
-  /// amount (in byte) to increment pointer to move to next access along
-  /// strided dimension
-  LongIndex inc_strided_ = 0;
-  /// amount (in byte) to increment pointer from last access to first access
-  /// of next tile
-  LongIndex inc_next_ = 0;
-  /// amount (in byte) to increment pointer from first access of current tile
-  /// to first access of next tile
-  LongIndex inc_advance_ = 0;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
-    CUTLASS_ASSERT(desc.element_size_bits > 0);
-    CUTLASS_ASSERT(desc.advance_rank == 0 || desc.advance_rank == 1);
-
-    stride_ = stride;
-
-    inc_strided_ = (LongIndex(stride_) * desc.threadmap_delta.strided()) *
-                     desc.element_size_bits / 8;
-
-    if (desc.advance_rank) {
-      // advance along strided dimension
-      inc_advance_ =
-          desc.threadblock_shape.strided() * LongIndex(stride_) * desc.element_size_bits / 8;
-    } else {
-      // advance along contiguous dimension
-      inc_advance_ = desc.threadblock_shape.contiguous() * desc.element_size_bits / 8;
-    }
-
-    inc_next_ = inc_advance_ - LongIndex(desc.threadmap_iterations.strided() - 1) *
-                                   desc.threadmap_delta.strided() * LongIndex(stride_) *
-                                   desc.element_size_bits / 8;    
-
-    return Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Status initialize(Index stride, PredicatedTileAccessIteratorDesc desc) {
-    return initialize(LongIndex(stride), desc);
-  }
-
-  PredicatedTileAccessIteratorParams() = default;
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorParams(Index stride, PredicatedTileAccessIteratorDesc desc) {
-    initialize(stride, desc);
-  }
-
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorParams(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
-    initialize(stride, desc);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
deleted file mode 100755
index 24498843d..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
+++ /dev/null
@@ -1,892 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates calculating the address and predicates to the load of tiles
-   from pitch-linear rank=2 tensors.
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last
-   "residue" tile first, with the objective of minimizing predicate mask updates
-   during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/predicate_vector.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileAccessIteratorTriangularMatrix
-///
-template <typename Shape, typename Element, typename Layout, 
-          int AdvanceRank, typename ThreadMap, 
-          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
-          typename AccessType>
-class PredicatedTileAccessIteratorTriangularMatrix;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for pitch-linear data.
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, typename AccessType_>
-class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
-
-  using CompareOp = typename TrMatrixCompareOp<kFillMode, kDiagType>::Type;
-
-  static_assert( kFillMode == FillMode::kFull || 
-                 ((kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) && AccessType::kElements == 1), 
-                 "BLAS3 iterator for the triangular/symmetric matrix must use AccessType::kElements as 1");
-
-  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
-    "Vectors implied by the thread map must be divisible by the access type.");
-
-  static int const kPredicatesPerByte = 4;
-  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
-
-  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
-
-  /// Number of 32b words containing predicates
-  static int const kPredicateByteCount = 
-    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
-  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
-
-  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
-
-  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = Array<uint32_t, kPredicateWordCount>;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend PredicatedTileAccessIteratorTriangularMatrix;
-
-   private:
-    /// stride of pitch-linear layout (units of Element)
-    StrideIndex stride_;
-    /// (true)  pitch-linear layout is mapped to row-major matrix 
-    /// (false) pitch-linear layout is mapped to column-major matrix
-    bool is_row_major_;
-    /// for vectorized access across the diagonal boundary guard condition is
-    /// checked for the element on the boundary
-    int access_diagonal_boundary_;    
-    /// amount (in byte) to increment pointer to move to next access along
-    /// strided dimension
-    LongIndex inc_strided_;
-    /// amount (in byte) to increment pointer from last access to first access
-    /// of next tile
-    LongIndex inc_next_;
-    /// amount (in byte) to increment pointer from first access of current tile
-    /// to first access of next tile
-    LongIndex inc_advance_;
-
-   public:
-
-    // Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0), is_row_major_(false), access_diagonal_boundary_(0) { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout, bool is_row_major, int access_diagonal_boundary) : 
-      stride_(layout.stride(0)), is_row_major_(is_row_major), access_diagonal_boundary_(access_diagonal_boundary) {
-
-      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
-                     sizeof_bits<Element>::value / 8;
-
-      if (kAdvanceRank) {
-        // advance along strided dimension
-        inc_advance_ =
-            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
-      } else {
-        // advance along contiguous dimension
-        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
-      }
-
-      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
-                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
-                                     sizeof_bits<Element>::value / 8;
-
-    };
-
-
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Parameters object with precomputed internal state
-  Params const &params_;
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  /// Guard predicates
-  uint32_t predicates_[kPredicateWordCount];
-
-  /// Track global memory addresses on the diagonal 
-  /// To ignore imag part for diagonal elements of hermitian matrices
-  uint32_t predicates_onDiag_[kPredicateWordCount];
-
-  /// Size of tensor
-  TensorCoord extent_;
-
-  /// Initial offset for each thread
-  TensorCoord thread_offset_;
-
-  /// Iteration along vectors implied by the thread map
-  int iteration_vector_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- private:
-  /// Computes predicates based on internally tracked per-thread offset.
-  CUTLASS_DEVICE
-  void compute_predicates_(
-      /// Extent of the matrix window
-      TensorCoord extent) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0u;
-      predicates_onDiag_[i] = 0u;
-    }
-
-    CompareOp compare_op;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
-
-      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-      
-      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
-
-      int c = access_residual / kAccessesPerVector;
-      int v = access_residual % kAccessesPerVector;
-
-      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
-                                s * ThreadMap::Delta::kStrided);
-
-      TensorCoord coord = thread_offset_ + iteration_coord;
-
-      bool guard;
-      bool onDiag = false;
-
-      guard = ((coord.strided() < extent.strided()) && 
-                (coord.contiguous() < extent.contiguous()));
-    
-
-      // guard access on the wrong side of the triagular matrix diagonal
-      if (kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) {
-        coord += TensorCoord{params_.access_diagonal_boundary_, 0};
-
-        bool triagular_guard_row_major = compare_op(coord.strided(), coord.contiguous()) | !params_.is_row_major_;
-        bool triagular_guard_col_major = compare_op(coord.contiguous(), coord.strided()) | params_.is_row_major_;
-        
-        guard = guard && triagular_guard_row_major && triagular_guard_col_major;
-
-        if (kDiagType == DiagType::kUnit) {
-          onDiag = (guard && coord.strided() == coord.contiguous()) ? true : false;
-        }
-      }
-
-      int pred_idx_onDiag = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
-      int word_idx_onDiag = pred_idx_onDiag / kPredicatesPerWord;
-      int residual_onDiag = pred_idx_onDiag % kPredicatesPerWord;
-      int byte_idx_onDiag = residual_onDiag / kPredicatesPerByte;
-      int bit_idx_onDiag = residual_onDiag % kPredicatesPerByte;
-      
-      predicates_onDiag_[word_idx_onDiag] |= (unsigned(onDiag) << (byte_idx_onDiag * 8 + bit_idx_onDiag));
-
-      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
-
-      int word_idx = pred_idx / kPredicatesPerWord;
-      int residual = pred_idx % kPredicatesPerWord;
-      int byte_idx = residual / kPredicatesPerByte;
-      int bit_idx = residual % kPredicatesPerByte;
-      
-      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
-
-    }
-
-  }
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
-        extent_(extent) {
-
-
-    // Per-thread offset in logical coordinates of tensor
-    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
-
-    // update internal pointers
-    Layout layout(params_.stride_);
-    add_pointer_offset(layout(thread_offset_));
-
-    compute_predicates_(extent_);
-
-    set_iteration_index(0);
-  }
-
-  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id)
-      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_vector_ = index % kAccessesPerVector;
-    int residual_access = index / kAccessesPerVector;
-
-    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-
-    if (kAdvanceRank) {
-      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
-      pointer_ += Shape::kContiguous * tile_offset.contiguous();
-      thread_offset_ += TensorCoord{0, Shape::kStrided * tile_offset.strided()};
-    } else {
-      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
-      pointer_ += Shape::kStrided * tile_offset.strided();
-      thread_offset_ += TensorCoord{Shape::kContiguous * tile_offset.contiguous(), 0};
-    }
-
-    compute_predicates_(extent_);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(
-        pointer_ + 
-        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
-
-    ++iteration_vector_;
-    if (iteration_vector_ < kAccessesPerVector) {
-      return *this;
-    }
-
-    iteration_vector_ = 0;
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      pointer_ += params_.inc_strided_;
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    // advance to next tile
-    pointer_ += params_.inc_next_;
-
-    // now return to start tile - if the iterator is subsequently advanced, this
-    // subtraction as well as the subsequent integer addition are both elided by
-    // the compiler.
-    pointer_ -= params_.inc_advance_;
-
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
-    PredicatedTileAccessIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = enable ? 0u : predicates_[i];
-    }
-
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = 0xffffffff;
-    }
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      predicates_[i] = mask[i];
-    }
-
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kPredicateWordCount; ++i) {
-      mask[i] = predicates_[i];
-    }
-  }
-
-  /// Return if the address in on the diagonal
-  CUTLASS_HOST_DEVICE
-  bool getOnDiag() {
-    int pred_idx = 
-      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
-
-    int word_idx = pred_idx / kPredicatesPerWord;
-    int residual = pred_idx % kPredicatesPerWord;
-    int byte_idx = residual / kPredicatesPerByte;
-    int bit_idx = residual % kPredicatesPerByte;
-    
-    bool pred = (predicates_onDiag_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
-    return pred;
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-
-    
-    int pred_idx = 
-      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
-
-    int word_idx = pred_idx / kPredicatesPerWord;
-    int residual = pred_idx % kPredicatesPerWord;
-    int byte_idx = residual / kPredicatesPerByte;
-    int bit_idx = residual % kPredicatesPerByte;
-    
-    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
-    return pred;
-    
-
-    //return true;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
-            SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
-            typename AccessType_>
-class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, 
-                                   AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, 
-      kSideMode, kFillMode, kDiagType, AccessType>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  static int const kAccessDiagonalBoundary = 
-    (kFillMode == FillMode::kLower) ? (AccessType::kElements - 1) : 0;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIteratorTriangularMatrix;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0)), false, kAccessDiagonalBoundary){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row(), extent.column()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column())) {}
-
-  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
-    PredicatedTileAccessIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Return if the address in on the diagonal
-  CUTLASS_HOST_DEVICE
-  bool getOnDiag() {
-    return iterator_.getOnDiag();
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
-          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
-          typename AccessType_>
-class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
-                                                  kSideMode, kFillMode, kDiagType, AccessType_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using AccessType = AccessType_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, 
-      kSideMode, kFillMode, kDiagType, AccessType>;
-
-  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
-
-  static int const kAccessDiagonalBoundary = 
-    (kFillMode == FillMode::kUpper) ? (AccessType::kElements - 1) : 0;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileAccessIteratorTriangularMatrix;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0)), true, kAccessDiagonalBoundary){};
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      ///< Precomputed parameters object
-      Params const &params,
-      ///< Pointer to start of tensor
-      Pointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
-                                     make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
-    PredicatedTileAccessIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Return if the address in on the diagonal
-  CUTLASS_HOST_DEVICE
-  bool getOnDiag() {
-    return iterator_.getOnDiag();
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
deleted file mode 100755
index bdfb33fe5..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ /dev/null
@@ -1,1887 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
-
-    This iterator uses masks to guard out-of-bounds accesses. The first tile this
-    iterator visits maybe partial, then the remaining tiles are complete. So, we 
-    only need to compute the predicates twice, once before the first tile and 
-    once for the remaining full tiles which can share the same predicates.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/arch/memory.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileIterator
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-/// Regular tile iterator using a precomputed control structure to minimize register liveness
-/// and integer arithmetic.
-///
-/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
-///
-/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
-/// Subsequently, they are assumed to be immutable.
-///
-/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
-/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
-///
-/// Visitation order is intended to first visit a "residual" tile that may be partially full in
-/// both the advance dimension and the steady-state dimension. This is assumed to be the last
-/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
-/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
-/// accesses may be performed without updating internal predicates and are efficient in terms of
-/// live register state and pointer arithmetic instructions.
-///
-/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
-/// outside any looping structure to minimize integer arithmetic. 
-///
-/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
-/// the iterator.
-///
-///
-/// Example:
-///
-/// An efficient pipeline structure may be constructed as follows:
-///
-// template <typename Iterator>
-// __global__ void kernel(
-//   typename Iterator::Params params, 
-//   typename Iterator::Element *ptr,
-//   TensorCoord extent) {
-//
-//   typename Iterator::Fragment fragment;
-//
-//   TensorCoord threadblock_offset(0, 0);
-//
-//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
-//
-//
-//   fragment = *iter;        // load "residue" tile first
-//   ++iter;                  // advance to first "steady state" tile and update internal masks
-//
-//
-//   #pragma unroll
-//   for (int i = Remaining - 1; i >= 0; --i) {
-//
-//     f(fragment);
-//
-//     if (!i) {
-//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
-//     }
-//  
-//     fragment = *iter;      // load tile during "steady state" phase
-//     ++iter;                // advance to next tile - lightweight due to steady-state masks
-//   }
-// }
-//
-// void host(TensorView<Element, 2, layout::PitchLinear> view) {
-//
-//   using Iterator = transform::threadblock::PredicatedTileIterator;
-//
-//   typename Iterator::Params params(view.layout());
-//
-//   kernel<Iterator>(params, view.data());
-// }
-///
-///
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  int AccessSize = ThreadMap::kElementsPerAccess,
-  bool Gather = false,
-  typename PermuteLayout = layout::NoPermute
->
-class PredicatedTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, bool Gather, typename PermuteLayout>
-class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
-                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator =
-      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType, Gather, PermuteLayout>;
-
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    using Base = typename TileAccessIterator::Params::Base;
-
-    friend PredicatedTileIterator;
-
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) {}
-
-    /// Default constructor
-    Params() = default;
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base)
-        : params_(base) {}
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      /// Gather indices
-      int const *indices = nullptr)
-      : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset, indices) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset({0, 1});
-    else
-      address_iterator_.add_tile_offset({1, 0});
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { address_iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
-
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          
-          address_iterator_.set_iteration_index(idx);
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize,
-  bool Gather,
-  typename PermuteLayout
->
-class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, 
-                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap,
-    AccessSize,
-    Gather,
-    PermuteLayout
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0)))
-    {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base)
-        : params_(base) {}
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.row(), extent.column()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
-      indices)
-    { }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize,
-  bool Gather,
-  typename PermuteLayout
->
-class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, 
-                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    AccessSize,
-    Gather,
-    PermuteLayout
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base)
-        : params_(base) {}
-
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,        ///< Initial offset of threadblock
-    int const *indices = nullptr                        ///< Gather indices
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.column(), extent.row()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
-      indices
-    ) { }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for affine rank-2 data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize>
-class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRank,
-                             ThreadMap_, AccessSize, false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRankN<2>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator =
-      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType>;
-
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-
-    friend PredicatedTileIterator;
-
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) {}
-
-    /// Default constructor
-    Params() = default;
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset(make_Coord(0, 1));
-    else
-      address_iterator_.add_tile_offset(make_Coord(1, 0));
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { address_iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
-
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          
-          address_iterator_.set_iteration_index(idx);
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for affine rank 2 column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize
->
-class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::AffineRankN<2>,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
-    {}
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.row(), extent.column()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
-    ) { }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for affine rank 2 row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int AccessSize
->
-class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::AffineRankN<2>,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.column(), extent.row()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
-    ) { }
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for interleaved data.  It is mapped
-/// to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, int InterleavedK>
-class PredicatedTileIterator<Shape_, Element_,
-                             layout::ColumnMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize, false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator<
-      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
-                               Shape::kColumn / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
-
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base)
-        : params_(base) {}
-
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
-                                           extent.column() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.row() * kInterleavedK,
-                      threadblock_offset.column() / kInterleavedK)) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator for interleaved-32 data.  It is
-/// mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize, int InterleavedK>
-class PredicatedTileIterator<Shape_, Element_,
-                             layout::RowMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize, false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator<
-      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
-                               Shape::kRow / kInterleavedK>,
-      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
-
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIterator;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-   public:
-
-    /// Default constructor
-    Params() = default;
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base)
-        : params_(base) {}
-  };
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-
-  /// Default constructor
-  PredicatedTileIterator() = default;
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : iterator_(params.params_, pointer,
-                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
-                                           extent.row() / kInterleavedK),
-                  thread_id,
-                  layout::PitchLinearCoord(
-                      threadblock_offset.column() * kInterleavedK,
-                      threadblock_offset.row() / kInterleavedK)) {}
-
-  /// Construct a PredicatedTileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIterator(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator operator++(int) {
-    PredicatedTileIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
deleted file mode 100755
index 422ac45c3..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ /dev/null
@@ -1,787 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h"
-#include "cutlass/transform/thread/transpose.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileIterator2dThreadTile
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-/// Regular tile iterator using a precomputed control structure to minimize register liveness
-/// and integer arithmetic.
-///
-/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
-///
-/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
-/// Subsequently, they are assumed to be immutable.
-///
-/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
-/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
-///
-/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
-/// both the advance dimension and the steady-state dimension. This is assumed to be the last
-/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
-/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
-/// accesses may be performed without updating internal predicates and are efficient in terms of
-/// live register state and pointer arithmetic instructions.
-///
-/// To be efficient, this assumes the iteraor will be dereferenced and advanced at least once
-/// outside any looping structure to minimize integer arithmetic. 
-///
-/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
-/// the iterator.
-///
-///
-/// Example:
-///
-/// An efficient pipeline structure may be constructed as follows:
-///
-// template <typename Iterator>
-// __global__ void kernel(
-//   typename Iterator::Params params, 
-//   typename Iterator::Element *ptr,
-//   TensorCoord extent) {
-//
-//   typename Iterator::Fragment fragment;
-//
-//   TensorCoord threadblock_offset(0, 0);
-//
-//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
-//
-//
-//   fragment = *iter;        // load "residue" tile first
-//   ++iter;                  // advance to first "steady state" tile and update internal masks
-//
-//
-//   #pragma unroll
-//   for (int i = Remaining - 1; i >= 0; --i) {
-//
-//     f(fragment);
-//
-//     if (!i) {
-//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
-//     }
-//  
-//     fragment = *iter;      // load tile during "steady state" phase
-//     ++iter;                // advance to next tile - lightweight due to steady-state masks
-//   }
-// }
-//
-// void host(TensorView<Element, 2, layout::PitchLinear> view) {
-//
-//   using Iterator = transform::threadblock::PredicatedTileIterator2dThreadTile;
-//
-//   typename Iterator::Params params(view.layout());
-//
-//   kernel<Iterator>(params, view.data());
-// }
-///
-///
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  bool Transpose = false
->
-class PredicatedTileIterator2dThreadTile;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, bool Transpose_>
-class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Transpose_> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  /// Type used for internal memory accesses
-  /// extra set of parenthesis is needed for VS compiler
-  struct alignas((ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value /
-                  8)) AccessType {
-
-    Array<Element, ThreadMap::kElementsPerAccess> storage;
-
-    static int const kElements = ThreadMap::kElementsPerAccess;
-  };
-
-  /// Optinally this fragment can be 4x4 transposed
-  using Transform = thread::Transpose< ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount , layout::PitchLinearShape<4,4>, Element>;
-  static bool const transpose = Transpose_;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator =
-      PredicatedTileAccessIterator2dThreadTile<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType>;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::ThreadAccessShape::kCount>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    using Base = typename TileAccessIterator::Params::Base;
-
-    friend PredicatedTileIterator2dThreadTile;
-
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) { }
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) 
-        : params_(base) {}
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset,
-      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-      )
-      : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset) {}
-
-  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile &operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset({0, 1});
-    else
-      address_iterator_.add_tile_offset({1, 0});
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile operator++(int) {
-    PredicatedTileIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { address_iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
-
-          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
-              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
-
-          address_iterator_.set_iteration_index(access_idx);
-          if (address_iterator_.valid()) {
-
-            frag_ptr[access_idx] =
-                *(address_iterator_.get() + pointer_offset);
-          }
-
-          ++address_iterator_;
-        }
-      }
-    }
-
-    if (transpose) {
-      Transform t;
-      t.transform(frag, frag);
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
-
-          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
-              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
-
-          address_iterator_.set_iteration_index(access_idx);
-          if (address_iterator_.valid()) {
-            *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  bool Transpose_
->
-class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Transpose_> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static bool const Transpose = Transpose_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap,
-    Transpose
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator2dThreadTile;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.row(), extent.column()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
-    ) { }
-
-  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile operator++(int) {
-    PredicatedTileIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  bool Transpose_
->
-class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Transpose_> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static bool const Transpose = Transpose_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    Transpose
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIterator2dThreadTile;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { } 
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) { }
-
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const &base) 
-        : params_(base) {}
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
-    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.column(), extent.row()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
-    ) { }
-
-  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIterator2dThreadTile operator++(int) {
-    PredicatedTileIterator2dThreadTile self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
deleted file mode 100755
index 8fea9ae02..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
+++ /dev/null
@@ -1,818 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/arch/memory.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedTileIteratorTriangularMatrix
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-/// Regular tile iterator using a precomputed control structure to minimize register liveness
-/// and integer arithmetic.
-///
-/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
-///
-/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
-/// Subsequently, they are assumed to be immutable.
-///
-/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
-/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
-///
-/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
-/// both the advance dimension and the steady-state dimension. This is assumed to be the last
-/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
-/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
-/// accesses may be performed without updating internal predicates and are efficient in terms of
-/// live register state and pointer arithmetic instructions.
-///
-/// To be efficient, this assumes the iteraor will be dereferenced and advanced at least once
-/// outside any looping structure to minimize integer arithmetic. 
-///
-/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
-/// the iterator.
-///
-///
-/// Example:
-///
-/// An efficient pipeline structure may be constructed as follows:
-///
-// template <typename Iterator>
-// __global__ void kernel(
-//   typename Iterator::Params params, 
-//   typename Iterator::Element *ptr,
-//   TensorCoord extent) {
-//
-//   typename Iterator::Fragment fragment;
-//
-//   TensorCoord threadblock_offset(0, 0);
-//
-//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
-//
-//
-//   fragment = *iter;        // load "residue" tile first
-//   ++iter;                  // advance to first "steady state" tile and update internal masks
-//
-//
-//   #pragma unroll
-//   for (int i = Remaining - 1; i >= 0; --i) {
-//
-//     f(fragment);
-//
-//     if (!i) {
-//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
-//     }
-//  
-//     fragment = *iter;      // load tile during "steady state" phase
-//     ++iter;                // advance to next tile - lightweight due to steady-state masks
-//   }
-// }
-//
-// void host(TensorView<Element, 2, layout::PitchLinear> view) {
-//
-//   using Iterator = transform::threadblock::PredicatedTileIteratorTriangularMatrix;
-//
-//   typename Iterator::Params params(view.layout());
-//
-//   kernel<Iterator>(params, view.data());
-// }
-///
-///
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  SideMode kSideMode, 
-  FillMode kFillMode, 
-  DiagType kDiagType,
-  int AccessSize = ThreadMap::kElementsPerAccess
->
-class PredicatedTileIteratorTriangularMatrix;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIteratorTriangularMatrix for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
-          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
-          int AccessSize>
-class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, 
-                                             kSideMode, kFillMode, kDiagType,
-                                             AccessSize> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator =
-      PredicatedTileAccessIteratorTriangularMatrix<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, kSideMode, kFillMode, kDiagType, AccessType>;
-
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
-                                               ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend PredicatedTileIteratorTriangularMatrix;
-
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout) : params_(layout) { }
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-  };
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-      /// Precomputed parameters object
-      Params const &params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset) {}
-
-  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-      Params const &params,  ///< Precomputed parameters object
-      Pointer pointer,       ///< Pointer to start of tensor
-      TensorCoord extent,    ///< Extent of tensor
-      int thread_id          ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id,
-                               make_Coord(0, 0)) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix &operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset({0, 1});
-    else
-      address_iterator_.add_tile_offset({1, 0});
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix operator++(int) {
-    PredicatedTileIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() { address_iterator_.enable_mask(); }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
-
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          
-          address_iterator_.set_iteration_index(idx);
-          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
-
-          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-          cutlass::arch::global_load<AccessType,
-                                     sizeof(AccessType)
-                                    >(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-
-          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-
-          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIteratorTriangularMatrix for column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  SideMode kSideMode, 
-  FillMode kFillMode, 
-  DiagType kDiagType,
-  int AccessSize
->
-class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, 
-                                              kSideMode, kFillMode, kDiagType,
-                                              AccessSize> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap,
-    kSideMode, 
-    kFillMode, 
-    kDiagType,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIteratorTriangularMatrix;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
-
-    }
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.row(), extent.column()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
-    ) { }
-
-  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix operator++(int) {
-    PredicatedTileIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedTileIteratorTriangularMatrix for row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  SideMode kSideMode, 
-  FillMode kFillMode, 
-  DiagType kDiagType,
-  int AccessSize
->
-class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
-                                            kSideMode, kFillMode, kDiagType,
-                                            AccessSize> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Pointer = Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    kSideMode, 
-    kFillMode, 
-    kDiagType,
-    AccessSize
-  >;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-  private:
-
-    friend PredicatedTileIteratorTriangularMatrix;
-
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-
-  public:
-    
-    CUTLASS_HOST_DEVICE
-    Params() { } 
-
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
-
-    };
-  };
-
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-    Params const &params,                         ///< Precomputed parameters object 
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
-  ):
-    iterator_(
-      params.params_,
-      pointer,
-      layout::PitchLinearCoord(extent.column(), extent.row()),
-      thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
-    ) { }
-
-  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix(
-    Params const &params,                         ///< Precomputed parameters object
-    Pointer pointer,                              ///< Pointer to start of tensor
-    TensorCoord extent,                           ///< Extent of tensor
-    int thread_id                                 ///< ID of each participating thread
-  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the iterator's
-  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
-  /// are lightweight and must only update the internal pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorTriangularMatrix operator++(int) {
-    PredicatedTileIteratorTriangularMatrix self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const &mask) {
-    iterator_.set_mask(mask);
-  }
-
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask &mask) {
-    iterator_.get_mask(mask);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
deleted file mode 100755
index 391f94b97..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates implementing computing the addresses of loading small
-    vectors from the global memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/coord.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// PredicatedVectorAccessIterator
-///
-template <
-    /// Shape of the vector accessed by the entire threadblock
-    typename Shape,
-    /// Shape of the vector accessed by the warp
-    typename WarpShape,
-    /// Type of Element
-    typename Element,
-    /// Layout of the vector
-    typename Layout,
-    /// Number of elements for each access
-    int ElementsPerAccess,
-    /// Support residual tile
-    bool EnableResidualAccess = false
->
-class PredicatedVectorAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Vector access iterator specialized for vectors, e.g. scale and bias
-/// Thread arrangements are for TensorOps
-///
-template <
-  typename Shape_, 
-  typename WarpShape_, 
-  typename Element_, 
-  int ElementsPerAccess, 
-  bool EnableResidualAccess
->
-class PredicatedVectorAccessIterator <
-  Shape_,
-  WarpShape_,
-  Element_,
-  layout::PitchLinear,
-  ElementsPerAccess,
-  EnableResidualAccess
-> {
-  public:
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-//  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kThreads = 32;
-  static int const kRowsPerIteration = 8;
-  static int const kThreadsPerRow = kThreads / kRowsPerIteration;
-  static int const kThreadsPerRowMask = 0x3;
-  static int const kIterations = WarpShape::kContiguous / (kThreadsPerRow * kElementsPerAccess); 
-  static int const kWarpCountStrided = Shape::kStrided / WarpShape::kStrided;
-
-  using AccessType = AlignedArray<Element, kElementsPerAccess>;
-
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char *;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Internal pointer to first access of tile
-  BytePointer pointer_;
-
-  /// Extent of tensor
-  TensorCoord extent_;
-
-  /// pointer offset of each thread
-  TensorCoord thread_offset_;
-
-  /// iteration index
-  LongIndex iteration_;
-
-  /// residual access
-  bool is_residual_;
-
-  /// residual offset of each thread
-  TensorCoord residual_offset_;
-
- public:
-  /// Constructs a vector access iterator
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator(
-    /// Pointer to the start of the vector
-    ConstPointer pointer,
-    /// Extent of vector
-    TensorCoord extent,
-    /// ID of each participating thread
-    int thread_id,
-    /// ID of each participating warp
-    int warp_id,
-    /// Initial offset of threadblock
-    TensorCoord const &threadblock_offset)
-    : pointer_(reinterpret_cast<BytePointer>(
-                       const_cast<NonConstPointer>(pointer))),
-      extent_(extent),
-      is_residual_(false) {
-
-
-    int warp_offset = (warp_id / kWarpCountStrided) * WarpShape::kContiguous;
-
-    // Per-thread offset in logical coordinates of tensor
-
-    thread_offset_ = threadblock_offset + TensorCoord(warp_offset, 0) +
-        TensorCoord((thread_id & kThreadsPerRowMask) * kElementsPerAccess, 0);
-
-    set_iteration_index(0);
-
-    if(EnableResidualAccess) {
-      // compute residual offset
-      typename TensorCoord::Index residual_size = extent_.contiguous() % WarpShape::kContiguous;
-      if (residual_size) {
-        is_residual_ = true;
-        residual_offset_ = make_Coord(residual_size, 0);
-      }
-    }
-  }
-
-  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator(
-    /// Pointer to start of vector
-    ConstPointer pointer,
-    /// Extent of vector
-    TensorCoord extent,
-    ///< ID of each participating thread
-    int thread_id,
-    /// ID of each participating warp
-    int warp_id)
-    : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id,
-                                     make_Coord(0, 0)) {}
-
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_ = index;
-  }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
-  CUTLASS_DEVICE
-  void add_tile_offset(
-      TensorCoord const &tile_offset) {
-
-    thread_offset_ =
-        thread_offset_ +
-        TensorCoord(WarpShape::kContiguous * tile_offset.contiguous(), 0);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    return reinterpret_cast<AccessType *>(
-        pointer_ +
-        ((thread_offset_.contiguous() + iteration_ * kThreadsPerRow * kElementsPerAccess) 
-        * sizeof_bits<Element>::value / 8));
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator &operator++() {
-    ++iteration_;
-    if(iteration_ >= kIterations)
-      iteration_ = 0; 
-
-    return *this;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    if(EnableResidualAccess && is_residual_) {
-      is_residual_ = false;
-      thread_offset_ += residual_offset_; 
-    }
-    else
-      add_tile_offset(TensorCoord(1, 0));
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator operator++(int) {
-    PredicatedVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return ((thread_offset_.contiguous() + 
-              iteration_ * kThreadsPerRow * kElementsPerAccess) < extent_.contiguous());
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Specialization of PredicatedVectorAccessIterator for row-major data.
-///
-template <
-  typename Shape_,
-  typename WarpShape_,
-  typename Element_,
-  int ElementsPerAccess,
-  bool EnableResidualAccess
->
-class PredicatedVectorAccessIterator<
-  Shape_,
-  WarpShape_,
-  Element_,
-  layout::RowMajor,
-  ElementsPerAccess,
-  EnableResidualAccess
-> {
- public:
-
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ConstPointer = const Element *;
-  using NonConstPointer = typename platform::remove_const<Element>::type *;
-
-  using UnderlyingIterator = PredicatedVectorAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, 
-      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>, 
-      Element,
-      layout::PitchLinear,
-      ElementsPerAccess,
-      EnableResidualAccess>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
-  static int const kRowsPerIteration = UnderlyingIterator::kRowsPerIteration;
-  static int const kThreads = UnderlyingIterator::kThreads;
-  static int const kIterations = UnderlyingIterator::kIterations;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator(
-      ///< Pointer to the start of the vector
-      ConstPointer pointer,
-      ///< Extent of tensor
-      TensorCoord extent,
-      ///< ID of each participating thread
-      int thread_id,
-      ///< ID of each participating warp
-      int warp_id,
-      ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
-      : iterator_(pointer, layout::PitchLinearCoord(extent.column(), extent.row()),
-                  thread_id, warp_id,
-                  layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
-
-  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator(
-      ConstPointer pointer,   ///< Pointer to the start of the vector
-      TensorCoord extent,     ///< Extent of tensor
-      int thread_id,          ///< ID of each participating thread
-      int warp_id             ///< ID of each participating warp
-      )
-      : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id, 
-                                        make_Coord(0, 0)) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  void add_tile_offset(TensorCoord const &tile_offset) {
-    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedVectorAccessIterator operator++(int) {
-    PredicatedVectorAccessIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Increment and return an instance to self.
-  CUTLASS_HOST_DEVICE
-  void advance() {
-    iterator_.advance();
-  }
-
-  /// Returns whether access is valid or not
-  CUTLASS_HOST_DEVICE
-  bool valid() {
-    return iterator_.valid();
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform 
-}  // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
deleted file mode 100755
index f5906d828..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Templates implementing computing the addresses of storing of small
-   scale and bias vectors in the shared memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// RegularScaleBiasVectorAccessIterator
-///
-template <typename Shape, typename Element, typename Layout>
-class RegularScaleBiasVectorAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_>
-class RegularScaleBiasVectorAccessIterator<Shape_, Element_, layout::PitchLinear> {
- public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Element type per access
-  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
-  static int const kThreads = Shape::kContiguous / kElementsPerAccess;
-  using AccessType = Array<Element, kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Internal pointer 
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator(
-      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
-                                 ///< vector
-      int thread_id              ///< ID of each participating thread
-      )
-      : byte_offset_(0) {
-    // Per-thread offset in logical coordinates of tensor
-    int thread_offset = thread_id * kElementsPerAccess;
-
-    // initialize pointer
-    pointer_ =
-        reinterpret_cast<AccessType *>(scale_bias_ref.data() + thread_offset);
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_DEVICE
-  AccessType *get() const {
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(pointer_);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator &operator++() { return *this; }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator operator++(int) {
-    RegularScaleBiasVectorAccessIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset in the unit of tile.
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    // Multiply by 2 because we store scale and bias belong to the same stage
-    // next to each other.
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous * 2);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for row major layouts
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_>
-class RegularScaleBiasVectorAccessIterator<
-    Shape_, Element_,
-    layout::RowMajor> {
- public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularScaleBiasVectorAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator(
-      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
-                                 ///< vector
-      int thread_id              ///< ID of each participating thread
-      )
-      : iterator_({scale_bias_ref.data(), scale_bias_ref.stride()}, thread_id) {
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularScaleBiasVectorAccessIterator operator++(int) {
-    RegularScaleBiasVectorAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform 
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
deleted file mode 100755
index d0992d441..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing the address computation of storing of tiles
-   from pitch-linear rank=2 tensors.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap,
-          int Alignment =
-              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8>
-class RegularTileAccessIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
deleted file mode 100755
index fa02b008b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ /dev/null
@@ -1,408 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing computing the addresses of storing of tiles
-   from pitch-linear rank=2 tensors.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::PitchLinear,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Element type per access
-  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
-        byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset in the unit of tile.
-  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
-  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
-  ///   For row major A operand, k dimension is contiguous dimension;
-  ///   For col major A operand, k dimension is strided dimension;
-  ///   For row major B operand, k dimension is strided dimension;
-  ///   For col major B operand, k dimension is contiguous dimension.
-  /// Below two classes map col/row major to the pitch linear coordinates used
-  /// in this base class.
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
-                       coord.strided() * Shape::kStrided * stride_ *
-                           ThreadMap::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for column major layouts
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajor,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 0 : 1), 
-      ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for row major layouts
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::RowMajor,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 1 : 0), 
-      ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
deleted file mode 100755
index a7b57bbe7..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing computing the addresses of storing of tiles
-   from pitch-linear rank=2 tensors.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap,
-           bool Dynamic_iterations = false,
-          int Alignment =
-              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8
-          >
-class RegularTileAccessIteratorDirectConv;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations OFF
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIteratorDirectConv<
-    Shape_, Element_,
-    layout::PitchLinear,
-    AdvanceRank, ThreadMap_, false, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Element type per access
-  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
-        byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_num(int num) {
-    //Do nothing
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv operator++(int) {
-    RegularTileAccessIteratorDirectConv prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset in the unit of tile.
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
-                       coord.strided() * ThreadMap::Iterations::kStrided *
-                           ThreadMap::Delta::kStrided * stride_ * ThreadMap::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations ON
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIteratorDirectConv<
-    Shape_, Element_,
-    layout::PitchLinear,
-    AdvanceRank, ThreadMap_,true, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Element type per access
-  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
-  /// Total iterattions in the strided dimension: Dynamic value
-  int total_iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
-        byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_num(int num) {
-    total_iteration_strided_ = num;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < total_iteration_strided_) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv operator++(int) {
-    RegularTileAccessIteratorDirectConv prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset in the unit of tile.
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
-                       coord.strided() * total_iteration_strided_ * ThreadMap::Delta::kStrided * stride_ *
-                           ThreadMap::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for column major layouts
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_,bool Dynamic_iterations, int Alignment >
-class RegularTileAccessIteratorDirectConv<
-    Shape_, Element_,
-    layout::ColumnMajor,
-    AdvanceRank, ThreadMap_, Dynamic_iterations , Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 0 : 1), 
-      ThreadMap_,
-      Dynamic_iterations>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-  
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_num(int num) {
-    iterator_.set_iteration_num(num);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv operator++(int) {
-    RegularTileAccessIteratorDirectConv prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for row major layouts
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_,bool Dynamic_iterations, int Alignment>
-class RegularTileAccessIteratorDirectConv<
-    Shape_, Element_,
-    layout::RowMajor,
-    AdvanceRank, ThreadMap_, Dynamic_iterations, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 1 : 0), 
-      ThreadMap_,
-      Dynamic_iterations>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_num(int num) {
-    iterator_.set_iteration_num(num);
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIteratorDirectConv operator++(int) {
-    RegularTileAccessIteratorDirectConv prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
deleted file mode 100755
index 96e3ee84b..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ /dev/null
@@ -1,821 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing computing the addresses of storing of tiles
-   from pitch-linear rank=2 tensors.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                          Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-  static int const kCrosswise = Crosswise;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128bs");
-
-    ///< Number of pointers
-    static int const kPointerCount =
-        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
-  };
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_[Detail::kPointerCount];
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
-        byte_offset_(0) {
-    layout::PitchLinearCoord thread_offset_base =
-        ThreadMap::initial_offset(thread_id);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      // This is the offset of a thread within a threadblock tile for a specific
-      // pointer (units of elements)
-      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-          thread_offset_base +
-          layout::PitchLinearCoord{
-              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
-
-      // initialize pointer
-      pointer_[i] = reinterpret_cast<AccessType *>(
-          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
-    int stride_idx = (iteration_strided_ & ~1);
-
-    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * Shape::kContiguous * Layout::kFactor +
-                       coord.strided() * Shape::kStrided * stride_ *
-                           Layout::kElementsPerAccess / Layout::kFactor);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                                  Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for crosswise arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::TensorOpMultiplicandCrosswise<
-                                    sizeof_bits<Element_>::value, Crosswise>,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-  static int const kCrosswise = Crosswise;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  static_assert(!(ThreadMap::Delta::kContiguous % kCrosswise),
-                "kCrosswise is the smallest unit in the contiguous dimension "
-                "for shared memory swizzling.");
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128bs");
-
-    /// Number of pointers
-    ///
-    /// Note:TN kblock32 layouts only needs 1 pointer, but strangely
-    /// reducing pointer count hurts perfomrnace
-    static int const kPointerCount =
-        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
-  };
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Total number of sections.  The memory is divided into stages.  One stage
-  /// can store one tile.  Stage is divided into sections.  Interleaved layout
-  /// can have multiple sections in a stage.  The rest layout only has one section
-  /// in a stage.
-  int sections_;
-
-  /// Sections that a stage has
-  int sections_per_stage_;
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_[Detail::kPointerCount];
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : sections_(ref.stride(0) / kCrosswise),
-        sections_per_stage_(Shape::kContiguous / kCrosswise),
-        // stride_ = kCrosswise x sections_ x kFactor
-        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
-        byte_offset_(0) {
-    layout::PitchLinearCoord thread_offset_base =
-        ThreadMap::initial_offset(thread_id);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      // This is the offset of a thread within a threadblock tile for a specific
-      // pointer (units of elements)
-      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-          thread_offset_base +
-          layout::PitchLinearCoord{
-              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
-      // initialize pointer
-      pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
-                    ref.offset(thread_offset_in_threadblock_tile) /
-                        Layout::kElementsPerAccess;
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
-    int stride_idx = (iteration_strided_ & ~1);
-
-    int access_offset =
-        stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
-        // kCrosswise elements in the contiguous dimension would span to a
-        // shared memory cache line.
-        iteration_contiguous_ * (ThreadMap::Delta::kContiguous / kCrosswise) *
-            Layout::TileShape::kContiguous;
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next section.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
-                           ThreadMap::kElementsPerAccess / sections_ +
-                       coord.strided() * Shape::kStrided * stride_ *
-                           Layout::kElementsPerAccess / Layout::kFactor);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCrosswise<
-        sizeof_bits<Element_>::value, Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::RowMajorTensorOpMultiplicandCrosswise<
-                                    sizeof_bits<Element_>::value, Crosswise>,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
deleted file mode 100755
index b424af445..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
+++ /dev/null
@@ -1,1532 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing computing the addresses of storing of tiles
-   from pitch-linear rank=2 tensors.
-*/
-
-#pragma once
-
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandCongruous64b,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorOpMultiplicandCongruous64b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  static_assert(ThreadMap::kThreads / 32 > 1, 
-    "This tile iterator requires at least two warps.");
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 64;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 64b");
-
-    ///< Number of pointers
-    static int const kPointerCount = 1;
-  };
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ): 
-    stride_(ref.stride(0) / Layout::kElementsPerAccess),
-    byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // This is the offset of a thread within a threadblock tile for a specific
-    // pointer (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-
-    RegularTileAccessIterator prev(*this);
-
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-
-    add_pointer_offset(
-      coord.contiguous() * Shape::kContiguous + 
-      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCongruous64b,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCongruous64b,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::RowMajorTensorOpMultiplicandCongruous64b,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCongruous64b,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for crosswise arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicand64bCrosswise,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorOpMultiplicand64bCrosswise;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  static_assert(ThreadMap::kThreads / 32 > 1, 
-    "This tile iterator requires at least two warps.");
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 64;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 64b");
-
-    ///< Number of pointers - two pointers are needed if making more than 4 iterations along
-    ///< strided dimension
-    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1);
-  };
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_[Detail::kPointerCount];
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ): 
-    stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // This is the offset of a thread within a threadblock tile for a specific
-    // pointer (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data());
-
-    byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element);
-    
-    if (Detail::kPointerCount == 2) {
-      byte_offset_[1] = byte_offset_[0] ^ 8;
-    }
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    pointer_ += pointer_offset / ThreadMap::kElementsPerAccess;
-  }
-
-  /// Returns a pointer
-  CUTLASS_DEVICE
-  AccessType *get() const {
-
-    // Map the logical contiguous and strided access to the internal swizzled structure.
-    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16 + stride_ * ThreadMap::Delta::kContiguous * iteration_contiguous_;
-
-    char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
-
-    int byte_offset;
-
-    // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations)
-    // in the strided dimension
-    if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) {
-      byte_offset = byte_offset_[1];
-    }
-    else {
-      byte_offset = byte_offset_[0];
-    }
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-
-    RegularTileAccessIterator prev(*this);
-
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-
-    add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicand64bCrosswise,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::RowMajorTensorOpMultiplicand64bCrosswise,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicand64bCrosswise,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandCongruous128b,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorOpMultiplicandCongruous128b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  static_assert(ThreadMap::kThreads / 32 > 1, 
-    "This tile iterator requires at least two warps.");
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128b");
-
-    ///< Number of pointers
-    static int const kPointerCount = 1;
-  };
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ): 
-    stride_(ref.stride(0) / Layout::kElementsPerAccess),
-    byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // This is the offset of a thread within a threadblock tile for a specific
-    // pointer (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-
-    RegularTileAccessIterator prev(*this);
-
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-
-    add_pointer_offset(
-      coord.contiguous() * Shape::kContiguous + 
-      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCongruous128b,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCongruous128b,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::RowMajorTensorOpMultiplicandCongruous128b,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCongruous128b,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandCrosswise128x4,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::TensorOpMultiplicandCrosswise128x4;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  static_assert(ThreadMap::kThreads / 32 > 1, 
-    "This tile iterator requires at least two warps.");
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value *
-                          ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128b");
-
-    ///< Number of pointers
-    static int const kPointerCount = 1;
-  };
-
-
-  static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension");
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_;
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-  /// Iteration in the contiguous dimension
-  int iteration_contiguous_;
-
-  /// Iteration in the strided dimension
-  int iteration_strided_;
-
- public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ): 
-    stride_(ref.stride(0) / Layout::kElementsPerAccess),
-    byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    // This is the offset of a thread within a threadblock tile for a specific
-    // pointer (units of elements)
-    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
-
-    // initialize pointer
-    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-
-    set_iteration_index(0);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-
-    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
-    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-
-    AccessType *access_ptr = pointer_;
-
-    int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2);
-    int offset_s = (iteration_strided_ / 2) * 8;
-
-    int access_offset = offset_c * stride_ + offset_s;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iteration_contiguous_;
-
-    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
-      return *this;
-
-    // Enter here only if (iteration_contiguous_ ==
-    // ThreadMap::Iteration::kContiguous)
-    iteration_contiguous_ = 0;
-    ++iteration_strided_;
-
-    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      return *this;
-    }
-
-    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
-    // which means we enter the next tile.
-    iteration_strided_ = 0;
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-
-    RegularTileAccessIterator prev(*this);
-
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-
-    add_pointer_offset(
-      coord.contiguous() * Shape::kContiguous * stride_ + 
-      coord.strided() * Shape::kStrided * Layout::kElementsPerAccess);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCrosswise128x4,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
-                            int thread_id   ///< ID of each participating thread
-                            )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment>
-class RegularTileAccessIterator<Shape_, Element_,
-                                layout::RowMajorTensorOpMultiplicandCrosswise128x4,
-                                AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileAccessIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCrosswise128x4,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
-  using AccessType = typename UnderlyingIterator::AccessType;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator(
-    TensorRef ref,  ///< Pointer to start of tensor
-    int thread_id   ///< ID of each participating thread
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return reinterpret_cast<AccessType *>(iterator_.get());
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileAccessIterator operator++(int) {
-    RegularTileAccessIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace threadblock
-}  // namespace transform
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
deleted file mode 100755
index d09c23892..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
->
-class RegularTileIterator;
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
deleted file mode 100755
index 1e04c4262..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ /dev/null
@@ -1,552 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Regular tile iterator specialized for pitch-linear.  This one is used by 2-stage SIMT kernels
-/// and sparse tensor core meta data.
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the contiguous or strided dimensions.");
-
-private:
-
-  //
-  // Types
-  //
-
-  //
-  // Data members
-  //
-
-  /// Pointer to memory
-  uint8_t *pointer_;
-
-  /// Stride quantity
-  StrideIndex stride_;
-
-  /// Amount to increment pointer along strided dimension
-  Index increment_strided_;
-
-  /// Amount to advance pointer between tiles
-  Index increment_advance_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator(
-    TensorRef const &ref, 
-    int thread_idx
-  ): 
-    pointer_(reinterpret_cast<uint8_t *>(ref.data()) + (ref.offset(ThreadMap::initial_offset(thread_idx)) * sizeof_bits<Element>::value / 8)) {
-    
-    stride_ = ref.stride()[0];
-    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value) * ThreadMap::Delta::kStrided / 8;
-    
-    increment_advance_ = 
-      (kAdvanceRank == 0 ? 
-        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
-        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8));
-  }
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int idx = c + s * ThreadMap::Iterations::kContiguous;
-        frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous /
-                                   ThreadMap::kElementsPerAccess];
-      }
-
-      if (s + 1 < ThreadMap::Iterations::kStrided) {
-        byte_pointer += increment_strided_;
-      }
-    }
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    load_with_pointer_offset(
-      frag, 
-      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
-        tile_offset.strided() * Shape::kStrided * stride_
-    );
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
-    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int idx = c + s * ThreadMap::Iterations::kContiguous;
-        access_ptr[c * ThreadMap::Delta::kContiguous /
-                   ThreadMap::kElementsPerAccess] = frag_ptr[idx];
-      }
-
-      if (s + 1 < ThreadMap::Iterations::kStrided) {
-        byte_pointer += increment_strided_;
-      }
-    }
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    store_with_pointer_offset(
-      frag,
-      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
-    );
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    pointer_ += increment_advance_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator--() {
-    pointer_ -= increment_advance_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset;
-  }
-
-  /// Adds a tile offset in the unit of tile.
-  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
-  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
-  ///   For row major A operand, k dimension is contiguous dimension;
-  ///   For col major A operand, k dimension is strided dimension;
-  ///   For row major B operand, k dimension is strided dimension;
-  ///   For col major B operand, k dimension is contiguous dimension.
-  /// Below two classes map col/row major to the pitch linear coordinates used
-  /// in this base class.
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    int offset = sizeof_bits<Element>::value *
-        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
-    add_pointer_offset(offset);
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-  }
-
-    /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-#if 0
-    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
-    int stride_idx = (iteration_strided_ & ~1);
-
-    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
-                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
-                            ThreadMap::kElementsPerAccess;
-
-    char *access_byte_ptr =
-        reinterpret_cast<char *>(access_ptr + access_offset);
-    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
-#endif
-    return reinterpret_cast<AccessType *>(pointer_);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Regular tile iterator specialized for row major 
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  using Underlying = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    kAlignment
-  >;
-
-  using AccessType = typename Underlying::AccessType;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the row or column dimensions.");
-
-private:
-
-  Underlying iterator_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator() { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator(
-    TensorRef const &ref, 
-    int thread_idx
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_idx) {
-
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    iterator_.store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator--() {
-    --iterator_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return iterator_.get();
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Regular tile iterator specialized for pitch-linear
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
-  using Underlying = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap
-  >;
-
-  using AccessType = typename Underlying::AccessType;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the row or column dimensions.");
-
-private:
-
-  Underlying iterator_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator() { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator(
-    TensorRef const &ref, 
-    int thread_idx
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_idx) {
-
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    iterator_.store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator--() {
-    --iterator_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Overrides the internal iteration index
-  CUTLASS_HOST_DEVICE
-  void set_iteration_index(int index) {
-  }
-
-  /// Returns a pointer
-  CUTLASS_HOST_DEVICE
-  AccessType *get() const {
-    return iterator_.get();
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
deleted file mode 100755
index 7fd495984..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ /dev/null
@@ -1,509 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename Shape,
-  typename Element,
-  typename Layout,
-  int AdvanceRank,
-  typename ThreadMap,
-  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
->
-class RegularTileIterator2dThreadTile;
-
-
-/// Regular tile iterator specialized for pitch-linear + 2d thread-tiled threadmapping
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the contiguous or strided dimensions.");
-
-private:
-
-  //
-  // Types
-  //
-  
-  using AccessType = AlignedArray<Element, ThreadMap::ThreadAccessShape::kCount, kAlignment>;
-
-  //
-  // Data members
-  //
-
-  /// Pointer to memory
-  uint8_t *pointer_;
-
-  /// Stride quantity
-  StrideIndex stride_;
-
-  /// Amount to increment pointer along strided dimension
-  LongIndex increment_strided_;
-
-  /// Amount to advance pointer between tiles
-  LongIndex increment_advance_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile(
-    TensorRef const &ref, 
-    int thread_idx,
-    int interleave
-  ){ 
-    
-    TensorCoord t = ThreadMap::initial_offset(thread_idx);
-    long int offset = t[0] * interleave + t[1] * ref.stride()[0]/interleave;
-    pointer_ = reinterpret_cast<uint8_t *>(ref.data() + offset);
-
-    stride_ = ref.stride()[0] / interleave;
-    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value / 8) * ThreadMap::Delta::kStrided / interleave;
-
-    increment_advance_ = 
-      (kAdvanceRank == 0 ? 
-        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
-        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8) / interleave);
-  }
-
-  /// Loads a fragment
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-          int idx = c + s * ThreadMap::Iterations::kContiguous;
-           frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided];
-        }
-
-      if (s + 1 < ThreadMap::Iterations::kStrided) {
-        byte_pointer += increment_strided_;
-      }
-    }
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    load_with_pointer_offset(
-      frag, 
-      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
-        tile_offset.strided() * Shape::kStrided * stride_
-    );
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
-    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-          int idx = c + s * ThreadMap::Iterations::kContiguous;
-          access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided] = frag_ptr[idx];
-      }
-
-      if (s + 1 < ThreadMap::Iterations::kStrided) {
-        byte_pointer += increment_strided_;
-      }
-    }
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    store_with_pointer_offset(
-      frag,
-      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
-    );
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator++() {
-    pointer_ += increment_advance_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator--() {
-    pointer_ -= increment_advance_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    pointer_ += pointer_offset;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    int offset = sizeof_bits<Element>::value *
-        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
-    add_pointer_offset(offset);
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator2dThreadTile<Shape_, Element_, layout::RowMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorInterleaved<4>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
-
-  using Underlying = RegularTileIterator2dThreadTile<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap,
-    kAlignment
-  >;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the row or column dimensions.");
-
-private:
-
-  Underlying iterator_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile() { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile(
-    TensorRef const &ref, 
-    int thread_idx
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
-
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    iterator_.store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator--() {
-    --iterator_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
-public:
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorInterleaved<4>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
-  using PitchLinearThreadMap = PitchLinearStripminedThreadMap< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, 
-                                  ThreadMap::kThreads, ThreadMap::ThreadAccessShape::kCount >;
-                        
-
-  using Underlying = RegularTileIterator2dThreadTile<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::PitchLinear,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap
-  >;
-
-  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
-    "Advance rank may only be along the row or column dimensions.");
-
-private:
-
-  Underlying iterator_;
-
-public:
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile() { }
-
-  CUTLASS_DEVICE
-  RegularTileIterator2dThreadTile(
-    TensorRef const &ref, 
-    int thread_idx
-  ):
-    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
-
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, TensorCoord const & tile_offset) {
-    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Loads a fragment
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) {
-    iterator_.load_with_pointer_offset(frag, 0);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag, TensorCoord const & tile_offset) {
-    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
-  }
-
-  /// Stores a fragment
-  CUTLASS_HOST_DEVICE
-  void store(Fragment const &frag) {
-    iterator_.store_with_pointer_offset(frag, 0);
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances the pointer
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator2dThreadTile &operator--() {
-    --iterator_;
-    return *this;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
deleted file mode 100755
index 1308f45eb..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ /dev/null
@@ -1,1107 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
-*/
-
-#pragma once
-
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
-
-////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                          Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    /// This iterator is specialized for an access size that is 128 bits in length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(
-      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
-      "This iterator requires a policy whose access size is 128bs");
-  };
-
-private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
-                                                       kAdvanceRank, ThreadMap>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : address_iterator_(ref, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    address_iterator_.add_tile_offset({0, 1});
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    address_iterator_.add_tile_offset(coord);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, Index byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
-        AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
-
-        frag_ptr[access_idx] = *access_ptr;
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-        *access_ptr = frag_ptr[access_idx];
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_byte_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::ColumnMajorTensorOpMultiplicandCongruous<
-        sizeof_bits<Element_>::value, Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for column-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
- public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag, 
-    Index pointer_offset) {
-    
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept | 
-///            ReadableContiguousTileIteratorConcept | 
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                                  Crosswise>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
-    "Specialization for row-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
- public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag, 
-    Index pointer_offset) {
-    
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for crosswise arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<Shape_, Element_,
-                          layout::TensorOpMultiplicandCrosswise<
-                              sizeof_bits<Element_>::value, Crosswise>,
-                          AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>;
-
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128bs");
-  };
-
- private:
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment =
-      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
-                                                       kAdvanceRank, ThreadMap>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : address_iterator_(ref, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    address_iterator_.add_tile_offset({1, 0});
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    address_iterator_.add_tile_offset(coord);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
-    address_iterator_.set_iteration_index(0);
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
-        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
-
-        *access_ptr = frag_ptr[access_idx];
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<Shape_, Element_,
-                          layout::ColumnMajorTensorOpMultiplicandCrosswise<
-                              sizeof_bits<Element_>::value, Crosswise>,
-                          AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int Alignment, int Crosswise>
-class RegularTileIterator<Shape_, Element_,
-                          layout::RowMajorTensorOpMultiplicandCrosswise<
-                              sizeof_bits<Element_>::value, Crosswise>,
-                          AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Crosswise>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Crosswise>,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for k interleaved arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
-                                                    InterleavedK>,
-    AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
-                                                      InterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-    /// This iterator is specialized for an access size that is 128 bits in
-    /// length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
-                      kAccessSizeInBits,
-                  "This iterator requires a policy whose access size is 128bs");
-  };
-
- private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment =
-      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
-
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
-                                                       kAdvanceRank, ThreadMap>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-       : address_iterator_(ref, thread_id) {}
- 
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    address_iterator_.add_pointer_offset(Shape::kCount);
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-        *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
-        ++address_iterator_;
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for k interleaved arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-
-template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
-                                             InterleavedK>,
-    AdvanceRank, ThreadMap_, Alignment> {
-
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
-                                                         InterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
-    (kAdvanceRank == 1 ? 0 : 1),
-    ThreadMap
-  >;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
- private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-       : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
deleted file mode 100755
index 81b774cf2..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ /dev/null
@@ -1,1460 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
-
-    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
-    first, with the objective of minimizing predicate mask updates during steady-state operation.
-
-    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
-    and integer addition is used to advance the pointer through memory.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
-
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    /// This iterator is specialized for an access size that is 128 bits in length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(
-      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
-      "This iterator requires a policy whose access size is 128bs");
-
-    ///< Number of pointers
-    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
-  };
-
-
-private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType * pointer_[Detail::kPointerCount];
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-
-      // This is the offset of a thread within a threadblock tile for a specific pointer
-      // (units of elements)
-      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
-
-      // initialize pointer
-      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(
-      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
-      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
-    );
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = pointer_[s & 1];
-      int stride_idx = (s & ~1);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
-            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
-            vec_pointer_offset;
-
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
-
-        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = pointer_[s & 1];
-      int stride_idx = (s & ~1);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
-          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
-          vec_pointer_offset;
-
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
-
-        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for column-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap_>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for row-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap_>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-/// Tile iterator specialized for congruous arrangements for TensorOps
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for pitch-linear iterator may along advance along the "
-    "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using StrideIndex = typename Layout::Stride::Index;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    /// This iterator is specialized for an access size that is 128 bits in length.
-    static int const kAccessSizeInBits = 128;
-
-    static_assert(
-      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
-      "This iterator requires a policy whose access size is 128bs");
-
-    ///< Number of pointers
-    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
-  };
-
-
-private:
-
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Stride value
-  StrideIndex stride_;
-
-  /// Internal pointer to first access of tile
-  AccessType * pointer_[Detail::kPointerCount];
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-
-      // This is the offset of a thread within a threadblock tile for a specific pointer
-      // (units of elements)
-      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
-
-      // initialize pointer
-      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
-
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset(
-      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
-      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
-    );
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = pointer_[s & 1];
-      int stride_idx = (s & ~1);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
-            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
-            vec_pointer_offset;
-
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
-
-        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = pointer_[s & 1];
-      int stride_idx = (s & ~1);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
-          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
-          vec_pointer_offset;
-
-        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
-
-        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
-
-        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for column-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-    Element,
-    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-    (kAdvanceRank == 0 ? 0 : 1),
-    ThreadMap_>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major congruous TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-  Shape_,
-  Element_,
-  layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-  AdvanceRank,
-  ThreadMap_,
-  Alignment> {
-public:
-
-  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
-    "Specialization for row-major iterator may along advance along the "
-    "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-    Element,
-    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
-    (kAdvanceRank == 0 ? 1 : 0),
-    ThreadMap_>;
-
-public:
-
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
-private:
-
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
-public:
-
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(
-    TensorRef ref,                              ///< Pointer to start of tensor
-    int thread_id                               ///< ID of each participating thread
-  ): iterator_({ref.data(), ref.stride()}, thread_id) {
-
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(
-    Fragment const &frag,
-    Index pointer_offset) {
-
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-
-
-/// Tile iterator specialized for crosswise arrangements for TensorOps.
-///
-/// Volta TN SMEM layout is a little diffrent:
-/// Crosseised elements will be stored in a line, while contiguous elements
-/// sre stored in line-by-line.
-/// Padding is used to reduce SMEM bank conflicts.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<
-    Shape_, Element_,
-    layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                               Shape_::kContiguous>,
-    AdvanceRank, ThreadMap_, Alignment> {
-
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout =
-      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                                 Shape::kContiguous>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Internal details made public to facilitate introspection
-  struct Detail {
-
-    ///< Number of pointers
-    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
-
-    /// Iterations for the kElementsPerAccess of ThreadMap
-    static int const kIterarionsPerAccess =
-        ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
-
-    /// Contiguous elements per line
-    static int const kContiguousElementsPerLine = 4;
-  };
-
- private:
-  /// Element type per access
-  using AccessType = Array<Element, Layout::kElementsPerAccess>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment =
-      Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-
- private:
-  //
-  // Data members
-  //
-
-  /// The crosswised elements will be stored in a line.
-  /// line_size is size of crosswised dimension plus padding.
-  /// in units of AccessType
-  Index line_size;
-
-  /// Internal pointer to first access of tile
-  AccessType *pointer_[Detail::kPointerCount];
-
-  /// Internal byte offset
-  Index byte_offset_;
-
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / Layout::kElementsPerAccess),
-        byte_offset_(0) {
-
-    layout::PitchLinearCoord thread_offset_base =
-        ThreadMap::initial_offset(thread_id);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Detail::kPointerCount; ++i) {
-      // This is the offset of a thread within a threadblock tile for a specific
-      // pointer (units of elements)
-      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
-          thread_offset_base +
-          layout::PitchLinearCoord{
-              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
-
-      // initialize pointer
-      pointer_[i] = reinterpret_cast<AccessType *>(
-          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
-    }
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_offset_ += pointer_offset * sizeof(Element);
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    // (Shape::kContiguous/Layout::kElementsPerAccess)*
-    //   line_size * Layout::kElementsPerAccess
-    add_pointer_offset(Shape::kContiguous * line_size);
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    this->operator++();
-
-    return prev;
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
-                       line_size + coord.strided() * Shape::kStrided) *
-                       Layout::kElementsPerAccess);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
-
-      access_ptr += 16 * (s / 2);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
-
-          int access_offset = 
-            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
-            vec_pointer_offset + i * line_size;
-
-          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
-            Detail::kIterarionsPerAccess + i;
-
-          char const *access_byte_ptr = reinterpret_cast<char const*>(access_ptr + access_offset);
-
-          frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
-              access_byte_ptr + byte_offset_);
-        }
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-
-      AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
-
-      access_ptr += 16 * (s / 2) + vec_pointer_offset;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
-
-          int access_offset = 
-            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size + i * line_size;
-
-          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
-            Detail::kIterarionsPerAccess + i;
-
-          char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
-
-          *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) =
-              frag_ptr[access_idx];
-        }
-      }
-    }
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for column-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,
-  int Alignment
->
-class RegularTileIterator<Shape_, Element_,
-                          layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-                              sizeof_bits<Element_>::value, Shape_::kRow>,
-                          AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for column-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Shape::kRow>;
-  static int const kAdvanceRank = AdvanceRank;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                            Shape::kRow>,
-      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.row(), coord.column()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tile Iterator specialized for row-major crosswise TensorOp formats.
-///
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept
-///
-template <
-  typename Shape_,
-  typename Element_,
-  int AdvanceRank,
-  typename ThreadMap_,  
-  int Alignment
->
-class RegularTileIterator<Shape_, Element_,
-                          layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-                              sizeof_bits<Element_>::value, Shape_::kColumn>,
-                          AdvanceRank, ThreadMap_, Alignment> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for row-major iterator may along advance along the "
-      "columns(rank=0) or rows(rank=1) dimension.");
-
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
-      sizeof_bits<Element_>::value, Shape::kColumn>;
-  static int const kAdvanceRank = AdvanceRank;
-  static int const kAlignment = Alignment;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-
-  using ThreadMap = ThreadMap_;
-
-  /// Underlying iterator type
-  using UnderlyingIterator = RegularTileIterator<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
-                                                 Shape::kColumn>,
-      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
-
- public:
-  /// Fragment object to be loaded or stored
-  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
-
- private:
-  /// Underlying iterator
-  UnderlyingIterator iterator_;
-
- public:
-  /// Construct a TileIterator with zero threadblock offset
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
-                      int thread_id   ///< ID of each participating thread
-                      )
-      : iterator_({ref.data(), ref.stride()}, thread_id) {}
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-
-  /// Adds a tile offset
-  CUTLASS_DEVICE
-  void add_tile_offset(TensorCoord const &coord) {
-    iterator_.add_tile_offset({coord.column(), coord.row()});
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator &operator++() {
-    ++iterator_;
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  RegularTileIterator operator++(int) {
-    RegularTileIterator prev(*this);
-    ++iterator_;
-
-    return prev;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
deleted file mode 100755
index f78e5e862..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template wraps the vector access iterator concept to load whole vector from tensors in
-      memory. This is typically used for per-channel scale and bias in convolution kernels.
-*/
-
-#pragma once
-
-#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename VectorAccessIterator_>
-class VectorIterator {
-public:
-  using VectorAccessIterator = VectorAccessIterator_;
-
-  using Shape = typename VectorAccessIterator::Shape;
-  using Element = typename VectorAccessIterator::Element;
-  using Layout = typename VectorAccessIterator::Layout;
-  using TensorCoord = typename Layout::TensorCoord;
-  using AccessType = typename VectorAccessIterator::AccessType;
-  using TensorRef = typename VectorAccessIterator::TensorRef;
-  using Index = typename VectorAccessIterator::Index;
-  using LongIndex = typename VectorAccessIterator::LongIndex;
-
-  static int const kElementsPerAccess = VectorAccessIterator::kElementsPerAccess;
-  static int const kRowsPerIteration = VectorAccessIterator::kRowsPerIteration;
-  static int const kThreads = VectorAccessIterator::kThreads;
-  static int const kIterations = VectorAccessIterator::kIterations;
-
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-    Element, kElementsPerAccess * kIterations>;
-
-private:
-
-  /// Internal state
-  VectorAccessIterator vector_access_iterator_;
-
-public:
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  VectorIterator(
-    Element const *ptr,
-    TensorCoord extent,
-    int thread_idx,
-    int warp_idx,
-    MatrixCoord const &threadblock_offset = MatrixCoord()
-  ):
-    vector_access_iterator_(ptr, extent, thread_idx, warp_idx, threadblock_offset) { }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  VectorIterator &operator++() {
-    vector_access_iterator_.advance();
-    return *this;
-  }
-
-  /// Advances to the next tile in memory.
-  CUTLASS_HOST_DEVICE
-  VectorIterator operator++(int) {
-    VectorIterator self(*this);
-    operator++();
-    return self;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
-
-    frag.clear();
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < kIterations; ++c) {
-
-        cutlass::arch::global_load<
-          AccessType,
-          sizeof(AccessType)
-        >(
-          frag_ptr[c],
-          vector_access_iterator_.get() + pointer_offset,
-          vector_access_iterator_.valid()
-        );
-
-        ++vector_access_iterator_;
-      }
-//    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) {
-    vector_access_iterator_.set_iteration_index(0);
-    load_with_pointer_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  void advance() {
-    vector_access_iterator_.advance();
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h b/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
deleted file mode 100755
index b8bfa57fd..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of a warp vector
-      that participate in one warp-level mma operation.
-
-      Typically, this is used to access the scale/bias fragement of a warp-level mma operation.
-      The scale/bias vector is then partitioned into smaller fragments that can be fed into 
-      next warp-level mma operation. 
-
-      This iterator is necessary to accomplish warp-level mma fusion where the scale/bias vector is 
-      applied to the multiplicand for the next mma.
-
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/array.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-
-namespace cutlass {
-namespace transform {
-namespace warp {
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-template <
-    /// Size of the input fragment tile shape (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    //// Number of elements per access when loading fragment
-    int ElementsPerAccess>
-class VectorFragmentIterator;
-
-
-// Partial specialization for PitchLinear layout tile
-
-template <
-    /// Size of the input fragment vector shape (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    //// Number of elements per access when loading fragment
-    int ElementsPerAccess>
-class VectorFragmentIterator<Shape_, Element_,
-                                         cutlass::layout::PitchLinear,
-                                         InstructionShape_, ElementsPerAccess> {
- public:
-    
-  /// Size of the input threadblock tile shape (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::PitchLinear;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Number of participating threads
-  static int const kThreads = 32;
-
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kRowsPerIteration = 8;
-  static int const kColumnsPerAccess = 8;
-  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kK / kThreads;
-  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
-  
-  /// Number of iterations
-  using Iterations = MatrixShape<InstructionShape::kM / kRowsPerIteration, Shape::kContiguous / kElementsPerIteration>;
-
-public:
-
-  //
-  // Derived quantities
-  //
-  // All fragments have kElementsPerAccess scale followed by bias
-
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one iteration of the iterator.
-  using Fragment = Array<Element, kElementsPerIteration * Iterations::kRow>;
-
-  /// Input threadblock fragment tile
-  using ThreadblockFragment = Array<Element, Shape::kContiguous >;
-
-private:
-
-  /// Internal access type
-  using AccessType = Array<Element, kElementsPerAccess>;
-
-private:
-  //
-  // Data members
-  //
-
-  /// Input threadblock fragment tile
-  AccessType const *iterator_;
-
-  /// Internal index
-  int index_;
-
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
-      : iterator_(reinterpret_cast<AccessType const *>(&threadblock_frag)),
-        index_(0) {}
-
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    index_ += index_offset; 
-
-    if(index_ >= Iterations::kColumn)
-        index_ = 0;
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  VectorFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_index(int idx) {
-    index_ = idx;
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int r = 0; r < Iterations::kRow; r++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kAccessPerIteration; i++) {
-    
-          frag_ptr[i * Iterations::kRow + r].clear();
-          frag_ptr[i * Iterations::kRow + r] = iterator_[index_ * kAccessPerIteration + i];
-        }
-    }
-  }
-
-};
-
-// Partial specialization for Row-Major layout tile
-
-template <
-    /// Size of the input fragment tile shape (concept: MatrixShape)
-    typename Shape_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    //// Number of elements per access when loading fragment
-    int ElementsPerAccess>
-class VectorFragmentIterator<Shape_, Element_,
-                                         cutlass::layout::RowMajor,
-                                         InstructionShape_, ElementsPerAccess> {
- public:
-    
-  /// Size of the input threadblock tile shape (concept: MatrixShape)
-  using Shape = Shape_;
-
-  /// Element type
-  using Element = Element_;
-
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-
-  /// Underlying iterator
-  using Base = VectorFragmentIterator<
-    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-    layout::PitchLinear, InstructionShape, ElementsPerAccess>;
-
-
- public:
-
-  //
-  // Derived quantities
-  //
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one iteration of the iterator.
-  using Fragment = typename Base::Fragment;
-
-  /// Input threadblock fragment tile
-  using ThreadblockFragment = typename Base::ThreadblockFragment;
-
- private:
-  /// Underlying iterator
-  Base iterator_;
-
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
-      : iterator_(threadblock_frag) {}
-
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    iterator_.add_offset(index_offset);
-  }
-
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  VectorFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-
-  CUTLASS_HOST_DEVICE
-  void set_index(int idx) {
-    iterator_.set_index(idx);
-  }
-
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    iterator_.load(frag);
-  }
-
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // namespace warp
-} // namespace conv
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/uint128.h b/lightllm-kernel/cutlass/include/cutlass/uint128.h
deleted file mode 100755
index 6de3ba141..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/uint128.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! 
-  \file
-  \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
-*/
-#pragma once
-
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cstdint>
-#else
-#include <cstdint>
-#include <cstdlib>
-#include <cmath>
-#include <type_traits>
-#include <stdexcept>
-#endif
-
-#include "cutlass/cutlass.h"
-
-/// Optionally enable GCC's built-in type
-#if (defined(__x86_64) || defined (__aarch64__)) && !(defined(__CUDA_ARCH__) && ((__CUDACC_VER_MAJOR__ <= 10) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ <= 4)))) && defined(__GNUC__)
-#define CUTLASS_UINT128_NATIVE
-#elif !defined(__CUDA_ARCH__)
-// No custom support for 128b arithmetic on device
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define CUTLASS_INT128_ARITHMETIC
-#include <intrin.h>
-#if _MSC_VER >= 1920 && !defined(__CUDA_ARCH__)
-#define CUTLASS_INT128_ARITHMETIC_DIV
-#include <immintrin.h>
-#endif
-#endif
-#endif
-
-namespace cutlass {
-
-///! Unsigned 128b integer type
-struct alignas(16) uint128_t
-{
-  /// Size of one part of the uint's storage in bits
-  static constexpr int storage_bits_ = 64;
-
-  struct hilo
-  {
-    uint64_t lo;
-    uint64_t hi;
-  };
-
-  // Use a union to store either low and high parts or, if present, a built-in 128b integer type.
-  union {
-    struct hilo hilo_;
-
-#if defined(CUTLASS_UINT128_NATIVE)
-    unsigned __int128 native;
-#endif // defined(CUTLASS_UINT128_NATIVE)
-  };
-
-  //
-  // Methods
-  //
-
-  /// Default ctor
-  CUTLASS_HOST_DEVICE
-  uint128_t() : hilo_{0, 0} {}
-
-  /// Constructor from uint64
-  CUTLASS_HOST_DEVICE
-  uint128_t(uint64_t lo_) : hilo_{lo_, 0} {}
-
-  /// Constructor from two 64b unsigned integers
-  CUTLASS_HOST_DEVICE
-  uint128_t(uint64_t lo_, uint64_t hi_) : hilo_{lo_, hi_} {}
-
-  /// Optional constructor from native value
-#if defined(CUTLASS_UINT128_NATIVE)
-  uint128_t(unsigned __int128 value) : native(value) { }
-#endif
-
-  /// Lossily cast to uint64
-  CUTLASS_HOST_DEVICE
-  explicit operator uint64_t() const
-  {
-    return hilo_.lo;
-  }
-
-  CUTLASS_HOST_DEVICE
-  static void exception()
-  {
-#if defined(__CUDA_ARCH__)
-  asm volatile ("  brkpt;\n");
-#else
-  // throw std::runtime_error("Not yet implemented.");
-  abort();
-#endif
-  }
-
-  /// Add
-  CUTLASS_HOST_DEVICE
-  uint128_t operator+(uint128_t const& rhs) const
-  {
-    uint128_t y{};
-#if defined(CUTLASS_UINT128_NATIVE)
-    y.native = native + rhs.native;
-#else
-    y.hilo_.lo = hilo_.lo + rhs.hilo_.lo;
-    y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (y.hilo_.lo < hilo_.lo);
-#endif
-    return y;
-  }
-
-  /// Subtract
-  CUTLASS_HOST_DEVICE
-  uint128_t operator-(uint128_t const& rhs) const
-  {
-    uint128_t y{};
-#if defined(CUTLASS_UINT128_NATIVE)
-    y.native = native - rhs.native;
-#else
-    y.hilo_.lo = hilo_.lo - rhs.hilo_.lo;
-    y.hilo_.hi = hilo_.hi - rhs.hilo_.hi - (rhs.hilo_.lo && y.hilo_.lo > hilo_.lo);
-#endif
-    return y;
-  }
-
-  /// Multiply by unsigned 64b integer yielding 128b integer
-  CUTLASS_HOST_DEVICE
-  uint128_t operator*(uint64_t const& rhs) const
-  {
-    uint128_t y{};
-#if defined(CUTLASS_UINT128_NATIVE)
-    y.native = native * rhs;
-#elif defined(CUTLASS_INT128_ARITHMETIC)
-    // Multiply by the low part
-    y.hilo_.lo = _umul128(hilo_.lo, rhs, &y.hilo_.hi);
-
-    // Add the high part and ignore the overflow
-    uint64_t overflow{0};
-    y.hilo_.hi += _umul128(hilo_.hi, rhs, &overflow);
-#else
-    CUTLASS_UNUSED(rhs);
-    exception();
-#endif
-    return y;
-  }
-
-  /// Divide 128b operation by 64b operation yielding a 64b quotient
-  CUTLASS_HOST_DEVICE
-  uint64_t operator/(uint64_t const& divisor) const
-  {
-    uint64_t quotient{0};
-#if defined(CUTLASS_UINT128_NATIVE)
-    quotient = uint64_t(native / divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
-    // implemented using MSVC's arithmetic intrinsics
-    uint64_t remainder{0};
-    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
-#else
-    CUTLASS_UNUSED(divisor);
-    exception();
-#endif
-    return quotient;
-  }
-
-  /// Divide 128b operation by 64b operation yielding a 64b quotient
-  CUTLASS_HOST_DEVICE
-  uint64_t operator%(uint64_t const& divisor) const
-  {
-    uint64_t remainder{0};
-#if defined(CUTLASS_UINT128_NATIVE)
-    remainder = uint64_t(native % divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
-    // implemented using MSVC's arithmetic intrinsics
-    (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
-#else
-    CUTLASS_UNUSED(divisor);
-    exception();
-#endif
-    return remainder;
-  }
-
-  /// Computes the quotient and remainder in a single method.
-  CUTLASS_HOST_DEVICE
-  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const
-  {
-    uint64_t quotient{0};
-#if defined(CUTLASS_UINT128_NATIVE)
-    quotient = uint64_t(native / divisor);
-    remainder = uint64_t(native % divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
-    // implemented using MSVC's arithmetic intrinsics
-    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
-#else
-    CUTLASS_UNUSED(remainder);
-    CUTLASS_UNUSED(divisor);
-    exception();
-#endif
-    return quotient;
-  }
-
-  /// Left-shifts a 128b unsigned integer
-  CUTLASS_HOST_DEVICE
-  uint128_t operator<<(int sh) const
-  {
-    if (sh == 0) {
-      return *this;
-    }
-    else if (sh >= storage_bits_) {
-      return uint128_t(0, hilo_.lo << (sh - storage_bits_));
-    }
-    else {
-      return uint128_t(
-        (hilo_.lo << sh),
-        (hilo_.hi << sh) | uint64_t(hilo_.lo >> (storage_bits_ - sh))
-      );
-    }
-  }
-
-  /// Right-shifts a 128b unsigned integer
-  CUTLASS_HOST_DEVICE
-  uint128_t operator>>(int sh) const
-  {
-    if (sh == 0) {
-      return *this;
-    }
-    else if (sh >= storage_bits_) {
-      return uint128_t((hilo_.hi >> (sh - storage_bits_)), 0);
-    }
-    else {
-      return uint128_t(
-        (hilo_.lo >> sh) | (hilo_.hi << (storage_bits_ - sh)),
-        (hilo_.hi >> sh)
-      );
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lightllm-kernel/cutlass/include/cutlass/version.h b/lightllm-kernel/cutlass/include/cutlass/version.h
deleted file mode 100755
index ff9aa1157..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/version.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-#define CUTLASS_MAJOR 3
-#define CUTLASS_MINOR 6
-#define CUTLASS_PATCH 0
-
-#ifdef CUTLASS_VERSIONS_GENERATED
-#include "cutlass/version_extended.h"
-#else
-#define CUTLASS_BUILD 0
-#define CUTLASS_REVISION ""
-#endif
-
-#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
-
-namespace cutlass {
-
-  inline constexpr uint32_t getVersion() {
-    return CUTLASS_VERSION;
-  }
-  inline constexpr uint32_t getVersionMajor() {
-    return CUTLASS_MAJOR;
-  }
-  inline constexpr uint32_t getVersionMinor() {
-    return CUTLASS_MINOR;
-  }
-  inline constexpr uint32_t getVersionPatch() {
-    return CUTLASS_PATCH;
-  }
-  inline constexpr uint32_t getVersionBuild() {
-    return CUTLASS_BUILD + 0;
-  }
-
-  inline std::string getVersionString() {
-    std::string version = "@CUTLASS_VERSION@";
-    if (getVersionBuild()) {
-      version += "." + std::to_string(getVersionBuild());
-    }
-    return version;
-  }
-  
-  inline std::string getGitRevision() {
-    return "@CUTLASS_REVISION@";
-  }
-
-} // namespace cutlass
diff --git a/lightllm-kernel/cutlass/include/cutlass/wmma_array.h b/lightllm-kernel/cutlass/include/cutlass/wmma_array.h
deleted file mode 100755
index 0f9b2b514..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/wmma_array.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
-           and is safe to use in a union.
-*/
-
-#pragma once
-
-#include "cutlass/arch/wmma.h"
-
-#if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
-template <
-  /// Element type
-  typename T,
-  /// Number of elements in the array
-  int N,
-  /// Whether the element type of T is half_t or __half
-  bool IsHalfType = (platform::is_same<typename T::element_type, cutlass::half_t>::value ||
-                     platform::is_same<typename T::element_type, __half>::value)
->
-class WmmaFragmentArray: public Array<T, N, true> {
-public:
-
-  /// Efficient clear method (override Array::clear())
-  CUTLASS_HOST_DEVICE
-  void clear()
-  {
-    for(int i = 0; i < Array<T, N, true>::kElements; i++)
-    {
-      nvcuda::wmma::fill_fragment((*this)[i], (typename T::element_type)0);
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
-  {
-    using element_type = typename T::element_type;
-    plus<T> add;
-
-    for (int i = 0; i < Array<T, N, true>::kElements; i++)
-    {
-      (*this)[i] = add((*this)[i], rhs[i]);
-    }
-
-    return *this;
-  }
-};
-
-/// Partial specialization for the case in which T::element_type is
-/// half_t or __half. This is needed because the cast (typename T::element_type)0
-/// in the primary template flags as an error when __CUDA_NO_HALF_CONVERSIONS__
-/// is set.
-template <
-  /// Element type
-  typename T,
-  /// Number of elements in the array
-  int N
->
-class WmmaFragmentArray<T, N, true>: public Array<T, N, true> {
-public:
-
-  /// Efficient clear method (override Array::clear())
-  CUTLASS_HOST_DEVICE
-  void clear()
-  {
-    for(int i = 0; i < Array<T, N, true>::kElements; i++)
-    {
-      nvcuda::wmma::fill_fragment((*this)[i], __float2half(0.f));
-    }
-  }
-
-  CUTLASS_HOST_DEVICE
-  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
-  {
-    using element_type = typename T::element_type;
-    plus<T> add;
-
-    for (int i = 0; i < Array<T, N, true>::kElements; i++)
-    {
-      (*this)[i] = add((*this)[i], rhs[i]);
-    }
-
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
-
diff --git a/lightllm-kernel/cutlass/include/cutlass/workspace.h b/lightllm-kernel/cutlass/include/cutlass/workspace.h
deleted file mode 100755
index 6f1c3254c..000000000
--- a/lightllm-kernel/cutlass/include/cutlass/workspace.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for initializing workspaces
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-#include "cuda.h"
-#include "cuda_runtime.h"
-
-#include "cutlass/trace.h"
-#endif
-
-#include "cutlass.h"
-#include "cutlass/cuda_host_adapter.hpp"
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-static constexpr int MinWorkspaceAlignment = 16;
-
-#if !defined(__CUDACC_RTC__)
-static Status
-zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
-  if (workspace_size > 0) {
-    if (workspace == nullptr) {
-      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
-      return Status::kErrorWorkspaceNull;
-    }
-
-    CUTLASS_TRACE_HOST("  clearing workspace");
-
-#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
-    //
-    // Use the cuda host adapter
-    //
-    CUTLASS_ASSERT(cuda_adapter);
-    if (cuda_adapter) {
-      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
-        return Status::kErrorInternal;
-      }
-    }
-    else {
-      return Status::kErrorInternal;
-    }
-#else
-    cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_size, stream);
-    if (cudaSuccess != result) {
-      result = cudaGetLastError(); // to clear the error bit
-      CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
-      return Status::kErrorInternal;
-    }
-#endif
-  }
-
-  return Status::kSuccess;
-}
-#endif
-
-#if !defined(__CUDACC_RTC__)
-template <typename T>
-Status
-fill_workspace(void* workspace, T fill_value, size_t fill_count, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
-  static_assert(sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1, "Unsupported fill type");
-  if (fill_count > 0) {
-    if (workspace == nullptr) {
-      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
-      return Status::kErrorWorkspaceNull;
-    }
-
-    CUTLASS_TRACE_HOST("  filling workspace");
-
-#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
-    //
-    // Use the cuda host adapter
-    //
-    CUTLASS_ASSERT(cuda_adapter);
-    if (cuda_adapter) {
-      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, fill_value, fill_count, stream)) {
-        return Status::kErrorInternal;
-      }
-    }
-    else {
-      return Status::kErrorInternal;
-    }
-#else
-    CUdeviceptr d_workspace = reinterpret_cast<CUdeviceptr>(workspace);
-    CUresult result = CUDA_SUCCESS;
-    if (sizeof(T) == 4) {
-      result = cuMemsetD32Async(d_workspace, reinterpret_cast<uint32_t&>(fill_value), fill_count, stream);
-    }
-    else if (sizeof(T) == 2) {
-      result = cuMemsetD16Async(d_workspace, reinterpret_cast<uint16_t&>(fill_value), fill_count, stream);
-    }
-    else if (sizeof(T) == 1) {
-      result = cuMemsetD8Async(d_workspace, reinterpret_cast<uint8_t&>(fill_value), fill_count, stream);
-    }
-
-    if (CUDA_SUCCESS != result) {
-      const char** error_string_ptr = nullptr;
-      (void) cuGetErrorString(result, error_string_ptr);
-      if (error_string_ptr != nullptr) {
-        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned error " << *error_string_ptr);
-      }
-      else {
-        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned unrecognized error");
-      }
-      return Status::kErrorInternal;
-    }
-#endif
-  }
-
-  return Status::kSuccess;
-}
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index 68bf434ca..790e08c62 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -7,9 +7,10 @@
 try:
     _C = importlib.import_module(f"{PKG}._C")
 except ImportError:
-    raise ImportError("Cannot import compiled extension 'lightllm_kernel.ops'")
-    repo_root = Path(__file__).resolve().parents[2]
-    csrc_dir = repo_root / "csrc"
+    # raise ImportError("Cannot import compiled extension 'lightllm_kernel.ops'")
+    repo_root = Path(__file__).resolve().parents[3]
+    kernels_root = Path(__file__).resolve().parents[2]
+    csrc_dir = kernels_root / "csrc"
     if not csrc_dir.exists():
         raise ImportError(
             "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
@@ -20,7 +21,7 @@
     PROGRAM_NAME = "lightllm_kernel._C"
     EXTENSION_BUILD_DIR = "build"
     INCLUDE_DIR = "include"
-    CUTLASS_DIR = "cutlass/include"
+    CUTLASS_DIR = "third-party/cutlass/include"
 
     sources = []
     file_names = []  # Store file names for printing
@@ -40,11 +41,12 @@
         sources=sources,
         verbose=True,
         extra_include_paths=[
-            os.path.join(repo_root, INCLUDE_DIR),
+            os.path.join(kernels_root, INCLUDE_DIR),
             os.path.join(repo_root, CUTLASS_DIR),
         ],
-        build_directory=os.path.join(repo_root, EXTENSION_BUILD_DIR),
+        build_directory=os.path.join(kernels_root, EXTENSION_BUILD_DIR),
         with_cuda=True,
+        extra_ldflags=["-lcuda", "-L/usr/local/cuda/lib64"],
         extra_cuda_cflags=[
             "-DNDEBUG",
             "-O3",
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
index 06d51d152..338089952 100644
--- a/lightllm-kernel/setup.py
+++ b/lightllm-kernel/setup.py
@@ -3,8 +3,9 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
-repo_root = Path(__file__).resolve().parents[0]
-csrc_dir = repo_root / "csrc"
+repo_root = Path(__file__).resolve().parents[1]
+kernels_root = Path(__file__).resolve().parents[0]
+csrc_dir = kernels_root / "csrc"
 if not csrc_dir.exists():
     raise ImportError(
         "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
@@ -14,7 +15,7 @@
 
 PROGRAM_NAME = "lightllm_kernel._C"
 INCLUDE_DIR = "include"
-CUTLASS_DIR = "cutlass/include"
+CUTLASS_DIR = "third-party/cutlass/include"
 
 sources = []
 file_names = []  # Store file names for printing
@@ -48,7 +49,7 @@
             ],
         },
         include_dirs=[
-            os.path.join(repo_root, INCLUDE_DIR),
+            os.path.join(kernels_root, INCLUDE_DIR),
             os.path.join(repo_root, CUTLASS_DIR),
         ],
     )
diff --git a/third-party/cutlass b/third-party/cutlass
new file mode 160000
index 000000000..bf9da7b76
--- /dev/null
+++ b/third-party/cutlass
@@ -0,0 +1 @@
+Subproject commit bf9da7b76c766d7ee7d536afc77880a4ef1f1156

From 49721cc47bb25e84a6149bb9df1285d2529ebd93 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Tue, 27 May 2025 20:30:37 +0800
Subject: [PATCH 06/14] add int8kv flashdecoding

---
 lightllm-kernel/CMakeLists.txt                |   2 +-
 lightllm-kernel/Makefile                      |   7 +-
 ...de_attention_kernel_in8kv_flashdecoding.cu | 650 ++++++++++++++++++
 lightllm-kernel/csrc/ops_bindings.cpp         |   1 +
 lightllm-kernel/include/ops_common.h          |  15 +
 .../lightllm_kernel/ops/__init__.py           |   2 +
 .../lightllm_kernel/ops/attention.py          |  36 +
 lightllm-kernel/setup.py                      |  10 +
 8 files changed, 721 insertions(+), 2 deletions(-)
 create mode 100644 lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu

diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt
index 5de3c6420..de01c346a 100644
--- a/lightllm-kernel/CMakeLists.txt
+++ b/lightllm-kernel/CMakeLists.txt
@@ -3,7 +3,7 @@ project(lightllm_kernel LANGUAGES CXX CUDA)
 
 # GPU 架构：缺省支持 A100(80)、Ampere(86)、Ada/L40s/4090(89)、Hopper(90)，
 if(NOT CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 80;86;89;90-virtual)
+  set(CMAKE_CUDA_ARCHITECTURES 80;86;89;90)
 endif()
 
 # 找 PyTorch & Python
diff --git a/lightllm-kernel/Makefile b/lightllm-kernel/Makefile
index c3fc05d52..c21cd0879 100644
--- a/lightllm-kernel/Makefile
+++ b/lightllm-kernel/Makefile
@@ -1,4 +1,9 @@
-.PHONY: build clean
+.PHONY: build clean submodule
+
+SUBMODULE_DIR = third-party/cutlass
+
+$(SUBMODULE_DIR)/.git:
+	git submodule update --init --recursive
 
 build:
 	# 8.0-> A100, 8.6-> A10, 8.9-> L40s/4090, 9.0+PTX-> Hopper
diff --git a/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu b/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu
new file mode 100644
index 000000000..c55eaaf6f
--- /dev/null
+++ b/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu
@@ -0,0 +1,650 @@
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <float.h> // need for FLT_MAX
+#include <math.h>
+#include <memory>
+#include <assert.h>
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "ops_common.h"
+# include <torch/extension.h>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace lightllm {
+namespace ops {
+
+template <typename T>
+__device__ inline float tofloat(T value) {
+    return static_cast<float>(value);
+}
+
+// Specialization for __half
+template <>
+__device__ inline float tofloat<__half>(__half value) {
+    return __half2float(value);
+}
+
+// Specialization for __nv_bfloat16
+template <>
+__device__ inline float tofloat<__nv_bfloat16>(__nv_bfloat16 value) {
+    return __bfloat162float(value);
+}
+
+#define LIGHT_DISPATCH_CASE_FLOATING_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define LIGHT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, LIGHT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void copy(const void* local, void* data)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(local);
+    T* out = static_cast<T*>(data);
+    *out = *in;
+}
+
+template<int32_t THREAD_GROUP_SIZE, int32_t ELEMENT_NUM, typename T>
+__device__ inline
+float attn_thread_group_dot(T* local_q, T* local_k)
+{
+    // Helper function for QK Dot.
+    // [TODO] It should be optimized by type fp32x4.
+
+    float qk = 0.0f;
+# pragma unroll
+    for(int32_t i = 0; i < ELEMENT_NUM; i++) {
+        qk += tofloat(local_q[i]) * tofloat(local_k[i]);
+    }
+#pragma unroll
+    for (int32_t mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_max(float reducing, float* shared_mem)
+{
+    // Helper function for reduce softmax qkmax.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    if (lane_id == 0) {
+        shared_mem[warp_id] = reducing;
+    }
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+    else reducing = -FLT_MAX;
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_sum(float reducing, float *shared_mem)
+{
+    // Helper function for reduce softmax exp sum.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+
+    if (lane_id == 0) shared_mem[warp_id] = reducing;
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<
+    int32_t HEAD_SIZE,
+    int32_t THREAD_GROUP_SIZE,        // how many threads inside a group
+    int32_t TPB,
+    int32_t QUANT_GROUP,
+    typename T>
+__global__
+void dynamic_batching_flashdecoding_cache_attention_int8kv_kernel(
+    const int64_t seq_block_size, 
+
+    T* __restrict__ output_emb,
+    T* __restrict__ output_logexpsum,  
+    // T* __restrict__ output,          // [context_lens, num_heads..., head_size]
+    
+    const T* __restrict__ query,     // [seq_lens, num_heads..., head_size]
+    const int8_t* k_cache,                // [max_token, num_kv_heads, head_size]
+    const T* k_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+    const int8_t* v_cache,                // [max_token, num_kv_heads, head_size]
+    const T* v_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+
+    const float attn_scale,
+
+    const int64_t output_emb_stride_b,
+    const int64_t output_emb_stride_h,
+    const int64_t output_emb_stride_s,
+    const int64_t output_emb_stride_d,
+
+    const int64_t output_logexpsum_stride_b,
+    const int64_t output_logexpsum_stride_h,
+    const int64_t output_logexpsum_stride_s,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+    const int64_t gqa_group_size) { 
+
+    /* --- Decoding Attention Kernel Implementation --- */
+    constexpr int64_t WARP_SIZE = 32;                              // warp size
+    constexpr int64_t WPT       = TPB / WARP_SIZE;                 // warp per thread block， TPB for Thread per block 4, block_size
+    constexpr int64_t GPW       = WARP_SIZE / THREAD_GROUP_SIZE;       // thread group per warp 4
+    constexpr int64_t GPT       = WARP_SIZE / THREAD_GROUP_SIZE * WPT; // thread group per thread block 16
+
+    // const int64_t num_heads     = gridDim.x;
+    const int64_t head_idx      = blockIdx.x;
+    const int64_t batch_idx     = blockIdx.y;
+    const int64_t seq_block_idx = blockIdx.z;
+
+    const int64_t seq_len = b_seq_len[batch_idx];
+    const int64_t cur_req_idx = b_req_idx[batch_idx];
+    const int32_t * b_start_loc = req_to_tokens + cur_req_idx * req_to_tokens_stride + seq_block_idx * seq_block_size;
+
+    constexpr int64_t VEC_SIZE  = 16 / sizeof(T);  // 128 bits, 这个是 cuda 能操作的最大的一个单位的数吧，8
+
+    // ------------------------------------------------ //
+    // Step 1. Load Q into Thread Reg.
+    constexpr int64_t VEC_LEN = (HEAD_SIZE / VEC_SIZE) / THREAD_GROUP_SIZE; // 128 / 8 / 8 = 2
+
+    static_assert((HEAD_SIZE / THREAD_GROUP_SIZE) % VEC_SIZE == 0);
+    static_assert(HEAD_SIZE % THREAD_GROUP_SIZE == 0);
+    static_assert(QUANT_GROUP == 8);
+
+    constexpr int64_t QUANT_GROUP_SHIFT = 3;
+
+    // The elements in Q, K, and V will be evenly distributed across each thread group.
+    T local_q[VEC_SIZE * VEC_LEN]; // 2 * 8
+
+    const int64_t warp_id       = threadIdx.x / WARP_SIZE;
+    const int64_t warp_lane_id  = threadIdx.x % WARP_SIZE;
+    const int64_t group_id      = warp_lane_id / THREAD_GROUP_SIZE;
+    const int64_t group_lane_id = warp_lane_id % THREAD_GROUP_SIZE;
+    const int64_t kv_head_idx     = head_idx / gqa_group_size;
+
+    if (seq_len <= seq_block_idx * seq_block_size) {
+        return;
+    }
+    const int64_t context_len = min(seq_len - seq_block_idx * seq_block_size, seq_block_size);
+
+    #pragma unroll
+    for (int64_t i = 0; i < VEC_LEN; i++) {
+        // copy 128(16 * 8) bits from Q to Local Q
+
+        // 这个地方是错开间隔读取的，不知道如果设置成为连续位置读取会不会一样呢？
+        copy<sizeof(T) * VEC_SIZE>(
+            &query[
+                batch_idx * query_stride_s +
+                head_idx * query_stride_h +
+                (group_lane_id + i * THREAD_GROUP_SIZE) * VEC_SIZE
+            ],
+            &local_q[i * VEC_SIZE]);
+    }
+    // ------------------------------------------------ //
+    // Step 2. Solve QK Dot
+
+    extern __shared__ float logits[];
+    float qk_max = -FLT_MAX;
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        int8_t local_k_quant[VEC_SIZE * VEC_LEN];
+        T local_k[VEC_SIZE * VEC_LEN];
+        T local_k_scale[VEC_LEN];
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+
+        // all thread groups within a warp must be launched together.
+        if (context_id >= context_len){
+            memset(local_k, 0, sizeof(local_k));
+        } else {
+            const int64_t key_offset
+                            = (mem_context_id) * kcache_stride_s
+                            + kv_head_idx * kcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from K to Local K
+                const int64_t key_idx = key_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&k_cache[key_idx],  &local_k_quant[i * VEC_SIZE]);
+
+                const int64_t key_scale_idx = key_idx >> QUANT_GROUP_SHIFT;
+                local_k_scale[i] = k_scale[key_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_k[i * VEC_SIZE + j]
+                        = local_k_scale[i] * (T)local_k_quant[i * VEC_SIZE + j];
+                }
+            }
+        }
+
+        // Ready for QK Dot
+        const float qk_dot
+            = attn_scale
+            * attn_thread_group_dot<THREAD_GROUP_SIZE, VEC_LEN * VEC_SIZE>(local_q, local_k);
+
+        if (group_lane_id == 0 && context_id < context_len) {
+            logits[context_id] = qk_dot;
+            qk_max = fmaxf(qk_dot, qk_max);
+        }
+    }
+
+    // ------------------------------------------------ //
+    // Step 3. Softmax
+
+    __shared__ float red_smem[WPT];
+
+    qk_max = attn_block_reduce_max<WPT>(qk_max, red_smem);
+
+    float exp_sum = 0.0f;
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB){
+        logits[context_id] -= qk_max;
+        logits[context_id] = exp(logits[context_id]);
+        exp_sum += logits[context_id];
+    }
+
+    static_assert(WPT == 2 || WPT == 4 || WPT == 8 || WPT == 16 || WPT == 32 || WPT == 64);
+    exp_sum = attn_block_reduce_sum<WPT>(exp_sum, red_smem);
+
+    const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB) {
+        logits[context_id] *= inv_sum;
+    }
+    __syncthreads(); // Must have this.
+
+    // ------------------------------------------------ //
+    // Step 4. Solve logits * V
+
+    int8_t local_v_quant[VEC_SIZE * VEC_LEN];
+    float local_v[VEC_SIZE * VEC_LEN];
+    T local_v_scale[VEC_LEN];
+
+    #pragma unroll
+    for(int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        local_v[i] = 0;
+    }
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+        // all thread groups within a warp must be launched together.
+        if (context_id < context_len){
+            const int64_t value_offset
+                            = (mem_context_id) * vcache_stride_s
+                            + kv_head_idx * vcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from V to Local V
+                const int64_t value_idx = value_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&v_cache[value_idx],  &local_v_quant[i * VEC_SIZE]);
+
+                const int64_t value_scale_idx = value_idx >> QUANT_GROUP_SHIFT;
+                local_v_scale[i] = v_scale[value_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_v[i * VEC_SIZE + j] += (tofloat(local_v_scale[i])
+                                                * (float)local_v_quant[i * VEC_SIZE + j]
+                                                * logits[context_id]);
+                }
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        #pragma unroll
+        for (int32_t mask = THREAD_GROUP_SIZE; mask <= WARP_SIZE >> 1; mask = mask << 1) {
+            local_v[i] += __shfl_xor_sync(uint32_t(-1), local_v[i], mask);
+        }
+    }
+
+    __syncthreads();
+
+    // do some reuse
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        logits[i] = 0;
+    }
+
+    __syncthreads();
+
+    if (warp_lane_id < THREAD_GROUP_SIZE) {
+        #pragma unroll
+        for (int32_t i = 0; i < VEC_LEN; i++) {
+            #pragma unroll
+            for (int32_t j = 0; j < VEC_SIZE; j++) {
+                atomicAdd(
+                    logits + i * THREAD_GROUP_SIZE * VEC_SIZE + warp_lane_id * VEC_SIZE + j,
+                    local_v[i * VEC_SIZE + j]
+                );
+            }
+        }
+    }
+
+    __syncthreads();
+
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB) {
+        output_emb[batch_idx * output_emb_stride_b + head_idx * output_emb_stride_h + seq_block_idx * output_emb_stride_s + i] = logits[i];
+    }
+
+    output_logexpsum[batch_idx * output_logexpsum_stride_b + head_idx * output_logexpsum_stride_h + seq_block_idx] = logf(exp_sum) + qk_max;
+}
+
+
+template<typename T>
+void run_group_int8kv_decode_flashattention_kernel(
+    const int64_t seq_block_size, 
+    T* __restrict__ output_emb,
+    T* __restrict__ output_logexpsum,       
+    const T* __restrict__ query,    
+    const int8_t* k_cache,              
+    const T* k_scale,                 
+    const int8_t* v_cache,
+    const T* v_scale,
+    const float attn_scale,
+
+    const int64_t output_emb_stride_b,
+    const int64_t output_emb_stride_h,
+    const int64_t output_emb_stride_s,
+    const int64_t output_emb_stride_d,
+
+    const int64_t output_logexpsum_stride_b,
+    const int64_t output_logexpsum_stride_h,
+    const int64_t output_logexpsum_stride_s,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+
+    const int64_t batch_size,
+    const int64_t q_head_num,
+    const int64_t head_dim,
+    const int64_t gqa_group_size) {
+
+    constexpr int64_t WARP_SIZE = 32;
+    constexpr int64_t TPB = 256;
+    constexpr int64_t MAX_SHM_SIZE = 48 * 1024;
+
+    constexpr int64_t reduce_shm_size = TPB / WARP_SIZE * sizeof(float);
+    const int64_t logits_size = max(seq_block_size * sizeof(float), head_dim * sizeof(float));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    
+    if (reduce_shm_size + logits_size <= MAX_SHM_SIZE) {
+        const dim3 grid_size = {static_cast<unsigned int>(q_head_num), static_cast<unsigned int>(batch_size), static_cast<unsigned int>((max_len_in_batch + seq_block_size - 1) / seq_block_size)};
+        switch (head_dim){
+            case 64:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<64, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 96:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<96, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum, 
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 128:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<128, 8, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 256:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<256, 16, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            default:
+                assert(false);
+        }
+    } else {
+        assert(false);
+    }
+}
+
+void group_int8kv_flashdecoding_attention(const int seq_block_size, at::Tensor mid_o_emb, at::Tensor mid_o_logexpsum, float att_scale, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,  at::Tensor v_s, at::Tensor req_to_tokens, at::Tensor b_req_idx, at::Tensor b_seq_len, int max_len_in_batch) {
+    int64_t batch_size = b_seq_len.sizes()[0];
+    int64_t head_num = q.sizes()[1];
+    int64_t head_dim = q.sizes()[2]; // q shape [batchsize, head_num, head_dim]
+    int64_t kv_head_num = k.sizes()[1];
+    assert(head_num % kv_head_num == 0);
+    int64_t gqa_group_size = head_num / kv_head_num;
+
+    LIGHT_DISPATCH_FLOATING_TYPES(q.scalar_type(), "group_int8kv_flashdecoding_attention", ([&] {
+        run_group_int8kv_decode_flashattention_kernel<scalar_t>(
+            seq_block_size, 
+            mid_o_emb.data_ptr<scalar_t>(), 
+            mid_o_logexpsum.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(), 
+            k.data_ptr<int8_t>(), k_s.data_ptr<scalar_t>(),
+            v.data_ptr<int8_t>(), v_s.data_ptr<scalar_t>(),
+            att_scale,
+            
+            mid_o_emb.stride(0),
+            mid_o_emb.stride(1),
+            mid_o_emb.stride(2),
+            mid_o_emb.stride(3),
+            mid_o_logexpsum.stride(0),
+            mid_o_logexpsum.stride(1),
+            mid_o_logexpsum.stride(2),
+
+            q.stride(0),
+            q.stride(1),
+            k.stride(0),
+            k.stride(1),
+            v.stride(0),
+            v.stride(1),
+            b_seq_len.data_ptr<int32_t>(),
+            b_req_idx.data_ptr<int32_t>(),
+            req_to_tokens.data_ptr<int32_t>(),
+            req_to_tokens.stride(0),
+            max_len_in_batch,
+            batch_size,
+            head_num,
+            head_dim,
+            gqa_group_size
+        );
+    }));
+
+}
+
+void group_int8kv_flashdecoding_attention(
+    const int64_t seq_block_size, 
+    torch::Tensor mid_o_emb, 
+    torch::Tensor mid_o_logexpsum, 
+    fp32_t att_scale, 
+    torch::Tensor q, 
+    torch::Tensor k, 
+    torch::Tensor k_s,  
+    torch::Tensor v,  
+    torch::Tensor v_s, 
+    torch::Tensor req_to_tokens, 
+    torch::Tensor b_req_idx, 
+    torch::Tensor b_seq_len, 
+    int64_t max_len_in_batch)
+{
+    group_int8kv_flashdecoding_attention(
+        static_cast<int>(seq_block_size), 
+        mid_o_emb, 
+        mid_o_logexpsum, 
+        att_scale, 
+        q, 
+        k, 
+        k_s, 
+        v, 
+        v_s, 
+        req_to_tokens, 
+        b_req_idx, 
+        b_seq_len, 
+        static_cast<int>(max_len_in_batch)
+    );
+}
+
+}
+}
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index 96672d2e8..308a9466d 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -16,6 +16,7 @@ PYBIND11_MODULE(_C, m) {
     m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
     m.def("all_gather", &all_gather, "ALL GATHER (CUDA)");
     m.def("meta_size", &lightllm::ops::meta_size, "Size (in bytes) of vllm::Signal metadata");
+    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "decode attention");
 }
 
 } // namespace ops
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index 7087f086c..c5d6c50ec 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -73,5 +73,20 @@ void all_gather(
     int64_t reg_buffer_sz_bytes
 );
 
+void group_int8kv_flashdecoding_attention(
+    const int64_t seq_block_size, 
+    Tensor mid_o_emb, 
+    Tensor mid_o_logexpsum, 
+    fp32_t att_scale, 
+    Tensor q, 
+    Tensor k, 
+    Tensor k_s,  
+    Tensor v,  
+    Tensor v_s, 
+    Tensor req_to_tokens, 
+    Tensor b_req_idx, 
+    Tensor b_seq_len, 
+    int64_t max_len_in_batch);
+
 } // namespace ops
 } // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index 790e08c62..03dd40bb5 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -72,6 +72,7 @@
 from .quant import per_token_quant_bf16_fp8
 from .gemm import cutlass_scaled_mm_bias_ls
 from .moe import all_gather, grouped_topk
+from .attention import group8_int8kv_flashdecoding_stage1
 
 __all__ = [
     "rmsnorm_bf16",
@@ -83,4 +84,5 @@
     "cutlass_scaled_mm_bias_ls",
     "grouped_topk",
     "meta_size",
+    "group8_int8kv_flashdecoding_stage1",
 ]
diff --git a/lightllm-kernel/lightllm_kernel/ops/attention.py b/lightllm-kernel/lightllm_kernel/ops/attention.py
index e69de29bb..8cff1730e 100644
--- a/lightllm-kernel/lightllm_kernel/ops/attention.py
+++ b/lightllm-kernel/lightllm_kernel/ops/attention.py
@@ -0,0 +1,36 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+
+def group8_int8kv_flashdecoding_stage1(
+    seq_block_size: int,
+    mid_o_emb: torch.Tensor,
+    mid_o_logexpsum: torch.Tensor,
+    att_scale: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+    v: torch.Tensor,
+    v_s: torch.Tensor,
+    req_to_tokens: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_len_in_batch: int,
+) -> None:
+    """Apply rmsnorm on given X, with weight W and eps"""
+    return _C.group8_int8kv_flashdecoding_stage1(
+        seq_block_size,
+        mid_o_emb,
+        mid_o_logexpsum,
+        att_scale,
+        q,
+        k,
+        k_s,
+        v,
+        v_s,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+    )
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
index 338089952..da6b69000 100644
--- a/lightllm-kernel/setup.py
+++ b/lightllm-kernel/setup.py
@@ -43,6 +43,16 @@
                 "-DNDEBUG",
                 "-O3",
                 "--use_fast_math",
+                # A100 (compute_80)
+                "-gencode=arch=compute_80,code=sm_80",
+                "-gencode=arch=compute_80,code=compute_80",
+                # A10 / other Ampere (compute_86)
+                "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_86,code=compute_86",
+                # L40s / 4090 (compute_89)
+                "-gencode=arch=compute_89,code=sm_89",
+                "-gencode=arch=compute_89,code=compute_89",
+                # H100 (compute_90)
                 "-gencode=arch=compute_90,code=sm_90",
                 "-gencode=arch=compute_90,code=compute_90",
                 "-gencode=arch=compute_90a, code=sm_90a",

From 30d3eab329ff734f9e6c644b3017f19795d4b1d6 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 28 May 2025 20:32:55 +0800
Subject: [PATCH 07/14] 0528-2

---
 lightllm-kernel/Makefile                      |   4 +-
 .../csrc/attention/decode_attention_kernel.cu | 569 ++++++++++++++++++
 lightllm-kernel/csrc/ops_bindings.cpp         |   3 +-
 lightllm-kernel/include/ops_common.h          |  12 +
 .../lightllm_kernel/ops/__init__.py           |   3 +-
 .../lightllm_kernel/ops/attention.py          |  29 +-
 lightllm/common/quantization/w8a8_quant.py    |   6 +-
 .../ppl_int8kv_flash_decoding.py              |   4 +-
 lightllm/utils/light_utils.py                 |   2 +-
 9 files changed, 620 insertions(+), 12 deletions(-)
 create mode 100644 lightllm-kernel/csrc/attention/decode_attention_kernel.cu

diff --git a/lightllm-kernel/Makefile b/lightllm-kernel/Makefile
index c21cd0879..5b7100bb6 100644
--- a/lightllm-kernel/Makefile
+++ b/lightllm-kernel/Makefile
@@ -2,10 +2,10 @@
 
 SUBMODULE_DIR = third-party/cutlass
 
-$(SUBMODULE_DIR)/.git:
+submodule:
 	git submodule update --init --recursive
 
-build:
+build: submodule
 	# 8.0-> A100, 8.6-> A10, 8.9-> L40s/4090, 9.0+PTX-> Hopper
 	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" \
 	python -m pip install -v .
diff --git a/lightllm-kernel/csrc/attention/decode_attention_kernel.cu b/lightllm-kernel/csrc/attention/decode_attention_kernel.cu
new file mode 100644
index 000000000..3fd4ce336
--- /dev/null
+++ b/lightllm-kernel/csrc/attention/decode_attention_kernel.cu
@@ -0,0 +1,569 @@
+#include <cuda_fp16.h>
+#include <float.h> // need for FLT_MAX
+#include <math.h>
+#include <memory>
+#include <assert.h>
+#include "ops_common.h"
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace lightllm {
+namespace ops {
+
+# include <torch/extension.h>
+#define LIGHT_DISPATCH_CASE_FLOATING_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define LIGHT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, LIGHT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+template <typename T>
+__device__ inline float tofloat(T value) {
+    return static_cast<float>(value);
+}
+
+// Specialization for __half
+template <>
+__device__ inline float tofloat<__half>(__half value) {
+    return __half2float(value);
+}
+
+// Specialization for __nv_bfloat16
+template <>
+__device__ inline float tofloat<__nv_bfloat16>(__nv_bfloat16 value) {
+    return __bfloat162float(value);
+}
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void copy(const void* local, void* data)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(local);
+    T* out = static_cast<T*>(data);
+    *out = *in;
+}
+
+template<int32_t THREAD_GROUP_SIZE, int32_t ELEMENT_NUM, typename T>
+__device__ inline
+float attn_thread_group_dot(T* local_q, T* local_k)
+{
+    // Helper function for QK Dot.
+    // [TODO] It should be optimized by type fp32x4.
+
+    float qk = 0.0f;
+# pragma unroll
+    for(int32_t i = 0; i < ELEMENT_NUM; i++) {
+        qk += tofloat(local_q[i]) * tofloat(local_k[i]);
+    }
+#pragma unroll
+    for (int32_t mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_max(float reducing, float* shared_mem)
+{
+    // Helper function for reduce softmax qkmax.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    if (lane_id == 0) {
+        shared_mem[warp_id] = reducing;
+    }
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+    else reducing = -FLT_MAX;
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_sum(float reducing, float *shared_mem)
+{
+    // Helper function for reduce softmax exp sum.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+
+    if (lane_id == 0) shared_mem[warp_id] = reducing;
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<
+    int32_t HEAD_SIZE,
+    int32_t THREAD_GROUP_SIZE,        // how many threads inside a group
+    int32_t TPB,
+    int32_t QUANT_GROUP,
+    typename T>
+__global__
+void dynamic_batching_decoding_cache_attention_fp16_kernel(
+    T* __restrict__ output,          // [context_lens, num_heads..., head_size]
+    
+    const T* __restrict__ query,     // [seq_lens, num_heads..., head_size]
+    const int8_t* k_cache,                // [max_token, num_kv_heads, head_size]
+    const T* k_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+    const int8_t* v_cache,                // [max_token, num_kv_heads, head_size]
+    const T* v_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+
+    const float attn_scale,
+
+    const int64_t output_stride_s,
+    const int64_t output_stride_h,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+    const int64_t gqa_group_size) { 
+
+    /* --- Decoding Attention Kernel Implementation --- */
+    constexpr int64_t WARP_SIZE = 32;                              // warp size
+    constexpr int64_t WPT       = TPB / WARP_SIZE;                 // warp per thread block， TPB for Thread per block 4, block_size
+    constexpr int64_t GPW       = WARP_SIZE / THREAD_GROUP_SIZE;       // thread group per warp 4
+    constexpr int64_t GPT       = WARP_SIZE / THREAD_GROUP_SIZE * WPT; // thread group per thread block 16
+
+    // const int64_t num_heads     = gridDim.x;
+    const int64_t head_idx      = blockIdx.x;
+    const int64_t batch_idx     = blockIdx.y;
+
+    const int64_t seq_len = b_seq_len[batch_idx];
+    const int64_t cur_req_idx = b_req_idx[batch_idx];
+    const int32_t * b_start_loc = req_to_tokens + cur_req_idx * req_to_tokens_stride;
+
+    constexpr int64_t VEC_SIZE  = 16 / sizeof(T);  // 128 bits, 这个是 cuda 能操作的最大的一个单位的数吧，8
+
+    // ------------------------------------------------ //
+    // Step 1. Load Q into Thread Reg.
+    constexpr int64_t VEC_LEN = (HEAD_SIZE / VEC_SIZE) / THREAD_GROUP_SIZE; // 128 / 8 / 8 = 2
+
+    static_assert((HEAD_SIZE / THREAD_GROUP_SIZE) % VEC_SIZE == 0);
+    static_assert(HEAD_SIZE % THREAD_GROUP_SIZE == 0);
+    static_assert(QUANT_GROUP == 8);
+
+    constexpr int64_t QUANT_GROUP_SHIFT = 3;
+
+    // The elements in Q, K, and V will be evenly distributed across each thread group.
+    T local_q[VEC_SIZE * VEC_LEN]; // 2 * 8
+
+    const int64_t warp_id       = threadIdx.x / WARP_SIZE;
+    const int64_t warp_lane_id  = threadIdx.x % WARP_SIZE;
+    const int64_t group_id      = warp_lane_id / THREAD_GROUP_SIZE;
+    const int64_t group_lane_id = warp_lane_id % THREAD_GROUP_SIZE;
+    const int64_t kv_head_idx     = head_idx / gqa_group_size;
+
+    #pragma unroll
+    for (int64_t i = 0; i < VEC_LEN; i++) {
+        // copy 128(16 * 8) bits from Q to Local Q
+
+        // 这个地方是错开间隔读取的，不知道如果设置成为连续位置读取会不会一样呢？
+        copy<sizeof(T) * VEC_SIZE>(
+            &query[
+                batch_idx * query_stride_s +
+                head_idx * query_stride_h +
+                (group_lane_id + i * THREAD_GROUP_SIZE) * VEC_SIZE
+            ],
+            &local_q[i * VEC_SIZE]);
+    }
+    // ------------------------------------------------ //
+    // Step 2. Solve QK Dot
+
+    const int64_t context_len = seq_len;
+    extern __shared__ float logits[];
+    float qk_max = -FLT_MAX;
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        int8_t local_k_quant[VEC_SIZE * VEC_LEN];
+        T local_k[VEC_SIZE * VEC_LEN];
+        T local_k_scale[VEC_LEN];
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+
+        // all thread groups within a warp must be launched together.
+        if (context_id >= context_len){
+            memset(local_k, 0, sizeof(local_k));
+        } else {
+            const int64_t key_offset
+                            = (mem_context_id) * kcache_stride_s
+                            + kv_head_idx * kcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from K to Local K
+                const int64_t key_idx = key_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&k_cache[key_idx],  &local_k_quant[i * VEC_SIZE]);
+
+                const int64_t key_scale_idx = key_idx >> QUANT_GROUP_SHIFT;
+                local_k_scale[i] = k_scale[key_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_k[i * VEC_SIZE + j]
+                        = local_k_scale[i] * (T)local_k_quant[i * VEC_SIZE + j];
+                }
+            }
+        }
+
+        // Ready for QK Dot
+        const float qk_dot
+            = attn_scale
+            * attn_thread_group_dot<THREAD_GROUP_SIZE, VEC_LEN * VEC_SIZE>(local_q, local_k);
+
+        if (group_lane_id == 0 && context_id < context_len) {
+            logits[context_id] = qk_dot;
+            qk_max = fmaxf(qk_dot, qk_max);
+        }
+    }
+
+    // ------------------------------------------------ //
+    // Step 3. Softmax
+
+    __shared__ float red_smem[WPT];
+
+    qk_max = attn_block_reduce_max<WPT>(qk_max, red_smem);
+
+    float exp_sum = 0.0f;
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB){
+        logits[context_id] -= qk_max;
+        logits[context_id] = exp(logits[context_id]);
+        exp_sum += logits[context_id];
+    }
+
+    static_assert(WPT == 2 || WPT == 4 || WPT == 8 || WPT == 16 || WPT == 32 || WPT == 64);
+    exp_sum = attn_block_reduce_sum<WPT>(exp_sum, red_smem);
+
+    const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB) {
+        logits[context_id] *= inv_sum;
+    }
+    __syncthreads(); // Must have this.
+
+    // ------------------------------------------------ //
+    // Step 4. Solve logits * V
+
+    int8_t local_v_quant[VEC_SIZE * VEC_LEN];
+    float local_v[VEC_SIZE * VEC_LEN];
+    T local_v_scale[VEC_LEN];
+
+    #pragma unroll
+    for(int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        local_v[i] = 0;
+    }
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+        // all thread groups within a warp must be launched together.
+        if (context_id < context_len){
+            const int64_t value_offset
+                            = (mem_context_id) * vcache_stride_s
+                            + kv_head_idx * vcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from V to Local V
+                const int64_t value_idx = value_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&v_cache[value_idx],  &local_v_quant[i * VEC_SIZE]);
+
+                const int64_t value_scale_idx = value_idx >> QUANT_GROUP_SHIFT;
+                local_v_scale[i] = v_scale[value_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_v[i * VEC_SIZE + j] += (tofloat(local_v_scale[i])
+                                                * (float)local_v_quant[i * VEC_SIZE + j]
+                                                * logits[context_id]);
+                }
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        #pragma unroll
+        for (int32_t mask = THREAD_GROUP_SIZE; mask <= WARP_SIZE >> 1; mask = mask << 1) {
+            local_v[i] += __shfl_xor_sync(uint32_t(-1), local_v[i], mask);
+        }
+    }
+
+    __syncthreads();
+
+    // do some reuse
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        logits[i] = 0;
+    }
+
+    __syncthreads();
+
+    if (warp_lane_id < THREAD_GROUP_SIZE) {
+        #pragma unroll
+        for (int32_t i = 0; i < VEC_LEN; i++) {
+            #pragma unroll
+            for (int32_t j = 0; j < VEC_SIZE; j++) {
+                atomicAdd(
+                    logits + i * THREAD_GROUP_SIZE * VEC_SIZE + warp_lane_id * VEC_SIZE + j,
+                    local_v[i * VEC_SIZE + j]
+                );
+            }
+        }
+    }
+
+    __syncthreads();
+
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        output[batch_idx * output_stride_s + head_idx * output_stride_h + i] = logits[i];
+    }
+}
+
+
+template<typename T>
+void run_group_int8kv_decode_attention_kernel(
+    T* __restrict__ output,         
+    const T* __restrict__ query,    
+    const int8_t* k_cache,              
+    const T* k_scale,                 
+    const int8_t* v_cache,
+    const T* v_scale,
+    const float attn_scale,
+    const int64_t output_stride_s,
+    const int64_t output_stride_h,
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+
+    const int64_t batch_size,
+    const int64_t q_head_num,
+    const int64_t head_dim,
+    const int64_t gqa_group_size) {
+
+    constexpr int64_t WARP_SIZE = 32;
+    constexpr int64_t TPB = 256;
+    constexpr int64_t MAX_SHM_SIZE = 48 * 1024;
+
+    constexpr int64_t reduce_shm_size = TPB / WARP_SIZE * sizeof(float);
+    const int64_t logits_size = max(max_len_in_batch * sizeof(float), head_dim * sizeof(float));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (reduce_shm_size + logits_size <= MAX_SHM_SIZE) {
+        const dim3 grid_size = {(unsigned int)q_head_num, (unsigned int)batch_size, 1};
+        switch (head_dim){
+            case 64:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<64, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 96:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<96, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 128:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<128, 8, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 256:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<256, 16, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            default:
+                assert(false);
+        }
+    } else {
+        assert(false);
+    }
+}
+
+void group_int8kv_decode_attention(at::Tensor o, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,  at::Tensor v_s, at::Tensor req_to_tokens, at::Tensor b_req_idx, at::Tensor b_seq_len, int max_len_in_batch) {
+    int64_t batch_size = b_seq_len.sizes()[0];
+    int64_t head_num = q.sizes()[1];
+    int64_t head_dim = q.sizes()[2]; // q shape [batchsize, head_num, head_dim]
+    float att_scale = 1.0 / std::sqrt(head_dim);
+    int64_t kv_head_num = k.sizes()[1];
+    assert(head_num % kv_head_num == 0);
+    int64_t gqa_group_size = head_num / kv_head_num;
+    LIGHT_DISPATCH_FLOATING_TYPES(q.scalar_type(), "group_int8kv_decode_attention", ([&]{
+            run_group_int8kv_decode_attention_kernel<scalar_t>(
+                o.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(), 
+                k.data_ptr<int8_t>(), k_s.data_ptr<scalar_t>(),
+                v.data_ptr<int8_t>(), v_s.data_ptr<scalar_t>(),
+                att_scale,
+                o.stride(0),
+                o.stride(1),
+                q.stride(0),
+                q.stride(1),
+                k.stride(0),
+                k.stride(1),
+                v.stride(0),
+                v.stride(1),
+                b_seq_len.data_ptr<int32_t>(),
+                b_req_idx.data_ptr<int32_t>(),
+                req_to_tokens.data_ptr<int32_t>(),
+                req_to_tokens.stride(0),
+                max_len_in_batch,
+                batch_size,
+                head_num,
+                head_dim,
+                gqa_group_size
+            );
+        }
+    ));
+}
+
+void group_int8kv_decode_attention(
+    torch::Tensor o, 
+    torch::Tensor q, 
+    torch::Tensor k, 
+    torch::Tensor k_s,  
+    torch::Tensor v,  
+    torch::Tensor v_s, 
+    torch::Tensor req_to_tokens, 
+    torch::Tensor b_req_idx, 
+    torch::Tensor b_seq_len, 
+    int64_t max_len_in_batch)
+{
+    group_int8kv_decode_attention(
+        o,
+        q, 
+        k, 
+        k_s, 
+        v, 
+        v_s, 
+        req_to_tokens, 
+        b_req_idx, 
+        b_seq_len, 
+        static_cast<int>(max_len_in_batch)
+    );
+}
+
+
+}
+}
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index 308a9466d..d54f29858 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -16,7 +16,8 @@ PYBIND11_MODULE(_C, m) {
     m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
     m.def("all_gather", &all_gather, "ALL GATHER (CUDA)");
     m.def("meta_size", &lightllm::ops::meta_size, "Size (in bytes) of vllm::Signal metadata");
-    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "decode attention");
+    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "int8kv flashdecoding attention");
+    m.def("group_int8kv_decode_attention", &group_int8kv_decode_attention, "int8kv decode attention");
 }
 
 } // namespace ops
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index c5d6c50ec..9d0380ab8 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -88,5 +88,17 @@ void group_int8kv_flashdecoding_attention(
     Tensor b_seq_len, 
     int64_t max_len_in_batch);
 
+void group_int8kv_decode_attention(
+    Tensor o, 
+    Tensor q, 
+    Tensor k, 
+    Tensor k_s,  
+    Tensor v,  
+    Tensor v_s, 
+    Tensor req_to_tokens, 
+    Tensor b_req_idx, 
+    Tensor b_seq_len, 
+    int64_t max_len_in_batch);
+
 } // namespace ops
 } // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index 03dd40bb5..4e710f418 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -72,7 +72,7 @@
 from .quant import per_token_quant_bf16_fp8
 from .gemm import cutlass_scaled_mm_bias_ls
 from .moe import all_gather, grouped_topk
-from .attention import group8_int8kv_flashdecoding_stage1
+from .attention import group8_int8kv_flashdecoding_stage1, group_int8kv_decode_attention
 
 __all__ = [
     "rmsnorm_bf16",
@@ -85,4 +85,5 @@
     "grouped_topk",
     "meta_size",
     "group8_int8kv_flashdecoding_stage1",
+    "group_int8kv_decode_attention",
 ]
diff --git a/lightllm-kernel/lightllm_kernel/ops/attention.py b/lightllm-kernel/lightllm_kernel/ops/attention.py
index 8cff1730e..dc1ba99d5 100644
--- a/lightllm-kernel/lightllm_kernel/ops/attention.py
+++ b/lightllm-kernel/lightllm_kernel/ops/attention.py
@@ -18,7 +18,7 @@ def group8_int8kv_flashdecoding_stage1(
     b_seq_len: torch.Tensor,
     max_len_in_batch: int,
 ) -> None:
-    """Apply rmsnorm on given X, with weight W and eps"""
+
     return _C.group8_int8kv_flashdecoding_stage1(
         seq_block_size,
         mid_o_emb,
@@ -34,3 +34,30 @@ def group8_int8kv_flashdecoding_stage1(
         b_seq_len,
         max_len_in_batch,
     )
+
+
+def group_int8kv_decode_attention(
+    o: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+    v: torch.Tensor,
+    v_s: torch.Tensor,
+    req_to_tokens: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_len_in_batch: int,
+) -> None:
+
+    return _C.group_int8kv_decode_attention(
+        o,
+        q,
+        k,
+        k_s,
+        v,
+        v_s,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+    )
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
index 6f1989325..dbefa4749 100644
--- a/lightllm/common/quantization/w8a8_quant.py
+++ b/lightllm/common/quantization/w8a8_quant.py
@@ -6,7 +6,7 @@
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
-from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 
 class BaseQuantizationMethod(QuantizationMethod):
@@ -93,9 +93,7 @@ def quantize_moe(self, weight):
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
         if HAS_LIGHTLLM_KERNEL:
-            from lightllm_kernel.ops import per_token_quant_bf16_fp8
-
-            x_q, x_scale = per_token_quant_bf16_fp8(input_tensor)
+            x_q, x_scale = light_ops.per_token_quant_bf16_fp8(input_tensor)
         else:
             x_q, x_scale = vllm_ops.scaled_fp8_quant(
                 input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True
diff --git a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py
index efcc0fb42..88e39b82f 100644
--- a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py
+++ b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py
@@ -1,4 +1,5 @@
 import torch
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 
 def token_decode_attention_flash_decoding(
@@ -18,7 +19,6 @@ def token_decode_attention_flash_decoding(
     max_len_in_batch = infer_state.max_len_in_batch
     calcu_shape1 = (batch_size, q_head_num, head_dim)
 
-    from lightllm_ppl_int8kv_flashdecoding_kernel import group8_int8kv_flashdecoding_stage1
     from .flash_decoding_stage2 import flash_decode_stage2
 
     o_tensor = alloc_tensor_func(q.shape, q.dtype, q.device) if out is None else out
@@ -30,7 +30,7 @@ def token_decode_attention_flash_decoding(
         [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 1], dtype=q.dtype, device="cuda"
     )
 
-    group8_int8kv_flashdecoding_stage1(
+    light_ops.group8_int8kv_flashdecoding_stage1(
         BLOCK_SEQ,
         mid_o,
         mid_o_logexpsum,
diff --git a/lightllm/utils/light_utils.py b/lightllm/utils/light_utils.py
index 1aa901af4..944a0fe15 100644
--- a/lightllm/utils/light_utils.py
+++ b/lightllm/utils/light_utils.py
@@ -5,7 +5,7 @@
     # TODO: lightllm_kernel release
     import lightllm_kernel
 
-    light_ops = lightllm_kernel
+    light_ops = getattr(lightllm_kernel, "ops", lightllm_kernel)
     HAS_LIGHTLLM_KERNEL = True
 except:
     light_ops = None

From 51eebc361c28b0c2c4b20bd582adf398b22f9089 Mon Sep 17 00:00:00 2001
From: sangchengmeng <sangchengmeng@mail.ustc.edu.cn>
Date: Thu, 29 May 2025 11:59:57 +0800
Subject: [PATCH 08/14] 0529

---
 lightllm-kernel/lightllm_kernel/ops/__init__.py             | 1 +
 .../models/llama/layer_infer/transformer_layer_infer.py     | 6 ++----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index 4e710f418..46bd1ebfc 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -84,6 +84,7 @@
     "cutlass_scaled_mm_bias_ls",
     "grouped_topk",
     "meta_size",
+    "all_gather",
     "group8_int8kv_flashdecoding_stage1",
     "group_int8kv_decode_attention",
 ]
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
index e6975ebf7..cc7dc0cb8 100755
--- a/lightllm/models/llama/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -5,7 +5,7 @@
 import numpy as np
 from typing import Tuple
 from functools import partial
-
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 from lightllm.models.llama.triton_kernel.context_flashattention_nopad import (
     context_attention_fwd,
@@ -539,11 +539,9 @@ def _token_decode_attention_ppl_int8kv(self, q, infer_state: LlamaInferStateInfo
         calcu_shape1 = (batch_size, self.tp_q_head_num_, self.head_dim_)
         o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
 
-        from lightllm_ppl_kernel import group8_int8kv_decode_attention
-
         # group_int8kv_decode_attention(at::Tensor o, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,
         # at::Tensor v_s, at::Tensor b_loc, at::Tensor b_seq_len, int max_len_in_batch)
-        group8_int8kv_decode_attention(
+        light_ops.group_int8kv_decode_attention(
             o_tensor.view(calcu_shape1),
             q.view(calcu_shape1),
             infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :],

From 7703be13f862a28c0c97d204e4ecfec377210ece Mon Sep 17 00:00:00 2001
From: sangchengmeng <sangchengmeng@mail.ustc.edu.cn>
Date: Thu, 29 May 2025 15:33:22 +0800
Subject: [PATCH 09/14] 0529-add-benchmark

---
 .../bench_quant_per_token_bf16_fp8.py         | 71 +++++++++++++++++
 lightllm-kernel/benchmark/bench_rms_norm.py   | 78 ++++++++++++++++++
 .../benchmark/benchmark_all_gather.py         | 79 +++++++++++++++++++
 .../csrc/{moe => allgather}/all_gather.cu     | 11 ---
 .../csrc/{moe => allgather}/all_gather.cuh    |  0
 .../csrc/{moe => allgather}/all_reduce.cuh    |  0
 lightllm-kernel/csrc/ops_bindings.cpp         | 11 ++-
 lightllm-kernel/include/ops_common.h          | 27 +++++++
 .../lightllm_kernel/ops/__init__.py           | 15 +++-
 .../lightllm_kernel/ops/allgather.py          | 29 +++++++
 lightllm-kernel/lightllm_kernel/ops/moe.py    |  8 --
 11 files changed, 306 insertions(+), 23 deletions(-)
 create mode 100644 lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py
 create mode 100644 lightllm-kernel/benchmark/bench_rms_norm.py
 create mode 100644 lightllm-kernel/benchmark/benchmark_all_gather.py
 rename lightllm-kernel/csrc/{moe => allgather}/all_gather.cu (95%)
 rename lightllm-kernel/csrc/{moe => allgather}/all_gather.cuh (100%)
 rename lightllm-kernel/csrc/{moe => allgather}/all_reduce.cuh (100%)
 create mode 100644 lightllm-kernel/lightllm_kernel/ops/allgather.py

diff --git a/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py b/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py
new file mode 100644
index 000000000..cd2eb291f
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py
@@ -0,0 +1,71 @@
+import time
+import torch
+import itertools
+from typing import Optional, Tuple
+from vllm import _custom_ops as ops
+from sgl_kernel import sgl_per_token_quant_fp8
+
+try:
+    from lightllm_kernel.ops import per_token_quant_bf16_fp8
+except ImportError:
+    raise ImportError("lightllm-kernel op per_token_quant_bf16_fp8 not found.")
+
+fp8_type_ = torch.float8_e4m3fn
+
+
+def vllm_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input, use_per_token_if_dynamic=True)
+
+
+def sglang_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scale = torch.zeros(input.size(0), device=input.device, dtype=torch.float32)
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    sgl_per_token_quant_fp8(input, output, scale)
+
+    return output, scale
+
+
+def lightllm_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return per_token_quant_bf16_fp8(input)
+
+
+def dequantize(q: torch.Tensor, scale: torch.Tensor):
+    return q.to(torch.bfloat16) * scale.view(-1, *((1,) * (q.dim() - 1)))
+
+
+def benchmark(fn, name, inp, iterations=200):
+    for _ in range(20):
+        q, s = fn(inp)
+    torch.cuda.synchronize()
+
+    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+    starter.record()
+    for _ in range(iterations):
+        q, s = fn(inp)
+    ender.record()
+    torch.cuda.synchronize()
+    avg_ms = starter.elapsed_time(ender) / iterations
+
+    q, s = fn(inp)
+    recon = dequantize(q, s)
+    err = recon - inp.to(torch.bfloat16)
+    mse = err.pow(2).mean().item()
+    max_err = err.abs().max().item()
+
+    print(f"{name:20s} | latency: {avg_ms:7.3f} ms | MSE: {mse:.3e} | MaxErr: {max_err:.3e}")
+
+
+if __name__ == "__main__":
+    batch, seq_len = 64, 4096
+    device = "cuda"
+    inp = torch.randn(batch, seq_len, device=device, dtype=torch.bfloat16)
+
+    benchmark(vllm_per_token_quant_fp8, "vllm_ops", inp)
+    benchmark(sglang_per_token_quant_fp8, "sgl_kernel", inp)
+    benchmark(lightllm_per_token_quant_fp8, "lightllm_kernel", inp)
diff --git a/lightllm-kernel/benchmark/bench_rms_norm.py b/lightllm-kernel/benchmark/bench_rms_norm.py
new file mode 100644
index 000000000..c591c53cb
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_rms_norm.py
@@ -0,0 +1,78 @@
+import time
+import torch
+from typing import Optional, Tuple, Union
+
+from vllm import _custom_ops as vllm_ops
+from lightllm_kernel.ops import rmsnorm_bf16 as lightllm_rms_norm
+from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm as triton_rms_norm
+
+
+def vllm_rmsnorm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    residual: Optional[torch.Tensor] = None,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def torch_rmsnorm(x: torch.Tensor, w: torch.Tensor, eps: float):
+    mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
+    inv_std = torch.rsqrt(mean_sq + eps)
+    out = x * inv_std * w
+    return out
+
+
+def benchmark(fn, name, x, w, eps, iterations=200):
+    for _ in range(10):
+        _ = fn(x, w, eps)
+    torch.cuda.synchronize()
+
+    starter = torch.cuda.Event(enable_timing=True)
+    ender = torch.cuda.Event(enable_timing=True)
+    starter.record()
+    for _ in range(iterations):
+        _ = fn(x, w, eps)
+    ender.record()
+    torch.cuda.synchronize()
+    latency_ms = starter.elapsed_time(ender) / iterations
+
+    y_ref = torch_rmsnorm(x, w, eps)
+    y_out = fn(x, w, eps)
+    err = y_out - y_ref
+    mse = err.pow(2).mean().item()
+    max_err = err.abs().max().item()
+
+    print(f"{name:20s} | latency: {latency_ms:7.3f} ms | MSE: {mse:.3e} | MaxErr: {max_err:.3e}")
+
+
+if __name__ == "__main__":
+
+    batch, dim = 64, 1024
+    eps = 1e-6
+    device = "cuda"
+
+    x = torch.randn(batch, dim, device=device, dtype=torch.bfloat16)
+    w = torch.randn(dim, device=device, dtype=torch.bfloat16)
+
+    benchmark(torch_rmsnorm, "torch_rmsnorm", x, w, eps)
+    benchmark(lightllm_rms_norm, "lightllm_rms_norm", x, w, eps)
+    benchmark(triton_rms_norm, "triton_rms_norm", x, w, eps)
+    benchmark(vllm_rmsnorm, "vllm_rmsnorm", x, w, eps)
diff --git a/lightllm-kernel/benchmark/benchmark_all_gather.py b/lightllm-kernel/benchmark/benchmark_all_gather.py
new file mode 100644
index 000000000..facd7421a
--- /dev/null
+++ b/lightllm-kernel/benchmark/benchmark_all_gather.py
@@ -0,0 +1,79 @@
+# benchmark_custom_allgather.py
+import os
+import torch
+import torch.distributed as dist
+from torch.multiprocessing import spawn
+
+# 导入扩展里的 API
+from lightllm_kernel.ops import (
+    init_custom_gather_ar,
+    all_gather as custom_all_gather,
+    allgather_dispose,
+    meta_size,
+)
+
+
+def run(rank, world):
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "29500"
+    dist.init_process_group("nccl", rank=rank, world_size=world)
+    torch.cuda.set_device(rank)
+
+    batch, dim = 32, 512
+    dtypes = [torch.float32, torch.float16, torch.bfloat16]
+
+    for dtype in dtypes:
+        local = torch.randn(batch, dim, device=rank, dtype=dtype)
+        fake_ptrs = [0] * world  # 简单占位
+        handle = init_custom_gather_ar(fake_ptrs, local, rank, full_nvlink=False)
+
+        out_custom = torch.empty(world * batch, dim, device=rank, dtype=dtype)
+
+        # 预热
+        for _ in range(10):
+            custom_all_gather(handle, local, out_custom, 0, 0)
+        torch.cuda.synchronize()
+
+        # 计时：自定义
+        start, end = torch.cuda.Event(True), torch.cuda.Event(True)
+        start.record()
+        for _ in range(100):
+            custom_all_gather(handle, local, out_custom, 0, 0)
+        end.record()
+        torch.cuda.synchronize()
+        t_custom = start.elapsed_time(end) / 100  # ms
+
+        # 计时：torch
+        gathered = [torch.empty_like(local) for _ in range(world)]
+        torch.cuda.synchronize()
+        start.record()
+        for _ in range(100):
+            dist.all_gather(gathered, local)
+        end.record()
+        torch.cuda.synchronize()
+        t_torch = start.elapsed_time(end) / 100  # ms
+
+        # 精度对比
+        custom_all_gather(handle, local, out_custom, 0, 0)
+        ref = torch.cat(gathered, dim=0).to(torch.float32)
+        diff = out_custom.to(torch.float32) - ref
+        mse = diff.pow(2).mean().item()
+        maxerr = diff.abs().max().item()
+
+        if rank == 0:
+            print(
+                f"dtype={dtype:<10}  custom {t_custom:7.3f} ms   "
+                f"torch {t_torch:7.3f} ms   "
+                f"MSE {mse:.3e}   MaxErr {maxerr:.3e}"
+            )
+
+        allgather_dispose(handle)
+
+    if rank == 0:
+        print(f"meta_size() 返回 {meta_size()} 字节")
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    gpus = torch.cuda.device_count()
+    spawn(run, args=(gpus,), nprocs=gpus)
diff --git a/lightllm-kernel/csrc/moe/all_gather.cu b/lightllm-kernel/csrc/allgather/all_gather.cu
similarity index 95%
rename from lightllm-kernel/csrc/moe/all_gather.cu
rename to lightllm-kernel/csrc/allgather/all_gather.cu
index fb6fd5ac9..56e4a863d 100644
--- a/lightllm-kernel/csrc/moe/all_gather.cu
+++ b/lightllm-kernel/csrc/allgather/all_gather.cu
@@ -146,16 +146,5 @@ void allgather_register_graph_buffers(fptr_t _fa,
   fa->register_graph_buffers(bytes, offsets);
 }
 
-
-// torch::Tensor all_gather(
-//     int64_t _fa,
-//     torch::Tensor inp,
-//     torch::Tensor out,
-//     int64_t _reg_buffer,
-//     int64_t reg_buffer_sz_bytes) {
-
-//   all_gather_cuda(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes);
-//   return out;
-// }
   } // namespace ops
 } // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/all_gather.cuh b/lightllm-kernel/csrc/allgather/all_gather.cuh
similarity index 100%
rename from lightllm-kernel/csrc/moe/all_gather.cuh
rename to lightllm-kernel/csrc/allgather/all_gather.cuh
diff --git a/lightllm-kernel/csrc/moe/all_reduce.cuh b/lightllm-kernel/csrc/allgather/all_reduce.cuh
similarity index 100%
rename from lightllm-kernel/csrc/moe/all_reduce.cuh
rename to lightllm-kernel/csrc/allgather/all_reduce.cuh
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index d54f29858..cd4f2504c 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -6,7 +6,7 @@ namespace lightllm {
 namespace ops {
 
 PYBIND11_MODULE(_C, m) {
-    m.def("grouped_topk", &grouped_topk,"Grouped Top-K routing (CUDA)");
+    m.def("grouped_topk", &grouped_topk,"GROUPED TOP-K (CUDA)");
     m.def("rmsnorm_align16_bf16", &rmsnorm_align16_bf16, "RMSNORM (CUDA)");
     m.def("pre_tp_norm_bf16", &pre_tp_norm_bf16, "PRE TP NORM (CUDA)");
     m.def("post_tp_norm_bf16", &post_tp_norm_bf16, "POST TP NORM (CUDA)");
@@ -15,9 +15,14 @@ PYBIND11_MODULE(_C, m) {
     m.def("gelu_per_token_quant_bf16_fp8", &gelu_per_token_quant_bf16_fp8, "GELU QUANT FUSED (CUDA)");
     m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
     m.def("all_gather", &all_gather, "ALL GATHER (CUDA)");
+    m.def("allgather_dispose", &allgather_dispose, "ALL GATHER DISPOSE (CUDA)");
+    m.def("init_custom_gather_ar", &init_custom_gather_ar, "INIT CUSTOM GATHER AR (CUDA)");
+    m.def("allgather_register_buffer", &allgather_register_buffer, "ALL GATHER REGISTER BUFFER (CUDA)");
+    m.def("allgather_register_graph_buffers", &allgather_register_graph_buffers, "ALL GATHER REGISTER BRAPH BUFFERS (CUDA)");
+    m.def("allgather_get_graph_buffer_ipc_meta", &allgather_get_graph_buffer_ipc_meta, "ALL GATHER GET GRAPH BUFFER IPC META (CUDA)");
     m.def("meta_size", &lightllm::ops::meta_size, "Size (in bytes) of vllm::Signal metadata");
-    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "int8kv flashdecoding attention");
-    m.def("group_int8kv_decode_attention", &group_int8kv_decode_attention, "int8kv decode attention");
+    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "INT8KV FLASHDECODING ATTENTION (CUDA)");
+    m.def("group_int8kv_decode_attention", &group_int8kv_decode_attention, "INT8KV DECODE ATTENTION (CUDA)");
 }
 
 } // namespace ops
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index 9d0380ab8..d7d6a454c 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -100,5 +100,32 @@ void group_int8kv_decode_attention(
     Tensor b_seq_len, 
     int64_t max_len_in_batch);
 
+int64_t init_custom_gather_ar(
+    const std::vector<int64_t>& fake_ipc_ptrs,
+    torch::Tensor& rank_data,
+    int64_t rank,
+    bool full_nvlink
+);
+
+void allgather_dispose(
+    int64_t _fa
+);
+
+void allgather_register_buffer(
+    int64_t _fa,
+    const std::vector<int64_t>& fake_ipc_ptrs
+);
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+allgather_get_graph_buffer_ipc_meta(
+    int64_t _fa
+);
+
+void allgather_register_graph_buffers(
+    int64_t _fa,
+    const std::vector<std::vector<int64_t>>& handles,
+    const std::vector<std::vector<int64_t>>& offsets
+);
+
 } // namespace ops
 } // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index 46bd1ebfc..d061bea25 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -69,9 +69,17 @@
 # 向外暴露 Python 端接口
 from .fusion import pre_tp_norm_bf16, post_tp_norm_bf16, add_norm_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
 from .norm import rmsnorm_bf16
+from .allgather import (
+    all_gather,
+    allgather_dispose,
+    init_custom_gather_ar,
+    allgather_register_buffer,
+    allgather_register_graph_buffers,
+    allgather_get_graph_buffer_ipc_meta,
+)
 from .quant import per_token_quant_bf16_fp8
 from .gemm import cutlass_scaled_mm_bias_ls
-from .moe import all_gather, grouped_topk
+from .moe import grouped_topk
 from .attention import group8_int8kv_flashdecoding_stage1, group_int8kv_decode_attention
 
 __all__ = [
@@ -85,6 +93,11 @@
     "grouped_topk",
     "meta_size",
     "all_gather",
+    "allgather_dispose",
+    "init_custom_gather_ar",
+    "allgather_register_buffer",
+    "allgather_get_graph_buffer_ipc_meta",
+    "allgather_register_graph_buffers",
     "group8_int8kv_flashdecoding_stage1",
     "group_int8kv_decode_attention",
 ]
diff --git a/lightllm-kernel/lightllm_kernel/ops/allgather.py b/lightllm-kernel/lightllm_kernel/ops/allgather.py
new file mode 100644
index 000000000..f4d124eb5
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/allgather.py
@@ -0,0 +1,29 @@
+import torch
+from typing import Optional, List, Tuple
+from . import _C
+
+
+def all_gather(
+    _fa: int, inp: torch.Tensor, out: torch.Tensor, _reg_buffer: int, reg_buffer_sz_bytes: int
+) -> torch.Tensor:
+    return _C.all_gather(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes)
+
+
+def init_custom_gather_ar(fake_ipc_ptrs: List[int], rank_data: torch.Tensor, rank: int, full_nvlink: bool) -> int:
+    return _C.init_custom_gather_ar(fake_ipc_ptrs, rank_data, rank, full_nvlink)
+
+
+def allgather_dispose(_fa: int) -> None:
+    _C.allgather_dispose(_fa)
+
+
+def allgather_register_buffer(_fa: int, fake_ipc_ptrs: List[int]) -> None:
+    _C.allgather_register_buffer(_fa, fake_ipc_ptrs)
+
+
+def allgather_get_graph_buffer_ipc_meta(_fa: int) -> Tuple[List[int], List[int]]:
+    return _C.allgather_get_graph_buffer_ipc_meta(_fa)
+
+
+def allgather_register_graph_buffers(_fa: int, handles: List[List[int]], offsets: List[List[int]]) -> None:
+    _C.allgather_register_graph_buffers(_fa, handles, offsets)
diff --git a/lightllm-kernel/lightllm_kernel/ops/moe.py b/lightllm-kernel/lightllm_kernel/ops/moe.py
index 1f4eeeb84..ce02263df 100644
--- a/lightllm-kernel/lightllm_kernel/ops/moe.py
+++ b/lightllm-kernel/lightllm_kernel/ops/moe.py
@@ -3,13 +3,6 @@
 from . import _C
 
 
-def all_gather(
-    _fa: int, inp: torch.Tensor, out: torch.Tensor, _reg_buffer: int, reg_buffer_sz_bytes: int
-) -> torch.Tensor:
-    """Apply rmsnorm on given X, with weight W and eps"""
-    return _C.all_gather(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes)
-
-
 def grouped_topk(
     topk_weights: torch.Tensor,
     correction_bias: torch.Tensor,
@@ -23,7 +16,6 @@ def grouped_topk(
     scoring_func: str,
     group_scores: torch.Tensor,
 ) -> torch.Tensor:
-    """Apply rmsnorm on given X, with weight W and eps"""
     return _C.grouped_topk(
         topk_weights,
         correction_bias,

From 419dd0236d471642cff6b693a6a08e22f0463775 Mon Sep 17 00:00:00 2001
From: sangchengmeng <sangchengmeng@mail.ustc.edu.cn>
Date: Thu, 29 May 2025 18:23:00 +0800
Subject: [PATCH 10/14] 0529-3

---
 lightllm-kernel/benchmark/bench_tp_norm.py    | 86 +++++++++++++++++++
 .../benchmark/benchmark_all_gather.py         | 79 -----------------
 .../meta_weights/mm_weight/mm_weight.py       | 13 ++-
 lightllm/common/quantization/w8a8_quant.py    | 32 +++++--
 .../layer_infer/transformer_layer_infer.py    | 51 +++--------
 5 files changed, 134 insertions(+), 127 deletions(-)
 create mode 100644 lightllm-kernel/benchmark/bench_tp_norm.py
 delete mode 100644 lightllm-kernel/benchmark/benchmark_all_gather.py

diff --git a/lightllm-kernel/benchmark/bench_tp_norm.py b/lightllm-kernel/benchmark/bench_tp_norm.py
new file mode 100644
index 000000000..53599ebb3
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_tp_norm.py
@@ -0,0 +1,86 @@
+# bench_tp_norm_tp4.py
+import os
+import torch
+import torch.distributed as dist
+from types import SimpleNamespace
+
+from lightllm_kernel.ops import (
+    rmsnorm_bf16,
+    pre_tp_norm_bf16,
+    post_tp_norm_bf16,
+)
+
+
+def init_dist():
+    dist.init_process_group("nccl", init_method="env://")
+    rank = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(rank)
+    return rank, dist.get_world_size()
+
+
+def tp_norm_cuda(x, w, cfg):
+    if cfg.tp_world == 1:
+        return rmsnorm_bf16(x, w, cfg.eps)
+
+    var_local = pre_tp_norm_bf16(x)
+    dist.all_reduce(var_local, op=dist.ReduceOp.SUM)
+    return post_tp_norm_bf16(x, w, var_local, cfg.global_embed, cfg.eps)
+
+
+def tp_norm_ref(x, w, cfg):
+    x32 = x.to(torch.float32)
+    var = x32.pow(2).sum(-1, keepdim=True)
+    if cfg.tp_world > 1:
+        dist.all_reduce(var, op=dist.ReduceOp.SUM)
+    x32 = x32 * torch.rsqrt(var / cfg.global_embed + cfg.eps)
+    return (w.to(torch.float32) * x32).to(x.dtype)
+
+
+def bench(fn, tag, x, w, cfg, iters=200):
+    for _ in range(20):
+        fn(x, w, cfg)
+    torch.cuda.synchronize()
+    t0 = torch.cuda.Event(True)
+    t1 = torch.cuda.Event(True)
+    t0.record()
+    for _ in range(iters):
+        fn(x, w, cfg)
+    t1.record()
+    torch.cuda.synchronize()
+    ms = t0.elapsed_time(t1) / iters
+
+    ref = tp_norm_ref(x, w, cfg).to(torch.float32)
+    out = fn(x, w, cfg).to(torch.float32)
+    mse = (out - ref).pow(2).mean().item()
+    err = (out - ref).abs().max().item()
+
+    if dist.get_rank() == 0:
+        print(f"{tag:18s}| {ms:6.3f} ms | MSE {mse:.3e} | MaxErr {err:.3e}")
+
+
+if __name__ == "__main__":
+    rank, world = init_dist()
+
+    tp_world = 4
+    pad_heads, dim_h = 32, 1024
+    local_embed = pad_heads * dim_h
+    global_embed = local_embed * tp_world
+    tokens = 2048
+    eps = 1e-6
+
+    x = torch.randn(tokens, local_embed, device=f"cuda:{rank}", dtype=torch.bfloat16)
+    w = torch.randn(local_embed, device=f"cuda:{rank}", dtype=torch.bfloat16)
+
+    cfg = SimpleNamespace(tp_world=tp_world, global_embed=global_embed, eps=eps)
+
+    if rank == 0:
+        print(
+            f"tp={tp_world}, tokens={tokens}, local_embed={local_embed}, " f"global_embed={global_embed}, dtype=bf16\n"
+        )
+    dist.barrier()
+
+    bench(tp_norm_ref, "torch_ref", x, w, cfg)
+    bench(tp_norm_cuda, "cuda_kernel", x, w, cfg)
+
+    dist.destroy_process_group()
+# python -m torch.distributed.run --nproc_per_node=4 bench_tp_norm.py
diff --git a/lightllm-kernel/benchmark/benchmark_all_gather.py b/lightllm-kernel/benchmark/benchmark_all_gather.py
deleted file mode 100644
index facd7421a..000000000
--- a/lightllm-kernel/benchmark/benchmark_all_gather.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# benchmark_custom_allgather.py
-import os
-import torch
-import torch.distributed as dist
-from torch.multiprocessing import spawn
-
-# 导入扩展里的 API
-from lightllm_kernel.ops import (
-    init_custom_gather_ar,
-    all_gather as custom_all_gather,
-    allgather_dispose,
-    meta_size,
-)
-
-
-def run(rank, world):
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "29500"
-    dist.init_process_group("nccl", rank=rank, world_size=world)
-    torch.cuda.set_device(rank)
-
-    batch, dim = 32, 512
-    dtypes = [torch.float32, torch.float16, torch.bfloat16]
-
-    for dtype in dtypes:
-        local = torch.randn(batch, dim, device=rank, dtype=dtype)
-        fake_ptrs = [0] * world  # 简单占位
-        handle = init_custom_gather_ar(fake_ptrs, local, rank, full_nvlink=False)
-
-        out_custom = torch.empty(world * batch, dim, device=rank, dtype=dtype)
-
-        # 预热
-        for _ in range(10):
-            custom_all_gather(handle, local, out_custom, 0, 0)
-        torch.cuda.synchronize()
-
-        # 计时：自定义
-        start, end = torch.cuda.Event(True), torch.cuda.Event(True)
-        start.record()
-        for _ in range(100):
-            custom_all_gather(handle, local, out_custom, 0, 0)
-        end.record()
-        torch.cuda.synchronize()
-        t_custom = start.elapsed_time(end) / 100  # ms
-
-        # 计时：torch
-        gathered = [torch.empty_like(local) for _ in range(world)]
-        torch.cuda.synchronize()
-        start.record()
-        for _ in range(100):
-            dist.all_gather(gathered, local)
-        end.record()
-        torch.cuda.synchronize()
-        t_torch = start.elapsed_time(end) / 100  # ms
-
-        # 精度对比
-        custom_all_gather(handle, local, out_custom, 0, 0)
-        ref = torch.cat(gathered, dim=0).to(torch.float32)
-        diff = out_custom.to(torch.float32) - ref
-        mse = diff.pow(2).mean().item()
-        maxerr = diff.abs().max().item()
-
-        if rank == 0:
-            print(
-                f"dtype={dtype:<10}  custom {t_custom:7.3f} ms   "
-                f"torch {t_torch:7.3f} ms   "
-                f"MSE {mse:.3e}   MaxErr {maxerr:.3e}"
-            )
-
-        allgather_dispose(handle)
-
-    if rank == 0:
-        print(f"meta_size() 返回 {meta_size()} 字节")
-    dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    gpus = torch.cuda.device_count()
-    spawn(run, args=(gpus,), nprocs=gpus)
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 706c328b8..23e458344 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -41,11 +41,20 @@ def __init__(
         self.has_bias: bool = None
 
     def mm(
-        self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
+        self,
+        input_tensor: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        ls_weight: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
     ) -> torch.Tensor:
         if self.quant_method is not None:
             return self.quant_method.apply(
-                input_tensor, self.weight, self.bias, out, use_custom_tensor_mananger=use_custom_tensor_mananger
+                input_tensor,
+                self.weight,
+                self.bias,
+                out,
+                ls_weight=ls_weight,
+                use_custom_tensor_mananger=use_custom_tensor_mananger,
             )
         if out is None:
             shape = (input_tensor.shape[0], self.weight.shape[1])
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
index dbefa4749..d00d492d8 100644
--- a/lightllm/common/quantization/w8a8_quant.py
+++ b/lightllm/common/quantization/w8a8_quant.py
@@ -8,6 +8,14 @@
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
+if HAS_LIGHTLLM_KERNEL:
+
+    def scaled_fp8_quant(tensor, *args, **kwargs):
+        return light_ops.per_token_quant_bf16_fp8(tensor)
+
+else:
+    scaled_fp8_quant = vllm_ops.scaled_fp8_quant
+
 
 class BaseQuantizationMethod(QuantizationMethod):
     def __init__(self):
@@ -72,7 +80,7 @@ def __init__(self):
     def quantize(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
+        qweight, weight_scale = scaled_fp8_quant(
             weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
         return qweight.transpose(0, 1), weight_scale
@@ -83,7 +91,7 @@ def quantize_moe(self, weight):
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         for i in range(num_experts):
-            qweight, weight_scale = vllm_ops.scaled_fp8_quant(
+            qweight, weight_scale = scaled_fp8_quant(
                 weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=False
             )
             qweights[i] = qweight
@@ -91,13 +99,20 @@ def quantize_moe(self, weight):
         weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)
         return qweights, weight_scale
 
-    def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
+    def apply(
+        self,
+        input_tensor,
+        weights,
+        bias=None,
+        out=None,
+        ls_weight=None,
+        workspace=None,
+        use_custom_tensor_mananger=True,
+    ):
         if HAS_LIGHTLLM_KERNEL:
             x_q, x_scale = light_ops.per_token_quant_bf16_fp8(input_tensor)
         else:
-            x_q, x_scale = vllm_ops.scaled_fp8_quant(
-                input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True
-            )
+            x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
 
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
@@ -108,7 +123,10 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        if ls_weight is not None:
+            light_ops.cutlass_scaled_mm_bias_ls(out, x_q, weights[0], x_scale, weights[1], bias, ls_weight)
+        else:
+            cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
         return out
 
 
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
index 5dc5d3d42..f51df0f66 100644
--- a/lightllm/models/vit/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -14,30 +14,10 @@
 from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
 from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 
 class ViTTransformerLayerInfer:
-    """ """
-    # 类变量缓存导入的算子
-    _lightllm_kernels = None
-    
-    @classmethod
-    def _init_kernels(cls):
-        if cls._lightllm_kernels is None:
-            cls._lightllm_kernels = {}
-            try:
-                from lightllm_kernel.ops import (
-                    rmsnorm_bf16,
-                    pre_tp_norm_bf16, post_tp_norm_bf16
-                )
-                cls._lightllm_kernels.update({
-                    'rmsnorm_bf16': rmsnorm_bf16,
-                    'pre_tp_norm_bf16': pre_tp_norm_bf16,
-                    'post_tp_norm_bf16': post_tp_norm_bf16,
-                })
-            except ImportError as e:
-                print(f"Warning: Failed to load lightllm_kernel.ops: {e}")
-
     def __init__(self, layer_num, network_config, mode=[]):
         self.tp_rank_ = get_current_rank_in_dp()
         self.tp_world_size_ = get_dp_world_size()
@@ -52,9 +32,6 @@ def __init__(self, layer_num, network_config, mode=[]):
         self.network_config_ = network_config
         self.mode = mode
         self.layer_num_ = layer_num
-        self.use_lightllm_kernels = os.getenv("ENABLE_LIGHTLLM_KERNELS", "0").upper() in ["ON", "TRUE", "1"]
-        if self.use_lightllm_kernels:
-             self.__class__._init_kernels()  # 确保算子已初始化
         return
 
     def norm(self, input, weight):
@@ -68,15 +45,13 @@ def norm(self, input, weight):
         out = out.reshape(input_shape)
         return out
 
-    def tp_norm_optim(self, input, weight):
+    def tp_norm_cuda(self, input, weight):
         if self.tp_world_size_ == 1:
-            out =  self._lightllm_kernels['rmsnorm_bf16'](input, weight, self.eps_)
+            out = light_ops.rmsnorm_bf16(input, weight, self.eps_)
         else:
-            tp_variance = self._lightllm_kernels['pre_tp_norm_bf16'](input)
+            tp_variance = light_ops.pre_tp_norm_bf16(input)
             dist.all_reduce(tp_variance, op=dist.ReduceOp.SUM, async_op=False)
-            out =  self._lightllm_kernels['post_tp_norm_bf16'](
-                input, weight, tp_variance, self.embed_dim_, self.eps_
-            )
+            out = light_ops.post_tp_norm_bf16(input, weight, tp_variance, self.embed_dim_, self.eps_)
         return out
 
     def tp_norm(self, input, weight):
@@ -123,9 +98,9 @@ def _ffn_norm(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Ten
             )
 
     def _qk_norm(self, q, k, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
-        if self.use_lightllm_kernels:
-            q_norm = self.tp_norm_optim(q, layer_weight.q_norm_weight_.weight)
-            k_norm = self.tp_norm_optim(k, layer_weight.k_norm_weight_.weight)
+        if HAS_LIGHTLLM_KERNEL:
+            q_norm = self.tp_norm_cuda(q, layer_weight.q_norm_weight_.weight)
+            k_norm = self.tp_norm_cuda(k, layer_weight.k_norm_weight_.weight)
         else:
             q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
             k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
@@ -150,10 +125,10 @@ def _get_o(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor
         batch_size = input.shape[0]
         seq_len = input.shape[1]
         o_tensor = layer_weight.o_proj.mm(
-            input.view(-1, self.tp_padding_head_num * self.head_dim_), use_custom_tensor_mananger=True
+            input.view(-1, self.tp_padding_head_num * self.head_dim_),
+            ls_weight=layer_weight.ls1,
+            use_custom_tensor_mananger=True,
         )
-        if layer_weight.use_ls:
-            o_tensor.mul_(layer_weight.ls1)
         return o_tensor.reshape((batch_size, seq_len, -1))
 
     def _ffn(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
@@ -161,10 +136,8 @@ def _ffn(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
         input_shape = input.shape
         input = None
         ffn1_out = gelu_fwd(fc1, use_custom_tensor_mananger=True)
-        ffn2_out = layer_weight.ffn_2_proj_.mm(ffn1_out, use_custom_tensor_mananger=True)
+        ffn2_out = layer_weight.ffn_2_proj_.mm(ffn1_out, ls_weight=layer_weight.ls2, use_custom_tensor_mananger=True)
         ffn1_out = None
-        if layer_weight.use_ls:
-            ffn2_out.mul_(layer_weight.ls2)
         return ffn2_out.reshape(input_shape)
 
     def _context_attention(self, input_embding, layer_weight):

From 597dc1aeeefbe72c448567d332572722cd6c28df Mon Sep 17 00:00:00 2001
From: sangchengmeng <sangchengmeng@mail.ustc.edu.cn>
Date: Mon, 9 Jun 2025 15:14:53 +0800
Subject: [PATCH 11/14] add pyproject.toml

---
 lightllm-kernel/CMakeLists.txt |  61 +++++-----
 lightllm-kernel/LICENSE        | 202 +++++++++++++++++++++++++++++++++
 lightllm-kernel/pyproject.toml |  70 ++++++++++++
 3 files changed, 301 insertions(+), 32 deletions(-)
 create mode 100644 lightllm-kernel/LICENSE
 create mode 100644 lightllm-kernel/pyproject.toml

diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt
index de01c346a..25a9855b6 100644
--- a/lightllm-kernel/CMakeLists.txt
+++ b/lightllm-kernel/CMakeLists.txt
@@ -16,44 +16,41 @@ file(GLOB_RECURSE SRC_CPP   CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cpp"
 file(GLOB_RECURSE SRC_CUDA  CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cu")
 
 # 编译生成 Python 扩展， _C.so
-add_library(_C SHARED ${SRC_CPP} ${SRC_CUDA})
+if (NOT TARGET _C)
+  add_library(_C SHARED ${SRC_CPP} ${SRC_CUDA})
 
-# C++17 更方便调度宏
-target_compile_features(_C PRIVATE cxx_std_17)
-target_include_directories(_C PRIVATE ${TORCH_INCLUDE_DIRS})
-target_link_libraries(_C
-    PRIVATE
-      ${TORCH_LIBRARIES}
-      Python::Python
-      CUDA::cudart
-      CUDA::cuda_driver)
-
-      
-# 输出文件名 _C.so，无前缀
-set_target_properties(_C PROPERTIES
-    PREFIX ""
-    OUTPUT_NAME "_C"
-    BUILD_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
-    INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
-)
+  # C++17 更方便调度宏
+  target_compile_features(_C PRIVATE cxx_std_17)
+  target_include_directories(_C PRIVATE 
+    ${TORCH_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/csrc
+    ${PROJECT_SOURCE_DIR}/../third-party/cutlass/include
+  )
+  target_link_libraries(_C
+      PRIVATE
+        ${TORCH_LIBRARIES}
+        Python::Python
+        CUDA::cudart
+        CUDA::cuda_driver)
 
+        
+  # 输出文件名 _C.so，无前缀
+  set_target_properties(_C PROPERTIES
+      PREFIX ""
+      OUTPUT_NAME "_C"
+      BUILD_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+      INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+  )
+endif()
 # 安装：把 _C.so、Python 包和 csrc 一起拷到 site-packages
 include(GNUInstallDirs)
 
 # 1) 计算 Python site-packages 路径
-execute_process(
-  COMMAND ${Python_EXECUTABLE} - <<EOF
-import sysconfig, json
-print(json.dumps({
-  "arch": sysconfig.get_path("platlib"),
-  "pure": sysconfig.get_path("purelib")
-}))
-EOF
-  OUTPUT_VARIABLE _py_paths
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-string(JSON Python_SITEARCH GET "${_py_paths}" arch)
-string(JSON Python_SITELIB  GET "${_py_paths}" pure)
+
+message(STATUS "Installing to ARCH = ${Python_SITEARCH}")
+message(STATUS "Installing to PURE = ${Python_SITELIB}")
 
 # 2) 安装编译好的 _C.so 到 lightllm_kernel 目录
 install(TARGETS _C
diff --git a/lightllm-kernel/LICENSE b/lightllm-kernel/LICENSE
new file mode 100644
index 000000000..7a4a3ea24
--- /dev/null
+++ b/lightllm-kernel/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/lightllm-kernel/pyproject.toml b/lightllm-kernel/pyproject.toml
new file mode 100644
index 000000000..a7be590da
--- /dev/null
+++ b/lightllm-kernel/pyproject.toml
@@ -0,0 +1,70 @@
+[build-system]
+requires = [
+  "scikit-build-core>=0.10",
+  "cmake>=3.22",
+  "ninja",
+  "torch>=2.6.0",
+  "wheel",
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "lightllm-kernel"
+version = "0.1.0"
+description = "CUDA kernel library for LightLLM"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "Apache-2.0" }
+keywords = ["cuda", "lightllm"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "License :: OSI Approved :: Apache Software License",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Operating System :: POSIX :: Linux",
+]
+dependencies = [
+  "pybind11>=2.11",
+]
+
+[project.optional-dependencies]
+dev = [
+  "black",
+  "ruff",
+  "pre-commit",
+]
+test = [
+  "pytest",
+  "pytest-cov",
+]
+
+[project.urls]
+Homepage = "https://github.com/ModelTC/lightllm/tree/main/lightllm-kernel"
+Source = "https://github.com/ModelTC/lightllm/tree/main/lightllm-kernel"
+Issues = "https://github.com/ModelTC/lightllm/issues"
+
+[tool.wheel]
+
+exclude = ["dist*", "tests*"]
+
+[tool.scikit-build]
+cmake.minimum-version = "3.22"
+cmake.build-type = "Release"
+cmake.verbose = true
+
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=80;86;89;90"]
+
+wheel.py-api = "cp39"
+wheel.packages = ["lightllm_kernel"]
+wheel.license-files = ["LICENSE"]
+
+sdist.include = [
+  "CMakeLists.txt",
+  "csrc/**/*",
+  "third-party/cutlass/**/*",
+  "lightllm_kernel/**/*",
+  "LICENSE",
+  "README.md",
+]
+
+install.components = ["Python"]

From e54052088cc31525f1f6355c822895b6f77f6324 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Mon, 9 Jun 2025 15:41:19 +0800
Subject: [PATCH 12/14] 0609

---
 lightllm-kernel/lightllm_kernel/ops/fusion.py | 20 +++--
 lightllm-kernel/lightllm_kernel/ops/gemm.py   | 14 +++-
 lightllm-kernel/lightllm_kernel/ops/norm.py   |  4 +-
 lightllm-kernel/lightllm_kernel/ops/quant.py  |  3 +-
 .../test/fusion/add_norm_quant_test.py        | 82 ++++++++++++-------
 .../test/fusion/gelu_per_token_quant_test.py  | 18 ++--
 .../test/fusion/post_tp_norm_test.py          |  6 +-
 .../test/fusion/pre_tp_norm_test.py           |  8 +-
 .../test/gemm/cutlass_scaled_mm_test.py       | 50 ++++++++---
 lightllm-kernel/test/norm/rmsnorm_test.py     |  9 +-
 lightllm-kernel/test/quant/quant_test.py      | 10 ++-
 lightllm-kernel/test/utils.py                 | 28 ++++---
 12 files changed, 170 insertions(+), 82 deletions(-)

diff --git a/lightllm-kernel/lightllm_kernel/ops/fusion.py b/lightllm-kernel/lightllm_kernel/ops/fusion.py
index 6f3c8243b..a9131420f 100644
--- a/lightllm-kernel/lightllm_kernel/ops/fusion.py
+++ b/lightllm-kernel/lightllm_kernel/ops/fusion.py
@@ -2,20 +2,28 @@
 from typing import Optional, Tuple
 from . import _C
 
+
 def pre_tp_norm_bf16(input: torch.Tensor) -> torch.Tensor:
-    """ Calculate powersum along embedding dimension of the input """
+    """Calculate powersum along embedding dimension of the input"""
     return _C.pre_tp_norm_bf16(input)
 
-def post_tp_norm_bf16(input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float) -> torch.Tensor:
-    """ Apply rmsnorm on given input, with weight and pre calculated powersum """
+
+def post_tp_norm_bf16(
+    input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float
+) -> torch.Tensor:
+    """Apply rmsnorm on given input, with weight and pre calculated powersum"""
     return _C.post_tp_norm_bf16(input, weight, tp_variance, embed_dim, eps)
 
-def add_norm_quant_bf16_fp8(input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Apply add_norm_quant on given input, with residual and weight """
+
+def add_norm_quant_bf16_fp8(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply add_norm_quant on given input, with residual and weight"""
     return _C.add_norm_quant_bf16_fp8(input, residual, weight, eps)
 
+
 def gelu_per_token_quant_bf16_fp8(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method """
+    """Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method"""
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
     _C.gelu_per_token_quant_bf16_fp8(output, input, scales)
diff --git a/lightllm-kernel/lightllm_kernel/ops/gemm.py b/lightllm-kernel/lightllm_kernel/ops/gemm.py
index 0fb569cfc..a3d3dfd4f 100644
--- a/lightllm-kernel/lightllm_kernel/ops/gemm.py
+++ b/lightllm-kernel/lightllm_kernel/ops/gemm.py
@@ -2,7 +2,15 @@
 from typing import Optional
 from . import _C
 
-def cutlass_scaled_mm_bias_ls(c: torch.Tensor, a: torch.Tensor, b: torch.Tensor,
-                      a_scales: torch.Tensor, b_scales: torch.Tensor, bias: Optional[torch.Tensor], ls: Optional[torch.Tensor]) -> None :
-    """ Apply scaled mm on the given input, with optional bias and ls weight """
+
+def cutlass_scaled_mm_bias_ls(
+    c: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    ls: Optional[torch.Tensor],
+) -> None:
+    """Apply scaled mm on the given input, with optional bias and ls weight"""
     return _C.cutlass_scaled_mm(c, a, b, a_scales, b_scales, bias, ls)
diff --git a/lightllm-kernel/lightllm_kernel/ops/norm.py b/lightllm-kernel/lightllm_kernel/ops/norm.py
index d60013f42..8974308e5 100644
--- a/lightllm-kernel/lightllm_kernel/ops/norm.py
+++ b/lightllm-kernel/lightllm_kernel/ops/norm.py
@@ -2,6 +2,6 @@
 from typing import Optional
 from . import _C
 
-def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float=1e-12) -> torch.Tensor:
-    """ Apply rmsnorm on given X, with weight W and eps """
+
+def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
     return _C.rmsnorm_align16_bf16(X, W, eps)
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
index 8889f11b1..05634f2f0 100644
--- a/lightllm-kernel/lightllm_kernel/ops/quant.py
+++ b/lightllm-kernel/lightllm_kernel/ops/quant.py
@@ -2,8 +2,9 @@
 from typing import Optional, Tuple
 from . import _C
 
+
 def per_token_quant_bf16_fp8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Quantize the given input using per token quant method """
+    """Quantize the given input using per token quant method"""
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
     _C.per_token_quant_bf16_fp8(output, input, scales)
diff --git a/lightllm-kernel/test/fusion/add_norm_quant_test.py b/lightllm-kernel/test/fusion/add_norm_quant_test.py
index a04d2b9f3..1af329677 100755
--- a/lightllm-kernel/test/fusion/add_norm_quant_test.py
+++ b/lightllm-kernel/test/fusion/add_norm_quant_test.py
@@ -10,12 +10,13 @@ def torch_add_norm_quant_bf16_fp8(X, R, W, eps=1e-6):
     # 1. Add residual
     X = X.add_(R)
     # 2. rmsnorm
-    normalized = torch.nn.functional.rms_norm(X, (N, ), W, eps=eps)
+    normalized = torch.nn.functional.rms_norm(X, (N,), W, eps=eps)
     # 3. per token quant
     quantized, scales = ops.scaled_fp8_quant(normalized, scale=None, use_per_token_if_dynamic=True)
 
     return quantized, scales
 
+
 class TestFusedAddNormQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -31,40 +32,65 @@ def test_accuracy(self):
         for batch in self.batchs:
             for seqLen in self.seqLens:
                 for embed_dim in self.embed_dims:
-                        with self.subTest(shape=[batch, seqLen, embed_dim]):
-                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            X2 = X1.clone()
-                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R2 = R1.clone()
-                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            output_real, scales_real = torch_add_norm_quant_bf16_fp8(X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
-                            output_pred, scales_pred = add_norm_quant_bf16_fp8(X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = X1.clone()
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        output_real, scales_real = torch_add_norm_quant_bf16_fp8(
+                            X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps
+                        )
+                        output_pred, scales_pred = add_norm_quant_bf16_fp8(
+                            X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps
+                        )
 
-                            self.assertTrue(
-                                error(output_real, output_pred) < 0.01,
-                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. output_real={output_real}, output_pred={output_pred}"
-                            )
-                            self.assertTrue(
-                                error(scales_real, scales_pred) < 0.01,
-                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. scales_real={scales_real}, scales_pred={scales_pred}"
-                            )
+                        self.assertTrue(
+                            error(output_real, output_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"output_real={output_real}, output_pred={output_pred}",
+                        )
+                        self.assertTrue(
+                            error(scales_real, scales_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"scales_real={scales_real}, scales_pred={scales_pred}",
+                        )
 
     def test_performance(self):
         """Test the performance of FusedAddNormQuant using benchmark."""
         for batch in self.batchs:
             for seqLen in self.seqLens:
                 for embed_dim in self.embed_dims:
-                        with self.subTest(shape=[batch, seqLen, embed_dim]):
-                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R2 = R1.clone()
-                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+
+                        shape = [[batch, seqLen, embed_dim]]
+                        tflops = 0.0
+                        benchmark(
+                            torch_add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X1.reshape(-1, X1.shape[2]),
+                            R1.reshape(-1, R1.shape[2]),
+                            W,
+                            self.eps,
+                        )
+                        benchmark(
+                            add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X2.reshape(-1, X1.shape[2]),
+                            R2.reshape(-1, R2.shape[2]),
+                            W,
+                            self.eps,
+                        )
 
-                            shape = [[batch, seqLen, embed_dim]]
-                            tflops = 0.0
-                            benchmark(torch_add_norm_quant_bf16_fp8, shape, tflops, 100, X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
-                            benchmark(add_norm_quant_bf16_fp8, shape, tflops, 100, X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
index 90fc00025..66a605be3 100644
--- a/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
+++ b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
@@ -4,10 +4,12 @@
 from lightllm_kernel.ops import per_token_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
 from test.utils import benchmark, error
 
+
 def gelu_quant(x):
     y = gelu_fwd(x)
     return per_token_quant_bf16_fp8(y)
 
+
 class TestGeluQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -21,20 +23,23 @@ def test_accuracy(self):
         for token in self.tokens:
             for hiddenDim in self.hiddenDims:
                 with self.subTest(shape=[token, hiddenDim]):
-                    input = torch.normal(mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype)
+                    input = torch.normal(
+                        mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype
+                    )
 
                     y_real, scales_real = gelu_quant(input)
                     y_pred, scales_pred = gelu_per_token_quant_bf16_fp8(input)
-                    
+
                     self.assertTrue(
                         error(scales_real, scales_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}. "
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
                     )
                     self.assertTrue(
                         error(y_real, y_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}." f"y_real={y_real}, y_pred={y_pred}",
                     )
-                    
+
     def test_performance(self):
         """Test the performance of gelu_per_token_quant using benchmark."""
         for token in self.tokens:
@@ -46,5 +51,6 @@ def test_performance(self):
                     benchmark(gelu_per_token_quant_bf16_fp8, shape, tflops, 100, input)
                     benchmark(gelu_quant, shape, tflops, 100, input)
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/post_tp_norm_test.py b/lightllm-kernel/test/fusion/post_tp_norm_test.py
index 4830112c3..0772aae3c 100755
--- a/lightllm-kernel/test/fusion/post_tp_norm_test.py
+++ b/lightllm-kernel/test/fusion/post_tp_norm_test.py
@@ -12,6 +12,7 @@ def post_tp_norm(input, weight, tp_variance, embed_dim, eps):
     out = weight * input.to(torch.bfloat16)
     return out
 
+
 class TestPostTpNormBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -34,7 +35,7 @@ def test_accuracy(self):
                     y_pred = post_tp_norm_bf16(X, W, V, self.embed_dim, self.eps)
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
                     )
 
     def test_performance(self):
@@ -50,5 +51,6 @@ def test_performance(self):
                     benchmark(post_tp_norm_bf16, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
                     benchmark(post_tp_norm, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/pre_tp_norm_test.py b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
index baf0e52ea..5f82a189a 100755
--- a/lightllm-kernel/test/fusion/pre_tp_norm_test.py
+++ b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
@@ -9,6 +9,7 @@ def pre_tp_norm(input):
     tp_variance = input.pow(2).sum(-1, keepdim=False)
     return tp_variance
 
+
 class TestPreTpNormBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -27,7 +28,7 @@ def test_accuracy(self):
                     y_pred = pre_tp_norm_bf16(X)
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
                     )
 
     def test_performance(self):
@@ -35,12 +36,13 @@ def test_performance(self):
             for size in self.sizes:
                 with self.subTest(shape=[batch, size]):
                     X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
-                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    # W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
 
                     shape = [[batch, size], [size], [batch, size]]
                     tflops = 0.0
                     benchmark(pre_tp_norm_bf16, shape, tflops, 100, X)
                     benchmark(pre_tp_norm, shape, tflops, 100, X)
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
index a9d0d014d..1ef8be74d 100644
--- a/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
+++ b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
@@ -10,6 +10,7 @@ def torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=tor
     y_pred = y_pred_tmp * ls
     return y_pred
 
+
 class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -18,7 +19,6 @@ def setUp(self):
         self.device = "cuda"
         self.dtype = torch.bfloat16
 
-
     def test_accuracy(self):
         """Test the accuracy of cutlass_scaled_mm_bias_ls"""
         for token in self.tokens:
@@ -29,10 +29,11 @@ def test_accuracy(self):
                     input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype)
                     x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
 
-                    
                     # 生成权重张量w_q（N×K），转置后为K×N（列优先）
                     weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype)
-                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
 
                     # 转置，w_q_t为列优先
                     w_q_t = w_q.t()
@@ -43,11 +44,13 @@ def test_accuracy(self):
                     ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
 
                     cutlass_scaled_mm_bias_ls(y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
-                    y_real = torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls)
+                    y_real = torch_cutlass_scale_gemm_with_ls(
+                        x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls
+                    )
 
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}",
                     )
 
     def test_performance(self):
@@ -62,7 +65,9 @@ def test_performance(self):
 
                     # 生成权重张量w_q（N×K），转置后为K×N（列优先）
                     weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype) - 0.5
-                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
 
                     bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
                     ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
@@ -72,9 +77,34 @@ def test_performance(self):
 
                     y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
                     shape = [[token, hiddenDim]]
-                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024**4
-                    benchmark(cutlass_scaled_mm_bias_ls, shape, tflops, 100, y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
-                    benchmark(torch_cutlass_scale_gemm_with_ls, shape, tflops, 100, x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls) # 无bias 495GB/s, 有bias 482GB/s
+                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024 ** 4
+                    benchmark(
+                        cutlass_scaled_mm_bias_ls,
+                        shape,
+                        tflops,
+                        100,
+                        y_pred,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        bias=bias,
+                        ls=ls,
+                    )
+                    benchmark(
+                        torch_cutlass_scale_gemm_with_ls,
+                        shape,
+                        tflops,
+                        100,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        out_dtype=torch.bfloat16,
+                        bias=bias,
+                        ls=ls,
+                    )  # 无bias 495GB/s, 有bias 482GB/s
+
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/norm/rmsnorm_test.py b/lightllm-kernel/test/norm/rmsnorm_test.py
index aaccc1c92..c79951052 100755
--- a/lightllm-kernel/test/norm/rmsnorm_test.py
+++ b/lightllm-kernel/test/norm/rmsnorm_test.py
@@ -20,11 +20,11 @@ def test_accuracy(self):
                     X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
                     W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
 
-                    y_real = torch.nn.functional.rms_norm(X, (size, ), W)
+                    y_real = torch.nn.functional.rms_norm(X, (size,), W)
                     y_pred = rmsnorm_bf16(X, W)
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
                     )
                     print(f"{error(y_pred, y_real) = }")
 
@@ -39,7 +39,8 @@ def test_performance(self):
                     shape = [[batch, size], [size], [batch, size]]
                     tflops = 0.0
                     benchmark(rmsnorm_bf16, shape, tflops, 100, X, W)
-                    benchmark(torch.nn.functional.rms_norm, shape, tflops, 100, X, (size, ), W)
+                    benchmark(torch.nn.functional.rms_norm, shape, tflops, 100, X, (size,), W)
+
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/quant/quant_test.py b/lightllm-kernel/test/quant/quant_test.py
index a71d2f249..48f50f7d8 100755
--- a/lightllm-kernel/test/quant/quant_test.py
+++ b/lightllm-kernel/test/quant/quant_test.py
@@ -25,11 +25,12 @@ def test_accuracy(self):
                     y_pred, scales_pred = per_token_quant_bf16_fp8(input)
                     self.assertTrue(
                         error(scales_real, scales_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}."
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
                     )
                     self.assertTrue(
                         error(y_real, y_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}",
                     )
 
     def test_performance(self):
@@ -39,9 +40,10 @@ def test_performance(self):
                 with self.subTest(shape=[token, size]):
                     input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
                     shape = [[token, size]]
-                    tflops = token * size / 1024**4
+                    tflops = token * size / 1024 ** 4
                     benchmark(per_token_quant_bf16_fp8, shape, tflops, 100, input)
                     benchmark(ops.scaled_fp8_quant, shape, tflops, 100, input, None, True)
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lightllm-kernel/test/utils.py b/lightllm-kernel/test/utils.py
index c87373178..ed79fdfec 100644
--- a/lightllm-kernel/test/utils.py
+++ b/lightllm-kernel/test/utils.py
@@ -6,15 +6,16 @@
 def error(y_pred: torch.Tensor, y_real: torch.Tensor) -> torch.Tensor:
     """
     Compute SNR between y_pred(tensor) and y_real(tensor)
-    
+
     SNR can be calcualted as following equation:
-    
+
         SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
-    
+
     if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
-    
+
         SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
 
+
     Args:
         y_pred (torch.Tensor): _description_
         y_real (torch.Tensor): _description_
@@ -31,10 +32,11 @@ def error(y_pred: torch.Tensor, y_real: torch.Tensor) -> torch.Tensor:
     y_real = torch.flatten(y_real).float()
 
     if y_pred.shape != y_real.shape:
-        raise ValueError(f'Can not compute snr loss for tensors with different shape. '
-            f'({y_pred.shape} and {y_real.shape})')
+        raise ValueError(
+            f"Can not compute snr loss for tensors with different shape. " f"({y_pred.shape} and {y_real.shape})"
+        )
 
-    noise_power  = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
     signal_power = torch.pow(y_real, 2).sum(dim=-1)
     snr = (noise_power) / (signal_power + 1e-7)
     return snr.item()
@@ -43,11 +45,11 @@ def error(y_pred: torch.Tensor, y_real: torch.Tensor) -> torch.Tensor:
 def benchmark(func: Callable, shape: List[int], tflops: float, steps: int, *args, **kwargs):
     """
     A decorator function to assist in performance testing of CUDA operations.
-    
+
     This function will:
-    1. Automatically determine whether any parameters in the argument list, 
+    1. Automatically determine whether any parameters in the argument list,
        or the output of the `func`, are of type `torch.Tensor`.
-    2. If so, calculate the memory usage of the input and output tensors 
+    2. If so, calculate the memory usage of the input and output tensors
        on the GPU (based on their data type and `torch.numel()`).
     3. Establish a CUDA graph and attempt to execute `func` repeatedly for `steps` iterations.
     4. Record the execution time during these iterations.
@@ -64,7 +66,7 @@ def benchmark(func: Callable, shape: List[int], tflops: float, steps: int, *args
     Returns:
         function result
     """
-    
+
     # Ensure CUDA is available
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is required for benchmarking.")
@@ -113,7 +115,7 @@ def calculate_memory(tensor: torch.Tensor):
     elapsed_time_s = elapsed_time_ms / 1000  # Convert to seconds
     avg_time_per_step = elapsed_time_s / steps
     compute_performance = tflops / avg_time_per_step  # TFLOPS
-    memory_throughput = (total_memory * steps / (1024**3)) / elapsed_time_s  # GB/s
+    memory_throughput = (total_memory * steps / (1024 ** 3)) / elapsed_time_s  # GB/s
 
     # Print performance metrics
     print(f"Function: {func.__name__}{shape}")
@@ -122,4 +124,4 @@ def calculate_memory(tensor: torch.Tensor):
     print(f"Average Time Per Step: {avg_time_per_step * 1000 :.3f} ms")
     print(f"Compute Performance: {compute_performance:.2f} TFLOPS")
     print(f"Memory Throughput: {memory_throughput:.2f} GB/s")
-    print("") # print a blank line.
\ No newline at end of file
+    print("")  # print a blank line.

From 13a1c3dc3e3507c575c230d1f27690566a64d292 Mon Sep 17 00:00:00 2001
From: Xtra <571889291@qq.com>
Date: Tue, 24 Jun 2025 18:48:28 +0800
Subject: [PATCH 13/14] add per_token_quant_bf16_int8 kernel (#939)

---
 lightllm-kernel/csrc/ops_bindings.cpp         |   3 +-
 ...bf16.cu => per_token_quantize_bf16_fp8.cu} |   0
 .../quant/per_token_quantize_bf16_int8.cu     | 338 ++++++++++++++++++
 lightllm-kernel/include/ops_common.h          |   6 +
 lightllm-kernel/include/utils.h               |   6 +
 .../lightllm_kernel/ops/__init__.py           |   3 +-
 lightllm-kernel/lightllm_kernel/ops/quant.py  |   7 +
 .../{quant_test.py => fp8_quant_test.py}      |   0
 lightllm-kernel/test/quant/int8_quant_test.py |  50 +++
 9 files changed, 411 insertions(+), 2 deletions(-)
 rename lightllm-kernel/csrc/quant/{per_token_quantize_bf16.cu => per_token_quantize_bf16_fp8.cu} (100%)
 create mode 100644 lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
 rename lightllm-kernel/test/quant/{quant_test.py => fp8_quant_test.py} (100%)
 create mode 100644 lightllm-kernel/test/quant/int8_quant_test.py

diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
index cd4f2504c..40346e3ea 100644
--- a/lightllm-kernel/csrc/ops_bindings.cpp
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -10,7 +10,8 @@ PYBIND11_MODULE(_C, m) {
     m.def("rmsnorm_align16_bf16", &rmsnorm_align16_bf16, "RMSNORM (CUDA)");
     m.def("pre_tp_norm_bf16", &pre_tp_norm_bf16, "PRE TP NORM (CUDA)");
     m.def("post_tp_norm_bf16", &post_tp_norm_bf16, "POST TP NORM (CUDA)");
-    m.def("per_token_quant_bf16_fp8", &per_token_quant_bf16_fp8, "PER TOKEN QUANT (CUDA)");
+    m.def("per_token_quant_bf16_fp8", &per_token_quant_bf16_fp8, "PER TOKEN QUANT FP8 (CUDA)");
+    m.def("per_token_quant_bf16_int8", &per_token_quant_bf16_int8, "PER TOKEN QUANT INT8 (CUDA)");
     m.def("add_norm_quant_bf16_fp8", &add_norm_quant_bf16_fp8, "ADD NORM QUANT FUSED (CUDA)");
     m.def("gelu_per_token_quant_bf16_fp8", &gelu_per_token_quant_bf16_fp8, "GELU QUANT FUSED (CUDA)");
     m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
similarity index 100%
rename from lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu
rename to lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
new file mode 100644
index 000000000..848f92580
--- /dev/null
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
@@ -0,0 +1,338 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_int8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M,                  // Number of rows in the input tensor
+    const int64_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8;
+    bf16_t local_bf16;
+
+    extern __shared__ bf16_t workspace1[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = _input[i];
+        workspace1[i] = local_bf16;
+
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        local_max = fmaxf(local_max, tmp);
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = workspace1[i];
+        
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        fp32_t x = tmp * inv_scale;
+        local_int8 = float_to_int8_rn(x);
+
+        _output[i] = local_int8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+
+}
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_int8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8[VPT];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace2[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace2 + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[j]);
+
+            int8_t a = float_to_int8_rn(x.x * inv_scale);
+            int8_t b = float_to_int8_rn(x.y * inv_scale);
+            
+            local_int8[2 * j] = a;
+            local_int8[2 * j + 1] = b;
+        }
+
+        vec_copy<sizeof(int8_t) * VPT>(local_int8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB, int32_t N>
+__global__ void device_per_token_quant_bf16_to_int8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t M                  // Number of rows in the input tensor
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8[VPT];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[j]);
+
+            int8_t a = float_to_int8_rn(x.x * inv_scale);
+            int8_t b = float_to_int8_rn(x.y * inv_scale);
+
+            local_int8[2 * j] = a;
+            local_int8[2 * j + 1] = b;
+        }
+
+        vec_copy<sizeof(int8_t) * VPT>(local_int8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+void per_token_quant_bf16_int8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_per_token_quant_bf16_to_int8<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 32:
+            device_per_token_quant_bf16_to_int8<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 64:
+            device_per_token_quant_bf16_to_int8<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 512:
+            device_per_token_quant_bf16_to_int8<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 1024:
+            device_per_token_quant_bf16_to_int8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 3200:
+            device_per_token_quant_bf16_to_int8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 4096:
+            device_per_token_quant_bf16_to_int8<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 12800:
+            device_per_token_quant_bf16_to_int8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        default: {
+            static constexpr int TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_per_token_quant_bf16_to_int8_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<int8_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M,
+                    N
+                );
+            } else {
+                device_per_token_quant_bf16_to_int8_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<int8_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M,
+                    N
+                );
+            }
+        }
+    }
+
+    return;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
index d7d6a454c..6f814a6c1 100644
--- a/lightllm-kernel/include/ops_common.h
+++ b/lightllm-kernel/include/ops_common.h
@@ -32,6 +32,12 @@ void per_token_quant_bf16_fp8(
     Tensor& scales
 );
 
+void per_token_quant_bf16_int8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
 std::tuple<Tensor, Tensor> add_norm_quant_bf16_fp8(
     Tensor& X, const Tensor &R, const Tensor &W,
     const fp32_t eps
diff --git a/lightllm-kernel/include/utils.h b/lightllm-kernel/include/utils.h
index 105dc89fc..882b5cea8 100644
--- a/lightllm-kernel/include/utils.h
+++ b/lightllm-kernel/include/utils.h
@@ -68,6 +68,12 @@ __device__ inline bf16x2_t _float22bf162_rn(fp32x2_t val) {
     return bf16x2_t(low, high);
 }
 
+__device__ inline int8_t float_to_int8_rn(fp32_t x) {
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+}
+
 template <typename T>
 __host__ __device__ T Cdiv(T numerator, T denominator) {
     return (numerator + denominator - 1) / denominator;
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
index d061bea25..fe6cfdde8 100644
--- a/lightllm-kernel/lightllm_kernel/ops/__init__.py
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -77,7 +77,7 @@
     allgather_register_graph_buffers,
     allgather_get_graph_buffer_ipc_meta,
 )
-from .quant import per_token_quant_bf16_fp8
+from .quant import per_token_quant_bf16_fp8, per_token_quant_bf16_int8
 from .gemm import cutlass_scaled_mm_bias_ls
 from .moe import grouped_topk
 from .attention import group8_int8kv_flashdecoding_stage1, group_int8kv_decode_attention
@@ -85,6 +85,7 @@
 __all__ = [
     "rmsnorm_bf16",
     "per_token_quant_bf16_fp8",
+    "per_token_quant_bf16_int8",
     "pre_tp_norm_bf16",
     "post_tp_norm_bf16",
     "add_norm_quant_bf16_fp8",
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
index 05634f2f0..8d3f8fe9d 100644
--- a/lightllm-kernel/lightllm_kernel/ops/quant.py
+++ b/lightllm-kernel/lightllm_kernel/ops/quant.py
@@ -9,3 +9,10 @@ def per_token_quant_bf16_fp8(input: torch.tensor) -> Tuple[torch.Tensor, torch.T
     scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
     _C.per_token_quant_bf16_fp8(output, input, scales)
     return output, scales
+
+def per_token_quant_bf16_int8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize the given input using per token quant method"""
+    output = torch.empty_like(input, dtype=torch.int8)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.per_token_quant_bf16_int8(output, input, scales)
+    return output, scales
diff --git a/lightllm-kernel/test/quant/quant_test.py b/lightllm-kernel/test/quant/fp8_quant_test.py
similarity index 100%
rename from lightllm-kernel/test/quant/quant_test.py
rename to lightllm-kernel/test/quant/fp8_quant_test.py
diff --git a/lightllm-kernel/test/quant/int8_quant_test.py b/lightllm-kernel/test/quant/int8_quant_test.py
new file mode 100644
index 000000000..5da9f08bf
--- /dev/null
+++ b/lightllm-kernel/test/quant/int8_quant_test.py
@@ -0,0 +1,50 @@
+import unittest
+import torch
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm_kernel.ops import per_token_quant_bf16_int8
+from test.utils import benchmark, error
+
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [1024, 13325]
+        self.hiddenDims = [256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda:2"
+        self.dtype = torch.bfloat16
+        torch.cuda.set_device(self.device)
+
+    def test_accuracy(self):
+        """Test the accuracy of per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
+                    y_real, scales_real, _ = ops.scaled_int8_quant(
+                        input.contiguous().cuda(self.device)
+                    )
+                    y_pred, scales_pred = per_token_quant_bf16_int8(input)
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}."
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        """Test the performance of per_token_quant"""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = token * size / 1024 ** 4
+                    benchmark(per_token_quant_bf16_int8, shape, tflops, 100, input)
+                    benchmark(ops.scaled_int8_quant, shape, tflops, 100, input)
+
+
+if __name__ == "__main__":
+    unittest.main()

From a71b1b2659cb9fb3ecb15421ec6c36e013639e5d Mon Sep 17 00:00:00 2001
From: Xtra <571889291@qq.com>
Date: Tue, 24 Jun 2025 20:29:55 +0800
Subject: [PATCH 14/14] fix:add fabsf() to general kernel when compare max
 values (#943)

---
 .../csrc/quant/per_token_quantize_bf16_fp8.cu | 35 ++++++-------------
 .../quant/per_token_quantize_bf16_int8.cu     | 35 ++++++-------------
 lightllm-kernel/test/quant/fp8_quant_test.py  |  4 +--
 lightllm-kernel/test/quant/int8_quant_test.py |  4 +--
 4 files changed, 26 insertions(+), 52 deletions(-)

diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
index ba9a5877e..6e6a98596 100755
--- a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
@@ -13,7 +13,6 @@ __global__ void device_per_token_quant_bf16_to_fp8_general(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int64_t N
 ) {
     const int32_t bid = blockIdx.x;
@@ -38,7 +37,7 @@ __global__ void device_per_token_quant_bf16_to_fp8_general(
         workspace1[i] = local_bf16;
 
         fp32_t tmp = cvt_bf16_f32(local_bf16);
-        local_max = fmaxf(local_max, tmp);
+        local_max = fmaxf(local_max, fabsf(tmp));
     }
 
     // Reduce the maximum value across the block
@@ -71,7 +70,6 @@ __global__ void device_per_token_quant_bf16_to_fp8_vpt(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int32_t N
 ) {
     constexpr int32_t VPT = 8;
@@ -147,8 +145,7 @@ template<int32_t TPB, int32_t N>
 __global__ void device_per_token_quant_bf16_to_fp8(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
-    fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M                  // Number of rows in the input tensor
+    fp32_t* __restrict__ scales       // Output scales for each token
 ) {
     constexpr int32_t VPT = 8;
 
@@ -243,8 +240,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 32:
@@ -252,8 +248,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 64:
@@ -261,8 +256,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 512:
@@ -270,8 +264,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 1024:
@@ -279,8 +272,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 3200:
@@ -288,8 +280,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 4096:
@@ -297,8 +288,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 12800:
@@ -306,8 +296,7 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         default: {
@@ -319,7 +308,6 @@ void per_token_quant_bf16_fp8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<fp8_e4m3_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             } else {
@@ -328,7 +316,6 @@ void per_token_quant_bf16_fp8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<fp8_e4m3_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             }
@@ -339,4 +326,4 @@ void per_token_quant_bf16_fp8 (
 }
 
 } // namespace ops
-} // namespace lightllm
\ No newline at end of file
+} // namespace lightllm
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
index 848f92580..0df97753c 100644
--- a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
@@ -13,7 +13,6 @@ __global__ void device_per_token_quant_bf16_to_int8_general(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int64_t N
 ) {
     const int32_t bid = blockIdx.x;
@@ -38,7 +37,7 @@ __global__ void device_per_token_quant_bf16_to_int8_general(
         workspace1[i] = local_bf16;
 
         fp32_t tmp = cvt_bf16_f32(local_bf16);
-        local_max = fmaxf(local_max, tmp);
+        local_max = fmaxf(local_max, fabsf(tmp));
     }
 
     // Reduce the maximum value across the block
@@ -71,7 +70,6 @@ __global__ void device_per_token_quant_bf16_to_int8_vpt(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int32_t N
 ) {
     constexpr int32_t VPT = 8;
@@ -145,8 +143,7 @@ template<int32_t TPB, int32_t N>
 __global__ void device_per_token_quant_bf16_to_int8(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
-    fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M                  // Number of rows in the input tensor
+    fp32_t* __restrict__ scales       // Output scales for each token
 ) {
     constexpr int32_t VPT = 8;
 
@@ -239,8 +236,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 32:
@@ -248,8 +244,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 64:
@@ -257,8 +252,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 512:
@@ -266,8 +260,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 1024:
@@ -275,8 +268,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 3200:
@@ -284,8 +276,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 4096:
@@ -293,8 +284,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 12800:
@@ -302,8 +292,7 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         default: {
@@ -315,7 +304,6 @@ void per_token_quant_bf16_int8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<int8_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             } else {
@@ -324,7 +312,6 @@ void per_token_quant_bf16_int8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<int8_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             }
@@ -335,4 +322,4 @@ void per_token_quant_bf16_int8 (
 }
 
 } // namespace ops
-} // namespace lightllm
\ No newline at end of file
+} // namespace lightllm
diff --git a/lightllm-kernel/test/quant/fp8_quant_test.py b/lightllm-kernel/test/quant/fp8_quant_test.py
index 48f50f7d8..e584e2fde 100755
--- a/lightllm-kernel/test/quant/fp8_quant_test.py
+++ b/lightllm-kernel/test/quant/fp8_quant_test.py
@@ -9,7 +9,7 @@ class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
         self.tokens = [1024, 13325]
-        self.hiddenDims = [256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.hiddenDims = [3, 256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
         self.device = "cuda"
         self.dtype = torch.bfloat16
 
@@ -20,7 +20,7 @@ def test_accuracy(self):
                 with self.subTest(shape=[token, hiddenDim]):
                     input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
                     y_real, scales_real = ops.scaled_fp8_quant(
-                        input.contiguous().cuda(self.device), scale=None, use_per_token_if_dynamic=True
+                        input.contiguous(), scale=None, use_per_token_if_dynamic=True
                     )
                     y_pred, scales_pred = per_token_quant_bf16_fp8(input)
                     self.assertTrue(
diff --git a/lightllm-kernel/test/quant/int8_quant_test.py b/lightllm-kernel/test/quant/int8_quant_test.py
index 5da9f08bf..1ab42546b 100644
--- a/lightllm-kernel/test/quant/int8_quant_test.py
+++ b/lightllm-kernel/test/quant/int8_quant_test.py
@@ -9,7 +9,7 @@ class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
         self.tokens = [1024, 13325]
-        self.hiddenDims = [256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.hiddenDims = [3, 256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
         self.device = "cuda:2"
         self.dtype = torch.bfloat16
         torch.cuda.set_device(self.device)
@@ -21,7 +21,7 @@ def test_accuracy(self):
                 with self.subTest(shape=[token, hiddenDim]):
                     input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
                     y_real, scales_real, _ = ops.scaled_int8_quant(
-                        input.contiguous().cuda(self.device)
+                        input.contiguous()
                     )
                     y_pred, scales_pred = per_token_quant_bf16_int8(input)
                     self.assertTrue(